diff --git a/src/common/snippets/include/snippets/pass/buffer_identification.hpp b/src/common/snippets/include/snippets/pass/buffer_identification.hpp index 170fadd79b3688..ecc058e8812b69 100644 --- a/src/common/snippets/include/snippets/pass/buffer_identification.hpp +++ b/src/common/snippets/include/snippets/pass/buffer_identification.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018-2022 Intel Corporation +// Copyright (C) 2018-2023 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -7,22 +7,39 @@ #include #include +#include "snippets/snippets_isa.hpp" + namespace ngraph { namespace snippets { namespace pass { /** * @interface BufferIdentification - * @brief The pass set identifiers for Buffers in common Buffer system + * @brief The pass set identifiers for Buffers in common Buffer system. + * The buffers with the same identifier has the same data register. + * The pass uses greedy graph coloring algorithm using adjacency matrix: + * - Buffers - are vertices of graph + * - Loops, Brgemm (the same other ops) - are "edges" between Buffers (hub of edges). + * The buffers are connected to the same Loop - are adjacent in graph sense bounds. + * - The vertices (buffers) are adjacent if they are connected to the same Loop and + * their data pointers cannot be proportionally incremented in Loops: different ptr increments or data sizes. + * - Firstly, create adjacency matrix using the definition above + * - Secondly, color vertices of graph (buffers) using adjacency matrix * Note: should be called before ResetBuffer() pass to have correct offsets * @ingroup snippets */ class BufferIdentification: public ngraph::pass::FunctionPass { public: - OPENVINO_RTTI("InsertLoops", "0"); + OPENVINO_RTTI("BufferIdentification", "0"); BufferIdentification() = default; bool run_on_model(const std::shared_ptr& m) override; + +private: + using BufferSet = std::vector>; + + std::vector create_adjacency_matrix(const BufferSet& buffers); + std::map coloring(BufferSet& buffers, std::vector& adj); }; } // namespace pass diff --git a/src/common/snippets/src/generator.cpp b/src/common/snippets/src/generator.cpp index 4b040f9be62899..dba0f139fda495 100644 --- a/src/common/snippets/src/generator.cpp +++ b/src/common/snippets/src/generator.cpp @@ -10,7 +10,6 @@ #include "snippets/op/subgraph.hpp" #include "snippets/op/kernel.hpp" #include -#include #include #include diff --git a/src/common/snippets/src/op/subgraph.cpp b/src/common/snippets/src/op/subgraph.cpp index ef25391bb705ea..73395bb9e2fafc 100644 --- a/src/common/snippets/src/op/subgraph.cpp +++ b/src/common/snippets/src/op/subgraph.cpp @@ -77,12 +77,19 @@ void snippets::op::Subgraph::init_config() { auto snippets::op::Subgraph::get_estimated_buffer_count(const ov::NodeVector& ops) -> size_t { // The count of potential unique Buffers - it's hidden virtual ports as well // We should go through Subgraph and calculate potential non-inplace Buffers count. - // These Buffers can be only around Loops (for example, around MatMul they may be inplace). So we should - // check for element type size of nodes which are used Buffer to get rating from above for uniqe Buffer count. + // These Buffers can be only around Loops (for example, around MatMul without blocking (Loops around) they may be inplace). + // So we should check for element type size of nodes which are used Buffer to get rating from above for uniqe Buffer count. // The count is estimated because when we calculate this number we have only original graph representation // and where will be Loops - we can just predict. // Note: The ops that create Buffers: MatMul, Transpose and Softmax (always FP32) std::vector used_precision_size; + + auto push_prc_size = [&used_precision_size](size_t precision_size) { + if (used_precision_size.empty() || used_precision_size.back() != precision_size) { + used_precision_size.push_back(precision_size); + } + }; + for (const auto& op : ops) { if (const auto transpose = ov::as_type_ptr(op)) { // At the moment Transposes are supported only on Results and Parameters but @@ -96,34 +103,23 @@ auto snippets::op::Subgraph::get_estimated_buffer_count(const ov::NodeVector& op }) || !ov::is_type(transpose->get_input_node_shared_ptr(0)); if (are_prev_or_next_ops) { - const auto prc_size = transpose->get_element_type().size(); - if (used_precision_size.empty() || used_precision_size.back() != prc_size) { - used_precision_size.push_back(prc_size); - } + push_prc_size(transpose->get_element_type().size()); } } else if (ov::is_type(op) || ov::is_type(op)) { - // Softmax always uses 2 FP32 Buffers - const auto prc_size = ov::element::f32.size(); - if (used_precision_size.empty() || used_precision_size.back() != prc_size) { - used_precision_size.push_back(prc_size); - } + // Softmax always uses 2 FP32 Buffers after decomposition. + // They are inplace and the same so we can push precision size only once + push_prc_size(ov::element::f32.size()); } else if (const auto matmul = ov::as_type_ptr(op)) { // First input check is enough because MatMul requires the same prc size on inputs if (!ov::is_type(matmul->get_input_node_shared_ptr(0)) || !ov::is_type(matmul->get_input_node_shared_ptr(1))) { - const auto prc_size = matmul->get_input_element_type(0).size(); - if (used_precision_size.empty() || used_precision_size.back() != prc_size) { - used_precision_size.push_back(prc_size); - } + push_prc_size(matmul->get_input_element_type(0).size()); } const auto consumers = matmul->get_output_target_inputs(0); if (std::none_of(consumers.begin(), consumers.end(), [](const ov::Input& in) { return ov::is_type(in.get_node()); })) { - const auto prc_size = matmul->get_element_type().size(); - if (used_precision_size.empty() || used_precision_size.back() != prc_size) { - used_precision_size.push_back(prc_size); - } + push_prc_size(matmul->get_element_type().size()); } } } diff --git a/src/common/snippets/src/pass/assign_registers.cpp b/src/common/snippets/src/pass/assign_registers.cpp index 0f2de57a94bc22..0b8046a552560b 100644 --- a/src/common/snippets/src/pass/assign_registers.cpp +++ b/src/common/snippets/src/pass/assign_registers.cpp @@ -99,8 +99,8 @@ bool ngraph::snippets::pass::AssignRegisters::run_on_model(const std::shared_ptr std::map manually_assigned_gprs, manually_assigned_vecs; manual_assigning(f, ops, manually_assigned_gprs, manually_assigned_vecs); - const auto IS_MANUALLY_ALLOCATED_REG = SIZE_MAX; - auto enumerate_out_tensors = [IS_MANUALLY_ALLOCATED_REG] (const std::shared_ptr& op, + const auto IS_MANUALLY_ASSIGNED_REG = SIZE_MAX; + auto enumerate_out_tensors = [IS_MANUALLY_ASSIGNED_REG] (const std::shared_ptr& op, decltype(regs_vec)& reg_map, const std::map& manually_assigned_regs, size_t& counter) { @@ -109,7 +109,7 @@ bool ngraph::snippets::pass::AssignRegisters::run_on_model(const std::shared_ptr // Note that some ops might have identical input&output tensors (Result and Tile* for ex.) // so we have to check that the tensor has not been enumerated already if (reg_map.count(t) == 0) { - reg_map[t] = manually_assigned_regs.count(t) == 0 ? counter++ : IS_MANUALLY_ALLOCATED_REG; + reg_map[t] = manually_assigned_regs.count(t) == 0 ? counter++ : IS_MANUALLY_ASSIGNED_REG; } } }; @@ -131,13 +131,13 @@ bool ngraph::snippets::pass::AssignRegisters::run_on_model(const std::shared_ptr std::vector> used_vec(ops.size(), std::set()); std::vector> defined_vec(ops.size(), std::set()); - auto tensor2reg = [IS_MANUALLY_ALLOCATED_REG] (const std::vector& tensors, const std::map& reg_map) { + auto tensor2reg = [IS_MANUALLY_ASSIGNED_REG] (const std::vector& tensors, const std::map& reg_map) { std::set result; for (const auto& t : tensors) { if (reg_map.count(t) == 0) throw ngraph::ngraph_error("Assign registers: attempt to access not enumerated tensor"); Reg reg_id = reg_map.at(t); - if (reg_id != IS_MANUALLY_ALLOCATED_REG) + if (reg_id != IS_MANUALLY_ASSIGNED_REG) result.insert(reg_id); } return result; @@ -298,10 +298,10 @@ bool ngraph::snippets::pass::AssignRegisters::run_on_model(const std::shared_ptr std::map assigned_regs(std::move(manually_assigned_gprs)); assigned_regs.insert(manually_assigned_vecs.begin(), manually_assigned_vecs.end()); - auto register_assigned_regs = [IS_MANUALLY_ALLOCATED_REG, &assigned_regs](const std::map& unique_regs, + auto register_assigned_regs = [IS_MANUALLY_ASSIGNED_REG, &assigned_regs](const std::map& unique_regs, const std::map& unique2reused) { for (const auto& reg : unique_regs) { - if (reg.second == IS_MANUALLY_ALLOCATED_REG) + if (reg.second == IS_MANUALLY_ASSIGNED_REG) continue; if (unique2reused.count(reg.second) == 0) throw ngraph::ngraph_error("Assign registers failed to allocate register for a tensor"); diff --git a/src/common/snippets/src/pass/buffer_identification.cpp b/src/common/snippets/src/pass/buffer_identification.cpp index 1c7ea6363f480c..a6215308bf2dab 100644 --- a/src/common/snippets/src/pass/buffer_identification.cpp +++ b/src/common/snippets/src/pass/buffer_identification.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2022 Intel Corporation +// Copyright (C) 2018-2023 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -13,24 +13,24 @@ namespace snippets { namespace pass { namespace { -using BufferSet = std::vector>; - -auto is_intermediate_buffer(const std::shared_ptr& op) -> std::shared_ptr { +std::shared_ptr is_intermediate_buffer(const std::shared_ptr& op) { const auto buffer = ov::as_type_ptr(op); return buffer && buffer->is_intermediate_memory() ? buffer : nullptr; } - +inline size_t index(size_t size, size_t row, size_t col) { + return col + row * size; +} } // namespace -auto create_adjacency_matrix(const BufferSet& buffers) -> std::vector { +std::vector BufferIdentification::create_adjacency_matrix(const BufferIdentification::BufferSet& buffers) { // The sync point to check for adjency is Loop because only in Loop we increment pointers. // So if some Buffers in the one Loop have conflict (cannot be inplace: the same ptr increment and finalization offset) // they are called as adjacent const auto size = buffers.size(); std::vector adj(size * size, false); for (size_t i = 0; i < size; ++i) - adj[i + i * size] = true; + adj[index(size, i, i)] = true; auto update_adj_matrix = [&](const std::shared_ptr& buffer, size_t buffer_index, const std::shared_ptr& neighbour_buffer) { @@ -41,19 +41,19 @@ auto create_adjacency_matrix(const BufferSet& buffers) -> std::vector { NGRAPH_CHECK(iter != buffers.cend(), "Buffer wasn't find in Buffer system of Subgraph"); const size_t adj_idx = std::distance(buffers.cbegin(), iter); - adj[buffer_index + adj_idx * size] = true; + adj[index(size, adj_idx, buffer_index)] = adj[index(size, buffer_index, adj_idx)] = true; } } }; for (size_t i = 0; i < buffers.size(); ++i) { - const auto buffer = buffers[i]; + const auto& buffer = buffers[i]; auto port = buffer->input_value(0).get_index(); auto parent = buffer->get_input_node_shared_ptr(0); // We iterate in While cycle to check nested Loops while (const auto loop_end = ov::as_type_ptr(parent)) { - const auto loop_begin = loop_end->get_loop_begin(); + const auto& loop_begin = loop_end->get_loop_begin(); for (const auto& input_value : loop_begin->input_values()) { auto loop_in = input_value.get_node_shared_ptr(); auto port_idx = input_value.get_index(); @@ -69,7 +69,7 @@ auto create_adjacency_matrix(const BufferSet& buffers) -> std::vector { } for (const auto& output : loop_end->outputs()) { // check for first target input is enough for Buffer searching because operations can have only single Buffer per each output port as op - const auto target_inputs = output.get_target_inputs(); + const auto& target_inputs = output.get_target_inputs(); auto consumer_in = *target_inputs.begin(); auto port_idx = consumer_in.get_index(); auto consumer = consumer_in.get_node()->shared_from_this(); @@ -95,21 +95,35 @@ auto create_adjacency_matrix(const BufferSet& buffers) -> std::vector { return adj; } -auto coloring(BufferSet& buffers, std::vector& adj) -> std::map { +std::map BufferIdentification::coloring(BufferIdentification::BufferSet& buffers, std::vector& adj) { size_t color = 0; - std::map color_groups; + std::map color_groups; const auto size = buffers.size(); + // If we have count of adjacent Buffers is equal to count of all Buffers, + // it mean that Buffers aren't adjacent between them (they just have loops) + if (static_cast(std::count(adj.begin(), adj.end(), true)) == size) { + color_groups[color] = buffers; + return color_groups; + } + for (size_t i = 0; i < size; i++) { + // The Buffer is already colored (visited) - skip if (!buffers[i]) continue; - auto buffer = buffers[i]; + const auto& buffer = buffers[i]; color_groups[color].push_back(buffer); // Add to Color Group buffers[i] = nullptr; // Remove from graph vertices // while Buffer i has not coloured non-neighbours // (row i contains 0) while (!std::accumulate(adj.begin() + i * size, adj.begin() + (i + 1) * size, true, std::logical_and())) { + // Find first non-adjacent and non-visited (non-colored) Buffer to color him to the same color + // NOTE: At the moment Snippets don't garantee that Buffer pointer won't be resetted after Loop execution. + // So we cannot reuse Buffer pointer at second time and don't allow the following case: + // Buffer[0] -> ... -> Buffer[1] -> ... -> Buffer[0] + // To cover this case, we make force break when find first adjacent not-visitted `vertex` + // Notice, this case will be supported in new infrastructure with Linear IR size_t j = i + 1; bool force_break = false; for (; j < size; ++j) { @@ -121,12 +135,19 @@ auto coloring(BufferSet& buffers, std::vector& adj) -> std::map()); } @@ -142,7 +163,7 @@ bool BufferIdentification::run_on_model(const std::shared_ptr& model) // Unite Buffers using Graph coloring algorithm. // Notes: We identify only Buffer with Intermediate memory because Buffers with new memory are used only in Brgemm case // so these Buffers are always IntermediateBuffer nonadjacent - BufferSet buffers; + BufferIdentification::BufferSet buffers; const auto ops = model->get_ordered_ops(); for (const auto& op : ops) { @@ -157,10 +178,9 @@ bool BufferIdentification::run_on_model(const std::shared_ptr& model) // Graph coloring algorithm const auto color_groups = coloring(buffers, adj); - // FIXME: use const auto& [color, united_buffers] when C++17 is available for (const auto& pair : color_groups) { const auto color = pair.first; - const auto united_buffers = pair.second; + const auto& united_buffers = pair.second; for (const auto& buffer : united_buffers) { buffer->set_id(color); } diff --git a/src/common/snippets/src/pass/collapse_subgraph.cpp b/src/common/snippets/src/pass/collapse_subgraph.cpp index ebe5d162e7ba0a..4d90a97f79c5b2 100644 --- a/src/common/snippets/src/pass/collapse_subgraph.cpp +++ b/src/common/snippets/src/pass/collapse_subgraph.cpp @@ -529,15 +529,18 @@ TokenizeSnippets::TokenizeSnippets() { ResultVector body_results; std::vector>> subgraph_result_inputs; - ov::NodeVector new_body_ops; + ov::NodeVector ops_for_buffer_count; for (auto subgraph : input_subgraphs) { // we should summurize additional needed data count (non-scalar Constants and Buffers) from all input subgraphs // because we will collapse them with our node and we should get total count const auto subgraph_ptr = ov::as_type_ptr(subgraph); hidden_data_count += subgraph_ptr->get_virtual_port_count(); + // Buffers can be existed only in Subgraphs with domain sensetive ops which + // requires intermediate memory for data repacking + // To avoid load time regressions, we verify only these Subgraph with domain sensetive ops if (subgraph_ptr->has_domain_sensitive_ops()) { const auto ops = subgraph_ptr->body_ptr()->get_ordered_ops(); - new_body_ops.insert(new_body_ops.end(), ops.begin(), ops.end()); + ops_for_buffer_count.insert(ops_for_buffer_count.end(), ops.begin(), ops.end()); } for (auto output : subgraph->outputs()) { @@ -570,7 +573,7 @@ TokenizeSnippets::TokenizeSnippets() { } if (op::Subgraph::is_domain_sensitive_op(node)) { - new_body_ops.push_back(node); + ops_for_buffer_count.push_back(node); } for (auto output : node->outputs()) { @@ -583,7 +586,7 @@ TokenizeSnippets::TokenizeSnippets() { } // todo: move this plugin-specific constraint to the plugin callback - const auto unique_buffer_count = op::Subgraph::get_estimated_buffer_count(new_body_ops); + const auto unique_buffer_count = op::Subgraph::get_estimated_buffer_count(ops_for_buffer_count); if (body_parameters.size() + body_results.size() + hidden_data_count + unique_buffer_count > 12) { const std::string message_reset = "new subgraph is created. Impossible to schedule subgraph with " + std::to_string(body_parameters.size()) + " inputs, " + std::to_string(body_results.size()) + " outputs and " + diff --git a/src/common/snippets/src/pass/mha_tokenization.cpp b/src/common/snippets/src/pass/mha_tokenization.cpp index 25e2cc397985c0..1126029b042a8e 100644 --- a/src/common/snippets/src/pass/mha_tokenization.cpp +++ b/src/common/snippets/src/pass/mha_tokenization.cpp @@ -18,11 +18,9 @@ namespace { auto is_supported_tensor(const ngraph::descriptor::Tensor& t) -> bool { - // TODO: Add support of non-4D tensors return t.get_partial_shape().is_static() && t.get_shape().size() == 4; } -// TODO: Add support of Reshape? auto is_supported_intermediate_op(const std::shared_ptr& node) -> bool { const auto is_intermediate_op = [](const std::shared_ptr& node) { return ngraph::is_type(node) || @@ -128,7 +126,6 @@ auto get_potential_body_params(const std::shared_ptr& op) -> size_t { auto update_intermediate_supported_ops(std::shared_ptr& interm_op, ngraph::NodeVector& ordered_ops, size_t& hidden_virtual_ports_count, size_t& potential_body_params_count) -> bool { - // TODO: Add Reshape support while (is_supported_intermediate_op(interm_op)) { // All supported intermediate ops have only one output port if (interm_op->get_output_target_inputs(0).size() != 1) @@ -334,8 +331,7 @@ ngraph::snippets::pass::TokenizeMHASnippets::TokenizeMHASnippets() { } potential_body_params_count += get_potential_body_params(parent); ordered_ops.insert(ordered_ops.begin(), parent); - // We think that sequence of ops goes through input port 0 - // But can be Select here? If it can be, parent shouldn't be on input port 0. Need another way? + // [107731] To go always through 0-th port - is it safe? parent = parent->get_input_node_shared_ptr(0); } @@ -369,8 +365,6 @@ ngraph::snippets::pass::TokenizeMHASnippets::TokenizeMHASnippets() { } } - // TODO: Add Reshape Support for all Transposes - // Add 3D support for all Transposes const auto transpose0 = ngraph::as_type_ptr(matmul0->get_input_node_shared_ptr(0)); if (is_valid_transpose(transpose0, {0, 2, 1, 3})) { ordered_ops.insert(ordered_ops.begin(), transpose0); @@ -399,8 +393,8 @@ ngraph::snippets::pass::TokenizeMHASnippets::TokenizeMHASnippets() { } potential_body_params_count += get_potential_body_params(child); - // TODO: move this plugin-specific constraint to the plugin callback - // We cannot collapse op to Subgraph if count of potential Parameter and Result count is higher 12 + // [75567]: move this plugin-specific constraint to the plugin callback + // We cannot collapse op to Subgraph if count of potential Parameter and Result count is higher 12 if (potential_body_params_count + child->get_output_target_inputs(0).size() + hidden_virtual_ports_count + buffer_count > 12) { break; } @@ -409,7 +403,7 @@ ngraph::snippets::pass::TokenizeMHASnippets::TokenizeMHASnippets() { child = child->get_output_target_inputs(0).begin()->get_node()->shared_from_this(); } - // TODO: Add full support of Transpose to cover cases where there are nodes between MatMul2 and Transpose3: + // At the moment Snippets don't support nodes between MatMul2 and Transpose3 due to Loop and strided calculations limitations // MatMul2 // // Transpose3 @@ -427,7 +421,7 @@ ngraph::snippets::pass::TokenizeMHASnippets::TokenizeMHASnippets() { /* ====== Subgraph creation ======= */ - // TODO: move this plugin-specific constraint to the plugin callback + // [75567]: move this plugin-specific constraint to the plugin callback const auto last_node = ordered_ops.back(); if (potential_body_params_count + last_node->get_output_size() + hidden_virtual_ports_count + buffer_count > 12) { return false; diff --git a/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.cpp b/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.cpp index a9169d0b52bc7d..43fda9bf83e4d1 100644 --- a/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.cpp +++ b/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.cpp @@ -144,7 +144,7 @@ KernelEmitter::KernelEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl: if (const auto buffer = ov::as_type_ptr(op)) unique_buffers.insert(buffer->get_id()); } - num_unqiue_buffer = unique_buffers.size(); + num_unique_buffers = unique_buffers.size(); NodeVector io_nodes; std::copy(params.begin(), params.end(), std::back_inserter(io_nodes)); std::copy(results.begin(), results.end(), std::back_inserter(io_nodes)); @@ -220,16 +220,16 @@ void KernelEmitter::validate_arguments(const std::vector &in, IE_THROW() << "KernelEmitter got invalid number of inputs. Expected 0, got " << in.size(); if (!out.empty()) IE_THROW() << "KernelEmitter got invalid number of outputs. Expected 0, got " << out.size(); - const auto num_params = num_inputs + num_outputs + num_unqiue_buffer; + const auto num_params = num_inputs + num_outputs + num_unique_buffers; // The number of used gpr may be >= num_params since LoopBegin+LoopEnd could also use gpr to store work_amount if (data_ptr_regs_idx.size() != num_params) IE_THROW() << "KernelEmitter: number of inputs and outputs is inconsisnent with the number of allocated registers " << num_params << " data_ptr_regs_idx.size() = " << data_ptr_regs_idx.size(); } -void KernelEmitter::init_data_pointers(size_t num_inputs, size_t num_params, size_t num_unqiue_buffer, - const Xbyak::Reg64& reg_indexes, const Xbyak::Reg64& reg_const_params, +void KernelEmitter::init_data_pointers(const Xbyak::Reg64& reg_indexes, const Xbyak::Reg64& reg_const_params, const std::vector& data_ptr_regs) const { + const auto num_params = num_inputs + num_outputs; // Note that we don't need offset for the last dim, since it's handled directly by Tile emitter const size_t offset_rank = jcp.master_shape.size() - 1; //const size_t tile_rank = jcp.tile_rank; @@ -292,7 +292,9 @@ void KernelEmitter::init_data_pointers(size_t num_inputs, size_t num_params, siz // Vector "data_ptr_regs" is sorted by abstract regs. // It means that the vector contains the physical registers in order [src, .., src, dst, .., dst, buffer] // So we can initialize buffer register firstly as last value of vector "data_ptr_regs" - for (size_t i = 0; i < num_unqiue_buffer; ++i) { + // NOTE: Snippets Buffer Scratchpad has the common data pointer for all Buffers (even with different ID). + // The accessing memory is covered by correct offsets in each Buffer and the corresponding MemoryAccess ops + for (size_t i = 0; i < num_unique_buffers; ++i) { h->mov(data_ptr_regs[num_params + i], h->ptr[reg_const_params + GET_OFF(buffer_scratchpad_ptr)]); } size_t i = 0; @@ -324,7 +326,7 @@ void KernelEmitter::emit_impl(const std::vector& in, std::vector data_ptr_regs; transform_idxs_to_regs(data_ptr_regs_idx, data_ptr_regs); - init_data_pointers(num_inputs, num_inputs + num_outputs, num_unqiue_buffer, reg_indexes, reg_const_params, data_ptr_regs); + init_data_pointers(reg_indexes, reg_const_params, data_ptr_regs); for (const auto& c : body) { const auto& emitter = c.first; std::vector in_regs, out_regs; diff --git a/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.hpp b/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.hpp index 9befa79eda2470..f7ca20a8314d62 100644 --- a/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.hpp +++ b/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.hpp @@ -84,13 +84,13 @@ class KernelEmitter : public jit_container_emitter { void validate_arguments(const std::vector &in, const std::vector &out) const override; void emit_impl(const std::vector& in, const std::vector& out) const override; - void init_data_pointers(size_t, size_t, size_t, const Xbyak::Reg64&, const Xbyak::Reg64&, const std::vector&) const; + void init_data_pointers(const Xbyak::Reg64&, const Xbyak::Reg64&, const std::vector&) const; jit_snippets_compile_args jcp; std::vector gp_regs_pool; size_t num_inputs; size_t num_outputs; - size_t num_unqiue_buffer; + size_t num_unique_buffers; // Vector of indices (lenght = input tensor rank) per every input and output that describes in which order // corresponding tensor dimensions are accessed (default: consecutive dense, e.g. 0,1,2,3 for 4D tensor). // Needed to calc i/o offsets. diff --git a/src/plugins/intel_cpu/src/ngraph_transformations/snippets_mark_skipped.cpp b/src/plugins/intel_cpu/src/ngraph_transformations/snippets_mark_skipped.cpp index 7afcd4f36650a8..011a77a0311350 100644 --- a/src/plugins/intel_cpu/src/ngraph_transformations/snippets_mark_skipped.cpp +++ b/src/plugins/intel_cpu/src/ngraph_transformations/snippets_mark_skipped.cpp @@ -427,7 +427,7 @@ void MarkSubgraphOpAsSkipped(const std::shared_ptr &node) { bool isSuitableConvert(const std::shared_ptr& node) { if (!ov::is_type(node)) return false; - auto hasResult = [](const std::shared_ptr &node) { + auto hasResult = [](const std::shared_ptr& node) { auto consumers = node->output(0).get_target_inputs(); bool findResult = false; if (consumers.size() == 1) { diff --git a/src/plugins/intel_cpu/src/transformation_pipeline.cpp b/src/plugins/intel_cpu/src/transformation_pipeline.cpp index 67d3152ec21703..2cbcfe9703e5d6 100644 --- a/src/plugins/intel_cpu/src/transformation_pipeline.cpp +++ b/src/plugins/intel_cpu/src/transformation_pipeline.cpp @@ -571,7 +571,8 @@ void Transformations::MainSnippets(void) { snippetsManager.get_pass_config()->disable(); } - auto is_supported_matmul = [](const std::shared_ptr& matmul) { + auto is_supported_matmul = [](const std::shared_ptr& n) { + const auto matmul = ov::as_type_ptr(n); if (!matmul) return false; if (matmul->get_input_element_type(1) == ov::element::i8) @@ -584,10 +585,10 @@ void Transformations::MainSnippets(void) { if (snippetsMode != Config::SnippetsMode::IgnoreCallback) { snippetsManager.get_pass_config()->set_callback( - [this, is_supported_matmul](const std::shared_ptr& n) -> bool { + [&, is_supported_matmul](const std::shared_ptr& n) -> bool { if (this->enableLpt) { // Tranformation callback is called on MatMul1 - if (!is_supported_matmul(ov::as_type_ptr(n))) + if (!is_supported_matmul(n)) return true; // Search for MatMul0 auto parent = n->get_input_node_shared_ptr(0); diff --git a/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_matmul.hpp b/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_matmul.hpp index 755dbf9aa81b4e..10860be56dcfb4 100644 --- a/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_matmul.hpp +++ b/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_matmul.hpp @@ -26,9 +26,9 @@ class MatMulFunction : public SnippetsFunctionBase { explicit MatMulFunction(const std::vector& inputShapes, const std::vector& precisions) : SnippetsFunctionBase(inputShapes), precisions(precisions) { NGRAPH_CHECK(input_shapes.size() == 2, "Got invalid number of input shapes"); - verify_precisions(precisions); + validate_precisions(precisions); } - static void verify_precisions(const std::vector& precisions) { + static void validate_precisions(const std::vector& precisions) { NGRAPH_CHECK(precisions.size() == 2, "Got invalid number of input element types"); const bool is_f32 = ngraph::snippets::utils::everyone_is(element::f32, precisions[0], precisions[1]); const bool is_int8 = ngraph::snippets::utils::one_of(precisions[0], element::i8, element::u8) && precisions[1] == element::i8; @@ -62,7 +62,7 @@ class MatMulBiasFunction : public SnippetsFunctionBase { explicit MatMulBiasFunction(const std::vector& inputShapes, const std::vector& precisions) : SnippetsFunctionBase(inputShapes), precisions(precisions) { NGRAPH_CHECK(input_shapes.size() == 3, "Got invalid number of input shapes"); - MatMulFunction::verify_precisions(precisions); + MatMulFunction::validate_precisions(precisions); } protected: std::shared_ptr initOriginal() const override; @@ -78,7 +78,7 @@ class MatMulBiasQuantizedFunction : public SnippetsFunctionBase { explicit MatMulBiasQuantizedFunction(const std::vector& inputShapes, const std::vector& precisions) : SnippetsFunctionBase(inputShapes), precisions(precisions) { NGRAPH_CHECK(input_shapes.size() == 3, "Got invalid number of input shapes"); - MatMulFunction::verify_precisions(precisions); + MatMulFunction::validate_precisions(precisions); } protected: std::shared_ptr initOriginal() const override; @@ -96,7 +96,7 @@ class MatMulsQuantizedFunction : public SnippetsFunctionBase { explicit MatMulsQuantizedFunction(const std::vector& inputShapes, const std::vector& precisions) : SnippetsFunctionBase(inputShapes), precisions(precisions) { NGRAPH_CHECK(input_shapes.size() == 3, "Got invalid number of input shapes"); - MatMulFunction::verify_precisions(precisions); + MatMulFunction::validate_precisions(precisions); } protected: std::shared_ptr initOriginal() const override; @@ -120,7 +120,7 @@ class Transpose0213MatMulFunction : public SnippetsFunctionBase { NGRAPH_CHECK(input_shapes[0].rank().get_length() == 4 && input_shapes[1].rank().get_length() == 4, "Only rank 4 input shapes are supported by this test"); NGRAPH_CHECK(transpose_position >=0 && transpose_position <= 2, "Got invalid transpose position"); - MatMulFunction::verify_precisions(precisions); + MatMulFunction::validate_precisions(precisions); } protected: std::shared_ptr initOriginal() const override; @@ -165,7 +165,7 @@ class MatMulsQuantizedSoftmaxFunction : public SnippetsFunctionBase { explicit MatMulsQuantizedSoftmaxFunction(const std::vector& inputShapes, const std::vector& precisions) : SnippetsFunctionBase(inputShapes), precisions(precisions) { NGRAPH_CHECK(input_shapes.size() == 3, "Got invalid number of input shapes"); - MatMulFunction::verify_precisions(precisions); + MatMulFunction::validate_precisions(precisions); } protected: std::shared_ptr initOriginal() const override; diff --git a/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_mha.hpp b/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_mha.hpp index ba078ced4cf0a3..63baf19710172d 100644 --- a/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_mha.hpp +++ b/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_mha.hpp @@ -46,6 +46,7 @@ class MHAFunction : public SnippetsFunctionBase { explicit MHAFunction(const std::vector& inputShapes, const std::vector& precisions, bool with_mul = true) : SnippetsFunctionBase(inputShapes), with_mul(with_mul), precisions(precisions) { NGRAPH_CHECK(input_shapes.size() == 4, "Got invalid number of input shapes"); + NGRAPH_CHECK(precisions.size() == 4, "Got invalid number of input precisions"); } protected: std::shared_ptr initOriginal() const override; @@ -75,6 +76,7 @@ class MHAMatMul0TransposeFunction : public SnippetsFunctionBase { explicit MHAMatMul0TransposeFunction(const std::vector& inputShapes, const std::vector& precisions) : SnippetsFunctionBase(inputShapes), precisions(precisions) { NGRAPH_CHECK(input_shapes.size() == 4, "Got invalid number of input shapes"); + NGRAPH_CHECK(precisions.size() == 4, "Got invalid number of input precisions"); } protected: std::shared_ptr initOriginal() const override; @@ -103,6 +105,7 @@ class MHASelectFunction : public SnippetsFunctionBase { explicit MHASelectFunction(const std::vector& inputShapes, const std::vector& precisions) : SnippetsFunctionBase(inputShapes), precisions(precisions) { NGRAPH_CHECK(input_shapes.size() == 6, "Got invalid number of input shapes"); + NGRAPH_CHECK(precisions.size() == 6, "Got invalid number of input precisions"); } protected: std::shared_ptr initOriginal() const override;