diff --git a/src/common/snippets/include/snippets/op/buffer.hpp b/src/common/snippets/include/snippets/op/buffer.hpp index a1e0575631b890..1963c61c1944c7 100644 --- a/src/common/snippets/include/snippets/op/buffer.hpp +++ b/src/common/snippets/include/snippets/op/buffer.hpp @@ -13,15 +13,15 @@ namespace op { /** * @interface Buffer * @brief The operation is for intermediate data storage - * Note: All buffers in a graph have the memory pointer. So if we have a few buffers, - * each buffer should have an own offset for common memory + * Note: All buffers in a graph have the same memory pointer. So if we have a few buffers, + * each buffer should have its own offset for common memory * @ingroup snippets */ class Buffer : public ngraph::op::Op { public: OPENVINO_OP("Buffer", "SnippetsOpset"); - Buffer(const Output& x, const size_t offset = 0); + Buffer(const Output& x); Buffer() = default; size_t get_offset() const { return m_offset; } @@ -36,7 +36,7 @@ class Buffer : public ngraph::op::Op { void validate_and_infer_types() override; private: - size_t m_offset; + size_t m_offset = 0lu; }; } // namespace op diff --git a/src/common/snippets/include/snippets/op/fill.hpp b/src/common/snippets/include/snippets/op/fill.hpp index be38d1c94d43da..91aceb49d1e360 100644 --- a/src/common/snippets/include/snippets/op/fill.hpp +++ b/src/common/snippets/include/snippets/op/fill.hpp @@ -15,28 +15,31 @@ namespace op { * @brief Generated in Tail Loop vector representation in code generation step for cases when we should * refill regsiters by special numbers. * For example, for cases with ReduceMax or ReduceSum in Softmax + * Where: + * - offset - is value shift for filling + * - fill_value - hexadecimal filling value * @ingroup snippets */ class Fill : public ngraph::op::Op { public: OPENVINO_OP("Fill", "SnippetsOpset"); - Fill(const Output& x, const int64_t offset, const std::string fill_value = "zero"); + Fill(const Output& x, const size_t offset, const uint32_t fill_value = 0x0); Fill() = default; - int64_t get_offset() const { return m_offset; } - std::string get_fill_value() const { return m_fill_value; } + size_t get_offset() const { return m_offset; } + uint32_t get_fill_value() const { return m_fill_value; } void set_offset(const size_t offset) { m_offset = offset; } - void set_fill_value(const std::string fill_value) { m_fill_value = fill_value; } + void set_fill_value(const uint32_t fill_value) { m_fill_value = fill_value; } bool visit_attributes(AttributeVisitor& visitor) override; std::shared_ptr clone_with_new_inputs(const OutputVector& new_args) const override; void validate_and_infer_types() override; protected: - int64_t m_offset = 0lu; - std::string m_fill_value = "zero"; + size_t m_offset = 0lu; + uint32_t m_fill_value = 0x0; }; } // namespace op diff --git a/src/common/snippets/include/snippets/op/load.hpp b/src/common/snippets/include/snippets/op/load.hpp index fc51e856ecda60..f8d009e9c4270c 100644 --- a/src/common/snippets/include/snippets/op/load.hpp +++ b/src/common/snippets/include/snippets/op/load.hpp @@ -12,9 +12,9 @@ namespace op { /** * @interface Load - * @brief Generated by Canonicalization step where explicit instructions should be emitted for data loading + * @brief Generated during Lowering stage (convert_to_snippets_dialect) where explicit instructions should be emitted for data loading * where number of elements to load is determined by "count" (Default value is "1" - to load one element) - * and memeory offset for loading is determined by "offset" (Default value is "0" - to load starting first element) + * and memory offset for loading is determined by "offset" (Default value is "0" - to load starting from the first element) * @ingroup snippets */ class Load : public ngraph::op::Op { diff --git a/src/common/snippets/include/snippets/op/loop.hpp b/src/common/snippets/include/snippets/op/loop.hpp index e92141af89e5a1..3a4573df272fa5 100644 --- a/src/common/snippets/include/snippets/op/loop.hpp +++ b/src/common/snippets/include/snippets/op/loop.hpp @@ -71,10 +71,10 @@ class LoopEnd : public LoopBase { std::shared_ptr clone_with_new_inputs(const OutputVector& inputs) const override; const std::vector& get_finalization_offsets() const; const std::vector& get_apply_increment() const; - const std::vector& get_forse_finalization_offsets() const; + const std::vector& get_force_finalization_offsets() const; void set_finalization_offsets(std::vector offsets); void set_apply_increment(std::vector apply_increment); - void set_forse_finalization_offsets(std::vector forse_finalizations_offsets); + void set_force_finalization_offsets(std::vector forse_finalizations_offsets); void set_work_amount(size_t new_work_amount); void set_increment(size_t new_increment); void set_evaluate_once(bool once); @@ -86,7 +86,7 @@ class LoopEnd : public LoopBase { // pointer always should be reverted (for example, for buffers: we should store to buffer in first loop // and we should load the same data from this buffer in the next loop) // true by default, the optimizations enabled if it's false. - std::vector forse_finalization_offsets; + std::vector force_finalization_offsets; std::vector finalization_offsets; size_t loop_io_size; }; diff --git a/src/common/snippets/include/snippets/op/loop_helpers.hpp b/src/common/snippets/include/snippets/op/loop_helpers.hpp index 92d8b50e3d5bf9..b8e6f7abaee9f8 100644 --- a/src/common/snippets/include/snippets/op/loop_helpers.hpp +++ b/src/common/snippets/include/snippets/op/loop_helpers.hpp @@ -42,7 +42,7 @@ std::shared_ptr insertLoopEndBeforeInputs(const std::vector size_t dimension, size_t work_amount, size_t increment, std::vector apply_increment = {}, std::vector finalization_offsets = {}, - std::vector forse_finalization_offsets = {}); + std::vector force_finalization_offsets = {}); template std::shared_ptr insertLoopEnd(const T& beforeTheseNodes, Args ...args) { diff --git a/src/common/snippets/include/snippets/op/store.hpp b/src/common/snippets/include/snippets/op/store.hpp index ae04d7c769a358..c4bf0185b6a58e 100644 --- a/src/common/snippets/include/snippets/op/store.hpp +++ b/src/common/snippets/include/snippets/op/store.hpp @@ -12,9 +12,9 @@ namespace op { /** * @interface Store - * @brief Generated by Canonicalization step where explicit instructions should be emitted for data storing + * @brief Generated during Lowering stage (convert_to_snippets_dialect) where explicit instructions should be emitted for data storing * where number of elements to store is determined by "count" (Default value is "1" - to store one element) - * and memeory offset for storing is determined by "offset" (Default value is "0" - to store starting at start memory ptr) + * and memory offset for storing is determined by "offset" (Default value is "0" - to store starting at start memory ptr) * @ingroup snippets */ class Store : public ngraph::op::Op { diff --git a/src/common/snippets/include/snippets/op/vector_buffer.hpp b/src/common/snippets/include/snippets/op/vector_buffer.hpp index 700d36b9e7ad78..9d93e4c01577bf 100644 --- a/src/common/snippets/include/snippets/op/vector_buffer.hpp +++ b/src/common/snippets/include/snippets/op/vector_buffer.hpp @@ -19,11 +19,14 @@ class VectorBuffer : public ngraph::op::Op { public: OPENVINO_OP("VectorBuffer", "SnippetsOpset"); - VectorBuffer(); + VectorBuffer(const ov::element::Type element_type = ov::element::f32); bool visit_attributes(AttributeVisitor& visitor) override { return true;} std::shared_ptr clone_with_new_inputs(const OutputVector& new_args) const override; void validate_and_infer_types() override; + +private: + ov::element::Type m_element_type; }; } // namespace op diff --git a/src/common/snippets/src/generator.cpp b/src/common/snippets/src/generator.cpp index 702e2ff84737f3..107df29d284b65 100644 --- a/src/common/snippets/src/generator.cpp +++ b/src/common/snippets/src/generator.cpp @@ -53,7 +53,7 @@ auto tail_transformations(NodeVector& tail, const size_t tail_size, const bool n auto& rt = input.get_rt_info(); auto fill_rt = rt.find("set_fill"); if (fill_rt != rt.end()) { - const std::string fill_value = fill_rt->second.as(); + const auto fill_value = fill_rt->second.as(); fill = std::make_shared(input.get_source_output(), tail_size, fill_value); input.get_node()->set_argument(input.get_index(), fill); // we should explicitly copy reg info because we insert Fill after assign register @@ -141,10 +141,10 @@ ngraph::snippets::code ngraph::snippets::Generator::generate(std::shared_ptrset_evaluate_once(true); const auto increment = loop->get_increment(); const auto& apply_increments = loop->get_apply_increment(); - const auto& forse_finalization_offsets = loop->get_forse_finalization_offsets(); + const auto& force_finalization_offsets = loop->get_force_finalization_offsets(); std::vector new_finalization_offsets(loop->get_finalization_offsets()); for (auto i = 0; i < new_finalization_offsets.size(); i++) { - new_finalization_offsets[i] += increment * apply_increments[i] * (forse_finalization_offsets[i] || force_ptr_increment); + new_finalization_offsets[i] += increment * apply_increments[i] * (force_finalization_offsets[i] || force_ptr_increment); } loop->set_finalization_offsets(new_finalization_offsets); return true; diff --git a/src/common/snippets/src/op/buffer.cpp b/src/common/snippets/src/op/buffer.cpp index e34c43b39dd18c..1c7a8f37920e7b 100644 --- a/src/common/snippets/src/op/buffer.cpp +++ b/src/common/snippets/src/op/buffer.cpp @@ -12,8 +12,7 @@ using namespace std; using namespace ngraph; -snippets::op::Buffer::Buffer(const Output& x, const size_t offset) : - Op({x}), m_offset(offset) { +snippets::op::Buffer::Buffer(const Output& x) : Op({x}) { constructor_validate_and_infer_types(); } @@ -26,7 +25,9 @@ bool snippets::op::Buffer::visit_attributes(AttributeVisitor& visitor) { std::shared_ptr snippets::op::Buffer::clone_with_new_inputs(const OutputVector& new_args) const { INTERNAL_OP_SCOPE(Buffer_clone_with_new_inputs); check_new_args_count(this, new_args); - return std::make_shared(new_args.at(0), m_offset); + auto new_buffer = std::make_shared(new_args.at(0)); + new_buffer->set_offset(m_offset); + return new_buffer; } void snippets::op::Buffer::validate_and_infer_types() { diff --git a/src/common/snippets/src/op/fill.cpp b/src/common/snippets/src/op/fill.cpp index 880b3ba2254ff9..a4be641f34e5a1 100644 --- a/src/common/snippets/src/op/fill.cpp +++ b/src/common/snippets/src/op/fill.cpp @@ -11,7 +11,7 @@ using namespace std; using namespace ngraph; -snippets::op::Fill::Fill(const Output& x, const int64_t offset, const std::string fill_value) +snippets::op::Fill::Fill(const Output& x, const size_t offset, const uint32_t fill_value) : Op({x}), m_offset(offset), m_fill_value(fill_value) { constructor_validate_and_infer_types(); } diff --git a/src/common/snippets/src/op/loop.cpp b/src/common/snippets/src/op/loop.cpp index 6f4617fa7d672a..f705a4a751fe42 100644 --- a/src/common/snippets/src/op/loop.cpp +++ b/src/common/snippets/src/op/loop.cpp @@ -86,14 +86,14 @@ std::shared_ptr LoopBegin::get_loop_end() { } LoopEnd::LoopEnd(const std::vector> &args, size_t dimension, size_t work_amount, size_t increment, - std::vector apply_increment, std::vector finalization_offsets, std::vector forse_finalization_offsets) + std::vector apply_increment, std::vector finalization_offsets, std::vector force_finalization_offsets) : LoopBase(args, dimension, work_amount, increment), apply_increment(std::move(apply_increment)), - finalization_offsets(std::move(finalization_offsets)), forse_finalization_offsets(std::move(forse_finalization_offsets)) { + finalization_offsets(std::move(finalization_offsets)), force_finalization_offsets(std::move(force_finalization_offsets)) { constructor_validate_and_infer_types(); } std::shared_ptr LoopEnd::clone_with_new_inputs(const OutputVector& inputs) const { - return std::make_shared(inputs, dimension, work_amount, increment, apply_increment, finalization_offsets, forse_finalization_offsets); + return std::make_shared(inputs, dimension, work_amount, increment, apply_increment, finalization_offsets, force_finalization_offsets); } std::shared_ptr LoopEnd::get_loop_begin() { @@ -111,8 +111,8 @@ const std::vector& LoopEnd::get_apply_increment() const { return apply_increment; } -const std::vector& LoopEnd::get_forse_finalization_offsets() const { - return forse_finalization_offsets; +const std::vector& LoopEnd::get_force_finalization_offsets() const { + return force_finalization_offsets; } void LoopEnd::set_finalization_offsets(std::vector offsets) { @@ -127,10 +127,10 @@ void LoopEnd::set_apply_increment(std::vector allow_increment) { apply_increment = std::move(allow_increment); } -void LoopEnd::set_forse_finalization_offsets(std::vector forse_finalization_offsets) { - if (forse_finalization_offsets.size() != loop_io_size) - throw std::invalid_argument("LoopEnd set_forse_finalization_offsets is called with inconsistent forse_finalization_offsets.size()"); - forse_finalization_offsets = std::move(forse_finalization_offsets); +void LoopEnd::set_force_finalization_offsets(std::vector force_finalization_offsets) { + if (force_finalization_offsets.size() != loop_io_size) + throw std::invalid_argument("LoopEnd set_force_finalization_offsets is called with inconsistent force_finalization_offsets.size()"); + force_finalization_offsets = std::move(force_finalization_offsets); } void LoopEnd::set_work_amount(size_t new_work_amount) { @@ -163,15 +163,15 @@ void LoopEnd::validate_and_infer_types() { NODE_VALIDATION_CHECK(this, finalization_offsets.empty() || finalization_offsets.size() == loop_io_size, "finalization_offsets must be either empty or defined per every input & output of joined Loop. Expected size: ", loop_io_size, " got ", finalization_offsets.size()); - NODE_VALIDATION_CHECK(this, forse_finalization_offsets.empty() || forse_finalization_offsets.size() == loop_io_size, - "forse_finalization_offsets must be either empty or defined per every input & output of joined Loop. Expected size: ", - loop_io_size, " got ", forse_finalization_offsets.size()); + NODE_VALIDATION_CHECK(this, force_finalization_offsets.empty() || force_finalization_offsets.size() == loop_io_size, + "force_finalization_offsets must be either empty or defined per every input & output of joined Loop. Expected size: ", + loop_io_size, " got ", force_finalization_offsets.size()); if (apply_increment.empty()) apply_increment.resize(loop_io_size, true); if (finalization_offsets.empty()) finalization_offsets.resize(loop_io_size, 0); - if (forse_finalization_offsets.empty()) - forse_finalization_offsets.resize(loop_io_size, true); + if (force_finalization_offsets.empty()) + force_finalization_offsets.resize(loop_io_size, true); set_output_size(num_inputs - 1); const auto& ins = inputs(); // All outputs are by-passed from inputs, except for the last one - it connects LoopBegin and LoopEnd diff --git a/src/common/snippets/src/op/loop_helpers.cpp b/src/common/snippets/src/op/loop_helpers.cpp index 64258e0bbd724e..79d14a4854a488 100644 --- a/src/common/snippets/src/op/loop_helpers.cpp +++ b/src/common/snippets/src/op/loop_helpers.cpp @@ -29,14 +29,14 @@ std::shared_ptr insertLoopEndBeforeInputs(const std::vector size_t dimension, size_t work_amount, size_t increment, std::vector apply_increment, std::vector finalization_offsets, - std::vector forse_finalization_offsets) { + std::vector force_finalization_offsets) { OutputVector originalParentOutputs; for (const auto& in : originalInputs) { originalParentOutputs.push_back(in.get_source_output()); } originalParentOutputs.push_back(loopBegin->output(loopBegin->get_output_size() - 1)); auto loop_end = std::make_shared(originalParentOutputs, dimension, work_amount, increment, - std::move(apply_increment), std::move(finalization_offsets), std::move(forse_finalization_offsets)); + std::move(apply_increment), std::move(finalization_offsets), std::move(force_finalization_offsets)); for (int i = 0; i < originalInputs.size(); i++) { originalInputs[i].replace_source_output(loop_end->output(i)); diff --git a/src/common/snippets/src/op/vector_buffer.cpp b/src/common/snippets/src/op/vector_buffer.cpp index 82cd60e55aec16..1be69a6d9ad678 100644 --- a/src/common/snippets/src/op/vector_buffer.cpp +++ b/src/common/snippets/src/op/vector_buffer.cpp @@ -11,17 +11,17 @@ using namespace std; using namespace ngraph; -snippets::op::VectorBuffer::VectorBuffer() : Op() { +snippets::op::VectorBuffer::VectorBuffer(const ov::element::Type element_type) : Op(), m_element_type(std::move(element_type)) { constructor_validate_and_infer_types(); } std::shared_ptr snippets::op::VectorBuffer::clone_with_new_inputs(const OutputVector& new_args) const { INTERNAL_OP_SCOPE(VectorBuffer_clone_with_new_inputs); check_new_args_count(this, new_args); - return std::make_shared(); + return std::make_shared(m_element_type); } void snippets::op::VectorBuffer::validate_and_infer_types() { INTERNAL_OP_SCOPE(VectorBuffer_validate_and_infer_types); - set_output_type(0, ov::element::f32, Shape{1lu}); + set_output_type(0, m_element_type, Shape{1lu}); } diff --git a/src/common/snippets/src/pass/insert_buffer.cpp b/src/common/snippets/src/pass/insert_buffer.cpp index f490b2f469fd73..136a861a584833 100644 --- a/src/common/snippets/src/pass/insert_buffer.cpp +++ b/src/common/snippets/src/pass/insert_buffer.cpp @@ -25,7 +25,7 @@ ngraph::snippets::pass::InsertBuffer::InsertBuffer() { bool rewritten = false; // check if already has Buffer, Parameter or Constant as an input - for (auto input : root->inputs()) { + for (const auto& input : root->inputs()) { const auto input_node = input.get_source_output().get_node()->shared_from_this(); if (!ov::is_type(input_node) && !ov::is_type(input_node) && @@ -38,10 +38,10 @@ ngraph::snippets::pass::InsertBuffer::InsertBuffer() { } // check if already has Buffer or outputs is Result - for (auto output : root->outputs()) { + for (const auto& output : root->outputs()) { const auto target_inputs = output.get_target_inputs(); if (target_inputs.size() > 1) { - for (auto consumer : target_inputs) { + for (const auto& consumer : target_inputs) { const auto output_node = consumer.get_node()->shared_from_this(); if (ov::is_type(output_node)) { // If some of children from one common port are different Buffers, @@ -63,7 +63,7 @@ ngraph::snippets::pass::InsertBuffer::InsertBuffer() { } const auto buffer = std::make_shared(output); - for (auto consumer : output.get_target_inputs()) { + for (const auto& consumer : output.get_target_inputs()) { const auto output_node = consumer.get_node()->shared_from_this(); if (output_node != buffer && !ov::is_type(output_node) && diff --git a/src/common/snippets/src/pass/insert_loops.cpp b/src/common/snippets/src/pass/insert_loops.cpp index 439e482fe21814..8f6158da7e0623 100644 --- a/src/common/snippets/src/pass/insert_loops.cpp +++ b/src/common/snippets/src/pass/insert_loops.cpp @@ -32,7 +32,7 @@ std::vector calculate_finalization_offsets(const size_t outer_dim, cons } // Forse finalization offsets increment if there is outer dimensions to enable scalar vs vector loop optimizations -std::vector calculate_forse_finalization_offsets(size_t outer_work_amount, size_t io_count) { +std::vector calculate_force_finalization_offsets(size_t outer_work_amount, size_t io_count) { return std::vector(io_count, outer_work_amount > 1); } @@ -61,7 +61,7 @@ void insert_explicitly_loops(const ov::NodeVector& ops, const ov::PartialShape& if (outer_work_amount > 1) { inner_finalization_offsets = calculate_finalization_offsets(outer_dim, inner_dim, master_shape, body_shapes); } - auto inner_forse_finalization_offsets = calculate_forse_finalization_offsets(outer_work_amount, count_io); + auto inner_force_finalization_offsets = calculate_force_finalization_offsets(outer_work_amount, count_io); // We should reset Buffer ptr after data storing // If there isn't outer_work_amount for buffer, we should reset this ptr for inner loop // otherwise we should reset it for outer loop @@ -70,7 +70,7 @@ void insert_explicitly_loops(const ov::NodeVector& ops, const ov::PartialShape& const auto idx = count_io - 1 - i; inner_finalization_offsets[idx] = outer_work_amount == 1 && body_shapes[idx].get_shape().back() != 1 ? -static_cast(inner_work_amount) : inner_finalization_offsets[idx]; - inner_forse_finalization_offsets[idx] = true; + inner_force_finalization_offsets[idx] = true; } } // Moreover, if there are many Buffers on I/O we should remember that all Buffer have the register @@ -89,7 +89,7 @@ void insert_explicitly_loops(const ov::NodeVector& ops, const ov::PartialShape& if (there_is_buffer) { apply_increments[i] = false; inner_finalization_offsets[i] = 0; - inner_forse_finalization_offsets[i] = false; + inner_force_finalization_offsets[i] = false; } else { there_is_buffer = true; } @@ -99,7 +99,7 @@ void insert_explicitly_loops(const ov::NodeVector& ops, const ov::PartialShape& const auto& inner_loop_begin = ngraph::snippets::op::insertLoopBeginAfterOutputs(body_parameters); const auto& inner_loop_end = ngraph::snippets::op::insertLoopEndBeforeInputs( reverse_body_results, inner_loop_begin, inner_dim, inner_work_amount, vector_size, - apply_increments, inner_finalization_offsets, inner_forse_finalization_offsets); + apply_increments, inner_finalization_offsets, inner_force_finalization_offsets); // Due to features of topological sort, some Constants (Scalars) may appear right after Parameters in // sorted ops (so it's between Parameters and LoopBegin). Consequently, ScalarEmitters would be called // outside the Loop, and only the first Loop iteration would yield correct data (assuming the vector reg @@ -123,11 +123,11 @@ void insert_explicitly_loops(const ov::NodeVector& ops, const ov::PartialShape& // If there isn't outer_work_amount for buffer, we should reset this ptr for inner loop // otherwise we should reset it for outer loop std::vector outer_finalization_offsets(body_shapes.size(), 0); - std::vector outer_forse_finalization_offsets(body_shapes.size(), false); + std::vector outer_force_finalization_offsets(body_shapes.size(), false); for (size_t i = 0; i < body_results.size(); ++i) { if (ov::is_type(body_results[i].get_node()) && body_results[i].get_shape()[outer_dim] > 1) { outer_finalization_offsets[count_io - i - 1] = -1 * (body_results[i].get_shape()[outer_dim] * body_results[i].get_shape()[inner_dim]); - outer_forse_finalization_offsets[count_io - i - 1] = true; + outer_force_finalization_offsets[count_io - i - 1] = true; } } bool there_is_buffer = false; @@ -136,7 +136,7 @@ void insert_explicitly_loops(const ov::NodeVector& ops, const ov::PartialShape& if (there_is_buffer) { apply_increments[i] = false; outer_finalization_offsets[i] = 0; - outer_forse_finalization_offsets[i] = false; + outer_force_finalization_offsets[i] = false; } else { there_is_buffer = true; } @@ -145,7 +145,7 @@ void insert_explicitly_loops(const ov::NodeVector& ops, const ov::PartialShape& const auto& outer_loop_begin = ngraph::snippets::op::insertLoopBegin(body_parameters); ngraph::snippets::op::insertLoopEnd(body_results, outer_loop_begin, outer_dim, outer_work_amount, 1, - apply_increments, outer_finalization_offsets, outer_forse_finalization_offsets); + apply_increments, outer_finalization_offsets, outer_force_finalization_offsets); } }; @@ -266,10 +266,10 @@ bool ngraph::snippets::pass::InsertLoops::run_on_model(const std::shared_ptr 1) { inner_finalization_offsets = calculate_finalization_offsets(outer_dim, inner_dim, master_shape, ioShapes); } - const auto& forse_finalization_offsets = calculate_forse_finalization_offsets(outer_work_amount, ioShapes.size()); + const auto& force_finalization_offsets = calculate_force_finalization_offsets(outer_work_amount, ioShapes.size()); const auto& inner_loop_begin = op::insertLoopBegin(commonParams); const auto& inner_loop_end = insertLoopEnd(commonResults, inner_loop_begin, inner_dim, inner_work_amount, vector_size, apply_increments, - inner_finalization_offsets, forse_finalization_offsets); + inner_finalization_offsets, force_finalization_offsets); // Due to features of topological sort, some Constants (Scalars) may appear right after Parameters in // sorted ops (so it's between Parameters and LoopBegin). Consequently, ScalarEmitters would be called // outside the Loop, and only the first Loop iteration would yield correct data (assuming the vector reg diff --git a/src/common/snippets/src/pass/insert_movebroadcast.cpp b/src/common/snippets/src/pass/insert_movebroadcast.cpp index 5b9ff71a684d68..f42bc06844262d 100644 --- a/src/common/snippets/src/pass/insert_movebroadcast.cpp +++ b/src/common/snippets/src/pass/insert_movebroadcast.cpp @@ -31,7 +31,7 @@ std::shared_ptr broadcast_node_last_dim(const ngraph::Output(broadcasted_node, broadcasted_shape); - // BroadcastMove should be immediately executed after broadcasted node. + // BroadcastMove should be immediately executed after its input op (input op is node with output which should be broadcasted). // For example, to execute Broadcast outside of a Loop We transfer control dependents and copy rt info broadcasted_node->add_node_control_dependents(value.get_node_shared_ptr()); ov::copy_runtime_info(value.get_node_shared_ptr(), broadcasted_node); diff --git a/src/common/snippets/src/pass/softmax_decomposition.cpp b/src/common/snippets/src/pass/softmax_decomposition.cpp index 4a3fd0fb61f4dd..b05b8bb926fe4b 100644 --- a/src/common/snippets/src/pass/softmax_decomposition.cpp +++ b/src/common/snippets/src/pass/softmax_decomposition.cpp @@ -91,9 +91,9 @@ ngraph::snippets::pass::SoftmaxDecomposition::SoftmaxDecomposition(const size_t // we should always reset data ptr after this loop because in the next Loop this ptr is used const auto finalization_offsets_max = std::vector{ calculate_required_finalization_offsets(inner_master_work_amount, data->get_shape()[inner_dim]), 0, 0 }; - const auto forse_finalization_offsets_max = std::vector{true, false, false}; + const auto force_finalization_offsets_max = std::vector{true, false, false}; const auto loop_max_end = std::make_shared(ngraph::OutputVector{loop_max_begin->output(1), loop_max_begin->output(2)}, - dimension, work_amount, increment, apply_increments_max, finalization_offsets_max, forse_finalization_offsets_max); + dimension, work_amount, increment, apply_increments_max, finalization_offsets_max, force_finalization_offsets_max); const auto horizon_max = std::make_shared(max); @@ -115,17 +115,17 @@ ngraph::snippets::pass::SoftmaxDecomposition::SoftmaxDecomposition(const size_t auto finalization_offsets_sum = std::vector{ has_outer_loop ? calculate_finalization_offsets(inner_master_work_amount, load_sub->get_shape()) : 0, calculate_required_finalization_offsets(inner_master_work_amount, store_exp->get_shape()[inner_dim]) }; - auto forse_finalization_offsets_sum = std::vector{has_outer_loop, true}; + auto force_finalization_offsets_sum = std::vector{has_outer_loop, true}; // Softmax has Buffer and if input of Softmax is Buffer as well we have Loop with the same Buffer on Input and Output for ReduceSum // So we should increment and reset buffer ptr only once! if (input_is_buffer) { apply_increments_sum[0] = false; finalization_offsets_sum[0] = 0; - forse_finalization_offsets_sum[0] = false; + force_finalization_offsets_sum[0] = false; } const auto loop_sum_end = std::make_shared( ngraph::OutputVector{store_exp, loop_sum_begin->output(1)}, dimension, work_amount, increment, - apply_increments_sum, finalization_offsets_sum, forse_finalization_offsets_sum); + apply_increments_sum, finalization_offsets_sum, force_finalization_offsets_sum); const auto horizon_sum = std::make_shared(sum); const auto buffer_exp = std::make_shared(loop_sum_end->output(0)); @@ -148,17 +148,17 @@ ngraph::snippets::pass::SoftmaxDecomposition::SoftmaxDecomposition(const size_t auto finalization_offsets_div = std::vector{ has_outer_loop ? calculate_finalization_offsets(inner_master_work_amount, load_div->get_shape()) : 0, has_outer_loop ? calculate_finalization_offsets(inner_master_work_amount, store_div->get_shape()) : 0 }; - auto forse_finalization_offsets_div = std::vector(2, has_outer_loop); + auto force_finalization_offsets_div = std::vector(2, has_outer_loop); // Before Loop with Div there is Buffer so if Softmax child is Buffer as well, we should increment buffer pointer only once // because Buffers have one common register if (output_is_buffer) { apply_increments_div[0] = false; finalization_offsets_sum[0] = 0; - forse_finalization_offsets_div[0] = false; + force_finalization_offsets_div[0] = false; } const auto loop_div_end = std::make_shared( ngraph::OutputVector{store_div, loop_div_begin->output(1)}, dimension, work_amount, increment, - apply_increments_div, finalization_offsets_div, forse_finalization_offsets_div); + apply_increments_div, finalization_offsets_div, force_finalization_offsets_div); /* =========================================== */ @@ -179,8 +179,8 @@ ngraph::snippets::pass::SoftmaxDecomposition::SoftmaxDecomposition(const size_t // For tail loop we should fill input of Max by float min and // input of Sum by zero to avoid math incorrect calculations - max->input(0).get_rt_info()["set_fill"] = std::string("float_min"); - sum->input(0).get_rt_info()["set_fill"] = std::string("zero"); + max->input(0).get_rt_info()["set_fill"] = uint32_t(0xff7fffff); + sum->input(0).get_rt_info()["set_fill"] = uint32_t(0x00000000); // These nodes should be executed outside loops ov::NodeVector ops_outside_loop = { vector_buffer_max, horizon_max, vector_buffer_sum, horizon_sum, pow, buffer_exp }; diff --git a/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.cpp b/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.cpp index 15d8ac10499c05..2deb38dd4668bf 100644 --- a/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.cpp +++ b/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.cpp @@ -820,6 +820,10 @@ FillEmitter::FillEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu IE_THROW() << "Fill emitter expects Fill op from Snippets opset"; } + if (fill->get_element_type().size() != 4) { + IE_THROW() << "Fill emitter supports only 4 Byte element types but gets: " << fill->get_element_type(); + } + offset = fill->get_offset(); fill_value = fill->get_fill_value(); prepare_table(); @@ -859,7 +863,7 @@ void FillEmitter::emit_isa(const std::vector &in, const std::vectormov(Reg64(aux_gpr_idxs[0]), tail_mask); h->kmovq(k_mask, Reg64(aux_gpr_idxs[0])); - h->vblendmps(dst_vmm | k_mask, src_vmm, table_val(fill_value)); + h->vblendmps(dst_vmm | k_mask, src_vmm, table_val("value")); } else if (one_of(host_isa_, dnnl::impl::cpu::x64::avx2, dnnl::impl::cpu::x64::sse41)) { uint8 imm = 1; imm = ~((imm << offset) - imm); // shift load_num bit @@ -867,20 +871,14 @@ void FillEmitter::emit_isa(const std::vector &in, const std::vectoruni_vmovups(dst_vmm, src_vmm); src_vmm = Vmm(dst_vmm.getIdx()); } - h->uni_vblendps(dst_vmm, src_vmm, table_val(fill_value), imm); + h->uni_vblendps(dst_vmm, src_vmm, table_val("value"), imm); } else { IE_THROW() << "Fill emitter doesn't support " << host_isa_; } } void FillEmitter::register_table_entries() { - push_arg_entry_of("zero", 0x00000000, true); - push_arg_entry_of("int_one", 0x00000001, true); - push_arg_entry_of("float_one", 0x3f800000, true); - push_arg_entry_of("int32_min", 0xcf000000, true); - push_arg_entry_of("float_min", 0xff7fffff, true); - push_arg_entry_of("int32_max", 0x4effffff, true); - push_arg_entry_of("float_max", 0x7f7fffff, true); + push_arg_entry_of("value", fill_value, true); } } // namespace intel_cpu diff --git a/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.hpp b/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.hpp index ffbc14931a44fd..a6c4b058f6455b 100644 --- a/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.hpp +++ b/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.hpp @@ -433,7 +433,7 @@ class FillEmitter : public jit_emitter { void register_table_entries() override; size_t offset = 0; - std::string fill_value; + uint32_t fill_value = 0x0; }; } // namespace intel_cpu diff --git a/src/plugins/intel_cpu/src/plugin.cpp b/src/plugins/intel_cpu/src/plugin.cpp index ad8761e8fd5863..bbb680f37da272 100644 --- a/src/plugins/intel_cpu/src/plugin.cpp +++ b/src/plugins/intel_cpu/src/plugin.cpp @@ -649,12 +649,11 @@ static void TransformationUpToCPUSpecificOpSet(std::shared_ptr snippetsManager.get_pass_config()->set_callback( [_tokenizeSoftmaxSnippets](const std::shared_ptr& n) -> bool { // CPU Plugin support Swish in Subgraph via conversion to SwichCPU which assumes second input to be constant - if (ov::is_type(n)) { - if (n->inputs().size() > 1 && !ov::is_type(n->get_input_node_shared_ptr(1))) - return true; - } else if (ov::is_type(n) || ov::is_type(n)) { - return !_tokenizeSoftmaxSnippets; - } + const bool is_unsupported_swish = ov::is_type(n) && n->inputs().size() > 1 && + !ov::is_type(n->get_input_node_shared_ptr(1)); + const bool is_disabled_softmax_tokenization = + (ov::is_type(n) || ov::is_type(n)) && !_tokenizeSoftmaxSnippets; + const auto& inputs = n->inputs(); // todo: clarify whether we can evaluate snippets on const paths const bool has_only_const_inputs = std::all_of(inputs.begin(), inputs.end(), @@ -671,7 +670,7 @@ static void TransformationUpToCPUSpecificOpSet(std::shared_ptr const auto& outputs = n->outputs(); const bool bad_output_rank = std::any_of(outputs.begin(), outputs.end(), [&](const ov::Output& out) {return rank_is_too_large(out.get_tensor());}); - return has_only_const_inputs || bad_input_rank || bad_output_rank; + return has_only_const_inputs || bad_input_rank || bad_output_rank || is_unsupported_swish || is_disabled_softmax_tokenization; }); snippetsManager.register_pass(); snippetsManager.run_passes(nGraphFunc); diff --git a/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_lowered.cpp b/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_lowered.cpp index 6384ea7930dd04..36916a763b4704 100644 --- a/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_lowered.cpp +++ b/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_lowered.cpp @@ -134,9 +134,9 @@ std::shared_ptr SoftmaxLoweredFunction::initLowered() const { std::vector finalization_offsets_max(3, 0); apply_increments_max[0] = data->get_shape()[inner_dim] != 1 && inner_master_wa != 1; finalization_offsets_max[0] = data->get_shape()[inner_dim] != 1 ? -inner_master_wa : 0; - const auto forse_finalization_offsets_max = std::vector{true, false, false}; + const auto force_finalization_offsets_max = std::vector{true, false, false}; const auto loop_max_end = std::make_shared(ngraph::OutputVector{loop_max_begin->output(1), loop_max_begin->output(2)}, - dimension, work_amount, increment, apply_increments_max, finalization_offsets_max, forse_finalization_offsets_max); + dimension, work_amount, increment, apply_increments_max, finalization_offsets_max, force_finalization_offsets_max); std::shared_ptr horizon_max = std::make_shared(max); horizon_max->add_control_dependency(loop_max_end); @@ -168,10 +168,10 @@ std::shared_ptr SoftmaxLoweredFunction::initLowered() const { apply_increments_sum[1] = store_exp->get_shape()[inner_dim] != 1 && inner_master_wa != 1; finalization_offsets_sum[0] = has_outer_loop && load_sub->get_shape()[inner_dim] != 1 ? -inner_master_wa : 0; finalization_offsets_sum[1] = store_exp->get_shape()[inner_dim] != 1 ? -inner_master_wa : 0; - const auto forse_finalization_offsets_sum = std::vector{has_outer_loop, true}; + const auto force_finalization_offsets_sum = std::vector{has_outer_loop, true}; const auto loop_sum_end = std::make_shared( ngraph::OutputVector{store_exp, loop_sum_begin->output(1)}, dimension, work_amount, increment, - apply_increments_sum, finalization_offsets_sum, forse_finalization_offsets_sum); + apply_increments_sum, finalization_offsets_sum, force_finalization_offsets_sum); loop_sum_end->add_control_dependency(sum); const auto horizon_sum = std::make_shared(sum); @@ -205,10 +205,10 @@ std::shared_ptr SoftmaxLoweredFunction::initLowered() const { apply_increments_div[1] = store_div->get_shape()[inner_dim] != 1 && inner_master_wa != 1; finalization_offsets_div[0] = has_outer_loop && load_div->get_shape()[inner_dim] != 1 ? -inner_master_wa : 0; finalization_offsets_div[1] = has_outer_loop && store_div->get_shape()[inner_dim] != 1 ? -inner_master_wa : 0; - const auto forse_finalization_offsets_div = std::vector(2, has_outer_loop); + const auto force_finalization_offsets_div = std::vector(2, has_outer_loop); const auto loop_div_end = std::make_shared( ngraph::OutputVector{store_div, loop_div_begin->output(1)}, dimension, work_amount, increment, - apply_increments_div, finalization_offsets_div, forse_finalization_offsets_div); + apply_increments_div, finalization_offsets_div, force_finalization_offsets_div); loop_div_begin->add_control_dependency(pow); loop_div_begin->add_control_dependency(prev_pow); @@ -268,9 +268,9 @@ std::shared_ptr AddSoftmaxLoweredFunction::initLowered() const { finalization_offsets_add[0] = input_shapes[0].get_shape()[inner_dim] != 1 ? -inner_master_wa : 0; finalization_offsets_add[1] = input_shapes[1].get_shape()[inner_dim] != 1 ? -inner_master_wa : 0; finalization_offsets_add[2] = master_shape[inner_dim] != 1 ? -inner_master_wa : 0; - const auto forse_finalization_offsets_add = std::vector{has_outer_loop, has_outer_loop, true}; + const auto force_finalization_offsets_add = std::vector{has_outer_loop, has_outer_loop, true}; auto loop_add_end = std::make_shared(ngraph::OutputVector{store, loop_add_begin->output(2)}, - dimension, work_amount, increment, apply_increments_add, finalization_offsets_add, forse_finalization_offsets_add); + dimension, work_amount, increment, apply_increments_add, finalization_offsets_add, force_finalization_offsets_add); /* =========================================== */ @@ -289,9 +289,9 @@ std::shared_ptr AddSoftmaxLoweredFunction::initLowered() const { std::vector finalization_offsets_max(3, 0); apply_increments_max[0] = master_shape[inner_dim] != 1 && inner_master_wa != 1; finalization_offsets_max[0] = master_shape[outer_dim] == 1 && master_shape[inner_dim] != 1 ? -inner_master_wa : 0; - const auto forse_finalization_offsets_max = std::vector{true, false, false}; + const auto force_finalization_offsets_max = std::vector{true, false, false}; const auto loop_max_end = std::make_shared(ngraph::OutputVector{loop_max_begin->output(1), loop_max_begin->output(2)}, - dimension, work_amount, increment, apply_increments_max, finalization_offsets_max, forse_finalization_offsets_max); + dimension, work_amount, increment, apply_increments_max, finalization_offsets_max, force_finalization_offsets_max); std::shared_ptr horizon_max = std::make_shared(max); horizon_max->add_control_dependency(loop_max_end); @@ -323,10 +323,10 @@ std::shared_ptr AddSoftmaxLoweredFunction::initLowered() const { apply_increments_sum[1] = store_exp->get_shape()[inner_dim] != 1 && inner_master_wa != 1; finalization_offsets_sum[0] = has_outer_loop && load_sub->get_shape()[inner_dim] != 1 ? -inner_master_wa : 0; finalization_offsets_sum[1] = store_exp->get_shape()[inner_dim] != 1 ? -inner_master_wa : 0; - const auto forse_finalization_offsets_sum = std::vector{has_outer_loop, true}; + const auto force_finalization_offsets_sum = std::vector{has_outer_loop, true}; const auto loop_sum_end = std::make_shared( ngraph::OutputVector{store_exp, loop_sum_begin->output(1)}, dimension, work_amount, increment, - apply_increments_sum, finalization_offsets_sum, forse_finalization_offsets_sum); + apply_increments_sum, finalization_offsets_sum, force_finalization_offsets_sum); loop_sum_end->add_control_dependency(sum); const auto horizon_sum = std::make_shared(sum); @@ -360,10 +360,10 @@ std::shared_ptr AddSoftmaxLoweredFunction::initLowered() const { apply_increments_div[1] = store_div->get_shape()[inner_dim] != 1 && inner_master_wa != 1; finalization_offsets_div[0] = has_outer_loop && load_div->get_shape()[inner_dim] != 1 ? -inner_master_wa : 0; finalization_offsets_div[1] = has_outer_loop && store_div->get_shape()[inner_dim] != 1 ? -inner_master_wa : 0; - const auto forse_finalization_offsets_div = std::vector(2, has_outer_loop); + const auto force_finalization_offsets_div = std::vector(2, has_outer_loop); const auto loop_div_end = std::make_shared( ngraph::OutputVector{store_div, loop_div_begin->output(1)}, dimension, work_amount, increment, - apply_increments_div, finalization_offsets_div, forse_finalization_offsets_div); + apply_increments_div, finalization_offsets_div, force_finalization_offsets_div); loop_div_begin->add_control_dependency(pow); loop_div_begin->add_control_dependency(prev_pow);