From 36ff257628a41fe59dddc0a939a9bd23bdd52256 Mon Sep 17 00:00:00 2001 From: Alexandra Sidorova Date: Thu, 29 Sep 2022 16:25:58 +0200 Subject: [PATCH] [Snippets] Added Softmax support --- .../snippets/include/snippets/op/buffer.hpp | 42 +++ .../snippets/include/snippets/op/fill.hpp | 44 +++ .../include/snippets/op/horizon_max.hpp | 33 ++ .../include/snippets/op/horizon_sum.hpp | 32 ++ .../snippets/include/snippets/op/load.hpp | 6 +- .../snippets/include/snippets/op/store.hpp | 5 +- .../snippets/include/snippets/op/subgraph.hpp | 22 +- .../include/snippets/op/vector_buffer.hpp | 31 ++ .../snippets/pass/set_buffer_offset.hpp | 31 ++ .../snippets/pass/softmax_decomposition.hpp | 26 ++ .../include/snippets/snippets_isa.hpp | 5 + src/common/snippets/src/generator.cpp | 80 +++-- src/common/snippets/src/op/buffer.cpp | 72 +++++ src/common/snippets/src/op/fill.cpp | 34 +++ src/common/snippets/src/op/horizon_max.cpp | 26 ++ src/common/snippets/src/op/horizon_sum.cpp | 26 ++ src/common/snippets/src/op/load.cpp | 6 +- src/common/snippets/src/op/store.cpp | 6 +- src/common/snippets/src/op/subgraph.cpp | 37 ++- src/common/snippets/src/op/vector_buffer.cpp | 26 ++ .../snippets/src/pass/assign_registers.cpp | 66 +++- .../snippets/src/pass/collapse_subgraph.cpp | 42 ++- .../snippets/src/pass/insert_load_store.cpp | 12 +- src/common/snippets/src/pass/insert_loops.cpp | 4 +- .../src/pass/insert_movebroadcast.cpp | 24 +- .../load_movebroadcast_to_broadcastload.cpp | 7 +- .../snippets/src/pass/set_buffer_offset.cpp | 27 ++ .../src/pass/softmax_decomposition.cpp | 170 +++++++++++ .../include/pass/softmax_decomposition.hpp | 29 ++ .../snippets/tests/src/lowering_utils.cpp | 9 +- .../tests/src/pass/softmax_decomposition.cpp | 64 ++++ .../intel_cpu/src/emitters/cpu_generator.cpp | 6 + .../src/emitters/jit_snippets_emitters.cpp | 288 ++++++++++++++++-- .../src/emitters/jit_snippets_emitters.hpp | 101 +++++- src/plugins/intel_cpu/src/nodes/subgraph.cpp | 22 +- src/plugins/intel_cpu/src/nodes/subgraph.h | 4 + .../fuse_load_store_and_convert.cpp | 8 +- .../op/load_convert.cpp | 14 +- .../op/load_convert.hpp | 4 +- .../op/store_convert.cpp | 14 +- .../op/store_convert.hpp | 4 +- .../snippets/softmax.cpp | 55 ++++ .../shared/include/snippets/softmax.hpp | 40 +++ .../plugin/shared/src/snippets/softmax.cpp | 48 +++ .../include/subgraph_lowered.hpp | 9 + .../include/subgraph_softmax.hpp | 36 +++ .../src/subgraph_lowered.cpp | 117 +++++++ .../src/subgraph_softmax.cpp | 28 ++ 48 files changed, 1713 insertions(+), 129 deletions(-) create mode 100644 src/common/snippets/include/snippets/op/buffer.hpp create mode 100644 src/common/snippets/include/snippets/op/fill.hpp create mode 100644 src/common/snippets/include/snippets/op/horizon_max.hpp create mode 100644 src/common/snippets/include/snippets/op/horizon_sum.hpp create mode 100644 src/common/snippets/include/snippets/op/vector_buffer.hpp create mode 100644 src/common/snippets/include/snippets/pass/set_buffer_offset.hpp create mode 100644 src/common/snippets/include/snippets/pass/softmax_decomposition.hpp create mode 100644 src/common/snippets/src/op/buffer.cpp create mode 100644 src/common/snippets/src/op/fill.cpp create mode 100644 src/common/snippets/src/op/horizon_max.cpp create mode 100644 src/common/snippets/src/op/horizon_sum.cpp create mode 100644 src/common/snippets/src/op/vector_buffer.cpp create mode 100644 src/common/snippets/src/pass/set_buffer_offset.cpp create mode 100644 src/common/snippets/src/pass/softmax_decomposition.cpp create mode 100644 src/common/snippets/tests/include/pass/softmax_decomposition.hpp create mode 100644 src/common/snippets/tests/src/pass/softmax_decomposition.cpp create mode 100644 src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/softmax.cpp create mode 100644 src/tests/functional/plugin/shared/include/snippets/softmax.hpp create mode 100644 src/tests/functional/plugin/shared/src/snippets/softmax.cpp create mode 100644 src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_softmax.hpp create mode 100644 src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_softmax.cpp diff --git a/src/common/snippets/include/snippets/op/buffer.hpp b/src/common/snippets/include/snippets/op/buffer.hpp new file mode 100644 index 00000000000000..160bbc3702983a --- /dev/null +++ b/src/common/snippets/include/snippets/op/buffer.hpp @@ -0,0 +1,42 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + +namespace ngraph { +namespace snippets { +namespace op { + +/** + * @interface Buffer + * @brief TODO + * @ingroup snippets + */ +class Buffer : public ngraph::op::Op { +public: + OPENVINO_OP("Buffer", "SnippetsOpset"); + + Buffer(const Output& x, const size_t offset = 0); + Buffer() = default; + + size_t get_offset() const; + void set_offset(const size_t offset); + + // If Buffer has offset this method set this offset to near Load and Store ops + // to correctly read and write data + void propogateOffset(); + + bool visit_attributes(AttributeVisitor& visitor) override { return true; }; + std::shared_ptr clone_with_new_inputs(const OutputVector& new_args) const override; + void validate_and_infer_types() override; + +private: + size_t offset; +}; + +} // namespace op +} // namespace snippets +} // namespace ngraph diff --git a/src/common/snippets/include/snippets/op/fill.hpp b/src/common/snippets/include/snippets/op/fill.hpp new file mode 100644 index 00000000000000..3b403e6780506b --- /dev/null +++ b/src/common/snippets/include/snippets/op/fill.hpp @@ -0,0 +1,44 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + +namespace ngraph { +namespace snippets { +namespace op { + +/** + * @interface Fill + * @brief TODO + * @ingroup snippets + */ +class Fill : public ngraph::op::Op { +public: + OPENVINO_OP("Fill", "SnippetsOpset"); + + Fill(const Output& x, const int64_t offset, const std::string fill_value = "zero"); + Fill() = default; + + int64_t get_offset() const { return m_offset; } + std::string get_fill_value() const { return m_fill_value; } + + void set_offset(const size_t offset) { m_offset = offset; } + void set_fill_value(const std::string fill_value) { m_fill_value = fill_value; } + + bool visit_attributes(AttributeVisitor& visitor) override; + + std::shared_ptr clone_with_new_inputs(const OutputVector& new_args) const override; + + void validate_and_infer_types() override; + +protected: + int64_t m_offset = 0lu; + std::string m_fill_value = "zero"; +}; + +} // namespace op +} // namespace snippets +} // namespace ngraph diff --git a/src/common/snippets/include/snippets/op/horizon_max.hpp b/src/common/snippets/include/snippets/op/horizon_max.hpp new file mode 100644 index 00000000000000..6c3b9eb9aecae3 --- /dev/null +++ b/src/common/snippets/include/snippets/op/horizon_max.hpp @@ -0,0 +1,33 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "ngraph/op/op.hpp" + +namespace ngraph { +namespace snippets { +namespace op { + +/** + * @interface HorizonMax + * @brief TODO + * @ingroup snippets + */ +class HorizonMax : public ngraph::op::Op { +public: + OPENVINO_OP("HorizonMax", "SnippetsOpset"); + + HorizonMax(const Output& x); + HorizonMax() = default; + + bool visit_attributes(AttributeVisitor& visitor) override { return true;} + + std::shared_ptr clone_with_new_inputs(const OutputVector& new_args) const override; + void validate_and_infer_types() override; +}; + +} // namespace op +} // namespace snippets +} // namespace ngraph diff --git a/src/common/snippets/include/snippets/op/horizon_sum.hpp b/src/common/snippets/include/snippets/op/horizon_sum.hpp new file mode 100644 index 00000000000000..ff0ad01760e7a0 --- /dev/null +++ b/src/common/snippets/include/snippets/op/horizon_sum.hpp @@ -0,0 +1,32 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "ngraph/op/op.hpp" + +namespace ngraph { +namespace snippets { +namespace op { + +/** + * @interface HorizonSum + * @brief TODO + * @ingroup snippets + */ +class HorizonSum : public ngraph::op::Op { +public: + OPENVINO_OP("HorizonSum", "SnippetsOpset"); + + HorizonSum(const Output& x); + HorizonSum() = default; + + bool visit_attributes(AttributeVisitor& visitor) override { return true;} + std::shared_ptr clone_with_new_inputs(const OutputVector& new_args) const override; + void validate_and_infer_types() override; +}; + +} // namespace op +} // namespace snippets +} // namespace ngraph diff --git a/src/common/snippets/include/snippets/op/load.hpp b/src/common/snippets/include/snippets/op/load.hpp index 7f53240ae21946..95a1570f36b634 100644 --- a/src/common/snippets/include/snippets/op/load.hpp +++ b/src/common/snippets/include/snippets/op/load.hpp @@ -15,18 +15,21 @@ namespace op { * @brief Generated by Canonicalization step where explicit instructions should be emitted for data loading * where number of elements to load is determined by "count" * Default value is "1" - to load one element + * TODO: ADD DESCRIPTION OF PARAMS * @ingroup snippets */ class Load : public ngraph::op::Op { public: OPENVINO_OP("Load", "SnippetsOpset"); - Load(const Output& x, const size_t count = 1lu); + Load(const Output& x, const size_t count = 1lu, const size_t offset = 0lu); Load() = default; size_t get_count() const { return m_count; } + size_t get_offset() const { return m_offset; } void set_count(const size_t count) { m_count = count; } + void set_offset(const size_t offset) { m_offset = offset; } bool visit_attributes(AttributeVisitor& visitor) override; @@ -40,6 +43,7 @@ class Load : public ngraph::op::Op { protected: size_t m_count = 0lu; + size_t m_offset = 0lu; }; } // namespace op diff --git a/src/common/snippets/include/snippets/op/store.hpp b/src/common/snippets/include/snippets/op/store.hpp index 0ff5cc3ec8e063..84ff0248aa6208 100644 --- a/src/common/snippets/include/snippets/op/store.hpp +++ b/src/common/snippets/include/snippets/op/store.hpp @@ -21,12 +21,14 @@ class Store : public ngraph::op::Op { public: OPENVINO_OP("Store", "SnippetsOpset"); - Store(const Output& x, const size_t count = 1lu); + Store(const Output& x, const size_t count = 1lu, const size_t offset = 0lu); Store() = default; size_t get_count() const { return m_count; } + size_t get_offset() const { return m_offset; } void set_count(const size_t count) { m_count = count; } + void set_offset(const size_t offset) { m_offset = offset; } bool visit_attributes(AttributeVisitor& visitor) override; @@ -40,6 +42,7 @@ class Store : public ngraph::op::Op { protected: size_t m_count = 0lu; + size_t m_offset = 0lu; }; } // namespace op diff --git a/src/common/snippets/include/snippets/op/subgraph.hpp b/src/common/snippets/include/snippets/op/subgraph.hpp index dfcde2bd4fd2c6..b5de7072f83881 100644 --- a/src/common/snippets/include/snippets/op/subgraph.hpp +++ b/src/common/snippets/include/snippets/op/subgraph.hpp @@ -89,17 +89,10 @@ class Subgraph : public ngraph::op::Op { return m_generator; } - size_t get_non_scalar_constants_count() const { - return m_non_scalar_constants_count; - } - - bool is_quantized() const { - return config.m_is_quantized; - } - - bool has_type_relaxed_ops() const { - return config.m_has_type_relaxed_ops; - } + size_t get_buffer_scratchpad_size() const; + size_t get_additional_data_count() const { return m_additional_data_count; } + bool is_quantized() const { return config.m_is_quantized; } + bool has_type_relaxed_ops() const { return config.m_has_type_relaxed_ops; } snippets::Schedule generate(const BlockedShapeVector& output_shapes, const BlockedShapeVector& input_shapes, ngraph::pass::Manager& opt, const void* compile_params = nullptr); @@ -114,7 +107,7 @@ class Subgraph : public ngraph::op::Op { // plugin sets generator for a snippet to some specific generator. // it's going to be replaced with Jitters table later void set_generator(std::shared_ptr generator); - void set_non_scalar_constants_count(const size_t count); + void set_additional_data_count(const size_t count); void print() const; void print_statistics(bool verbose); @@ -128,11 +121,12 @@ class Subgraph : public ngraph::op::Op { private: void align_element_types(const BlockedShapeVector& outputShapes, const BlockedShapeVector& inputShapes); void convert_to_snippet_dialect(); + void init_config(); // Count of potentional non-scalar Consants that will be created after some tranformations // At the moment it's relevant only for FakeQuantize decomposition // NOTE: To avoid overheads in each calcution of this count (for example, in validate_and_type_infer()), // we should MANUALLY calculate it where it needed. - size_t m_non_scalar_constants_count = 0; + size_t m_additional_data_count = 0; Shape exec_domain = {}; std::shared_ptr m_body = nullptr; std::shared_ptr m_generator = nullptr; @@ -150,6 +144,8 @@ class Subgraph : public ngraph::op::Op { // True if Subgraph contains TypeRelaxed nodes -> for several streams in tp mode we should copy body using mutexes // because TypeRelaxed::copy_with_new_inputs() isn't save-thread method bool m_has_type_relaxed_ops = false; + // True if we should check runtime info for nodes to call specific needed transformations + bool m_check_rt_info = false; } config; ov::PartialShape master_shape; diff --git a/src/common/snippets/include/snippets/op/vector_buffer.hpp b/src/common/snippets/include/snippets/op/vector_buffer.hpp new file mode 100644 index 00000000000000..e105fbe8e90ba7 --- /dev/null +++ b/src/common/snippets/include/snippets/op/vector_buffer.hpp @@ -0,0 +1,31 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + +namespace ngraph { +namespace snippets { +namespace op { + +/** + * @interface Buffer + * @brief TODO + * @ingroup snippets + */ +class VectorBuffer : public ngraph::op::Op { +public: + OPENVINO_OP("VectorBuffer", "SnippetsOpset"); + + VectorBuffer(); + + bool visit_attributes(AttributeVisitor& visitor) override { return true;} + std::shared_ptr clone_with_new_inputs(const OutputVector& new_args) const override; + void validate_and_infer_types() override; +}; + +} // namespace op +} // namespace snippets +} // namespace ngraph \ No newline at end of file diff --git a/src/common/snippets/include/snippets/pass/set_buffer_offset.hpp b/src/common/snippets/include/snippets/pass/set_buffer_offset.hpp new file mode 100644 index 00000000000000..23f4c22fe699a2 --- /dev/null +++ b/src/common/snippets/include/snippets/pass/set_buffer_offset.hpp @@ -0,0 +1,31 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include + +namespace ngraph { +namespace snippets { +namespace pass { + +/** + * @interface SetBufferOffset + * @brief TODO + * NOTE: Should be called after Load/Store insertion and before LoadMoveBroadcastToBroadcastLoad because + * we cannot fuse Load with non-zero offset and MoveBroadcast + * @ingroup snippets + */ +class SetBufferOffset: public ngraph::pass::MatcherPass { +public: + SetBufferOffset(); + +private: + size_t current_offset = 0lu; +}; + +} // namespace pass +} // namespace snippets +} // namespace ngraph diff --git a/src/common/snippets/include/snippets/pass/softmax_decomposition.hpp b/src/common/snippets/include/snippets/pass/softmax_decomposition.hpp new file mode 100644 index 00000000000000..80d7f36f617779 --- /dev/null +++ b/src/common/snippets/include/snippets/pass/softmax_decomposition.hpp @@ -0,0 +1,26 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include + +namespace ngraph { +namespace snippets { +namespace pass { + +/** + * @interface SoftmaxDecomposition + * @brief TODO + * @ingroup snippets + */ +class SoftmaxDecomposition: public ngraph::pass::MatcherPass { +public: + SoftmaxDecomposition(const size_t vector_size); +}; + +} // namespace pass +} // namespace snippets +} // namespace ngraph diff --git a/src/common/snippets/include/snippets/snippets_isa.hpp b/src/common/snippets/include/snippets/snippets_isa.hpp index 1137de1db0c76c..3fa860ff047e76 100644 --- a/src/common/snippets/include/snippets/snippets_isa.hpp +++ b/src/common/snippets/include/snippets/snippets_isa.hpp @@ -9,8 +9,12 @@ #include "op/broadcastload.hpp" #include "op/broadcastmove.hpp" +#include "op/buffer.hpp" #include "op/convert_saturation.hpp" #include "op/convert_truncation.hpp" +#include "op/horizon_max.hpp" +#include "op/horizon_sum.hpp" +#include "op/fill.hpp" #include "op/kernel.hpp" #include "op/load.hpp" #include "op/nop.hpp" @@ -18,6 +22,7 @@ #include "op/powerstatic.hpp" #include "op/store.hpp" #include "op/loop.hpp" +#include "op/vector_buffer.hpp" namespace ngraph { namespace snippets { diff --git a/src/common/snippets/src/generator.cpp b/src/common/snippets/src/generator.cpp index 7095ab4ec19371..3b31d718431dc3 100644 --- a/src/common/snippets/src/generator.cpp +++ b/src/common/snippets/src/generator.cpp @@ -22,33 +22,88 @@ auto getRegisters(const std::shared_ptr &n) -> RegInfo { // ToDo: change to reg_t std::vector rin, rout; - + std::cout << n->get_friendly_name() << std::endl; + std::cout << "out: "; for (const auto& output : n->outputs()) { const auto& rt = output.get_tensor_ptr()->get_rt_info(); auto it_rt = rt.find("reginfo"); - if (it_rt != rt.end()) + if (it_rt != rt.end()) { rout.push_back(it_rt->second.as()); + std::cout << rout.back() << " "; + } } - + std::cout << "in: "; for (const auto& input : n->inputs()) { auto rt = input.get_source_output().get_tensor_ptr()->get_rt_info(); auto it_rt = rt.find("reginfo"); - if (it_rt != rt.end()) + if (it_rt != rt.end()) { rin.push_back(it_rt->second.as()); + std::cout << rin.back() << " "; + } } + std::cout << std::endl; return std::make_pair(rin, rout); } +auto copyRegInfo(const ov::descriptor::Tensor& from, ov::descriptor::Tensor& to) -> void { + auto rt = from.get_rt_info(); + auto reginfo = rt.find("reginfo"); + if (reginfo != rt.end()) { + to.get_rt_info()["reginfo"] = reginfo->second; + } +} + +auto scalarLoopTransformations(NodeVector& scalar_tile) -> void { + NodeVector updated_tile; + auto insertFill = [](const ov::Input& input) -> std::shared_ptr { + std::shared_ptr fill = nullptr; + auto& rt = input.get_rt_info(); + auto fill_rt = rt.find("set_fill"); + if (fill_rt != rt.end()) { + const std::string fill_value = fill_rt->second.as(); + fill = std::make_shared(input.get_source_output(), 1lu, fill_value); + input.get_node()->set_argument(input.get_index(), fill); + // we should explicitly copy reg info because we insert Fill after assign register + copyRegInfo(fill->get_input_tensor(0), fill->get_output_tensor(0)); + } + return fill; + }; + + for (auto& op : scalar_tile) { + // We should fill vector regs by float_min and zero to have + // correct math calculations for ReduceMax and ReduceSum in scalar case. + // We find Maximum and Add ops because HorizonMax and HorizonSum are outside Loop + if (ov::is_type(op) || + ov::is_type(op)) { + for (auto i = 0; i < op->inputs().size(); ++i) { + if (auto fill = insertFill(op->input(i))) { + updated_tile.push_back(fill); + } + } + } else if (const auto load = ov::as_type_ptr(op)) { + load->set_count(1); + } else if (const auto store = ov::as_type_ptr(op)) { + store->set_count(1); + } + updated_tile.push_back(op); + } + + scalar_tile = updated_tile; +} + ngraph::snippets::code ngraph::snippets::Generator::generate(std::shared_ptr& m, const void* compile_params) const { OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::Generator::generate") if (!target->is_supported()) throw ngraph_error("unsupported architecture for code generation"); + const auto& ops = m->get_ordered_ops(); auto params = m->get_parameters(); auto results = m->get_results(); auto in = params.size(); auto out = results.size(); + auto buffer = static_cast(std::any_of(ops.begin(), ops.end(), + [](const std::shared_ptr& node) { return ov::is_type(node); } )); std::vector io_last_dims(in + out); std::vector io_data_sizes(in + out); @@ -104,7 +159,6 @@ ngraph::snippets::code ngraph::snippets::Generator::generate(std::shared_ptrget_ordered_ops(); for (auto op = ops.begin(); op < ops.end(); op++) { const auto& loop_begin = ov::as_type_ptr(*op); // ignore outer loops and possible manual scalar loops @@ -140,19 +194,11 @@ ngraph::snippets::code ngraph::snippets::Generator::generate(std::shared_ptr& n){ - if (const auto load = ov::as_type_ptr(n)) - load->set_count(1); - else if (const auto store = ov::as_type_ptr(n)) - store->set_count(1); - return n; - }); + scalarLoopTransformations(scalar_loop); scalar_loop_end = ov::as_type_ptr(*scalar_loop.rbegin()); scalar_loop_end->set_finalization_offsets(scalar_finalization_offsets); - const auto scalar_work_amount = work_amount % increment; scalar_loop_end->set_increment(1); - scalar_loop_end->set_work_amount(scalar_work_amount); + scalar_loop_end->set_work_amount(work_amount % increment); scalar_loop_end->has_outer_loop = vector_loop_end->has_outer_loop; // ptr increment is applied automatically if there is non-empty outer loop scalar_evaluate_once = optimize_single_evaluation(scalar_loop_end); @@ -180,7 +226,7 @@ ngraph::snippets::code ngraph::snippets::Generator::generate(std::shared_ptrcompile_params = compile_params; std::shared_ptr kernel = target->get(op::Kernel::get_type_info_static())(loops2DKernel); - kernel->emit_code({in, out}, {}); + kernel->emit_code({in, out, buffer}, {}); OV_ITT_TASK_NEXT(GENERATE, "::EmitData") for (auto& op : lowered) { @@ -195,4 +241,4 @@ std::shared_ptr Generator::get_target_machine() const { } }// namespace snippets -}// namespace ngraph \ No newline at end of file +}// namespace ngraph diff --git a/src/common/snippets/src/op/buffer.cpp b/src/common/snippets/src/op/buffer.cpp new file mode 100644 index 00000000000000..e4b44a1bf735e9 --- /dev/null +++ b/src/common/snippets/src/op/buffer.cpp @@ -0,0 +1,72 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +#include "snippets/op/buffer.hpp" +#include "snippets/snippets_isa.hpp" + +#include + +using namespace std; +using namespace ngraph; + +snippets::op::Buffer::Buffer(const Output& x, const size_t offset) : + Op({x}), offset(offset) { + constructor_validate_and_infer_types(); +} + +size_t snippets::op::Buffer::get_offset() const { + return offset; +} + +void snippets::op::Buffer::set_offset(const size_t offset) { + this->offset = offset; +} + +std::shared_ptr snippets::op::Buffer::clone_with_new_inputs(const OutputVector& new_args) const { + INTERNAL_OP_SCOPE(Buffer); + check_new_args_count(this, new_args); + return std::make_shared(new_args.at(0), offset); +} + +void snippets::op::Buffer::validate_and_infer_types() { + set_output_type(0, get_input_element_type(0), get_input_partial_shape(0)); +} + +void snippets::op::Buffer::propogateOffset() { + // Propogate to up: to Store. Buffer can have only one Store + const auto parent = get_input_node_shared_ptr(0); + auto store = ov::as_type_ptr(parent); + if (!store) { + if (ov::is_type(parent)) { + const auto index = input(0).get_source_output().get_index(); + store = ov::as_type_ptr(parent->get_input_node_shared_ptr(index)); + } + } + if (store) { + store->set_offset(offset); + } + + // Propogate to down: to Load. Buffer can have several Load + for (const auto target_output : output(0).get_target_inputs()) { + const auto child = target_output.get_node()->shared_from_this(); + auto load = ov::as_type_ptr(child); + if (!load) { + if (ov::is_type(child)) { + const auto index = target_output.get_index(); + for (const auto loop_target_output : child->output(index).get_target_inputs()) { + const auto loop_child = loop_target_output.get_node()->shared_from_this(); + load = ov::as_type_ptr(loop_child); + if (load) { + load->set_offset(offset); + } + } + } + } + if (load) { + load->set_offset(offset); + } + } +} diff --git a/src/common/snippets/src/op/fill.cpp b/src/common/snippets/src/op/fill.cpp new file mode 100644 index 00000000000000..1846315dc5ceba --- /dev/null +++ b/src/common/snippets/src/op/fill.cpp @@ -0,0 +1,34 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +#include "snippets/op/fill.hpp" + +#include + +using namespace std; +using namespace ngraph; + +snippets::op::Fill::Fill(const Output& x, const int64_t offset, const std::string fill_value) + : Op({x}), m_offset(offset), m_fill_value(fill_value) { + constructor_validate_and_infer_types(); +} + +bool snippets::op::Fill::visit_attributes(AttributeVisitor& visitor) { + visitor.on_attribute("offset", m_offset); + visitor.on_attribute("fill_value", m_fill_value); + return true; +} + +std::shared_ptr snippets::op::Fill::clone_with_new_inputs(const OutputVector& new_args) const { + INTERNAL_OP_SCOPE(Fill); + check_new_args_count(this, new_args); + return std::make_shared(new_args.at(0), m_offset, m_fill_value); +} + +void snippets::op::Fill::validate_and_infer_types() { + set_output_type(0, get_input_element_type(0), get_input_partial_shape(0)); +} + diff --git a/src/common/snippets/src/op/horizon_max.cpp b/src/common/snippets/src/op/horizon_max.cpp new file mode 100644 index 00000000000000..47f37559eb07eb --- /dev/null +++ b/src/common/snippets/src/op/horizon_max.cpp @@ -0,0 +1,26 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include "snippets/op/horizon_max.hpp" + +using namespace std; +using namespace ngraph; + +snippets::op::HorizonMax::HorizonMax(const Output& x) : Op({x}) { + constructor_validate_and_infer_types(); +} + +std::shared_ptr snippets::op::HorizonMax::clone_with_new_inputs(const OutputVector& new_args) const { + INTERNAL_OP_SCOPE(HorizonMax); + check_new_args_count(this, new_args); + auto other = std::make_shared(new_args.at(0)); + return other; +} + +void snippets::op::HorizonMax::validate_and_infer_types() { + auto new_shape = get_input_partial_shape(0); + new_shape[new_shape.size() - 1] = 1lu; + set_output_type(0, get_input_element_type(0), new_shape); +} diff --git a/src/common/snippets/src/op/horizon_sum.cpp b/src/common/snippets/src/op/horizon_sum.cpp new file mode 100644 index 00000000000000..9df2600dee745f --- /dev/null +++ b/src/common/snippets/src/op/horizon_sum.cpp @@ -0,0 +1,26 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include "snippets/op/horizon_sum.hpp" + +using namespace std; +using namespace ngraph; + +snippets::op::HorizonSum::HorizonSum(const Output& x) : Op({x}) { + constructor_validate_and_infer_types(); +} + +std::shared_ptr snippets::op::HorizonSum::clone_with_new_inputs(const OutputVector& new_args) const { + INTERNAL_OP_SCOPE(HorizonSum); + check_new_args_count(this, new_args); + auto other = std::make_shared(new_args.at(0)); + return other; +} + +void snippets::op::HorizonSum::validate_and_infer_types() { + auto new_shape = get_input_partial_shape(0); + new_shape[new_shape.size() - 1] = 1lu; + set_output_type(0, get_input_element_type(0), new_shape); +} diff --git a/src/common/snippets/src/op/load.cpp b/src/common/snippets/src/op/load.cpp index e33a0a4e50d4ac..4d9d7943b7c2da 100644 --- a/src/common/snippets/src/op/load.cpp +++ b/src/common/snippets/src/op/load.cpp @@ -11,19 +11,21 @@ using namespace std; using namespace ngraph; -snippets::op::Load::Load(const Output& x, const size_t count) : Op({x}), m_count(count) { +snippets::op::Load::Load(const Output& x, const size_t count, const size_t offset) + : Op({x}), m_count(count), m_offset(offset) { constructor_validate_and_infer_types(); } bool snippets::op::Load::visit_attributes(AttributeVisitor& visitor) { visitor.on_attribute("count", m_count); + visitor.on_attribute("offset", m_offset); return true; } std::shared_ptr snippets::op::Load::clone_with_new_inputs(const OutputVector& new_args) const { INTERNAL_OP_SCOPE(Load); check_new_args_count(this, new_args); - return std::make_shared(new_args.at(0), m_count); + return std::make_shared(new_args.at(0), m_count, m_offset); } void snippets::op::Load::validate_and_infer_types() { diff --git a/src/common/snippets/src/op/store.cpp b/src/common/snippets/src/op/store.cpp index db3204df69ab0b..9d5569b03b0ec4 100644 --- a/src/common/snippets/src/op/store.cpp +++ b/src/common/snippets/src/op/store.cpp @@ -11,18 +11,20 @@ using namespace std; using namespace ngraph; -snippets::op::Store::Store(const Output& x, const size_t count) : Op({x}), m_count(count) { +snippets::op::Store::Store(const Output& x, const size_t count, const size_t offset) : Op({x}), m_count(count), m_offset(offset) { constructor_validate_and_infer_types(); } bool snippets::op::Store::visit_attributes(AttributeVisitor& visitor) { + visitor.on_attribute("count", m_count); + visitor.on_attribute("offset", m_offset); return true; } std::shared_ptr snippets::op::Store::clone_with_new_inputs(const OutputVector& new_args) const { INTERNAL_OP_SCOPE(Store); check_new_args_count(this, new_args); - return std::make_shared(new_args.at(0), m_count); + return std::make_shared(new_args.at(0), m_count, m_offset); } void snippets::op::Store::validate_and_infer_types() { diff --git a/src/common/snippets/src/op/subgraph.cpp b/src/common/snippets/src/op/subgraph.cpp index 178a1e97d555ed..733c8e390e33e1 100644 --- a/src/common/snippets/src/op/subgraph.cpp +++ b/src/common/snippets/src/op/subgraph.cpp @@ -17,6 +17,8 @@ #include "snippets/pass/insert_loops.hpp" #include "snippets/pass/transform_convert.hpp" #include "snippets/pass/align_element_type.hpp" +#include "snippets/pass/softmax_decomposition.hpp" +#include "snippets/pass/set_buffer_offset.hpp" #include "snippets/utils.hpp" #include "transformations/common_optimizations/nop_elimination.hpp" @@ -38,20 +40,24 @@ void snippets::op::Subgraph::set_generator(std::shared_ptr body) - : Op(args), m_body(body), m_generator(nullptr) { +void snippets::op::Subgraph::init_config() { const auto ops = m_body->get_ops(); for (const auto& op : ops) { config.m_is_quantized = config.m_is_quantized || ov::is_type(op); + config.m_check_rt_info = config.m_check_rt_info || ov::is_type(op); config.m_has_type_relaxed_ops = config.m_has_type_relaxed_ops || std::dynamic_pointer_cast(op); config.m_is_needed_to_align_precision = config.m_is_needed_to_align_precision || is_quantized() || has_type_relaxed_ops() || snippets::pass::AlignElementType::opNeedsAlignElementType(op, execution_element_type); } +} +snippets::op::Subgraph::Subgraph(const OutputVector& args, std::shared_ptr body) + : Op(args), m_body(body), m_generator(nullptr) { + init_config(); constructor_validate_and_infer_types(); } @@ -164,9 +170,15 @@ auto snippets::op::Subgraph::wrap_node_as_subgraph(const std::shared_ptrget_friendly_name(), body_results, body_parameters); auto subgraph = build_subgraph(node, subgraph_inputs, body); + auto hidden_data_count = 0lu; if (auto fq_node = ov::as_type_ptr(node)) { - subgraph->set_non_scalar_constants_count(utils::get_non_scalar_constant_count_for_fq(fq_node)); + hidden_data_count += utils::get_non_scalar_constant_count_for_fq(fq_node); + // Ops that requires Buffer + } else if (ov::is_type(node) || + ov::is_type(node)) { + hidden_data_count++; } + subgraph->set_additional_data_count(hidden_data_count); for (size_t i = 0; i < body->get_parameters().size(); i++) { body->get_parameters()[i]->set_friendly_name(body_parameters[i]->get_friendly_name()); @@ -306,6 +318,17 @@ PartialShape snippets::op::Subgraph::get_master_shape() { return master_shape; } +size_t snippets::op::Subgraph::get_buffer_scratchpad_size() const { + size_t buffer_size = 0; + const auto ops = m_body->get_ops(); + for (const auto& op : ops) { + if (auto buffer = ov::as_type_ptr(op)) { + buffer_size += ngraph::shape_size(buffer->get_shape()) * buffer->get_element_type().size(); + } + } + return buffer_size; +} + void snippets::op::Subgraph::align_element_types(const BlockedShapeVector& outputShapes, const BlockedShapeVector& inputShapes) { const auto& body_results = m_body->get_results(); @@ -330,7 +353,7 @@ void snippets::op::Subgraph::align_element_types(const BlockedShapeVector& outpu // We should insert Convert before Results to return original output element type const auto convert = std::make_shared( - body_results[i]->get_input_node_shared_ptr(0), needed_out_type); + body_results[i]->get_input_node_shared_ptr(0), needed_out_type); body_results[i]->set_argument(0, convert); } // We should align element type inside body using the corresponding pass: @@ -367,10 +390,12 @@ void snippets::op::Subgraph::convert_to_snippet_dialect() { return p->get_partial_shape().rbegin()->is_dynamic(); }); ngraph::pass::Manager manager; + manager.register_pass(count); manager.register_pass(); manager.register_pass(); manager.register_pass(count); manager.register_pass(count); + manager.register_pass(); // todo: presently dynamic pipeline is activated even if the last two dimension are static // In general, we can use static kernels in this case, but several parameters (src and dst memory pointers for example) // should be passed as run-time args, so it's a mixed regime: kernel is shape-aware, but some additional runtime args are required diff --git a/src/common/snippets/src/op/vector_buffer.cpp b/src/common/snippets/src/op/vector_buffer.cpp new file mode 100644 index 00000000000000..1e104682aea6f0 --- /dev/null +++ b/src/common/snippets/src/op/vector_buffer.cpp @@ -0,0 +1,26 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +#include "snippets/op/vector_buffer.hpp" + +#include + +using namespace std; +using namespace ngraph; + +snippets::op::VectorBuffer::VectorBuffer() : Op() { + constructor_validate_and_infer_types(); +} + +std::shared_ptr snippets::op::VectorBuffer::clone_with_new_inputs(const OutputVector& new_args) const { + INTERNAL_OP_SCOPE(VectorBuffer); + check_new_args_count(this, new_args); + return std::make_shared(); +} + +void snippets::op::VectorBuffer::validate_and_infer_types() { + set_output_type(0, ov::element::f32, Shape{1lu}); +} diff --git a/src/common/snippets/src/pass/assign_registers.cpp b/src/common/snippets/src/pass/assign_registers.cpp index 2358d18c1d311a..e24324e7a8419b 100644 --- a/src/common/snippets/src/pass/assign_registers.cpp +++ b/src/common/snippets/src/pass/assign_registers.cpp @@ -11,6 +11,7 @@ bool ngraph::snippets::pass::AssignRegisters::run_on_model(const std::shared_ptr OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::AssignRegisters") using Reg = size_t; using tensor = std::shared_ptr; + const size_t reg_count = 16lu; auto ops = f->get_ordered_ops(); // Note that currently there are 3 types of ops: // * gpr->gpr: (Parameter, Result, LoopBegin, LoopEnd) will also be Buffer? @@ -22,7 +23,8 @@ bool ngraph::snippets::pass::AssignRegisters::run_on_model(const std::shared_ptr if (std::dynamic_pointer_cast(op) || std::dynamic_pointer_cast(op) || std::dynamic_pointer_cast(op) || - std::dynamic_pointer_cast(op)) + std::dynamic_pointer_cast(op) || + std::dynamic_pointer_cast(op)) return gpr2gpr; else if (std::dynamic_pointer_cast(op) || std::dynamic_pointer_cast(op)) @@ -39,22 +41,57 @@ bool ngraph::snippets::pass::AssignRegisters::run_on_model(const std::shared_ptr size_t counter_gpr = 0; std::map regs_vec, regs_gpr; // Define a set of immune tensors that will be ignored by auto reg allocation => their reg allocation is done manually - // todo: presently it hold only gpr's. If you need to manually assign vec's, implement reg_type or create a second map - std::map manually_assigned_regs; + std::map manually_assigned_gprs, manually_assigned_vecs; const auto IS_MANUALLY_ALLOCATED_REG = SIZE_MAX; const auto num_parameters = f->get_parameters().size(); + const auto num_results = f->get_results().size(); + auto accumulator_reg = 0lu; for (const auto& op : ops) { if (const auto& param = ov::as_type_ptr(op)) { - manually_assigned_regs[op->output(0).get_tensor_ptr()] = + manually_assigned_gprs[op->output(0).get_tensor_ptr()] = static_cast(f->get_parameter_index(param)); } else if (const auto& result = ov::as_type_ptr(op)) { // here we use the fact that Result input & output tensors are identical by construction - manually_assigned_regs[op->output(0).get_tensor_ptr()] = + manually_assigned_gprs[op->output(0).get_tensor_ptr()] = static_cast(f->get_result_index(result) + num_parameters); + } else if (const auto& buffer = ov::as_type_ptr(op)) { + // All buffers have one common data pointer + manually_assigned_gprs[op->input(0).get_tensor_ptr()] = + static_cast(num_results + num_parameters); + manually_assigned_gprs[op->output(0).get_tensor_ptr()] = + static_cast(num_results + num_parameters); + } else if (ov::is_type(op) || ov::is_type(op)) { + // Only ReduceMax and ReduceSum in SoftmaxDecomposition use HorizonMax/HorizonSum and VectorBuffer. + // We should manually set the one vector register for VectorBuffer and Max/Sum output to simulate a accumulator + // TODO: We should rewrite accumulator pattern using another way. + const auto input = op->get_input_node_shared_ptr(0); + for (auto i = 0; i < input->get_input_size(); ++i) { + if (ov::is_type(input->get_input_node_shared_ptr(i))) { + manually_assigned_vecs[input->input(i).get_tensor_ptr()] = + static_cast(accumulator_reg); + } + } + + manually_assigned_vecs[input->output(0).get_tensor_ptr()] = + static_cast(accumulator_reg); + manually_assigned_vecs[op->output(0).get_tensor_ptr()] = + static_cast(accumulator_reg); + + // If there is Broadcast, it should have the same register as Horizon op + // because it's result of accumulator as well + for (auto& out : op->output(0).get_target_inputs()) { + const auto child = out.get_node()->shared_from_this(); + if (ov::is_type(child)) { + manually_assigned_vecs[child->output(0).get_tensor_ptr()] = + static_cast(accumulator_reg); + } + } + accumulator_reg++; } } - auto enumerate_out_tensors = [IS_MANUALLY_ALLOCATED_REG, &manually_assigned_regs] (const std::shared_ptr& op, + auto enumerate_out_tensors = [IS_MANUALLY_ALLOCATED_REG] (const std::shared_ptr& op, decltype(regs_vec)& reg_map, + const std::map& manually_assigned_regs, size_t& counter) { for (const auto& output : op->outputs()) { const auto& t = output.get_tensor_ptr(); @@ -69,11 +106,11 @@ bool ngraph::snippets::pass::AssignRegisters::run_on_model(const std::shared_ptr switch (t_op.first) { case vec2vec: case gpr2vec: - enumerate_out_tensors(t_op.second, regs_vec, counter_vec); + enumerate_out_tensors(t_op.second, regs_vec, manually_assigned_vecs, counter_vec); break; case gpr2gpr: case vec2gpr: - enumerate_out_tensors(t_op.second, regs_gpr, counter_gpr); + enumerate_out_tensors(t_op.second, regs_gpr, manually_assigned_gprs, counter_gpr); break; } } @@ -237,15 +274,18 @@ bool ngraph::snippets::pass::AssignRegisters::run_on_model(const std::shared_ptr }; // todo: vec_/gpr_pool are hardware-specific and should be provided by a backend, e.g. overloaded generator std::set vec_pool; - for (Reg i = 0; i < 16; i++) + for (Reg i = 0; i < reg_count; i++) vec_pool.insert(i); - auto unique2reused_map_vec = linescan_assign_registers(live_intervals_vec, vec_pool); - std::set gpr_pool(std::move(vec_pool)); - for (const auto& t_reg : manually_assigned_regs) + std::set gpr_pool(vec_pool); + for (const auto& t_reg : manually_assigned_vecs) + vec_pool.erase(t_reg.second); + for (const auto& t_reg : manually_assigned_gprs) gpr_pool.erase(t_reg.second); + auto unique2reused_map_vec = linescan_assign_registers(live_intervals_vec, vec_pool); auto unique2reused_map_gpr = linescan_assign_registers(live_intervals_gpr, gpr_pool); - std::map assigned_regs(std::move(manually_assigned_regs)); + std::map assigned_regs(std::move(manually_assigned_gprs)); + assigned_regs.insert(manually_assigned_vecs.begin(), manually_assigned_vecs.end()); auto register_assigned_regs = [IS_MANUALLY_ALLOCATED_REG, &assigned_regs](const std::map& unique_regs, const std::map& unique2reused) { for (const auto& reg : unique_regs) { diff --git a/src/common/snippets/src/pass/collapse_subgraph.cpp b/src/common/snippets/src/pass/collapse_subgraph.cpp index be4744c80f653d..ea16f733f2a1c4 100644 --- a/src/common/snippets/src/pass/collapse_subgraph.cpp +++ b/src/common/snippets/src/pass/collapse_subgraph.cpp @@ -101,7 +101,25 @@ auto is_supported_op(const std::shared_ptr &n) -> bool { || ov::is_type(n) || ov::is_type(n); }; - return is_supported_fq_op(n) || is_supported_unary_eltwise_op(n) || is_supported_binary_eltwise_op(n); + auto is_supported_softmax = [](const std::shared_ptr &n) -> bool { + if (n->get_input_size() != 1 || n->get_input_partial_shape(0).rank().is_dynamic()) + return false; + size_t axis = 0; + const auto rank = n->get_input_partial_shape(0).rank(); + if (const auto softmax_v8 = ngraph::as_type_ptr(n)) { + axis = ngraph::normalize_axis(n->get_friendly_name(), softmax_v8->get_axis(), rank); + } else if (const auto softmax_v1 = ngraph::as_type_ptr(n)) { + axis = softmax_v1->get_axis(); + } else { + return false; + } + return axis == (rank.get_length() - 1); + }; + + return is_supported_fq_op(n) + || is_supported_unary_eltwise_op(n) + || is_supported_binary_eltwise_op(n) + || is_supported_softmax(n); } auto has_supported_in_out(const std::shared_ptr &n) -> bool { @@ -467,18 +485,22 @@ TokenizeSnippets::TokenizeSnippets() { // than the actual number of Constants during tokenization. // To avoid unsupported number of non-scalar Constants in the future (plugin specific limitation) // we should calculate potentional number of non-scalar Constants that will be moved up from body. - size_t hidden_non_scalar_constant_count = 0; + size_t hidden_data_count = 0; if (const auto fq_node = ov::as_type_ptr(node)) { - hidden_non_scalar_constant_count += ngraph::snippets::utils::get_non_scalar_constant_count_for_fq(fq_node); + hidden_data_count += ngraph::snippets::utils::get_non_scalar_constant_count_for_fq(fq_node); + // Ops that requires Buffer + } else if (ov::is_type(node) || + ov::is_type(node)) { + hidden_data_count++; } ResultVector body_results; std::vector>> subgraph_result_inputs; for (auto subgraph : input_subgraphs) { - // we should summurize non-scalar Constants count from all input subgraphs - // because we will collapse them with our node and we should get total count of non-scalar Constants - hidden_non_scalar_constant_count += ov::as_type_ptr(subgraph)->get_non_scalar_constants_count(); + // we should summurize additional needed data count (non-scalar Constants and Buffers) from all input subgraphs + // because we will collapse them with our node and we should get total count + hidden_data_count += ov::as_type_ptr(subgraph)->get_additional_data_count(); for (auto output : subgraph->outputs()) { bool first_side_consumer = true; @@ -519,13 +541,13 @@ TokenizeSnippets::TokenizeSnippets() { } // todo: move this plugin-specific constraint to the plugin callback - if (body_parameters.size() + body_results.size() + hidden_non_scalar_constant_count > 12) { + if (body_parameters.size() + body_results.size() + hidden_data_count > 12) { const std::string message_reset = "new subgraph is created. Impossible to schedule subgraph with " + std::to_string(body_parameters.size()) + " inputs, " + std::to_string(body_results.size()) + " outputs and " + - std::to_string(hidden_non_scalar_constant_count) + " non-scalar constants."; + std::to_string(hidden_data_count) + " non-scalar constants and buffers."; const std::string message_abort = "failed to continue subgraph. Impossible to schedule subgraph with " + std::to_string(body_parameters.size()) + " inputs, " + std::to_string(body_results.size()) + " outputs and " + - std::to_string(hidden_non_scalar_constant_count) + " non-scalar constants."; + std::to_string(hidden_data_count) + " non-scalar constants and buffers."; return abort_with_strategy(message_reset, message_abort); } @@ -560,7 +582,7 @@ TokenizeSnippets::TokenizeSnippets() { act_body1->get_parameters()[i]->set_friendly_name(body_parameters[i]->get_friendly_name()); } subgraph->get_rt_info()["originalLayersNames"] = fusedNames; - subgraph->set_non_scalar_constants_count(hidden_non_scalar_constant_count); + subgraph->set_additional_data_count(hidden_data_count); remark(1) << "Replacement (merge) done for: " << subgraph->get_friendly_name() diff --git a/src/common/snippets/src/pass/insert_load_store.cpp b/src/common/snippets/src/pass/insert_load_store.cpp index 827b1f914a793d..6d4ceae16ac33b 100644 --- a/src/common/snippets/src/pass/insert_load_store.cpp +++ b/src/common/snippets/src/pass/insert_load_store.cpp @@ -20,10 +20,11 @@ ngraph::snippets::pass::InsertLoad::InsertLoad(const size_t count) { OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::InsertLoad") auto root = m.get_match_root(); - // check if already has Load as an output + // check if already has Load or LoopBegin as an output for (auto output : root->outputs()) { for (auto consumer : output.get_target_inputs()) { - if (ov::is_type(consumer.get_node())) { + if (ov::is_type(consumer.get_node()) || + ov::is_type(consumer.get_node())) { return false; } } @@ -54,14 +55,15 @@ ngraph::snippets::pass::InsertStore::InsertStore(const size_t count) { OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::InsertStore") auto root = m.get_match_root(); - // check if already has Store as an input + // check if already has Store or LoopEnd as an input for (auto input : root->inputs()) { - if (ov::is_type(input.get_source_output().get_node())) { + if (ov::is_type(input.get_source_output().get_node()) || + ov::is_type(input.get_source_output().get_node())) { return false; } } - auto store = std::make_shared (root->input_value(0), count); + auto store = std::make_shared(root->input_value(0), count); ngraph::copy_runtime_info(root, store); root->set_argument(0, store); return true; diff --git a/src/common/snippets/src/pass/insert_loops.cpp b/src/common/snippets/src/pass/insert_loops.cpp index f4192087c2dc41..d52b26056000c1 100644 --- a/src/common/snippets/src/pass/insert_loops.cpp +++ b/src/common/snippets/src/pass/insert_loops.cpp @@ -35,7 +35,7 @@ bool ngraph::snippets::pass::InsertLoops::run_on_model(const std::shared_ptr& n) { return n->get_input_partial_shape(0); }); - if (inner_WA > 0) { + /* if (inner_WA > 0) { std::vector apply_increments; apply_increments.reserve(ioShapes.size()); // Inner Loop applies increments if a dimension is not broadcasted @@ -67,7 +67,7 @@ bool ngraph::snippets::pass::InsertLoops::run_on_model(const std::shared_ptr 1) { std::vector apply_increments; diff --git a/src/common/snippets/src/pass/insert_movebroadcast.cpp b/src/common/snippets/src/pass/insert_movebroadcast.cpp index 499be69e67f062..bb6bdf0a6cb0f4 100644 --- a/src/common/snippets/src/pass/insert_movebroadcast.cpp +++ b/src/common/snippets/src/pass/insert_movebroadcast.cpp @@ -7,6 +7,7 @@ #include "snippets/pass/insert_movebroadcast.hpp" #include "snippets/snippets_isa.hpp" +#include "snippets/utils.hpp" #include #include @@ -30,6 +31,9 @@ std::shared_ptr broadcast_node_last_dim(const ngraph::Output(broadcasted_node, broadcasted_shape); + // BroadcastMove should be immediately executed after broadcasted node. + // For example, to execute Broadcast out of Loop We transfer control dependents + broadcasted_node->add_node_control_dependents(value.get_node_shared_ptr()); } return broadcasted_node; @@ -64,23 +68,25 @@ ngraph::snippets::pass::InsertMoveBroadcast::InsertMoveBroadcast() { return false; } - auto is_scalar_constant = [](const ov::Output& v){ - if (auto constant = ov::as_type_ptr(v.get_node_shared_ptr())) { - if (constant->get_shape().empty() || ngraph::shape_size(constant->get_shape()) == 1) { - return true; - } + auto is_ignored_node = [](const ov::Output& v){ + if (utils::is_scalar_constant(v.get_node_shared_ptr())) { + return true; + } else if (ov::is_type(v.get_node_shared_ptr())) { + // VectorBuffer has scalar output shape to avoid broadcast conflicts and manually shape insertion. + // So we shouldn't insert BroadcastMove + return true; } return false; }; std::vector input_shapes; - std::vector ignore_as_scalar; + std::vector is_ignored; for (const auto& val : values) { input_shapes.emplace_back(val.get_partial_shape()); - ignore_as_scalar.push_back(is_scalar_constant(val)); + is_ignored.push_back(is_ignored_node(val)); // Do not insert MoveBroadcast if any of the last dims is dynamic, // since we don't know if we really need it. In these cases, broadcasting will be performed // by outer Loop based on runtime shapes. - if (!ignore_as_scalar.back() && !input_shapes.back().rbegin()->is_static()) + if (!is_ignored.back() && !input_shapes.back().rbegin()->is_static()) return false; } @@ -89,7 +95,7 @@ ngraph::snippets::pass::InsertMoveBroadcast::InsertMoveBroadcast() { ngraph::OutputVector broadcasted_inputs; for (size_t i = 0; i < values.size(); ++i) { - if (ignore_as_scalar[i]) { + if (is_ignored[i]) { broadcasted_inputs.push_back(values[i]); } else { auto node = broadcast_node_last_dim(values[i], bcast_shapes.first, bcast_shapes.second[i]); diff --git a/src/common/snippets/src/pass/load_movebroadcast_to_broadcastload.cpp b/src/common/snippets/src/pass/load_movebroadcast_to_broadcastload.cpp index f3765e471971a2..8380bf7b50f1d4 100644 --- a/src/common/snippets/src/pass/load_movebroadcast_to_broadcastload.cpp +++ b/src/common/snippets/src/pass/load_movebroadcast_to_broadcastload.cpp @@ -28,9 +28,12 @@ ngraph::snippets::pass::LoadMoveBroadcastToBroadcastLoad::LoadMoveBroadcastToBro const auto param = pm.at(param_pattern).get_node_shared_ptr(); // Cannot rewrite Broadcast + Load if load has more than 1 user - // or more than one input, or if Broadcast has several inputs + // or more than one input, + // or if Broadcast has several inputs, + // or if Load has offset if (input->output(0).get_target_inputs().size() != 1 || - root->inputs().size() != 1 || input->inputs().size() != 1) { + root->inputs().size() != 1 || input->inputs().size() != 1 || + ov::as_type_ptr(input)->get_offset() > 0) { return false; } diff --git a/src/common/snippets/src/pass/set_buffer_offset.cpp b/src/common/snippets/src/pass/set_buffer_offset.cpp new file mode 100644 index 00000000000000..29c82f174e338f --- /dev/null +++ b/src/common/snippets/src/pass/set_buffer_offset.cpp @@ -0,0 +1,27 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include +#include + +#include "snippets/snippets_isa.hpp" +#include "snippets/pass/set_buffer_offset.hpp" +#include "snippets/op/subgraph.hpp" + + +ngraph::snippets::pass::SetBufferOffset::SetBufferOffset() { + MATCHER_SCOPE(SetBufferOffset); + register_matcher(std::make_shared( + ngraph::pattern::wrap_type(), matcher_name), + [&](ngraph::pattern::Matcher &m) { + OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::SetBufferOffset") + auto root = m.get_match_root(); + const auto buffer = ov::as_type_ptr(root); + buffer->set_offset(current_offset); + buffer->propogateOffset(); + current_offset += ngraph::shape_size(buffer->get_shape()) * buffer->get_element_type().size(); + return true; + }); +} diff --git a/src/common/snippets/src/pass/softmax_decomposition.cpp b/src/common/snippets/src/pass/softmax_decomposition.cpp new file mode 100644 index 00000000000000..dd8fca47a410c5 --- /dev/null +++ b/src/common/snippets/src/pass/softmax_decomposition.cpp @@ -0,0 +1,170 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "snippets/remarks.hpp" +#include + +#include "snippets/pass/softmax_decomposition.hpp" +#include "snippets/op/loop_helpers.hpp" +#include "snippets/snippets_isa.hpp" + +#include +#include +#include +#include +#include + +namespace { + +inline bool calculate_apply_increment(const size_t inner_master_wa, const size_t inner_target_wa) { + return inner_target_wa != 1 && inner_master_wa != 1; +} + +inline std::vector calculate_apply_increments(const size_t inner_master_wa, const std::vector& shapes) { + std::vector apply_increments(shapes.size(), false); + for (auto i = 0; i < shapes.size(); ++i) { + apply_increments[i] = calculate_apply_increment(inner_master_wa, shapes[i].back()); + } + return apply_increments; +} + +inline int64_t calculate_finalization_offsets(const size_t inner_master_wa, const size_t inner_target_wa) { + return inner_target_wa != 1 ? -inner_master_wa : 0; +} + +} // namespace + + +ngraph::snippets::pass::SoftmaxDecomposition::SoftmaxDecomposition(const size_t vector_size) { + MATCHER_SCOPE(SoftmaxDecomposition); + register_matcher(std::make_shared( + ngraph::pattern::wrap_type(), matcher_name), + [=](ngraph::pattern::Matcher &m) { + OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::SoftmaxDecomposition") + auto root = m.get_match_root(); + const auto master_pshape = root->get_input_partial_shape(0); + const auto rank = master_pshape.rank(); + if (rank.is_dynamic() || master_pshape.is_dynamic()) + return false; + + size_t axis = 0; + if (const auto softmax_v8 = ngraph::as_type_ptr(root)) { + axis = ngraph::normalize_axis(root->get_friendly_name(), softmax_v8->get_axis(), rank); + } else if (const auto softmax_v1 = ngraph::as_type_ptr(root)) { + axis = softmax_v1->get_axis(); + } else { + return false; + } + + const auto shape_rank = rank.get_length(); + if (axis != shape_rank - 1) + return false; + + const auto data = root->get_input_node_shared_ptr(0); + + const auto master_shape = master_pshape.get_shape(); + const auto dimension = shape_rank - 1; + const auto work_amount = master_shape[dimension]; + const auto increment = vector_size; + const auto inner_dim = shape_rank - 1; + const auto inner_master_wa = static_cast(master_shape[inner_dim]); + const int outer_dim = shape_rank > 1 ? shape_rank - 2 : -1; + const auto has_outer_loop = outer_dim >= 0 && master_shape[outer_dim] > 1; + + /* ====== ReduceMax decomposition ====== */ + + const auto vector_buffer_max = std::make_shared(); + const auto loop_max_begin = ngraph::snippets::op::insertLoopBegin(ngraph::OutputVector{data, data}); + + const auto load_max = std::make_shared(loop_max_begin->output(0), increment); + const auto max = std::make_shared(load_max, vector_buffer_max); + + // For tail loop we should fill input of Max by float min to avoid math incorrect calculations + auto& max_rt_info = max->input(0).get_rt_info(); + max_rt_info["set_fill"] = std::string("float_min"); + + const auto apply_increments_max = std::vector{ calculate_apply_increment(inner_master_wa, data->get_shape()[inner_dim]), false, false }; + const auto finalization_offsets_max = std::vector{ calculate_finalization_offsets(inner_master_wa, data->get_shape()[inner_dim]), 0, 0 }; + const auto loop_max_end = std::make_shared(ngraph::OutputVector{loop_max_begin->output(1), loop_max_begin->output(2)}, + dimension, work_amount, increment, apply_increments_max, finalization_offsets_max); + // We forse to increment ptrs for optimize_single_evaluation() to reset data ptr because + // the next loop with ReduceSum uses the same ptr + loop_max_end->has_outer_loop = true; + + const auto horizon_max = std::make_shared(max); + + /* =========================================== */ + + /* === Sub + Exp + ReduceSum decomposition === */ + + const auto vector_buffer_sum = std::make_shared(); + const auto loop_sum_begin = ngraph::snippets::op::insertLoopBegin(ngraph::OutputVector{loop_max_end->output(0)}); + + const auto load_sub = std::make_shared(loop_sum_begin->output(0), increment); + const auto sub = std::make_shared(load_sub, horizon_max); + const auto exp = std::make_shared(sub); + const auto sum = std::make_shared(exp, vector_buffer_sum); + const auto store_exp = std::make_shared(exp, increment); + + // For tail loop we should fill input of Sum by zeros to avoid math incorrect calculations + auto& sum_rt_info = sum->input(0).get_rt_info(); + sum_rt_info["set_fill"] = std::string("zero"); + + const auto apply_increments_sum = calculate_apply_increments(inner_master_wa, {load_sub->get_shape(), store_exp->get_shape()}); + const auto finalization_offsets_sum = + std::vector{ has_outer_loop ? calculate_finalization_offsets(inner_master_wa, load_sub->get_shape()[inner_dim]) : 0, + calculate_finalization_offsets(inner_master_wa, store_exp->get_shape()[inner_dim]) }; + const auto loop_sum_end = std::make_shared( + ngraph::OutputVector{store_exp, loop_sum_begin->output(1)}, dimension, work_amount, increment, apply_increments_sum, finalization_offsets_sum); + // We forse to increment ptrs for optimize_single_evaluation() to reset buffer ptr because + // the next loop with Div uses the same ptr + loop_sum_end->has_outer_loop = true; + + const auto horizon_sum = std::make_shared(sum); + const auto buffer_exp = std::make_shared(loop_sum_end->output(0)); + + /* =========================================== */ + + /* ================== Div ==================== */ + + const auto loop_div_begin = ngraph::snippets::op::insertLoopBegin(ngraph::OutputVector{buffer_exp}); + + const auto load_div = std::make_shared(loop_div_begin->output(0), increment); + const auto div = std::make_shared(load_div, horizon_sum); + const auto store_div = std::make_shared(div, increment); + + const auto apply_increments_div = calculate_apply_increments(inner_master_wa, {load_div->get_shape(), store_div->get_shape()}); + auto finalization_offsets_div = std::vector{ 0, 0 }; + if (has_outer_loop) { + finalization_offsets_div[0] = calculate_finalization_offsets(inner_master_wa, load_div->get_shape()[inner_dim]); + finalization_offsets_div[1] = calculate_finalization_offsets(inner_master_wa, store_div->get_shape()[inner_dim]); + } + const auto loop_div_end = std::make_shared( + ngraph::OutputVector{store_div, loop_div_begin->output(1)}, dimension, work_amount, increment, apply_increments_div, finalization_offsets_div); + loop_div_end->has_outer_loop = has_outer_loop; + + /* =========================================== */ + + /* ========== Control dependency ============= */ + + loop_max_begin->add_control_dependency(vector_buffer_max); + loop_max_end->add_control_dependency(max); + horizon_max->add_control_dependency(loop_max_end); + loop_sum_begin->add_control_dependency(vector_buffer_sum); + loop_sum_begin->add_control_dependency(horizon_max); + loop_sum_end->add_control_dependency(sum); + horizon_sum->add_control_dependency(loop_sum_end); + loop_div_begin->add_control_dependency(horizon_sum); + + /* =========================================== */ + + ngraph::copy_runtime_info(root, + {vector_buffer_max, loop_max_begin, load_max, max, horizon_max, loop_max_end, + vector_buffer_sum, loop_sum_begin, load_sub, sub, exp, sum, store_exp, horizon_sum, loop_sum_end, buffer_exp, + loop_div_begin, load_div, div, store_div, loop_div_end}); + ngraph::replace_node(root, loop_div_end); + + return true; + }); +} diff --git a/src/common/snippets/tests/include/pass/softmax_decomposition.hpp b/src/common/snippets/tests/include/pass/softmax_decomposition.hpp new file mode 100644 index 00000000000000..0057a3eed6c370 --- /dev/null +++ b/src/common/snippets/tests/include/pass/softmax_decomposition.hpp @@ -0,0 +1,29 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "lowering_utils.hpp" +#include "snippets_helpers.hpp" + +namespace ov { +namespace test { +namespace snippets { + +typedef std::tuple< + Shape, // Input shape 0 + int // Axis +> SoftmaxParams; + +class SoftmaxTests : public LoweringTests, public testing::WithParamInterface { +public: + static std::string getTestCaseName(testing::TestParamInfo obj); +protected: + void SetUp() override; + std::shared_ptr snippets_function; +}; + +} // namespace snippets +} // namespace test +} // namespace ov diff --git a/src/common/snippets/tests/src/lowering_utils.cpp b/src/common/snippets/tests/src/lowering_utils.cpp index ec644e62e514e1..7ab7adfc8d3da8 100644 --- a/src/common/snippets/tests/src/lowering_utils.cpp +++ b/src/common/snippets/tests/src/lowering_utils.cpp @@ -21,7 +21,11 @@ DummyTargetMachine::DummyTargetMachine() { jitters[op::v1::Add::get_type_info_static()] = dummy_functor; jitters[op::v1::Subtract::get_type_info_static()] = dummy_functor; jitters[op::v1::Multiply::get_type_info_static()] = dummy_functor; - jitters[op::v1::Multiply::get_type_info_static()] = dummy_functor; + jitters[op::v1::Divide::get_type_info_static()] = dummy_functor; + jitters[op::v1::Maximum::get_type_info_static()] = dummy_functor; + jitters[op::v0::Exp::get_type_info_static()] = dummy_functor; + jitters[ngraph::snippets::op::HorizonMax::get_type_info_static()] = dummy_functor; + jitters[ngraph::snippets::op::HorizonSum::get_type_info_static()] = dummy_functor; jitters[ngraph::snippets::op::Load::get_type_info_static()] = dummy_functor; jitters[ngraph::snippets::op::BroadcastLoad::get_type_info_static()] = dummy_functor; @@ -32,6 +36,9 @@ DummyTargetMachine::DummyTargetMachine() { jitters[ngraph::snippets::op::Kernel::get_type_info_static()] = dummy_functor; jitters[ngraph::snippets::op::LoopBegin::get_type_info_static()] = dummy_functor; jitters[ngraph::snippets::op::LoopEnd::get_type_info_static()] = dummy_functor; + jitters[ngraph::snippets::op::Buffer::get_type_info_static()] = dummy_functor; + jitters[ngraph::snippets::op::VectorBuffer::get_type_info_static()] = dummy_functor; + jitters[ngraph::snippets::op::Fill::get_type_info_static()] = dummy_functor; } void LoweringTests::SetUp() { diff --git a/src/common/snippets/tests/src/pass/softmax_decomposition.cpp b/src/common/snippets/tests/src/pass/softmax_decomposition.cpp new file mode 100644 index 00000000000000..b3511fc81eb7ca --- /dev/null +++ b/src/common/snippets/tests/src/pass/softmax_decomposition.cpp @@ -0,0 +1,64 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +#include "pass/softmax_decomposition.hpp" +#include "common_test_utils/common_utils.hpp" +#include "subgraph_softmax.hpp" +#include "subgraph_lowered.hpp" + +#include "snippets/pass/softmax_decomposition.hpp" +#include "snippets/pass/insert_load_store.hpp" +#include "snippets/pass/insert_movebroadcast.hpp" + +namespace ov { +namespace test { +namespace snippets { + +std::string SoftmaxTests::getTestCaseName(testing::TestParamInfo obj) { + Shape inputShape; + int axis; + std::tie(inputShape, axis) = obj.param; + std::ostringstream result; + result << "IS=" << CommonTestUtils::vec2str(inputShape) << "_"; + result << "Axis=" << axis << "_"; + return result.str(); +} + +void SoftmaxTests::SetUp() { + const size_t count = 10; + manager.register_pass(); + manager.register_pass(count); + manager.register_pass(count); + manager.register_pass(count); + manager.register_pass(); + Shape inputShape; + int axis; + std::tie(inputShape, axis) = this->GetParam(); + snippets_function = std::make_shared(std::vector{inputShape}, axis); + master_shape = inputShape; +} + +TEST_P(SoftmaxTests, SoftmaxDecomposition) { + PartialShape scheduler_shape({master_shape[master_shape.size() - 2], + master_shape[master_shape.size() - 1]}); + auto subgraph = getLoweredSubgraph(snippets_function->getOriginal(), scheduler_shape); + function = subgraph->get_body(); + function_ref = snippets_function->getLowered(); +} + +namespace SoftmaxTestsInstantiation { +std::vector inputShape{{12, 4, 12, 12, 127}, {12, 4, 12, 12, 1}}; + +INSTANTIATE_TEST_SUITE_P(smoke_Snippets_SoftmaxDecomposition, SoftmaxTests, + ::testing::Combine( + ::testing::ValuesIn(inputShape), + ::testing::Values(-1)), + SoftmaxTests::getTestCaseName); + +} // namespace SoftmaxTestsInstantiation +} // namespace snippets +} // namespace test +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/cpu_generator.cpp b/src/plugins/intel_cpu/src/emitters/cpu_generator.cpp index 46fb0e88344355..eacf13e614087e 100644 --- a/src/plugins/intel_cpu/src/emitters/cpu_generator.cpp +++ b/src/plugins/intel_cpu/src/emitters/cpu_generator.cpp @@ -45,6 +45,8 @@ ov::intel_cpu::CPUTargetMachine::CPUTargetMachine(dnnl::impl::cpu::x64::cpu_isa_ // data movement jitters[ngraph::opset1::Parameter::get_type_info_static()] = CREATE_EMITTER(NopEmitter); jitters[ngraph::opset1::Result::get_type_info_static()] = CREATE_EMITTER(NopEmitter); + jitters[ngraph::snippets::op::Buffer::get_type_info_static()] = CREATE_EMITTER(NopEmitter); + jitters[ngraph::snippets::op::VectorBuffer::get_type_info_static()] = CREATE_EMITTER(ZeroEmitter); // jitters[ngraph::opset1::Constant::get_type_info_static()] = CREATE_EMITTER(); // Not supported jitters[ngraph::snippets::op::Load::get_type_info_static()] = CREATE_EMITTER(LoadEmitter); @@ -121,6 +123,10 @@ ov::intel_cpu::CPUTargetMachine::CPUTargetMachine(dnnl::impl::cpu::x64::cpu_isa_ // jitters[ngraph::opset1::Selu::get_type_info_static()] = CREATE_EMITTER(); // not supported jitters[ngraph::op::v0::Gelu::get_type_info_static()] = CREATE_EMITTER(ov::intel_cpu::jit_gelu_v0_emitter); jitters[ngraph::op::v7::Gelu::get_type_info_static()] = CREATE_EMITTER(ov::intel_cpu::jit_gelu_v7_emitter); + jitters[ngraph::snippets::op::Fill::get_type_info_static()] = CREATE_EMITTER(FillEmitter); + + jitters[ngraph::snippets::op::HorizonMax::get_type_info_static()] = CREATE_EMITTER(HorizonMaxEmitter); + jitters[ngraph::snippets::op::HorizonSum::get_type_info_static()] = CREATE_EMITTER(HorizonSumEmitter); jitters[ngraph::snippets::op::Kernel::get_type_info_static()] = CREATE_EMITTER(KernelEmitter); jitters[ngraph::snippets::op::LoopBegin::get_type_info_static()] = CREATE_EMITTER(LoopBeginEmitter); diff --git a/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.cpp b/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.cpp index 298811d16d47fa..e903515c80c12a 100644 --- a/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.cpp +++ b/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.cpp @@ -154,19 +154,23 @@ void KernelEmitter::validate_arguments(const std::vector &in, const std::vector &out, const std::vector &pool, const std::vector &gpr) const { - if (in.size() != 2) - IE_THROW() << "KernelEmitter got invalid number of inputs. Expected 2, got " << in.size(); + if (in.size() != 3) + IE_THROW() << "KernelEmitter got invalid number of inputs. Expected 3, got " << in.size(); if (!out.empty()) IE_THROW() << "KernelEmitter got invalid number of outputs. Expected 0, got " << out.size(); - const auto num_params = in[0] + in[1]; + const auto num_params = in[0] + in[1] + in[2]; // The number of used gpr may be >= num_params since LoopBegin+LoopEnd could also use gpr to store work_amount if (data_ptr_regs_idx.size() != num_params) - IE_THROW() << "KernelEmitter arguments are inconsistent with the gpr_regs_used size: in[0] + in[1] = " + IE_THROW() << "KernelEmitter arguments are inconsistent with the gpr_regs_used size: in[0] + in[1] + in[2] = " << num_params << " data_ptr_regs_idx.size() = " << data_ptr_regs_idx.size(); } -void KernelEmitter::init_data_pointers(size_t num_inputs, size_t num_params, - const Reg64& reg_indexes, const Reg64& reg_const_params, const std::vector& data_ptr_regs) const { +void KernelEmitter::init_data_pointers(size_t num_inputs, size_t num_io, bool is_buffer_needed, + const Reg64& reg_indexes, const Reg64& reg_const_params, const std::vector& data_ptr_regs) const { + // Firstly we should move Buffer ptr + if (is_buffer_needed) { + h->mov(data_ptr_regs[num_io], h->ptr[reg_const_params + GET_OFF(buffer_scratchpad)]); + } // master_shape size must be valid in both static and dynamic cases const int64_t offsetRank = jcp.master_shape.size() - 1; std::function init_ptr_with_offset; @@ -185,16 +189,16 @@ void KernelEmitter::init_data_pointers(size_t num_inputs, size_t num_params, return reg != reg_indexes_idx && reg != reg_const_params_idx; }); const bool last_iter_explicitly = spare_corruptable_gpr == gp_regs_pool.end(); - Reg64 reg_tmp = last_iter_explicitly ? data_ptr_regs.back() : Reg64(static_cast(*spare_corruptable_gpr)); + Reg64 reg_tmp = last_iter_explicitly ? data_ptr_regs[num_io - 1] : Reg64(static_cast(*spare_corruptable_gpr)); size_t i = 0; - for (; i < num_params - last_iter_explicitly; i++) { + for (; i < num_io - last_iter_explicitly; i++) { if (i < num_inputs) h->mov(data_ptr_regs[i], h->ptr[reg_const_params + GET_OFF(src_ptrs) + i * sizeof(void*)]); else h->mov(data_ptr_regs[i], h->ptr[reg_const_params + GET_OFF(dst_ptrs) + (i - num_inputs) * sizeof(void*)]); init_ptr_with_offset(data_ptr_regs[i], i * offsetRank, reg_tmp); } - // a rare case when num_params is maximal, so we have no spare gprs + // a rare case when num_io is maximal, so we have no spare gprs // * Static case: we can use reg_const_params as the last reg_tmp for the last iteration (and corrupt it), since // it won't be used anymore // * Dynamic case: we will need reg_const_params to pass runtime args to LoopScheduler, so we have to @@ -215,13 +219,14 @@ void KernelEmitter::emit_impl(const std::vector& in, const size_t num_inputs = in[0]; const size_t num_outputs = in[1]; + const bool is_buffer_needed = static_cast(in[2]); Reg64 reg_indexes = Reg64(static_cast(reg_indexes_idx)); Reg64 reg_const_params = Reg64(static_cast(reg_const_params_idx)); std::vector data_ptr_regs; transform_idxs_to_regs(data_ptr_regs_idx, data_ptr_regs); - init_data_pointers(num_inputs, num_inputs + num_outputs, reg_indexes, reg_const_params, data_ptr_regs); + init_data_pointers(num_inputs, num_inputs + num_outputs, is_buffer_needed, reg_indexes, reg_const_params, data_ptr_regs); for (const auto& c : body) { const auto& emitter = c.first; std::vector in_regs, out_regs; @@ -455,7 +460,12 @@ StoreEmitter::StoreEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::c if (src_prc != dst_prc) IE_THROW() << "StoreEmitter supports only equal input and output types but gets: " << src_prc.name() << " and " << dst_prc.name(); - count = ov::as_type_ptr(n)->get_count(); + const auto store = ov::as_type_ptr(n); + if (!store) + IE_THROW() << "StoreEmitter expect Store op"; + + count = store->get_count(); + byte_offset = store->get_offset(); in_out_type_ = emitter_in_out_map::vec_to_gpr; store_emitter.reset(new jit_store_emitter(h, isa, src_prc, dst_prc, count)); } @@ -482,7 +492,7 @@ void StoreEmitter::emit_isa(const std::vector &in, const std::vector::type; if (!store_emitter) IE_THROW() << "Store CPU emitter isn't initialized for StoreEmitter!"; - store_emitter->emit_code({in[0]}, {out[0]}, aux_vec_idxs, aux_gpr_idxs); + store_emitter->emit_code({in[0], byte_offset}, {out[0]}, aux_vec_idxs, aux_gpr_idxs); } void StoreEmitter::emit_data() const { @@ -494,7 +504,12 @@ LoadEmitter::LoadEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu if (src_prc != dst_prc) IE_THROW() << "LoadEmitter supports only equal input and output types but gets: " << src_prc.name() << " and " << dst_prc.name(); - count = ov::as_type_ptr(n)->get_count(); + const auto load = ov::as_type_ptr(n); + if (!load) + IE_THROW() << "LoadEmitter expects Load op"; + + count = load->get_count(); + byte_offset = load->get_offset(); in_out_type_ = emitter_in_out_map::gpr_to_vec; load_emitter.reset(new jit_load_emitter(h, isa, src_prc, dst_prc, count)); } @@ -521,7 +536,7 @@ void LoadEmitter::emit_isa(const std::vector &in, const std::vector::type; if (!load_emitter) IE_THROW() << "Load CPU emitter isn't initialized for LoadEmitter!"; - load_emitter->emit_code({in[0]}, {out[0]}, aux_vec_idxs, aux_gpr_idxs); + load_emitter->emit_code({in[0], byte_offset}, {out[0]}, aux_vec_idxs, aux_gpr_idxs); } void LoadEmitter::emit_data() const { @@ -571,7 +586,11 @@ void BroadcastLoadEmitter::emit_isa(const std::vector &in, const std::ve LoadConvertEmitter::LoadConvertEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr& n) : MemoryEmitter(h, isa, n) { - count = ov::as_type_ptr(n)->get_count(); + const auto load = ov::as_type_ptr(n); + if (!load) + IE_THROW() << "LoadConvertEmitter expects Load op"; + count = load->get_count(); + byte_offset = load->get_offset(); in_out_type_ = emitter_in_out_map::gpr_to_vec; load_emitter.reset(new jit_load_emitter(h, isa, src_prc, dst_prc, count)); } @@ -596,7 +615,7 @@ template void LoadConvertEmitter::emit_isa(const std::vector &in, const std::vector &out) const { if (!load_emitter) IE_THROW() << "Load CPU emitter isn't initialized for LoadEmitter!"; - load_emitter->emit_code({in[0]}, {out[0]}, aux_vec_idxs, aux_gpr_idxs); + load_emitter->emit_code({in[0], byte_offset}, {out[0]}, aux_vec_idxs, aux_gpr_idxs); } void LoadConvertEmitter::emit_data() const { @@ -605,7 +624,11 @@ void LoadConvertEmitter::emit_data() const { StoreConvertEmitter::StoreConvertEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr& n) : MemoryEmitter(h, isa, n) { - count = ov::as_type_ptr(n)->get_count(); + const auto store = ov::as_type_ptr(n); + if (!store) + IE_THROW() << "StoreConvertEmitter expects Store op"; + count = store->get_count(); + byte_offset = store->get_offset(); in_out_type_ = emitter_in_out_map::vec_to_gpr; if (ov::is_type(n)) { @@ -635,12 +658,241 @@ template void StoreConvertEmitter::emit_isa(const std::vector &in, const std::vector &out) const { if (!store_emitter) IE_THROW() << "Store CPU emitter isn't initialized for StoreEmitter!"; - store_emitter->emit_code({in[0]}, {out[0]}, aux_vec_idxs, aux_gpr_idxs); + store_emitter->emit_code({in[0], byte_offset}, {out[0]}, aux_vec_idxs, aux_gpr_idxs); } void StoreConvertEmitter::emit_data() const { store_emitter->emit_data(); } +HorizonMaxEmitter::HorizonMaxEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr& n) : + jit_emitter(h, isa, n, Precision::FP32, emitter_in_out_map::vec_to_vec) { + prepare_table(); + + const auto shape = n->get_input_partial_shape(0); + is_scalar = shape.is_static() && shape.get_shape().back() == 1; +} + +size_t HorizonMaxEmitter::aux_gprs_count() const { + // We can reuse one GPR reg for table value and accumulator + return is_scalar ? 0 : 1; +} +size_t HorizonMaxEmitter::aux_vecs_count() const { + return is_scalar ? 0 : 1; +} + +void HorizonMaxEmitter::emit_impl(const std::vector& in, + const std::vector& out, + const std::vector& pool, + const std::vector& gpr, + const ov::intel_cpu::emitter_context *emit_context) const { + if (host_isa_ == dnnl::impl::cpu::x64::sse41) { + emit_isa(in, out); + } else if (host_isa_ == dnnl::impl::cpu::x64::avx2) { + emit_isa(in, out); + } else if (host_isa_ == dnnl::impl::cpu::x64::avx512_core) { + emit_isa(in, out); + } else { + IE_THROW() << "HorizonMax emitter doesn't support " << host_isa_; + } +} + +template +void HorizonMaxEmitter::emit_isa(const std::vector &in, const std::vector &out) const { + using Vmm = typename dnnl::impl::utils::conditional3::type; + + Vmm src_vmm = Vmm(in[0]); + Xmm dst_xmm = Xmm(out[0]); + + if (is_scalar) { + if (src_vmm.getIdx() != dst_xmm.getIdx()) { + h->uni_vmovups(dst_xmm, src_vmm); + } + } else { + Xmm aux_xmm = Xmm(aux_vec_idxs[0]); + + Reg64 aux_reg = Reg64(aux_gpr_idxs[0]); + Reg32 aux_reg_32 = Reg32(aux_reg.getIdx()); + + const size_t vlen = dnnl::impl::cpu::x64::cpu_isa_traits::vlen; + const size_t vec_size = vlen / sizeof(float); + h->sub(h->rsp, vlen); + h->uni_vmovups(h->ptr[h->rsp], src_vmm); + h->uni_vmovups(dst_xmm, table_val("float_min")); + for (size_t i = 0; i < vec_size; i++) { + h->mov(aux_reg, h->ptr[h->rsp + i * sizeof(float)]); + h->vmovq(aux_xmm, aux_reg); + h->uni_vmaxps(dst_xmm, dst_xmm, aux_xmm); + } + h->add(h->rsp, vlen); + } +} + +void HorizonMaxEmitter::register_table_entries() { + push_arg_entry_of("float_min", 0xff7fffff, true); +} + +HorizonSumEmitter::HorizonSumEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr& n) : + jit_emitter(h, isa, n, Precision::FP32, emitter_in_out_map::vec_to_vec) { + const auto shape = n->get_input_partial_shape(0); + is_scalar = shape.is_static() && shape.get_shape().back() == 1; +} + +size_t HorizonSumEmitter::aux_gprs_count() const { + return is_scalar ? 0 : 1; +} +size_t HorizonSumEmitter::aux_vecs_count() const { + return is_scalar ? 0 : 1; +} + +void HorizonSumEmitter::emit_impl(const std::vector& in, + const std::vector& out, + const std::vector& pool, + const std::vector& gpr, + const ov::intel_cpu::emitter_context *emit_context) const { + if (host_isa_ == dnnl::impl::cpu::x64::sse41) { + emit_isa(in, out); + } else if (host_isa_ == dnnl::impl::cpu::x64::avx2) { + emit_isa(in, out); + } else if (host_isa_ == dnnl::impl::cpu::x64::avx512_core) { + emit_isa(in, out); + } else { + IE_THROW() << "HorizonSum emitter doesn't support " << host_isa_; + } +} + +template +void HorizonSumEmitter::emit_isa(const std::vector &in, const std::vector &out) const { + using Vmm = typename dnnl::impl::utils::conditional3::type; + + Vmm src_vmm = Vmm(in[0]); + Xmm dst_xmm = Xmm(out[0]); + + if (is_scalar) { + if (src_vmm.getIdx() != dst_xmm.getIdx()) { + h->uni_vmovups(dst_xmm, src_vmm); + } + } else { + Xmm aux_xmm = Xmm(aux_vec_idxs[0]); + + Reg64 aux_reg = Reg64(aux_gpr_idxs[0]); + Reg32 aux_reg_32 = Reg32(aux_reg.getIdx()); + + const size_t vlen = dnnl::impl::cpu::x64::cpu_isa_traits::vlen; + const size_t vec_size = vlen / sizeof(float); + h->sub(h->rsp, vlen); + h->uni_vmovups(h->ptr[h->rsp], src_vmm); + h->uni_vpxor(dst_xmm, dst_xmm, dst_xmm); + for (size_t i = 0; i < vec_size; i++) { + h->mov(aux_reg, h->ptr[h->rsp + i * sizeof(float)]); + h->vmovq(aux_xmm, aux_reg); + h->uni_vaddps(dst_xmm, dst_xmm, aux_xmm); + } + h->add(h->rsp, vlen); + } +} + +ZeroEmitter::ZeroEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr& n) : + jit_emitter(h, isa, n, Precision::FP32, emitter_in_out_map::vec_to_vec) {} + +void ZeroEmitter::emit_impl(const std::vector& in, + const std::vector& out, + const std::vector& pool, + const std::vector& gpr, + const ov::intel_cpu::emitter_context *emit_context) const { + if (host_isa_ == dnnl::impl::cpu::x64::sse41) { + emit_isa(in, out); + } else if (host_isa_ == dnnl::impl::cpu::x64::avx2) { + emit_isa(in, out); + } else if (host_isa_ == dnnl::impl::cpu::x64::avx512_core) { + emit_isa(in, out); + } else { + IE_THROW() << "Zero emitter doesn't support " << host_isa_; + } +} + +template +void ZeroEmitter::emit_isa(const std::vector &in, const std::vector &out) const { + using Vmm = typename dnnl::impl::utils::conditional3::type; + + Vmm vmm = Vmm(out[0]); + h->uni_vpxor(vmm, vmm, vmm); +} + +FillEmitter::FillEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr& n) : + jit_emitter(h, isa, n, Precision::FP32, emitter_in_out_map::vec_to_vec) { + const auto fill = ov::as_type_ptr(n); + if (!fill) { + IE_THROW() << "Fill emitter expects Fill op from Snippets opset"; + } + + offset = fill->get_offset(); + fill_value = fill->get_fill_value(); + prepare_table(); +} + +size_t FillEmitter::aux_gprs_count() const { + if (one_of(host_isa_, dnnl::impl::cpu::x64::avx512_core)) + return 2; + else + return 1; +} + +void FillEmitter::emit_impl(const std::vector& in, + const std::vector& out, + const std::vector& pool, + const std::vector& gpr, + const ov::intel_cpu::emitter_context *emit_context) const { + if (host_isa_ == dnnl::impl::cpu::x64::sse41) { + emit_isa(in, out); + } else if (host_isa_ == dnnl::impl::cpu::x64::avx2) { + emit_isa(in, out); + } else if (host_isa_ == dnnl::impl::cpu::x64::avx512_core) { + emit_isa(in, out); + } else { + IE_THROW() << "Fill emitter doesn't support " << host_isa_; + } +} + +template +void FillEmitter::emit_isa(const std::vector &in, const std::vector &out) const { + using Vmm = typename dnnl::impl::utils::conditional3::type; + + Vmm src_vmm = Vmm(in[0]); + Vmm dst_vmm = Vmm(out[0]); + + if (one_of(host_isa_, dnnl::impl::cpu::x64::avx512_core)) { + uint64_t tail_mask = 1; + tail_mask = ~((tail_mask << offset) - tail_mask); + h->mov(Reg64(aux_gpr_idxs[0]), tail_mask); + h->kmovq(k_mask, Reg64(aux_gpr_idxs[0])); + h->vblendmps(dst_vmm | k_mask, src_vmm, table_val(fill_value)); + } else if (one_of(host_isa_, dnnl::impl::cpu::x64::avx2, dnnl::impl::cpu::x64::sse41)) { + uint8 imm = 1; + imm = ~((imm << offset) - imm); // shift load_num bit + if (host_isa_ == dnnl::impl::cpu::x64::sse41 && src_vmm.getIdx() != dst_vmm.getIdx()) { + h->uni_vmovups(dst_vmm, src_vmm); + src_vmm = Vmm(dst_vmm.getIdx()); + } + h->uni_vblendps(dst_vmm, src_vmm, table_val(fill_value), imm); + } else { + IE_THROW() << "Fill emitter doesn't support " << host_isa_; + } +} + +void FillEmitter::register_table_entries() { + push_arg_entry_of("zero", 0x00000000, true); + push_arg_entry_of("int_one", 0x00000001, true); + push_arg_entry_of("float_one", 0x3f800000, true); + push_arg_entry_of("int32_min", 0xcf000000, true); + push_arg_entry_of("float_min", 0xff7fffff, true); + push_arg_entry_of("int32_max", 0x4effffff, true); + push_arg_entry_of("float_max", 0x7f7fffff, true); +} + } // namespace intel_cpu } // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.hpp b/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.hpp index bb626c9ef79ff6..beccd81e77e409 100644 --- a/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.hpp +++ b/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.hpp @@ -28,6 +28,7 @@ namespace intel_cpu { struct jit_snippets_call_args { const void *src_ptrs[SNIPPETS_MAX_SNIPPETS_DIMS] = {}; void *dst_ptrs[SNIPPETS_MAX_SNIPPETS_DIMS] = {}; + void* buffer_scratchpad = nullptr; }; struct jit_snippets_compile_args { @@ -89,7 +90,7 @@ class KernelEmitter : public jit_container_emitter { const std::vector& pool, const std::vector& gpr, const ov::intel_cpu::emitter_context *emit_context) const override; - void init_data_pointers(size_t, size_t, const Reg64&, const Reg64&, const std::vector&) const; + void init_data_pointers(size_t, size_t, bool, const Reg64&, const Reg64&, const std::vector&) const; jit_snippets_compile_args jcp; std::vector gp_regs_pool; @@ -241,6 +242,9 @@ class MemoryEmitter : public jit_emitter { protected: Precision src_prc; Precision dst_prc; + + size_t count = 0; + size_t byte_offset = 0; }; class StoreEmitter : public MemoryEmitter { @@ -261,7 +265,6 @@ class StoreEmitter : public MemoryEmitter { void emit_data() const override; private: - size_t count; std::unique_ptr store_emitter = nullptr; }; @@ -283,7 +286,6 @@ class LoadEmitter : public MemoryEmitter { void emit_data() const override; private: - size_t count; std::unique_ptr load_emitter = nullptr; }; @@ -322,7 +324,6 @@ class LoadConvertEmitter : public MemoryEmitter { void emit_data() const override; private: - size_t count; std::unique_ptr load_emitter = nullptr; }; @@ -344,8 +345,98 @@ class StoreConvertEmitter : public MemoryEmitter { void emit_data() const override; private: - size_t count; std::unique_ptr store_emitter = nullptr; }; + +class HorizonMaxEmitter : public jit_emitter { +public: + HorizonMaxEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr& n); + + size_t get_inputs_num() const override {return 1;} + +protected: + size_t aux_gprs_count() const override; + size_t aux_vecs_count() const override; + +private: + void emit_impl(const std::vector& in, + const std::vector& out, + const std::vector& pool, + const std::vector& gpr, + const ov::intel_cpu::emitter_context *emit_context) const override; + + template + void emit_isa(const std::vector &in, const std::vector &out) const; + + void register_table_entries() override; + + bool is_scalar = false; +}; + +class HorizonSumEmitter : public jit_emitter { +public: + HorizonSumEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr& n); + + size_t get_inputs_num() const override {return 1;} + +protected: + size_t aux_gprs_count() const override; + size_t aux_vecs_count() const override; + +private: + void emit_impl(const std::vector& in, + const std::vector& out, + const std::vector& pool, + const std::vector& gpr, + const ov::intel_cpu::emitter_context *emit_context) const override; + + template + void emit_isa(const std::vector &in, const std::vector &out) const; + + bool is_scalar = false; +}; + +class ZeroEmitter : public jit_emitter { +public: + ZeroEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr& n); + + size_t get_inputs_num() const override {return 0;} + +private: + void emit_impl(const std::vector& in, + const std::vector& out, + const std::vector& pool, + const std::vector& gpr, + const ov::intel_cpu::emitter_context *emit_context) const override; + + template + void emit_isa(const std::vector &in, const std::vector &out) const; +}; + +class FillEmitter : public jit_emitter { +public: + FillEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr& n); + + size_t get_inputs_num() const override {return 1;} + +protected: + size_t aux_gprs_count() const override; + +private: + void emit_impl(const std::vector& in, + const std::vector& out, + const std::vector& pool, + const std::vector& gpr, + const ov::intel_cpu::emitter_context *emit_context) const override; + + template + void emit_isa(const std::vector &in, const std::vector &out) const; + + void register_table_entries() override; + + size_t offset = 0; + std::string fill_value; +}; + } // namespace intel_cpu } // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.cpp b/src/plugins/intel_cpu/src/nodes/subgraph.cpp index 5026b89e9bbce2..93b518188dddf5 100644 --- a/src/plugins/intel_cpu/src/nodes/subgraph.cpp +++ b/src/plugins/intel_cpu/src/nodes/subgraph.cpp @@ -245,9 +245,13 @@ void Snippet::optimizeExecDomain(std::vector& inputShapes, std::vect const size_t minimalConcurrency = parallel_get_max_threads(); const size_t minimalJitWorkAmount = 256; const size_t ds = domain.size(); + const auto body = snippet->get_body()->get_ops(); if ( ds <= 2 || // not enough dimensions to collapse domain[ds-1] >= minimalJitWorkAmount || // There is enough work for 1D Tiles, no need to collapse - domain[ds-1] * domain[ds-2] >= fullWorkAmount / minimalConcurrency) // There won't be enough work for every thread (even one iter) if we collapse + domain[ds-1] * domain[ds-2] >= fullWorkAmount / minimalConcurrency || // There won't be enough work for every thread (even one iter) if we collapse + std::any_of(body.begin(), body.end(), [](const std::shared_ptr& op) { // There are operations which don't support dim collapsion + return ov::is_type(op) || ov::is_type(op); + })) return; auto findDimsToCollapse = [&]() { auto collapseLastDims = [](VectorDims& dims, size_t dimsToCollapse) { @@ -373,6 +377,8 @@ void Snippet::createPrimitive() { jcp.master_shape = masterShape; std::copy(data_offsets.begin(), data_offsets.end(), jcp.data_offsets); generate(&jcp); + buffer_scratchpad_size = snippet->get_buffer_scratchpad_size(); + buffer_scratchpad.resize(buffer_scratchpad_size * parallel_get_num_threads(), 0); } std::vector Snippet::shapeInfer() const { @@ -498,6 +504,12 @@ void Snippet::prepareParams() { std::copy(s.begin() + offset, s.end(), ns.begin()); new_shapes.emplace_back(std::move(ns)); } + auto ops = snippet->get_body()->get_ops(); + for (auto& op : ops) { + if (auto softmax = ov::as_type_ptr(op)) { + softmax->set_axis(tileRank - 1); + } + } snippet->set_master_shape(PartialShape(scheduler_work_amounts)); snippet->reshape_body(new_shapes); } @@ -520,6 +532,7 @@ void Snippet::execute(dnnl::stream strm) { } jit_snippets_call_args call_args; updateSrcDstPtrs(call_args); + call_args.buffer_scratchpad = buffer_scratchpad.data(); if (tensorRank == rank6D) { schedule_6d(call_args); @@ -583,11 +596,16 @@ void Snippet::generate(const jit_snippets_compile_args* jcp) { void Snippet::schedule_6d(const jit_snippets_call_args& call_args) const { const auto& dom = exec_domain; + std::vector per_thread_call_args(parallel_get_num_threads(), call_args); + if (buffer_scratchpad_size > 0) { + for (size_t i = 0; i < per_thread_call_args.size(); ++i) + per_thread_call_args[i].buffer_scratchpad = reinterpret_cast(per_thread_call_args[i].buffer_scratchpad) + i * buffer_scratchpad_size; + } // < N, C, H, W > < 1, 1, N, C*H*W> parallel_for5d(dom[0], dom[1], dom[2], dom[3], dom[4], [&](int64_t d0, int64_t d1, int64_t d2, int64_t d3, int64_t d4) { int64_t indexes[] = {d0, d1, d2, d3, d4}; - schedule.get_callable()(indexes, &call_args); + schedule.get_callable()(indexes, &per_thread_call_args[parallel_get_thread_num()]); }); } diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.h b/src/plugins/intel_cpu/src/nodes/subgraph.h index 8b657e42805131..4b86d5f5b7b7ee 100644 --- a/src/plugins/intel_cpu/src/nodes/subgraph.h +++ b/src/plugins/intel_cpu/src/nodes/subgraph.h @@ -109,6 +109,10 @@ class Snippet : public Node { std::vector start_offset_in = {}; std::vector start_offset_out = {}; + + // TODO + std::vector buffer_scratchpad = {}; + size_t buffer_scratchpad_size = 0; }; } // namespace node diff --git a/src/plugins/intel_cpu/src/snippets_transformations/fuse_load_store_and_convert.cpp b/src/plugins/intel_cpu/src/snippets_transformations/fuse_load_store_and_convert.cpp index 2db9fd9f010de8..021b3f6c1293ec 100644 --- a/src/plugins/intel_cpu/src/snippets_transformations/fuse_load_store_and_convert.cpp +++ b/src/plugins/intel_cpu/src/snippets_transformations/fuse_load_store_and_convert.cpp @@ -42,12 +42,12 @@ ov::intel_cpu::pass::FuseLoadConvert::FuseLoadConvert() { std::dynamic_pointer_cast(convert)) { load_convert = std::make_shared(param, convert_saturation->get_destination_type(), - load->get_count()); + load->get_count(), load->get_offset()); } else if (const auto convert_truncation = std::dynamic_pointer_cast(convert)) { load_convert = std::make_shared(param, convert_truncation->get_destination_type(), - load->get_count()); + load->get_count(), load->get_offset()); } else { throw ngraph::ngraph_error( "Type of Convert op is undefined. Supports only fusing Load and ConvertTruncation or ConvertSaturation ops"); @@ -91,12 +91,12 @@ ov::intel_cpu::pass::FuseStoreConvert::FuseStoreConvert() { std::dynamic_pointer_cast(convert)) { store_convert = std::make_shared(input, convert_saturation->get_destination_type(), - store->get_count()); + store->get_count(), store->get_offset()); } else if (const auto convert_truncation = std::dynamic_pointer_cast(convert)) { store_convert = std::make_shared(input, convert_truncation->get_destination_type(), - store->get_count()); + store->get_count(), store->get_offset()); } else { throw ngraph::ngraph_error( "Type of Convert op is undefined. Supports only fusing Store and ConvertTruncation or ConvertSaturation ops"); diff --git a/src/plugins/intel_cpu/src/snippets_transformations/op/load_convert.cpp b/src/plugins/intel_cpu/src/snippets_transformations/op/load_convert.cpp index 731c0cb1e1b24a..675c214ed7ae2b 100644 --- a/src/plugins/intel_cpu/src/snippets_transformations/op/load_convert.cpp +++ b/src/plugins/intel_cpu/src/snippets_transformations/op/load_convert.cpp @@ -11,8 +11,9 @@ using namespace std; using namespace ov; -intel_cpu::LoadConvertSaturation::LoadConvertSaturation(const Output& x, const ov::element::Type& destination_type, const size_t count) : - Load(x, count), m_destination_type(destination_type) { +intel_cpu::LoadConvertSaturation::LoadConvertSaturation(const Output& x, const ov::element::Type& destination_type, + const size_t count, const size_t offset) : + Load(x, count, offset), m_destination_type(destination_type) { constructor_validate_and_infer_types(); } @@ -30,11 +31,12 @@ void intel_cpu::LoadConvertSaturation::validate_and_infer_types() { std::shared_ptr intel_cpu::LoadConvertSaturation::clone_with_new_inputs(const OutputVector& new_args) const { INTERNAL_OP_SCOPE(LoadConvert_clone_with_new_inputs); check_new_args_count(this, new_args); - return std::make_shared(new_args.at(0), m_destination_type, m_count); + return std::make_shared(new_args.at(0), m_destination_type, m_count, m_offset); } -intel_cpu::LoadConvertTruncation::LoadConvertTruncation(const Output& x, const ov::element::Type& destination_type, const size_t count) : - Load(x, count), m_destination_type(destination_type) { +intel_cpu::LoadConvertTruncation::LoadConvertTruncation(const Output& x, const ov::element::Type& destination_type, + const size_t count, const size_t offset) : + Load(x, count, offset), m_destination_type(destination_type) { constructor_validate_and_infer_types(); } @@ -52,5 +54,5 @@ void intel_cpu::LoadConvertTruncation::validate_and_infer_types() { std::shared_ptr intel_cpu::LoadConvertTruncation::clone_with_new_inputs(const OutputVector& new_args) const { INTERNAL_OP_SCOPE(LoadConvert_clone_with_new_inputs); check_new_args_count(this, new_args); - return std::make_shared(new_args.at(0), m_destination_type, m_count); + return std::make_shared(new_args.at(0), m_destination_type, m_count, m_offset); } diff --git a/src/plugins/intel_cpu/src/snippets_transformations/op/load_convert.hpp b/src/plugins/intel_cpu/src/snippets_transformations/op/load_convert.hpp index 572cbf00f521d4..1b1b8988c16784 100644 --- a/src/plugins/intel_cpu/src/snippets_transformations/op/load_convert.hpp +++ b/src/plugins/intel_cpu/src/snippets_transformations/op/load_convert.hpp @@ -20,7 +20,7 @@ class LoadConvertSaturation : public ngraph::snippets::op::Load { public: OPENVINO_OP("LoadConvertSaturation", "SnippetsOpset", ngraph::snippets::op::Load); - LoadConvertSaturation(const Output& x, const ov::element::Type& destination_type, const size_t count = 1lu); + LoadConvertSaturation(const Output& x, const ov::element::Type& destination_type, const size_t count = 1lu, const size_t offset = 0lu); LoadConvertSaturation() = default; ov::element::Type get_destination_type() const { return m_destination_type; } @@ -47,7 +47,7 @@ class LoadConvertTruncation : public ngraph::snippets::op::Load { public: OPENVINO_OP("LoadConvertTruncation", "SnippetsOpset", ngraph::snippets::op::Load); - LoadConvertTruncation(const Output& x, const ov::element::Type& destination_type, const size_t count = 1lu); + LoadConvertTruncation(const Output& x, const ov::element::Type& destination_type, const size_t count = 1lu, const size_t offset = 0lu); LoadConvertTruncation() = default; ov::element::Type get_destination_type() const { return m_destination_type; } diff --git a/src/plugins/intel_cpu/src/snippets_transformations/op/store_convert.cpp b/src/plugins/intel_cpu/src/snippets_transformations/op/store_convert.cpp index e58b5bc678d1f8..6a4180c54299c5 100644 --- a/src/plugins/intel_cpu/src/snippets_transformations/op/store_convert.cpp +++ b/src/plugins/intel_cpu/src/snippets_transformations/op/store_convert.cpp @@ -11,8 +11,9 @@ using namespace std; using namespace ov; -intel_cpu::StoreConvertSaturation::StoreConvertSaturation(const Output& x, const ov::element::Type& destination_type, const size_t count) : - Store(x, count), m_destination_type(destination_type) { +intel_cpu::StoreConvertSaturation::StoreConvertSaturation(const Output& x, const ov::element::Type& destination_type, + const size_t count, const size_t offset) : + Store(x, count, offset), m_destination_type(destination_type) { constructor_validate_and_infer_types(); } @@ -30,11 +31,12 @@ void intel_cpu::StoreConvertSaturation::validate_and_infer_types() { std::shared_ptr intel_cpu::StoreConvertSaturation::clone_with_new_inputs(const OutputVector& new_args) const { INTERNAL_OP_SCOPE(StoreConvert_clone_with_new_inputs); check_new_args_count(this, new_args); - return std::make_shared(new_args.at(0), m_destination_type, m_count); + return std::make_shared(new_args.at(0), m_destination_type, m_count, m_offset); } -intel_cpu::StoreConvertTruncation::StoreConvertTruncation(const Output& x, const ov::element::Type& destination_type, const size_t count) : - Store(x, count), m_destination_type(destination_type) { +intel_cpu::StoreConvertTruncation::StoreConvertTruncation(const Output& x, const ov::element::Type& destination_type, + const size_t count, const size_t offset) : + Store(x, count, offset), m_destination_type(destination_type) { constructor_validate_and_infer_types(); } @@ -52,5 +54,5 @@ void intel_cpu::StoreConvertTruncation::validate_and_infer_types() { std::shared_ptr intel_cpu::StoreConvertTruncation::clone_with_new_inputs(const OutputVector& new_args) const { INTERNAL_OP_SCOPE(StoreConvert_clone_with_new_inputs); check_new_args_count(this, new_args); - return std::make_shared(new_args.at(0), m_destination_type, m_count); + return std::make_shared(new_args.at(0), m_destination_type, m_count, m_offset); } diff --git a/src/plugins/intel_cpu/src/snippets_transformations/op/store_convert.hpp b/src/plugins/intel_cpu/src/snippets_transformations/op/store_convert.hpp index d0c4a947433b7c..3697af21540915 100644 --- a/src/plugins/intel_cpu/src/snippets_transformations/op/store_convert.hpp +++ b/src/plugins/intel_cpu/src/snippets_transformations/op/store_convert.hpp @@ -20,7 +20,7 @@ class StoreConvertSaturation : public ngraph::snippets::op::Store { public: OPENVINO_OP("StoreConvertSaturation", "SnippetsOpset", ngraph::snippets::op::Store); - StoreConvertSaturation(const Output& x, const ov::element::Type& destination_type, const size_t count = 1lu); + StoreConvertSaturation(const Output& x, const ov::element::Type& destination_type, const size_t count = 1lu, const size_t offset = 0lu); StoreConvertSaturation() = default; ov::element::Type get_destination_type() const { return m_destination_type; } @@ -47,7 +47,7 @@ class StoreConvertTruncation : public ngraph::snippets::op::Store { public: OPENVINO_OP("StoreConvertTruncation", "SnippetsOpset", ngraph::snippets::op::Store); - StoreConvertTruncation(const Output& x, const ov::element::Type& destination_type, const size_t count = 1lu); + StoreConvertTruncation(const Output& x, const ov::element::Type& destination_type, const size_t count = 1lu, const size_t offset = 0lu); StoreConvertTruncation() = default; ov::element::Type get_destination_type() const { return m_destination_type; } diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/softmax.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/softmax.cpp new file mode 100644 index 00000000000000..d978e1b3b5ad5a --- /dev/null +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/softmax.cpp @@ -0,0 +1,55 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "snippets/softmax.hpp" +#include "common_test_utils/test_constants.hpp" + +namespace ov { +namespace test { +namespace snippets { + + +namespace { + +const std::vector inputShape = { + ov::Shape{1, 16}, + ov::Shape{1, 32}, + ov::Shape{1, 1}, + ov::Shape{1, 9}, + ov::Shape{1, 17}, + ov::Shape{1, 19}, + ov::Shape{1, 49}, + ov::Shape{1, 50}, + ov::Shape{5, 16}, + ov::Shape{5, 32}, + ov::Shape{5, 1}, + ov::Shape{5, 9}, + ov::Shape{5, 17}, + ov::Shape{5, 19}, + ov::Shape{5, 49}, + ov::Shape{5, 50}, + ov::Shape{2, 12, 16}, + ov::Shape{2, 12, 32}, + ov::Shape{2, 12, 1}, + ov::Shape{2, 12, 9}, + ov::Shape{2, 12, 17}, + ov::Shape{2, 12, 19}, + ov::Shape{2, 12, 49}, + ov::Shape{2, 12, 50}, +}; + +INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Softmax, Softmax, + ::testing::Combine( + ::testing::ValuesIn(inputShape), + ::testing::Values(-1), + ::testing::Values(2), // Subgraph + Sin + ::testing::Values(1), + ::testing::Values(CommonTestUtils::DEVICE_CPU)), + Softmax::getTestCaseName); + + +} // namespace +} // namespace snippets +} // namespace test +} // namespace ov \ No newline at end of file diff --git a/src/tests/functional/plugin/shared/include/snippets/softmax.hpp b/src/tests/functional/plugin/shared/include/snippets/softmax.hpp new file mode 100644 index 00000000000000..aa2d73fa84a653 --- /dev/null +++ b/src/tests/functional/plugin/shared/include/snippets/softmax.hpp @@ -0,0 +1,40 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "shared_test_classes/base/snippets_test_utils.hpp" + +namespace ov { +namespace test { +namespace snippets { + +typedef std::tuple< + ov::Shape, // Input 0 Shape + int, // Axis + size_t, // Expected num nodes + size_t, // Expected num subgraphs + std::string // Target Device +> SoftmaxParams; + +typedef std::tuple< + std::pair, // Input Shapes + int, // Axis + size_t, // Expected num nodes + size_t, // Expected num subgraphs + std::string // Target Device +> AddSoftmaxParams; + +class Softmax : public testing::WithParamInterface, + virtual public ov::test::SnippetsTestsCommon { +public: + static std::string getTestCaseName(testing::TestParamInfo obj); + +protected: + void SetUp() override; +}; + +} // namespace snippets +} // namespace test +} // namespace ov \ No newline at end of file diff --git a/src/tests/functional/plugin/shared/src/snippets/softmax.cpp b/src/tests/functional/plugin/shared/src/snippets/softmax.cpp new file mode 100644 index 00000000000000..282f902b2e7974 --- /dev/null +++ b/src/tests/functional/plugin/shared/src/snippets/softmax.cpp @@ -0,0 +1,48 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "common_test_utils/common_utils.hpp" +#include "snippets/softmax.hpp" +#include "subgraph_softmax.hpp" +#include "ngraph_functions/builders.hpp" +#include "functional_test_utils/skip_tests_config.hpp" + +namespace ov { +namespace test { +namespace snippets { + +std::string Softmax::getTestCaseName(testing::TestParamInfo obj) { + ov::Shape inputShapes; + int axis; + std::string targetDevice; + size_t num_nodes, num_subgraphs; + std::tie(inputShapes, axis, num_nodes, num_subgraphs, targetDevice) = obj.param; + + std::ostringstream result; + result << "IS=" << CommonTestUtils::vec2str(inputShapes) << "_"; + result << "Axis=" << axis << "_"; + result << "#N=" << num_nodes << "_"; + result << "#S=" << num_subgraphs << "_"; + result << "targetDevice=" << targetDevice; + return result.str(); +} + +void Softmax::SetUp() { + ov::Shape inputShape; + int axis; + std::tie(inputShape, axis, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam(); + init_input_shapes({{{}, {inputShape, }}}); + + auto f = ov::test::snippets::SinhSoftmaxFunction({inputShape}, axis); + function = f.getOriginal(); +} + +TEST_P(Softmax, CompareWithRefImpl) { + run(); + validateNumSubgraphs(); +} + +} // namespace snippets +} // namespace test +} // namespace ov diff --git a/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_lowered.hpp b/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_lowered.hpp index 69027e96452751..642b372f3306a5 100644 --- a/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_lowered.hpp +++ b/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_lowered.hpp @@ -8,6 +8,7 @@ #include "snippets_helpers.hpp" #include "subgraph_simple.hpp" #include "subgraph_converts.hpp" +#include "subgraph_softmax.hpp" /* This file provides lowered representations (after the generate() was calles) for some simple functions. * This is required to test snippets lowering and optimization passes. All the functions are expected to be direct @@ -51,6 +52,14 @@ class EltwiseThreeInputsLoweredFunction : public EltwiseThreeInputsFunction { std::vector broadcast_shapes; }; +class SoftmaxLoweredFunction : public SoftmaxFunction { +public: + explicit SoftmaxLoweredFunction(const std::vector& inputShapes, int axis) : SoftmaxFunction(inputShapes, axis) {} + +protected: + std::shared_ptr initLowered() const override; +}; + } // namespace snippets } // namespace test } // namespace ov diff --git a/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_softmax.hpp b/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_softmax.hpp new file mode 100644 index 00000000000000..0c1e8836db8479 --- /dev/null +++ b/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_softmax.hpp @@ -0,0 +1,36 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "ngraph/ngraph.hpp" +#include "./snippets_helpers.hpp" + +namespace ov { +namespace test { +namespace snippets { + +class SoftmaxFunction : public SnippetsFunctionBase { +public: + explicit SoftmaxFunction(const std::vector& inputShapes, int axis) : SnippetsFunctionBase(inputShapes), axis(axis) { + NGRAPH_CHECK(input_shapes.size() == 1, "Got invalid number of input shapes"); + } +protected: + std::shared_ptr initOriginal() const override; + int axis; +}; + +class SinhSoftmaxFunction : public SnippetsFunctionBase { +public: + explicit SinhSoftmaxFunction(const std::vector& inputShapes, int axis) : SnippetsFunctionBase(inputShapes), axis(axis) { + NGRAPH_CHECK(input_shapes.size() == 1, "Got invalid number of input shapes"); + } +protected: + std::shared_ptr initOriginal() const override; + int axis; +}; + +} // namespace snippets +} // namespace test +} // namespace ov diff --git a/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_lowered.cpp b/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_lowered.cpp index d04db522a54881..3d5c96c752e0b5 100644 --- a/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_lowered.cpp +++ b/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_lowered.cpp @@ -105,6 +105,123 @@ std::shared_ptr EltwiseThreeInputsLoweredFunction::initLowered() cons } return model; } +std::shared_ptr SoftmaxLoweredFunction::initLowered() const { + auto input_params = ngraph::builder::makeParams(precision, {input_shapes[0].get_shape()}); + + const auto data = input_params.front(); + + const auto master_shape = input_shapes[0].get_shape(); + const auto shape_rank = master_shape.size(); + const auto dimension = shape_rank - 1; + const auto work_amount = master_shape[dimension]; + const auto increment = 10; + const auto inner_dim = shape_rank - 1; + const auto inner_master_wa = static_cast(master_shape[inner_dim]); + const int outer_dim = shape_rank > 1 ? shape_rank - 2 : -1; + const auto has_outer_loop = outer_dim >= 0 && master_shape[outer_dim] > 1; + const bool is_scalar = work_amount == 1; + + /* ====== ReduceMax decomposition ====== */ + + const auto vector_buffer_max = std::make_shared(); + const auto loop_max_begin = ngraph::snippets::op::insertLoopBegin(ngraph::OutputVector{data, data}); + + std::shared_ptr load_max = std::make_shared(loop_max_begin->output(0), increment); + if (is_scalar) { + load_max = std::make_shared(load_max, 1, "float_min"); + } + const auto max = std::make_shared(load_max, vector_buffer_max); + + std::vector apply_increments_max(3, false); + std::vector finalization_offsets_max(3, 0); + apply_increments_max[0] = data->get_shape()[inner_dim] != 1 && inner_master_wa != 1; + finalization_offsets_max[0] = data->get_shape()[inner_dim] != 1 ? -inner_master_wa : 0; + const auto loop_max_end = std::make_shared(ngraph::OutputVector{loop_max_begin->output(1), loop_max_begin->output(2)}, + dimension, work_amount, increment, apply_increments_max, finalization_offsets_max); + + std::shared_ptr horizon_max = std::make_shared(max); + horizon_max->add_control_dependency(loop_max_end); + if (!is_scalar) { + horizon_max = std::make_shared(horizon_max, horizon_max->get_input_partial_shape(0)); + } + + loop_max_begin->add_control_dependency(vector_buffer_max); + loop_max_end->add_control_dependency(max); + + /* =========================================== */ + + /* === Sub + Exp + ReduceSum decomposition === */ + + const auto vector_buffer_sum = std::make_shared(); + const auto loop_sum_begin = ngraph::snippets::op::insertLoopBegin(ngraph::OutputVector{loop_max_end->output(0)}); + + const auto load_sub = std::make_shared(loop_sum_begin->output(0), increment); + const auto sub = std::make_shared(load_sub, horizon_max); + std::shared_ptr exp = std::make_shared(sub); + if (is_scalar) { + exp = std::make_shared(exp, 1, "zero"); + } + const auto sum = std::make_shared(exp, vector_buffer_sum); + const auto store_exp = std::make_shared(exp, increment); + + std::vector apply_increments_sum(2, false); + std::vector finalization_offsets_sum(2, 0); + apply_increments_sum[0] = load_sub->get_shape()[inner_dim] != 1 && inner_master_wa != 1; + apply_increments_sum[1] = store_exp->get_shape()[inner_dim] != 1 && inner_master_wa != 1; + finalization_offsets_sum[0] = has_outer_loop && load_sub->get_shape()[inner_dim] != 1 ? -inner_master_wa : 0; + finalization_offsets_sum[1] = store_exp->get_shape()[inner_dim] != 1 ? -inner_master_wa : 0; + const auto loop_sum_end = std::make_shared( + ngraph::OutputVector{store_exp, loop_sum_begin->output(1)}, dimension, work_amount, increment, apply_increments_sum, finalization_offsets_sum); + loop_sum_end->add_control_dependency(sum); + + std::shared_ptr horizon_sum = std::make_shared(sum); + horizon_sum->add_control_dependency(loop_sum_end); + if (!is_scalar) { + horizon_sum = std::make_shared(horizon_sum, horizon_sum->get_input_partial_shape(0)); + horizon_sum->add_control_dependency(loop_sum_end); + } + + const auto buffer_exp = std::make_shared(loop_sum_end->output(0)); + + loop_sum_begin->add_control_dependency(vector_buffer_sum); + loop_sum_begin->add_control_dependency(horizon_max); + + /* =========================================== */ + + /* ================== Div ==================== */ + + const auto loop_div_begin = ngraph::snippets::op::insertLoopBegin(ngraph::OutputVector{buffer_exp}); + + const auto load_div = std::make_shared(loop_div_begin->output(0), increment); + const auto div = std::make_shared(load_div, horizon_sum); + const auto store_div = std::make_shared(div, increment); + + std::vector apply_increments_div(2, false); + std::vector finalization_offsets_div(2, 0); + apply_increments_div[0] = load_div->get_shape()[inner_dim] != 1 && inner_master_wa != 1; + apply_increments_div[1] = store_div->get_shape()[inner_dim] != 1 && inner_master_wa != 1; + finalization_offsets_div[0] = has_outer_loop && load_div->get_shape()[inner_dim] != 1 ? -inner_master_wa : 0; + finalization_offsets_div[1] = has_outer_loop && store_div->get_shape()[inner_dim] != 1 ? -inner_master_wa : 0; + const auto loop_div_end = std::make_shared( + ngraph::OutputVector{store_div, loop_div_begin->output(1)}, dimension, work_amount, increment, apply_increments_div, finalization_offsets_div); + loop_div_end->has_outer_loop = has_outer_loop; + loop_div_begin->add_control_dependency(horizon_sum); + + /* =========================================== */ + + auto loop_end = loop_div_end; + if (has_outer_loop) { + const auto& outer_loop_begin = ngraph::snippets::op::insertLoopBegin(input_params); + loop_end = insertLoopEnd(NodeVector{loop_div_end}, outer_loop_begin, 0, 1, 1, std::vector{true, true}); + } + + auto model = std::make_shared(NodeVector{loop_end}, input_params); + + auto ops = model->get_ordered_ops(); + for (auto op : ops) + std::cout << op->get_friendly_name() << std::endl; + return model; +} } // namespace snippets } // namespace test } // namespace ov diff --git a/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_softmax.cpp b/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_softmax.cpp new file mode 100644 index 00000000000000..5ca472a8055939 --- /dev/null +++ b/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_softmax.cpp @@ -0,0 +1,28 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "subgraph_softmax.hpp" +#include "common_test_utils/data_utils.hpp" +#include + +namespace ov { +namespace test { +namespace snippets { + +std::shared_ptr SoftmaxFunction::initOriginal() const { + auto data = std::make_shared(precision, input_shapes[0]); + auto softmax = std::make_shared(data, axis); + return std::make_shared(NodeVector{softmax}, ParameterVector{data}); +} + +std::shared_ptr SinhSoftmaxFunction::initOriginal() const { + auto data = std::make_shared(precision, input_shapes[0]); + auto sinh = std::make_shared(data); + auto softmax = std::make_shared(sinh, axis); + return std::make_shared(NodeVector{softmax}, ParameterVector{data}); +} + +} // namespace snippets +} // namespace test +} // namespace ov \ No newline at end of file