From b819ce06eacd83ad38875bfe8bd54ca7c7caa67f Mon Sep 17 00:00:00 2001 From: Alexandra Sidorova Date: Thu, 29 Sep 2022 16:25:58 +0200 Subject: [PATCH] [Snippets] Added Softmax support --- .../snippets/include/snippets/op/buffer.hpp | 38 +++ .../snippets/include/snippets/op/fill.hpp | 44 ++++ .../include/snippets/op/horizon_max.hpp | 33 +++ .../include/snippets/op/horizon_sum.hpp | 32 +++ .../snippets/include/snippets/op/load.hpp | 6 +- .../snippets/include/snippets/op/store.hpp | 5 +- .../snippets/include/snippets/op/subgraph.hpp | 18 +- .../include/snippets/op/vector_buffer.hpp | 31 +++ .../snippets/pass/softmax_decomposition.hpp | 26 ++ .../include/snippets/snippets_isa.hpp | 5 + src/common/snippets/src/generator.cpp | 67 ++++- src/common/snippets/src/op/buffer.cpp | 35 +++ src/common/snippets/src/op/fill.cpp | 34 +++ src/common/snippets/src/op/horizon_max.cpp | 26 ++ src/common/snippets/src/op/horizon_sum.cpp | 26 ++ src/common/snippets/src/op/load.cpp | 6 +- src/common/snippets/src/op/store.cpp | 6 +- src/common/snippets/src/op/subgraph.cpp | 124 ++++----- src/common/snippets/src/op/vector_buffer.cpp | 26 ++ .../snippets/src/pass/assign_registers.cpp | 45 +++- .../snippets/src/pass/collapse_subgraph.cpp | 11 +- .../snippets/src/pass/insert_load_store.cpp | 32 +-- .../src/pass/insert_movebroadcast.cpp | 15 +- .../src/pass/softmax_decomposition.cpp | 172 +++++++++++++ .../tests/src/softmax_decomposition.cpp | 39 +++ .../intel_cpu/src/emitters/cpu_generator.cpp | 6 + .../src/emitters/jit_load_store_emitters.cpp | 3 +- .../src/emitters/jit_snippets_emitters.cpp | 241 ++++++++++++++++-- .../src/emitters/jit_snippets_emitters.hpp | 95 ++++++- src/plugins/intel_cpu/src/nodes/subgraph.cpp | 31 ++- src/plugins/intel_cpu/src/nodes/subgraph.h | 4 + .../fuse_load_store_and_convert.cpp | 8 +- .../op/load_convert.cpp | 14 +- .../op/load_convert.hpp | 4 +- .../op/store_convert.cpp | 14 +- .../op/store_convert.hpp | 4 +- .../snippets/softmax.cpp | 47 ++++ .../shared/include/snippets/softmax.hpp | 49 ++++ .../plugin/shared/src/snippets/softmax.cpp | 80 ++++++ .../include/subgraph_softmax.hpp | 43 ++++ .../src/subgraph_softmax.cpp | 32 +++ 41 files changed, 1412 insertions(+), 165 deletions(-) create mode 100644 src/common/snippets/include/snippets/op/buffer.hpp create mode 100644 src/common/snippets/include/snippets/op/fill.hpp create mode 100644 src/common/snippets/include/snippets/op/horizon_max.hpp create mode 100644 src/common/snippets/include/snippets/op/horizon_sum.hpp create mode 100644 src/common/snippets/include/snippets/op/vector_buffer.hpp create mode 100644 src/common/snippets/include/snippets/pass/softmax_decomposition.hpp create mode 100644 src/common/snippets/src/op/buffer.cpp create mode 100644 src/common/snippets/src/op/fill.cpp create mode 100644 src/common/snippets/src/op/horizon_max.cpp create mode 100644 src/common/snippets/src/op/horizon_sum.cpp create mode 100644 src/common/snippets/src/op/vector_buffer.cpp create mode 100644 src/common/snippets/src/pass/softmax_decomposition.cpp create mode 100644 src/common/snippets/tests/src/softmax_decomposition.cpp create mode 100644 src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/softmax.cpp create mode 100644 src/tests/functional/plugin/shared/include/snippets/softmax.hpp create mode 100644 src/tests/functional/plugin/shared/src/snippets/softmax.cpp create mode 100644 src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_softmax.hpp create mode 100644 src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_softmax.cpp diff --git a/src/common/snippets/include/snippets/op/buffer.hpp b/src/common/snippets/include/snippets/op/buffer.hpp new file mode 100644 index 00000000000000..9c0e0207a9005f --- /dev/null +++ b/src/common/snippets/include/snippets/op/buffer.hpp @@ -0,0 +1,38 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + +namespace ngraph { +namespace snippets { +namespace op { + +/** + * @interface Buffer + * @brief TODO + * @ingroup snippets + */ +class Buffer : public ngraph::op::Op { +public: + OPENVINO_OP("Buffer", "SnippetsOpset"); + + Buffer(const Output& x, const size_t offset = 0); + Buffer() = default; + + size_t get_offset() const; + void set_offset(const size_t offset); + + bool visit_attributes(AttributeVisitor& visitor) override { return true; }; + std::shared_ptr clone_with_new_inputs(const OutputVector& new_args) const override; + void validate_and_infer_types() override; + +private: + size_t offset; +}; + +} // namespace op +} // namespace snippets +} // namespace ngraph diff --git a/src/common/snippets/include/snippets/op/fill.hpp b/src/common/snippets/include/snippets/op/fill.hpp new file mode 100644 index 00000000000000..3b403e6780506b --- /dev/null +++ b/src/common/snippets/include/snippets/op/fill.hpp @@ -0,0 +1,44 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + +namespace ngraph { +namespace snippets { +namespace op { + +/** + * @interface Fill + * @brief TODO + * @ingroup snippets + */ +class Fill : public ngraph::op::Op { +public: + OPENVINO_OP("Fill", "SnippetsOpset"); + + Fill(const Output& x, const int64_t offset, const std::string fill_value = "zero"); + Fill() = default; + + int64_t get_offset() const { return m_offset; } + std::string get_fill_value() const { return m_fill_value; } + + void set_offset(const size_t offset) { m_offset = offset; } + void set_fill_value(const std::string fill_value) { m_fill_value = fill_value; } + + bool visit_attributes(AttributeVisitor& visitor) override; + + std::shared_ptr clone_with_new_inputs(const OutputVector& new_args) const override; + + void validate_and_infer_types() override; + +protected: + int64_t m_offset = 0lu; + std::string m_fill_value = "zero"; +}; + +} // namespace op +} // namespace snippets +} // namespace ngraph diff --git a/src/common/snippets/include/snippets/op/horizon_max.hpp b/src/common/snippets/include/snippets/op/horizon_max.hpp new file mode 100644 index 00000000000000..6c3b9eb9aecae3 --- /dev/null +++ b/src/common/snippets/include/snippets/op/horizon_max.hpp @@ -0,0 +1,33 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "ngraph/op/op.hpp" + +namespace ngraph { +namespace snippets { +namespace op { + +/** + * @interface HorizonMax + * @brief TODO + * @ingroup snippets + */ +class HorizonMax : public ngraph::op::Op { +public: + OPENVINO_OP("HorizonMax", "SnippetsOpset"); + + HorizonMax(const Output& x); + HorizonMax() = default; + + bool visit_attributes(AttributeVisitor& visitor) override { return true;} + + std::shared_ptr clone_with_new_inputs(const OutputVector& new_args) const override; + void validate_and_infer_types() override; +}; + +} // namespace op +} // namespace snippets +} // namespace ngraph diff --git a/src/common/snippets/include/snippets/op/horizon_sum.hpp b/src/common/snippets/include/snippets/op/horizon_sum.hpp new file mode 100644 index 00000000000000..ff0ad01760e7a0 --- /dev/null +++ b/src/common/snippets/include/snippets/op/horizon_sum.hpp @@ -0,0 +1,32 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "ngraph/op/op.hpp" + +namespace ngraph { +namespace snippets { +namespace op { + +/** + * @interface HorizonSum + * @brief TODO + * @ingroup snippets + */ +class HorizonSum : public ngraph::op::Op { +public: + OPENVINO_OP("HorizonSum", "SnippetsOpset"); + + HorizonSum(const Output& x); + HorizonSum() = default; + + bool visit_attributes(AttributeVisitor& visitor) override { return true;} + std::shared_ptr clone_with_new_inputs(const OutputVector& new_args) const override; + void validate_and_infer_types() override; +}; + +} // namespace op +} // namespace snippets +} // namespace ngraph diff --git a/src/common/snippets/include/snippets/op/load.hpp b/src/common/snippets/include/snippets/op/load.hpp index 7f53240ae21946..95a1570f36b634 100644 --- a/src/common/snippets/include/snippets/op/load.hpp +++ b/src/common/snippets/include/snippets/op/load.hpp @@ -15,18 +15,21 @@ namespace op { * @brief Generated by Canonicalization step where explicit instructions should be emitted for data loading * where number of elements to load is determined by "count" * Default value is "1" - to load one element + * TODO: ADD DESCRIPTION OF PARAMS * @ingroup snippets */ class Load : public ngraph::op::Op { public: OPENVINO_OP("Load", "SnippetsOpset"); - Load(const Output& x, const size_t count = 1lu); + Load(const Output& x, const size_t count = 1lu, const size_t offset = 0lu); Load() = default; size_t get_count() const { return m_count; } + size_t get_offset() const { return m_offset; } void set_count(const size_t count) { m_count = count; } + void set_offset(const size_t offset) { m_offset = offset; } bool visit_attributes(AttributeVisitor& visitor) override; @@ -40,6 +43,7 @@ class Load : public ngraph::op::Op { protected: size_t m_count = 0lu; + size_t m_offset = 0lu; }; } // namespace op diff --git a/src/common/snippets/include/snippets/op/store.hpp b/src/common/snippets/include/snippets/op/store.hpp index 0ff5cc3ec8e063..84ff0248aa6208 100644 --- a/src/common/snippets/include/snippets/op/store.hpp +++ b/src/common/snippets/include/snippets/op/store.hpp @@ -21,12 +21,14 @@ class Store : public ngraph::op::Op { public: OPENVINO_OP("Store", "SnippetsOpset"); - Store(const Output& x, const size_t count = 1lu); + Store(const Output& x, const size_t count = 1lu, const size_t offset = 0lu); Store() = default; size_t get_count() const { return m_count; } + size_t get_offset() const { return m_offset; } void set_count(const size_t count) { m_count = count; } + void set_offset(const size_t offset) { m_offset = offset; } bool visit_attributes(AttributeVisitor& visitor) override; @@ -40,6 +42,7 @@ class Store : public ngraph::op::Op { protected: size_t m_count = 0lu; + size_t m_offset = 0lu; }; } // namespace op diff --git a/src/common/snippets/include/snippets/op/subgraph.hpp b/src/common/snippets/include/snippets/op/subgraph.hpp index dfcde2bd4fd2c6..81158ea3bb9e0a 100644 --- a/src/common/snippets/include/snippets/op/subgraph.hpp +++ b/src/common/snippets/include/snippets/op/subgraph.hpp @@ -89,17 +89,10 @@ class Subgraph : public ngraph::op::Op { return m_generator; } - size_t get_non_scalar_constants_count() const { - return m_non_scalar_constants_count; - } - - bool is_quantized() const { - return config.m_is_quantized; - } - - bool has_type_relaxed_ops() const { - return config.m_has_type_relaxed_ops; - } + size_t get_buffer_scratchpad_size() const; + size_t get_non_scalar_constants_count() const { return m_non_scalar_constants_count; } + bool is_quantized() const { return config.m_is_quantized; } + bool has_type_relaxed_ops() const { return config.m_has_type_relaxed_ops; } snippets::Schedule generate(const BlockedShapeVector& output_shapes, const BlockedShapeVector& input_shapes, ngraph::pass::Manager& opt, const void* compile_params = nullptr); @@ -128,6 +121,7 @@ class Subgraph : public ngraph::op::Op { private: void align_element_types(const BlockedShapeVector& outputShapes, const BlockedShapeVector& inputShapes); void convert_to_snippet_dialect(); + void init_config(); // Count of potentional non-scalar Consants that will be created after some tranformations // At the moment it's relevant only for FakeQuantize decomposition // NOTE: To avoid overheads in each calcution of this count (for example, in validate_and_type_infer()), @@ -150,6 +144,8 @@ class Subgraph : public ngraph::op::Op { // True if Subgraph contains TypeRelaxed nodes -> for several streams in tp mode we should copy body using mutexes // because TypeRelaxed::copy_with_new_inputs() isn't save-thread method bool m_has_type_relaxed_ops = false; + // True if we should check runtime info for nodes to call specific needed transformations + bool m_check_rt_info = false; } config; ov::PartialShape master_shape; diff --git a/src/common/snippets/include/snippets/op/vector_buffer.hpp b/src/common/snippets/include/snippets/op/vector_buffer.hpp new file mode 100644 index 00000000000000..e105fbe8e90ba7 --- /dev/null +++ b/src/common/snippets/include/snippets/op/vector_buffer.hpp @@ -0,0 +1,31 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + +namespace ngraph { +namespace snippets { +namespace op { + +/** + * @interface Buffer + * @brief TODO + * @ingroup snippets + */ +class VectorBuffer : public ngraph::op::Op { +public: + OPENVINO_OP("VectorBuffer", "SnippetsOpset"); + + VectorBuffer(); + + bool visit_attributes(AttributeVisitor& visitor) override { return true;} + std::shared_ptr clone_with_new_inputs(const OutputVector& new_args) const override; + void validate_and_infer_types() override; +}; + +} // namespace op +} // namespace snippets +} // namespace ngraph \ No newline at end of file diff --git a/src/common/snippets/include/snippets/pass/softmax_decomposition.hpp b/src/common/snippets/include/snippets/pass/softmax_decomposition.hpp new file mode 100644 index 00000000000000..80d7f36f617779 --- /dev/null +++ b/src/common/snippets/include/snippets/pass/softmax_decomposition.hpp @@ -0,0 +1,26 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include + +namespace ngraph { +namespace snippets { +namespace pass { + +/** + * @interface SoftmaxDecomposition + * @brief TODO + * @ingroup snippets + */ +class SoftmaxDecomposition: public ngraph::pass::MatcherPass { +public: + SoftmaxDecomposition(const size_t vector_size); +}; + +} // namespace pass +} // namespace snippets +} // namespace ngraph diff --git a/src/common/snippets/include/snippets/snippets_isa.hpp b/src/common/snippets/include/snippets/snippets_isa.hpp index f1c0e9056d66eb..c1b13124eb8646 100644 --- a/src/common/snippets/include/snippets/snippets_isa.hpp +++ b/src/common/snippets/include/snippets/snippets_isa.hpp @@ -9,8 +9,12 @@ #include "op/broadcastload.hpp" #include "op/broadcastmove.hpp" +#include "op/buffer.hpp" #include "op/convert_saturation.hpp" #include "op/convert_truncation.hpp" +#include "op/horizon_max.hpp" +#include "op/horizon_sum.hpp" +#include "op/fill.hpp" #include "op/kernel.hpp" #include "op/load.hpp" #include "op/nop.hpp" @@ -19,6 +23,7 @@ #include "op/store.hpp" #include "op/tile.hpp" #include "op/tile_scheduler.hpp" +#include "op/vector_buffer.hpp" namespace ngraph { namespace snippets { diff --git a/src/common/snippets/src/generator.cpp b/src/common/snippets/src/generator.cpp index 24a92d0c579f41..10465301721894 100644 --- a/src/common/snippets/src/generator.cpp +++ b/src/common/snippets/src/generator.cpp @@ -20,41 +20,93 @@ namespace snippets { auto getRegisters(const std::shared_ptr &n) -> RegInfo { OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::getRegisters") + std::cout << n->get_friendly_name() << std::endl; // ToDo: change to reg_t std::vector rin, rout; + std::cout << "out: "; for (const auto& output : n->outputs()) { const auto& rt = output.get_tensor_ptr()->get_rt_info(); auto it_rt = rt.find("reginfo"); if (it_rt != rt.end()) { for (auto reg : it_rt->second.as>()) { rout.push_back(reg); + std::cout << reg << " "; } } } + std::cout << "in: "; for (const auto& input : n->inputs()) { auto rt = input.get_source_output().get_tensor_ptr()->get_rt_info(); auto it_rt = rt.find("reginfo"); if (it_rt != rt.end()) { for (auto& reg : it_rt->second.as>()) { rin.push_back(reg); + std::cout << reg << " "; } } } + std::cout << std::endl; return std::make_pair(rin, rout); } +auto copyRegInfo(const ov::descriptor::Tensor& from, ov::descriptor::Tensor& to) -> void { + auto rt = from.get_rt_info(); + auto reginfo = rt.find("reginfo"); + if (reginfo != rt.end()) { + to.get_rt_info()["reginfo"] = reginfo->second; + } +} + +auto scalarLoopTransformations(NodeVector& scalar_tile) -> void { + NodeVector updated_tile; + auto insertFill = [](const ov::Input& input) -> std::shared_ptr { + std::shared_ptr fill = nullptr; + auto& rt = input.get_rt_info(); + auto fill_rt = rt.find("set_fill"); + if (fill_rt != rt.end()) { + const std::string fill_value = fill_rt->second.as(); + fill = std::make_shared(input.get_source_output(), 1lu, fill_value); + input.get_node()->set_argument(input.get_index(), fill); + // we should explicitly copy reg info because we insert Fill after assign register + copyRegInfo(fill->get_input_tensor(0), fill->get_output_tensor(0)); + } + return fill; + }; + + for (auto& op : scalar_tile) { + if (ov::is_type(op) || + ov::is_type(op)) { + for (auto i = 0; i < op->inputs().size(); ++i) { + if (auto fill = insertFill(op->input(i))) { + updated_tile.push_back(fill); + } + } + } else if (const auto load = ov::as_type_ptr(op)) { + load->set_count(1); + } else if (const auto store = ov::as_type_ptr(op)) { + store->set_count(1); + } + updated_tile.push_back(op); + } + + scalar_tile = updated_tile; +} + ngraph::snippets::code ngraph::snippets::Generator::generate(std::shared_ptr& m, const void* compile_params) const { OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::Generator::generate") if (!target->is_supported()) throw ngraph_error("unsupported architecture for code generation"); + const auto& ops = m->get_ordered_ops(); auto params = m->get_parameters(); auto results = m->get_results(); auto in = params.size(); auto out = results.size(); + auto buffer = static_cast(std::any_of(ops.begin(), ops.end(), + [](const std::shared_ptr& node) { return ov::is_type(node); } )); std::vector io_last_dims(in + out); std::vector io_data_sizes(in + out); @@ -110,7 +162,6 @@ ngraph::snippets::code ngraph::snippets::Generator::generate(std::shared_ptrget_ordered_ops(); for (auto op = ops.begin(); op < ops.end(); op++) { const auto& tile_begin = ov::as_type_ptr(*op); // ignore outer tiles and possible manual scalar tiles @@ -145,14 +196,8 @@ ngraph::snippets::code ngraph::snippets::Generator::generate(std::shared_ptr& n){ - if (const auto load = ov::as_type_ptr(n)) - load->set_count(1); - else if (const auto store = ov::as_type_ptr(n)) - store->set_count(1); - return n; - }); + scalarLoopTransformations(scalar_tile); + const auto& scalar_tile_end = ov::as_type_ptr(*scalar_tile.rbegin()); scalar_tile_end->set_finalization_offsets(scalar_finalization_offsets); const auto scalar_work_amount = work_amount % increment; @@ -176,7 +221,7 @@ ngraph::snippets::code ngraph::snippets::Generator::generate(std::shared_ptrcompile_params = compile_params; std::shared_ptr kernel = target->get(op::Kernel::get_type_info_static())(tiles2DKernel); - kernel->emit_code({in, out}, {}); + kernel->emit_code({in, out, buffer}, {}); OV_ITT_TASK_NEXT(GENERATE, "::EmitData") for (auto& op : lowered) { @@ -191,4 +236,4 @@ std::shared_ptr Generator::get_target_machine() const { } }// namespace snippets -}// namespace ngraph \ No newline at end of file +}// namespace ngraph diff --git a/src/common/snippets/src/op/buffer.cpp b/src/common/snippets/src/op/buffer.cpp new file mode 100644 index 00000000000000..2cf3e194915427 --- /dev/null +++ b/src/common/snippets/src/op/buffer.cpp @@ -0,0 +1,35 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +#include "snippets/op/buffer.hpp" + +#include + +using namespace std; +using namespace ngraph; + +snippets::op::Buffer::Buffer(const Output& x, const size_t offset) : + Op({x}), offset(offset) { + constructor_validate_and_infer_types(); +} + +size_t snippets::op::Buffer::get_offset() const { + return offset; +} + +void snippets::op::Buffer::set_offset(const size_t offset) { + this->offset = offset; +} + +std::shared_ptr snippets::op::Buffer::clone_with_new_inputs(const OutputVector& new_args) const { + INTERNAL_OP_SCOPE(Buffer); + check_new_args_count(this, new_args); + return std::make_shared(new_args.at(0), offset); +} + +void snippets::op::Buffer::validate_and_infer_types() { + set_output_type(0, get_input_element_type(0), get_input_partial_shape(0)); +} diff --git a/src/common/snippets/src/op/fill.cpp b/src/common/snippets/src/op/fill.cpp new file mode 100644 index 00000000000000..1846315dc5ceba --- /dev/null +++ b/src/common/snippets/src/op/fill.cpp @@ -0,0 +1,34 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +#include "snippets/op/fill.hpp" + +#include + +using namespace std; +using namespace ngraph; + +snippets::op::Fill::Fill(const Output& x, const int64_t offset, const std::string fill_value) + : Op({x}), m_offset(offset), m_fill_value(fill_value) { + constructor_validate_and_infer_types(); +} + +bool snippets::op::Fill::visit_attributes(AttributeVisitor& visitor) { + visitor.on_attribute("offset", m_offset); + visitor.on_attribute("fill_value", m_fill_value); + return true; +} + +std::shared_ptr snippets::op::Fill::clone_with_new_inputs(const OutputVector& new_args) const { + INTERNAL_OP_SCOPE(Fill); + check_new_args_count(this, new_args); + return std::make_shared(new_args.at(0), m_offset, m_fill_value); +} + +void snippets::op::Fill::validate_and_infer_types() { + set_output_type(0, get_input_element_type(0), get_input_partial_shape(0)); +} + diff --git a/src/common/snippets/src/op/horizon_max.cpp b/src/common/snippets/src/op/horizon_max.cpp new file mode 100644 index 00000000000000..47f37559eb07eb --- /dev/null +++ b/src/common/snippets/src/op/horizon_max.cpp @@ -0,0 +1,26 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include "snippets/op/horizon_max.hpp" + +using namespace std; +using namespace ngraph; + +snippets::op::HorizonMax::HorizonMax(const Output& x) : Op({x}) { + constructor_validate_and_infer_types(); +} + +std::shared_ptr snippets::op::HorizonMax::clone_with_new_inputs(const OutputVector& new_args) const { + INTERNAL_OP_SCOPE(HorizonMax); + check_new_args_count(this, new_args); + auto other = std::make_shared(new_args.at(0)); + return other; +} + +void snippets::op::HorizonMax::validate_and_infer_types() { + auto new_shape = get_input_partial_shape(0); + new_shape[new_shape.size() - 1] = 1lu; + set_output_type(0, get_input_element_type(0), new_shape); +} diff --git a/src/common/snippets/src/op/horizon_sum.cpp b/src/common/snippets/src/op/horizon_sum.cpp new file mode 100644 index 00000000000000..9df2600dee745f --- /dev/null +++ b/src/common/snippets/src/op/horizon_sum.cpp @@ -0,0 +1,26 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include "snippets/op/horizon_sum.hpp" + +using namespace std; +using namespace ngraph; + +snippets::op::HorizonSum::HorizonSum(const Output& x) : Op({x}) { + constructor_validate_and_infer_types(); +} + +std::shared_ptr snippets::op::HorizonSum::clone_with_new_inputs(const OutputVector& new_args) const { + INTERNAL_OP_SCOPE(HorizonSum); + check_new_args_count(this, new_args); + auto other = std::make_shared(new_args.at(0)); + return other; +} + +void snippets::op::HorizonSum::validate_and_infer_types() { + auto new_shape = get_input_partial_shape(0); + new_shape[new_shape.size() - 1] = 1lu; + set_output_type(0, get_input_element_type(0), new_shape); +} diff --git a/src/common/snippets/src/op/load.cpp b/src/common/snippets/src/op/load.cpp index e33a0a4e50d4ac..4d9d7943b7c2da 100644 --- a/src/common/snippets/src/op/load.cpp +++ b/src/common/snippets/src/op/load.cpp @@ -11,19 +11,21 @@ using namespace std; using namespace ngraph; -snippets::op::Load::Load(const Output& x, const size_t count) : Op({x}), m_count(count) { +snippets::op::Load::Load(const Output& x, const size_t count, const size_t offset) + : Op({x}), m_count(count), m_offset(offset) { constructor_validate_and_infer_types(); } bool snippets::op::Load::visit_attributes(AttributeVisitor& visitor) { visitor.on_attribute("count", m_count); + visitor.on_attribute("offset", m_offset); return true; } std::shared_ptr snippets::op::Load::clone_with_new_inputs(const OutputVector& new_args) const { INTERNAL_OP_SCOPE(Load); check_new_args_count(this, new_args); - return std::make_shared(new_args.at(0), m_count); + return std::make_shared(new_args.at(0), m_count, m_offset); } void snippets::op::Load::validate_and_infer_types() { diff --git a/src/common/snippets/src/op/store.cpp b/src/common/snippets/src/op/store.cpp index db3204df69ab0b..9d5569b03b0ec4 100644 --- a/src/common/snippets/src/op/store.cpp +++ b/src/common/snippets/src/op/store.cpp @@ -11,18 +11,20 @@ using namespace std; using namespace ngraph; -snippets::op::Store::Store(const Output& x, const size_t count) : Op({x}), m_count(count) { +snippets::op::Store::Store(const Output& x, const size_t count, const size_t offset) : Op({x}), m_count(count), m_offset(offset) { constructor_validate_and_infer_types(); } bool snippets::op::Store::visit_attributes(AttributeVisitor& visitor) { + visitor.on_attribute("count", m_count); + visitor.on_attribute("offset", m_offset); return true; } std::shared_ptr snippets::op::Store::clone_with_new_inputs(const OutputVector& new_args) const { INTERNAL_OP_SCOPE(Store); check_new_args_count(this, new_args); - return std::make_shared(new_args.at(0), m_count); + return std::make_shared(new_args.at(0), m_count, m_offset); } void snippets::op::Store::validate_and_infer_types() { diff --git a/src/common/snippets/src/op/subgraph.cpp b/src/common/snippets/src/op/subgraph.cpp index 3f4a0377e7d4a0..675c6a35f2463a 100644 --- a/src/common/snippets/src/op/subgraph.cpp +++ b/src/common/snippets/src/op/subgraph.cpp @@ -16,6 +16,7 @@ #include "snippets/pass/vector_to_scalar.hpp" #include "snippets/pass/transform_convert.hpp" #include "snippets/pass/align_element_type.hpp" +#include "snippets/pass/softmax_decomposition.hpp" #include "snippets/utils.hpp" #include "transformations/common_optimizations/nop_elimination.hpp" @@ -42,16 +43,20 @@ void snippets::op::Subgraph::set_non_scalar_constants_count(const size_t count) m_non_scalar_constants_count = count; } -snippets::op::Subgraph::Subgraph(const OutputVector& args, std::shared_ptr body) - : Op(args), m_body(body), m_generator(nullptr) { +void snippets::op::Subgraph::init_config() { const auto ops = m_body->get_ops(); for (const auto& op : ops) { config.m_is_quantized = config.m_is_quantized || ov::is_type(op); + config.m_check_rt_info = config.m_check_rt_info || ov::is_type(op); config.m_has_type_relaxed_ops = config.m_has_type_relaxed_ops || std::dynamic_pointer_cast(op); config.m_is_needed_to_align_precision = config.m_is_needed_to_align_precision || is_quantized() || has_type_relaxed_ops() || snippets::pass::AlignElementType::opNeedsAlignElementType(op, execution_element_type); } +} +snippets::op::Subgraph::Subgraph(const OutputVector& args, std::shared_ptr body) + : Op(args), m_body(body), m_generator(nullptr) { + init_config(); constructor_validate_and_infer_types(); } @@ -306,6 +311,17 @@ PartialShape snippets::op::Subgraph::get_master_shape() { return master_shape; } +size_t snippets::op::Subgraph::get_buffer_scratchpad_size() const { + size_t buffer_size = 0; + const auto ops = m_body->get_ops(); + for (const auto& op : ops) { + if (auto buffer = ov::as_type_ptr(op)) { + buffer_size += ngraph::shape_size(buffer->get_shape()) * buffer->get_element_type().size(); + } + } + return buffer_size; +} + void snippets::op::Subgraph::align_element_types(const BlockedShapeVector& outputShapes, const BlockedShapeVector& inputShapes) { const auto& body_results = m_body->get_results(); @@ -329,9 +345,11 @@ void snippets::op::Subgraph::align_element_types(const BlockedShapeVector& outpu } // We should insert Convert before Results to return original output element type - const auto convert = std::make_shared( + if (body_results[i]->get_element_type() != needed_out_type) { + const auto convert = std::make_shared( body_results[i]->get_input_node_shared_ptr(0), needed_out_type); - body_results[i]->set_argument(0, convert); + body_results[i]->set_argument(0, convert); + } } // We should align element type inside body using the corresponding pass: // - Insert Convert before operations that doesn't support original element type for execution @@ -367,6 +385,7 @@ void snippets::op::Subgraph::convert_to_snippet_dialect() { return p->get_partial_shape().rbegin()->is_dynamic(); }); ngraph::pass::Manager manager; + manager.register_pass(count); manager.register_pass(); manager.register_pass(); manager.register_pass(count); @@ -406,39 +425,9 @@ void snippets::op::Subgraph::convert_to_snippet_dialect() { } } manager.run_passes(m_body); -} - -snippets::Schedule snippets::op::Subgraph::generate(const BlockedShapeVector& output_shapes, - const BlockedShapeVector& input_shapes, - const void* compile_params) { - canonicalize(output_shapes, input_shapes); - return generate(compile_params); -} - -snippets::Schedule snippets::op::Subgraph::generate(const BlockedShapeVector& output_shapes, - const BlockedShapeVector& input_shapes, - ngraph::pass::Manager& opt, - const void* compile_params) { - canonicalize(output_shapes, input_shapes); - return generate(opt, compile_params); -} - -snippets::Schedule snippets::op::Subgraph::generate(const void* compile_params) { - auto mngr = ngraph::pass::Manager(); - return generate(mngr, compile_params); -} - -snippets::Schedule snippets::op::Subgraph::generate(ngraph::pass::Manager& opt, const void* compile_params) { - INTERNAL_OP_SCOPE(Subgraph); - OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::generate") - NGRAPH_CHECK(m_generator != nullptr, "generate is called while generator is not set"); - - convert_to_snippet_dialect(); - opt.run_passes(m_body); - - // generation flow - snippets::pass::AssignRegisters().run_on_model(m_body); + ov::pass::Serialize("/home/sidorova/work/branch/openvino/graphs/softmax/passes.xml", + "/home/sidorova/work/branch/openvino/graphs/softmax/passes.bin").run_on_model(m_body); if (master_shape.is_static()) { const auto inner_dim = master_shape.size() - 1; @@ -446,7 +435,6 @@ snippets::Schedule snippets::op::Subgraph::generate(ngraph::pass::Manager& opt, const auto outer_dim = master_shape.size() - 2; const auto inner_WA = master_shape[inner_dim].get_length(); const auto outer_WA = master_shape.size() >= 2 ? master_shape[outer_dim].get_length() : 1; - // todo: get_lanes() assumes fp32. Could there be any int8 issues? const auto vector_size = m_generator->get_target_machine()->get_lanes(); auto& commonParams = m_body->get_parameters(); @@ -459,9 +447,6 @@ snippets::Schedule snippets::op::Subgraph::generate(ngraph::pass::Manager& opt, [](const std::shared_ptr& n) { return n->get_input_partial_shape(0); }); if (inner_WA > 0) { -// const bool skip_counters = vector_work_amount == vector_size; -// const bool skip_ptr_increments = outer_dim == 1 && skip_counters; - // todo: pass skip_counters and skip_ptr_increments std::vector apply_increments; apply_increments.reserve(ioShapes.size()); // Inner Tile applies increments if a dimension is not broadcasted @@ -477,11 +462,12 @@ snippets::Schedule snippets::op::Subgraph::generate(ngraph::pass::Manager& opt, return ps[outer_dim] == 1 && ps[inner_dim] != 1 ? -inner_WA : 0; }); } - const auto& innerTileBegin = insertTileBegin(commonParams); - const auto& innerTileEnd = insertTileEnd(commonResults, innerTileBegin, inner_dim, inner_WA, vector_size, apply_increments, - inner_finalization_offsets); - // set internal flag to enable scalar vs vector tile optimizations - innerTileEnd->has_outer_tile = outer_WA > 1; + + const auto& innerTileBegin = insertTileBegin(commonParams); + const auto& innerTileEnd = insertTileEnd(commonResults, innerTileBegin, inner_dim, inner_WA, vector_size, apply_increments, + inner_finalization_offsets); + // set internal flag to enable scalar vs vector tile optimizations + innerTileEnd->has_outer_tile = outer_WA > 1; } if (outer_WA > 1) { @@ -498,23 +484,41 @@ snippets::Schedule snippets::op::Subgraph::generate(ngraph::pass::Manager& opt, } else { throw ngraph_error("Dynamic case is not supported yet"); } -// for (int i = 0; i < tileEnd->get_output_size(); i++) { -// std::cerr << i << " : "; -// const auto& rt = tileBegin->get_output_tensor(i).get_rt_info(); -// auto it_rt = rt.find("reginfo"); -// if (it_rt != rt.end()) { -// for (auto reg : it_rt->second.as>()) { -// std::cerr << reg << " "; -// } -// } else { -// std::cerr << "reginfo is empty!"; -// } -// std::cerr << "\n"; -// } + m_body->validate_nodes_and_infer_types(); +} + +snippets::Schedule snippets::op::Subgraph::generate(const BlockedShapeVector& output_shapes, + const BlockedShapeVector& input_shapes, + const void* compile_params) { + canonicalize(output_shapes, input_shapes); + return generate(compile_params); +} -// std::cerr << "Tile after is dumped"; -// ov::pass::Serialize("tile_after.xml", "tile_after.bin").run_on_model(m_body); +snippets::Schedule snippets::op::Subgraph::generate(const BlockedShapeVector& output_shapes, + const BlockedShapeVector& input_shapes, + ngraph::pass::Manager& opt, + const void* compile_params) { + canonicalize(output_shapes, input_shapes); + return generate(opt, compile_params); +} + +snippets::Schedule snippets::op::Subgraph::generate(const void* compile_params) { + auto mngr = ngraph::pass::Manager(); + return generate(mngr, compile_params); +} + +snippets::Schedule snippets::op::Subgraph::generate(ngraph::pass::Manager& opt, const void* compile_params) { + INTERNAL_OP_SCOPE(Subgraph); + OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::generate") + NGRAPH_CHECK(m_generator != nullptr, "generate is called while generator is not set"); + + + convert_to_snippet_dialect(); + opt.run_passes(m_body); + + // generation flow + snippets::pass::AssignRegisters().run_on_model(m_body); // schedule generation should go here and be target agnostic // actual code emission diff --git a/src/common/snippets/src/op/vector_buffer.cpp b/src/common/snippets/src/op/vector_buffer.cpp new file mode 100644 index 00000000000000..1e104682aea6f0 --- /dev/null +++ b/src/common/snippets/src/op/vector_buffer.cpp @@ -0,0 +1,26 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +#include "snippets/op/vector_buffer.hpp" + +#include + +using namespace std; +using namespace ngraph; + +snippets::op::VectorBuffer::VectorBuffer() : Op() { + constructor_validate_and_infer_types(); +} + +std::shared_ptr snippets::op::VectorBuffer::clone_with_new_inputs(const OutputVector& new_args) const { + INTERNAL_OP_SCOPE(VectorBuffer); + check_new_args_count(this, new_args); + return std::make_shared(); +} + +void snippets::op::VectorBuffer::validate_and_infer_types() { + set_output_type(0, ov::element::f32, Shape{1lu}); +} diff --git a/src/common/snippets/src/pass/assign_registers.cpp b/src/common/snippets/src/pass/assign_registers.cpp index 31e16e4137e3b9..31a891915acd61 100644 --- a/src/common/snippets/src/pass/assign_registers.cpp +++ b/src/common/snippets/src/pass/assign_registers.cpp @@ -20,14 +20,20 @@ bool ngraph::snippets::pass::AssignRegisters::run_on_model(const std::shared_ptr auto ops = f->get_ordered_ops(); decltype(ops) stmts; std::copy_if(ops.begin(), ops.end(), std::back_inserter(stmts), [](decltype(ops[0]) op) { - return !(std::dynamic_pointer_cast(op) || std::dynamic_pointer_cast(op)); + return !(std::dynamic_pointer_cast(op) || + std::dynamic_pointer_cast(op) || + std::dynamic_pointer_cast(op) || + std::dynamic_pointer_cast(op) || + std::dynamic_pointer_cast(op) || + std::dynamic_pointer_cast(op)); }); size_t rdx = 0; std::map, Reg> regs; for (const auto& op : stmts) { for (const auto& output : op->outputs()) { - regs[output.get_tensor_ptr()] = rdx++; + if (regs.find(output.get_tensor_ptr()) == regs.end()) + regs[output.get_tensor_ptr()] = rdx++; } } @@ -140,21 +146,30 @@ bool ngraph::snippets::pass::AssignRegisters::run_on_model(const std::shared_ptr physical_regs[reg.first] = register_map[reg.second]; } const auto num_parameters = f->get_parameters().size(); - for (const auto& n : f->get_ordered_ops()) { + const auto num_results = f->get_results().size(); + const auto ordered_ops = f->get_ordered_ops(); + for (const auto& n : ordered_ops) { /* The main idea here is that each operation stores its output regs in rt["reginfo"]. Input and output regs are * then derived by parsing node's and parent's rt["reginfo"], look into ngraph::snippets::getRegisters for details. * Note also that Parameter and Result store general-purpose register index, because they work with memory * (memory pointer is stored in gpr). All other "regular" ops store vector regs indexes, since calculations are * performed on registers. */ - if (is_type(n)) { + if (is_type(n) || + is_type(n) || + is_type(n)) { continue; } else if (const auto& param = ov::as_type_ptr(n)) { auto& rt = n->get_output_tensor(0).get_rt_info(); rt["reginfo"] = std::vector{static_cast(f->get_parameter_index(param))}; - } else if (const auto& store = ov::as_type_ptr(n)) { - auto& rt = n->get_output_tensor(0).get_rt_info(); - rt["reginfo"] = std::vector{static_cast(f->get_result_index(store) + num_parameters)}; + } else if (const auto& result = ov::as_type_ptr(n)) { + auto& rt = result->get_input_tensor(0).get_rt_info(); + rt["reginfo"] = std::vector{static_cast(f->get_result_index(result) + num_parameters)}; + } else if (const auto& buffer = ov::as_type_ptr(n)) { + auto& in_rt = n->get_output_tensor(0).get_rt_info(); + in_rt["reginfo"] = std::vector{static_cast(num_parameters + num_results)}; + auto& out_rt = n->get_input_tensor(0).get_rt_info(); + out_rt["reginfo"] = std::vector{static_cast(num_parameters + num_results)}; } else { for (const auto& output : n->outputs()) { auto out_tensor = output.get_tensor_ptr(); @@ -164,5 +179,21 @@ bool ngraph::snippets::pass::AssignRegisters::run_on_model(const std::shared_ptr } } + for (const auto& n : ordered_ops) { + if (is_type(n)) { + for (int i = 0; i < n->get_input_size(); i++) { + const auto& rt_info_old = n->input(i).get_source_output().get_tensor().get_rt_info(); + auto& rt_info_new = n->get_output_tensor(i).get_rt_info(); + rt_info_new = rt_info_old; + } + } else if (is_type(n)) { + for (int i = 0; i < n->get_output_size(); i++) { + const auto& rt_info_old = n->get_output_tensor(i).get_rt_info(); + auto& rt_info_new = n->input(i).get_source_output().get_tensor().get_rt_info(); + rt_info_new = rt_info_old; + } + } + } + return false; } diff --git a/src/common/snippets/src/pass/collapse_subgraph.cpp b/src/common/snippets/src/pass/collapse_subgraph.cpp index be4744c80f653d..f42772b573ba69 100644 --- a/src/common/snippets/src/pass/collapse_subgraph.cpp +++ b/src/common/snippets/src/pass/collapse_subgraph.cpp @@ -101,7 +101,16 @@ auto is_supported_op(const std::shared_ptr &n) -> bool { || ov::is_type(n) || ov::is_type(n); }; - return is_supported_fq_op(n) || is_supported_unary_eltwise_op(n) || is_supported_binary_eltwise_op(n); + auto is_supported_softmax = [](const std::shared_ptr &n) -> bool { + const auto softmax = ngraph::as_type_ptr(n); + return softmax && n->get_input_partial_shape(0).rank().is_static() && + softmax->get_axis() == (n->get_input_partial_shape(0).rank().get_length() - 1); + }; + + return is_supported_fq_op(n) + || is_supported_unary_eltwise_op(n) + || is_supported_binary_eltwise_op(n) + || is_supported_softmax(n); } auto has_supported_in_out(const std::shared_ptr &n) -> bool { diff --git a/src/common/snippets/src/pass/insert_load_store.cpp b/src/common/snippets/src/pass/insert_load_store.cpp index 827b1f914a793d..d5ba15bc280ef9 100644 --- a/src/common/snippets/src/pass/insert_load_store.cpp +++ b/src/common/snippets/src/pass/insert_load_store.cpp @@ -20,12 +20,11 @@ ngraph::snippets::pass::InsertLoad::InsertLoad(const size_t count) { OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::InsertLoad") auto root = m.get_match_root(); - // check if already has Load as an output - for (auto output : root->outputs()) { - for (auto consumer : output.get_target_inputs()) { - if (ov::is_type(consumer.get_node())) { - return false; - } + // check if already has Load or TileBegin as an output + for (auto consumer : root->output(0).get_target_inputs()) { + if (ov::is_type(consumer.get_node()) || + ov::is_type(consumer.get_node())) { + return false; } } @@ -33,12 +32,10 @@ ngraph::snippets::pass::InsertLoad::InsertLoad(const size_t count) { ngraph::copy_runtime_info(root, load); bool rewritten = false; - for (auto output : root->outputs()) { - for (auto consumer : output.get_target_inputs()) { - if (consumer.get_node()->shared_from_this() != load) { - consumer.replace_source_output(load); - rewritten |= true; - } + for (auto consumer : root->output(0).get_target_inputs()) { + if (consumer.get_node()->shared_from_this() != load) { + consumer.replace_source_output(load); + rewritten |= true; } } @@ -54,14 +51,13 @@ ngraph::snippets::pass::InsertStore::InsertStore(const size_t count) { OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::InsertStore") auto root = m.get_match_root(); - // check if already has Store as an input - for (auto input : root->inputs()) { - if (ov::is_type(input.get_source_output().get_node())) { - return false; - } + // check if already has Store or TileEnd as an input + if (ov::is_type(root->get_input_node_shared_ptr(0)) || + ov::is_type(root->get_input_node_shared_ptr(0))) { + return false; } - auto store = std::make_shared (root->input_value(0), count); + auto store = std::make_shared(root->input_value(0), count); ngraph::copy_runtime_info(root, store); root->set_argument(0, store); return true; diff --git a/src/common/snippets/src/pass/insert_movebroadcast.cpp b/src/common/snippets/src/pass/insert_movebroadcast.cpp index 34589a604e3ff8..a080538729e635 100644 --- a/src/common/snippets/src/pass/insert_movebroadcast.cpp +++ b/src/common/snippets/src/pass/insert_movebroadcast.cpp @@ -30,6 +30,7 @@ std::shared_ptr broadcast_node_last_dim(const ngraph::Output(broadcasted_node, broadcasted_shape); + // TODO: Added copy od control dependency } return broadcasted_node; @@ -64,23 +65,27 @@ ngraph::snippets::pass::InsertMoveBroadcast::InsertMoveBroadcast() { return false; } - auto is_scalar_constant = [](const ov::Output& v){ + auto is_ignored_node = [](const ov::Output& v){ if (auto constant = ov::as_type_ptr(v.get_node_shared_ptr())) { if (constant->get_shape().empty() || ngraph::shape_size(constant->get_shape()) == 1) { return true; } + } else if (ov::is_type(v.get_node_shared_ptr())) { + // VectorBuffer has scalar output shape to avoid broadcast conflicts and manually shape insertion. + // So we shouldn't insert BroadcastMove + return true; } return false; }; std::vector input_shapes; - std::vector ignore_as_scalar; + std::vector is_ignored; for (const auto& val : values) { input_shapes.emplace_back(val.get_partial_shape()); - ignore_as_scalar.push_back(is_scalar_constant(val)); + is_ignored.push_back(is_ignored_node(val)); // Do not insert MoveBroadcast if any of the last dims is dynamic, // since we don't know if we really need it. In these cases, broadcasting will be performed // by TileSchdeuler based on runtime shapes. - if (!ignore_as_scalar.back() && !input_shapes.back().rbegin()->is_static()) + if (!is_ignored.back() && !input_shapes.back().rbegin()->is_static()) return false; } @@ -89,7 +94,7 @@ ngraph::snippets::pass::InsertMoveBroadcast::InsertMoveBroadcast() { ngraph::OutputVector broadcasted_inputs; for (size_t i = 0; i < values.size(); ++i) { - if (ignore_as_scalar[i]) { + if (is_ignored[i]) { broadcasted_inputs.push_back(values[i]); } else { auto node = broadcast_node_last_dim(values[i], bcast_shapes.first, bcast_shapes.second[i]); diff --git a/src/common/snippets/src/pass/softmax_decomposition.cpp b/src/common/snippets/src/pass/softmax_decomposition.cpp new file mode 100644 index 00000000000000..f26efad7a99cc8 --- /dev/null +++ b/src/common/snippets/src/pass/softmax_decomposition.cpp @@ -0,0 +1,172 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "snippets/remarks.hpp" +#include + +#include "snippets/pass/softmax_decomposition.hpp" +#include "snippets/op/tile_helpers.hpp" +#include "snippets/snippets_isa.hpp" + +#include +#include +#include +#include + +namespace { + +inline bool calculate_apply_increment(const size_t inner_master_wa, const size_t inner_target_wa) { + return inner_target_wa != 1 && inner_master_wa != 1; +} + +inline std::vector calculate_apply_increments(const size_t inner_master_wa, const std::vector& shapes) { + std::vector apply_increments(shapes.size(), false); + for (auto i = 0; i < shapes.size(); ++i) { + apply_increments[i] = calculate_apply_increment(inner_master_wa, shapes[i].back()); + } + return apply_increments; +} + +inline int64_t calculate_finalization_offsets(const size_t inner_master_wa, const size_t inner_target_wa) { + return inner_target_wa != 1 ? -inner_master_wa : 0; +} + +} // namespace + + +ngraph::snippets::pass::SoftmaxDecomposition::SoftmaxDecomposition(const size_t vector_size) { + const auto data0_pattern = ngraph::pattern::any_input(); + const auto data1_pattern = ngraph::pattern::any_input(); + const auto add_pattern = ngraph::pattern::wrap_type({data0_pattern, data1_pattern}, pattern::consumers_count(1)); + const auto softmax_pattern = ngraph::pattern::wrap_type({add_pattern}); // TODO: add axis? + + MATCHER_SCOPE(SoftmaxDecomposition); + register_matcher(std::make_shared(softmax_pattern, matcher_name), + [=](ngraph::pattern::Matcher &m) { + OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::SoftmaxDecomposition") + auto pattern_map = m.get_pattern_value_map(); + const auto data0 = pattern_map.at(data0_pattern).get_node_shared_ptr(); + const auto data1 = pattern_map.at(data1_pattern).get_node_shared_ptr(); + const auto bias = pattern_map.at(add_pattern).get_node_shared_ptr(); + const auto softmax = ngraph::as_type_ptr(pattern_map.at(softmax_pattern).get_node_shared_ptr()); + const auto master_pshape = softmax->get_input_partial_shape(0); + const auto rank = master_pshape.rank(); + if (!softmax || rank.is_dynamic() || master_pshape.is_dynamic() || softmax->get_axis() != (rank.get_length() - 1)) + return false; + + const auto data = softmax->input_value(0); + + const auto master_shape = master_pshape.get_shape(); + const auto dimension = master_shape.size() - 1; + const auto work_amount = master_shape[dimension]; + const auto increment = vector_size; + const auto inner_dim = master_shape.size() - 1; + const auto inner_master_wa = static_cast(master_shape[inner_dim]); + + /* ====== Add + ReduceMax decomposition ====== */ + + const auto vector_buffer_max = std::make_shared(); + const auto tile_max_begin = ngraph::snippets::op::insertTileBegin(ngraph::OutputVector{data0, data1}); + + const auto load_add0 = std::make_shared(tile_max_begin->output(0), increment); + const auto load_add1 = std::make_shared(tile_max_begin->output(1), increment); + const auto add = std::make_shared(load_add0, load_add1); + const auto max = std::make_shared(add, vector_buffer_max); + const auto store_add = std::make_shared(add, increment); + + // For tail loop we should fill input of Max by float min to avoid math incorrect calculations + auto& max_rt_info = max->input(0).get_rt_info(); + max_rt_info["set_fill"] = std::string("float_min"); + + const auto horizon_max = std::make_shared(max); + + const auto apply_increments_max = calculate_apply_increments(inner_master_wa, {data0->get_shape(), data1->get_shape(), store_add->get_shape()}); + const auto finalization_offsets_max = std::vector{ 0, 0, calculate_finalization_offsets(inner_master_wa, store_add->get_shape()[inner_dim]) }; + const auto tile_max_end = std::make_shared( + ngraph::OutputVector{store_add, tile_max_begin->output(2)}, dimension, work_amount, increment, apply_increments_max, finalization_offsets_max); + + const auto buffer_add = std::make_shared(tile_max_end->output(0)); + + /* =========================================== */ + + /* === Sub + Exp + ReduceSum decomposition === */ + + const auto vector_buffer_sum = std::make_shared(); + const auto tile_sum_begin = ngraph::snippets::op::insertTileBegin(ngraph::OutputVector{buffer_add}); + // We should zero buffer and find horizon max before Loop + tile_sum_begin->add_control_dependency(vector_buffer_sum); + tile_sum_begin->add_control_dependency(horizon_max); + + const auto load_sub = std::make_shared(tile_sum_begin->output(0), increment); + const auto sub = std::make_shared(load_sub, horizon_max); + const auto exp = std::make_shared(sub); + const auto sum = std::make_shared(exp, vector_buffer_sum); + const auto store_exp = std::make_shared(exp, increment); + + // For tail loop we should fill input of Sum by zeros to avoid math incorrect calculations + auto& sum_rt_info = sum->input(0).get_rt_info(); + sum_rt_info["set_fill"] = std::string("zero"); + + const auto horizon_sum = std::make_shared(sum); + + // we should increment only one of Load & Store because they work with Buffers that are mapped to one register + const auto apply_increments_sum = std::vector{ calculate_apply_increment(inner_master_wa, load_sub->get_shape()[inner_dim]), false}; + const auto finalization_offsets_sum = std::vector{ calculate_finalization_offsets(inner_master_wa, load_sub->get_shape()[inner_dim]), 0 }; + const auto tile_sum_end = std::make_shared( + ngraph::OutputVector{store_exp, tile_sum_begin->output(1)}, dimension, work_amount, increment, apply_increments_sum, finalization_offsets_sum); + + const auto buffer_exp = std::make_shared(tile_sum_end->output(0)); + + /* =========================================== */ + + /* ================== Div ==================== */ + + const auto tile_div_begin = ngraph::snippets::op::insertTileBegin(ngraph::OutputVector{buffer_exp}); + // We should find horizon sum before Loop + tile_div_begin->add_control_dependency(horizon_sum); + + const auto load_div = std::make_shared(tile_div_begin->output(0), increment); + const auto div = std::make_shared(load_div, horizon_sum); + const auto store_div = std::make_shared(div, increment); + + const auto apply_increments_div = calculate_apply_increments(inner_master_wa, {load_div->get_shape(), store_div->get_shape()}); + const auto finalization_offsets_div = std::vector{ calculate_finalization_offsets(inner_master_wa, load_div->get_shape()[inner_dim]), 0 }; + const auto tile_div_end = std::make_shared( + ngraph::OutputVector{store_div, tile_div_begin->output(1)}, dimension, work_amount, increment, apply_increments_div, finalization_offsets_div); + + /* =========================================== */ + + /* ============== Set offsets ================ */ + // For these setters should be created new Pass + // which set offsets regarding Buffers + + const auto buffer_size = ngraph::shape_size(buffer_add->get_shape()) * buffer_add->get_element_type().size(); + store_exp->set_offset(buffer_size); + load_div->set_offset(buffer_size); + + /* =========================================== */ + + /* ========== Control dependency ============= */ + + tile_max_begin->add_control_dependency(vector_buffer_max); + tile_max_end->add_control_dependency(max); + horizon_max->add_control_dependency(tile_max_end); + tile_sum_begin->add_control_dependency(vector_buffer_sum); + tile_sum_begin->add_control_dependency(horizon_max); + tile_sum_end->add_control_dependency(sum); + horizon_sum->add_control_dependency(tile_sum_end); + tile_div_begin->add_control_dependency(horizon_sum); + + /* =========================================== */ + + ngraph::copy_runtime_info( + {bias, softmax}, + {vector_buffer_max, tile_max_begin, load_add0, load_add1, add, max, store_add, horizon_max, tile_max_end, buffer_add, + vector_buffer_sum, tile_sum_begin, load_sub, sub, exp, sum, store_exp, horizon_sum, tile_sum_end, buffer_exp, + tile_div_begin, load_div, div, store_div, tile_div_end}); + ngraph::replace_node(softmax, tile_div_end); + + return true; + }); +} diff --git a/src/common/snippets/tests/src/softmax_decomposition.cpp b/src/common/snippets/tests/src/softmax_decomposition.cpp new file mode 100644 index 00000000000000..c8ae357f3acd23 --- /dev/null +++ b/src/common/snippets/tests/src/softmax_decomposition.cpp @@ -0,0 +1,39 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +#include +#include + +#include +#include + +#include + +#include "common_test_utils/ngraph_test_utils.hpp" + +using namespace testing; +using namespace ngraph; + +// todo: Rewrite this test using Snippets test infrastructure. See ./include/canonicalization.hpp for example + +TEST(TransformationTests, SoftmaxDecomposition) { + std::shared_ptr f(nullptr); + { + auto data0 = std::make_shared(element::f32, Shape{16, 32, 128}); + auto data1 = std::make_shared(element::f32, Shape{16, 32, 128}); + auto add = std::make_shared(data0, data1); + auto softmax = std::make_shared(add, add->get_shape().size() - 1); + f = std::make_shared(NodeVector{softmax}, ParameterVector{data0, data1}); + + pass::Manager m; + m.register_pass(); + m.register_pass(16); + m.run_passes(f); + } + + ov::pass::Serialize("/home/sidorova/work/branch/openvino/graphs/softmax/test.xml", + "/home/sidorova/work/branch/openvino/graphs/softmax/test.bin").run_on_model(f); +} diff --git a/src/plugins/intel_cpu/src/emitters/cpu_generator.cpp b/src/plugins/intel_cpu/src/emitters/cpu_generator.cpp index 2edbe58a272282..e3b51fb801bcf0 100644 --- a/src/plugins/intel_cpu/src/emitters/cpu_generator.cpp +++ b/src/plugins/intel_cpu/src/emitters/cpu_generator.cpp @@ -45,6 +45,8 @@ ov::intel_cpu::CPUTargetMachine::CPUTargetMachine(dnnl::impl::cpu::x64::cpu_isa_ // data movement jitters[ngraph::opset1::Parameter::get_type_info_static()] = CREATE_EMITTER(NopEmitter); jitters[ngraph::opset1::Result::get_type_info_static()] = CREATE_EMITTER(NopEmitter); + jitters[ngraph::snippets::op::Buffer::get_type_info_static()] = CREATE_EMITTER(NopEmitter); + jitters[ngraph::snippets::op::VectorBuffer::get_type_info_static()] = CREATE_EMITTER(ZeroEmitter); // jitters[ngraph::opset1::Constant::get_type_info_static()] = CREATE_EMITTER(); // Not supported jitters[ngraph::snippets::op::Load::get_type_info_static()] = CREATE_EMITTER(LoadEmitter); @@ -121,6 +123,10 @@ ov::intel_cpu::CPUTargetMachine::CPUTargetMachine(dnnl::impl::cpu::x64::cpu_isa_ // jitters[ngraph::opset1::Selu::get_type_info_static()] = CREATE_EMITTER(); // not supported jitters[ngraph::op::v0::Gelu::get_type_info_static()] = CREATE_EMITTER(ov::intel_cpu::jit_gelu_v0_emitter); jitters[ngraph::op::v7::Gelu::get_type_info_static()] = CREATE_EMITTER(ov::intel_cpu::jit_gelu_v7_emitter); + jitters[ngraph::snippets::op::Fill::get_type_info_static()] = CREATE_EMITTER(FillEmitter); + + jitters[ngraph::snippets::op::HorizonMax::get_type_info_static()] = CREATE_EMITTER(HorizonMaxEmitter); + jitters[ngraph::snippets::op::HorizonSum::get_type_info_static()] = CREATE_EMITTER(HorizonSumEmitter); jitters[ngraph::snippets::op::Kernel::get_type_info_static()] = CREATE_EMITTER(KernelEmitter); jitters[ngraph::snippets::op::TileBegin::get_type_info_static()] = CREATE_EMITTER(TileBeginEmitter); diff --git a/src/plugins/intel_cpu/src/emitters/jit_load_store_emitters.cpp b/src/plugins/intel_cpu/src/emitters/jit_load_store_emitters.cpp index 104ff43553b8ca..1dc6e92f001b56 100644 --- a/src/plugins/intel_cpu/src/emitters/jit_load_store_emitters.cpp +++ b/src/plugins/intel_cpu/src/emitters/jit_load_store_emitters.cpp @@ -59,9 +59,10 @@ jit_load_emitter::jit_load_emitter(dnnl::impl::cpu::x64::jit_generator *host, dn bool is_fill, std::string fill_value, emitter_in_out_map in_out_type) : jit_emitter(host, host_isa, exec_prc, in_out_type), load_num_(load_num), src_prc_(src_prc), dst_prc_(dst_prc), is_fill_(is_fill), fill_value_(fill_value), name_("unknown") { - prepare_table(); load_size_ = load_num * src_prc.size(); v_len_elt_ = get_vec_length() / exec_prc.size(); + is_fill_ = is_fill && load_size_ < get_vec_length(); + prepare_table(); } size_t jit_load_emitter::get_inputs_num() const { return 1; } diff --git a/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.cpp b/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.cpp index 88c8d367ca7a0e..49f97bfd32f3d2 100644 --- a/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.cpp +++ b/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.cpp @@ -131,18 +131,22 @@ void KernelEmitter::validate_arguments(const std::vector &in, const std::vector &out, const std::vector &pool, const std::vector &gpr) const { - if (in.size() != 2) - IE_THROW() << "KernelEmitter got invalid number of inputs. Expected 2, got " << in.size(); + if (in.size() != 3) + IE_THROW() << "KernelEmitter got invalid number of inputs. Expected 3, got " << in.size(); if (!out.empty()) IE_THROW() << "KernelEmitter got invalid number of outputs. Expected 0, got " << out.size(); - const auto num_params = in[0] + in[1]; + const auto num_params = in[0] + in[1] + in[2]; if (gp_regs_used.size() != num_params) IE_THROW() << "KernelEmitter arguments are inconsistent with the gpr_regs_used size: in[0] + in[1] = " << num_params << " gp_regs_used.size() = " << gp_regs_used.size(); } -void KernelEmitter::init_data_pointers(size_t num_inputs, size_t num_params, - const Reg64& reg_indexes, const Reg64& reg_const_params, const std::vector& data_ptr_regs) const { +void KernelEmitter::init_data_pointers(size_t num_inputs, size_t num_params, bool is_buffer_needed, + const Reg64& reg_indexes, const Reg64& reg_const_params, const std::vector& data_ptr_regs) const { + // Firstly we should move Buffer ptr + if (is_buffer_needed) { + h->mov(data_ptr_regs[num_params], h->ptr[reg_const_params + GET_OFF(buffer_scratchpad)]); + } // master_shape size must be valid in both static and dynamic cases const int64_t offsetRank = jcp.master_shape.size() - 1; std::function init_ptr_with_offset; @@ -157,7 +161,7 @@ void KernelEmitter::init_data_pointers(size_t num_inputs, size_t num_params, } }; const bool last_iter_explicitly = gp_regs_pool.empty(); - Reg64 reg_tmp = last_iter_explicitly ? data_ptr_regs.back() : Reg64(gp_regs_pool.back()); + Reg64 reg_tmp = last_iter_explicitly ? data_ptr_regs[num_params - 1] : Reg64(gp_regs_pool.back()); size_t i = 0; for (; i < num_params - last_iter_explicitly; i++) { if (i < num_inputs) @@ -187,13 +191,14 @@ void KernelEmitter::emit_impl(const std::vector& in, const size_t num_inputs = in[0]; const size_t num_outputs = in[1]; + const bool is_buffer_needed = in[2]; Reg64 reg_indexes = Reg64(abi_param1.getIdx()); Reg64 reg_const_params = Reg64(abi_param2.getIdx()); std::vector data_ptr_regs; transform_idxs_to_regs(gp_regs_used, data_ptr_regs); - init_data_pointers(num_inputs, num_inputs + num_outputs, reg_indexes, reg_const_params, data_ptr_regs); + init_data_pointers(num_inputs, num_inputs + num_outputs, is_buffer_needed, reg_indexes, reg_const_params, data_ptr_regs); // todo: emit_impl is a const method, so we can't just push_back unused regs to the gp_regs_pool. // we need a more elegant approach to avoid a full copy here auto local_gpr_pool = gp_regs_pool; @@ -429,7 +434,12 @@ StoreEmitter::StoreEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::c if (src_prc != dst_prc) IE_THROW() << "StoreEmitter supports only equal input and output types but gets: " << src_prc.name() << " and " << dst_prc.name(); - count = ov::as_type_ptr(n)->get_count(); + const auto store = ov::as_type_ptr(n); + if (!store) + IE_THROW() << "StoreEmitter expect Store op"; + + count = store->get_count(); + byte_offset = store->get_offset(); in_out_type_ = emitter_in_out_map::vec_to_gpr; store_emitter.reset(new jit_store_emitter(h, isa, src_prc, dst_prc, count)); } @@ -456,7 +466,7 @@ void StoreEmitter::emit_isa(const std::vector &in, const std::vector::type; if (!store_emitter) IE_THROW() << "Store CPU emitter isn't initialized for StoreEmitter!"; - store_emitter->emit_code({in[0]}, {out[0]}, aux_vec_idxs, aux_gpr_idxs); + store_emitter->emit_code({in[0], byte_offset}, {out[0]}, aux_vec_idxs, aux_gpr_idxs); } void StoreEmitter::emit_data() const { @@ -468,7 +478,12 @@ LoadEmitter::LoadEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu if (src_prc != dst_prc) IE_THROW() << "LoadEmitter supports only equal input and output types but gets: " << src_prc.name() << " and " << dst_prc.name(); - count = ov::as_type_ptr(n)->get_count(); + const auto load = ov::as_type_ptr(n); + if (!load) + IE_THROW() << "LoadEmitter expects Load op"; + + count = load->get_count(); + byte_offset = load->get_offset(); in_out_type_ = emitter_in_out_map::gpr_to_vec; load_emitter.reset(new jit_load_emitter(h, isa, src_prc, dst_prc, count)); } @@ -495,7 +510,7 @@ void LoadEmitter::emit_isa(const std::vector &in, const std::vector::type; if (!load_emitter) IE_THROW() << "Load CPU emitter isn't initialized for LoadEmitter!"; - load_emitter->emit_code({in[0]}, {out[0]}, aux_vec_idxs, aux_gpr_idxs); + load_emitter->emit_code({in[0], byte_offset}, {out[0]}, aux_vec_idxs, aux_gpr_idxs); } void LoadEmitter::emit_data() const { @@ -545,7 +560,11 @@ void BroadcastLoadEmitter::emit_isa(const std::vector &in, const std::ve LoadConvertEmitter::LoadConvertEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr& n) : MemoryEmitter(h, isa, n) { - count = ov::as_type_ptr(n)->get_count(); + const auto load = ov::as_type_ptr(n); + if (!load) + IE_THROW() << "LoadConvertEmitter expects Load op"; + count = load->get_count(); + byte_offset = load->get_offset(); in_out_type_ = emitter_in_out_map::gpr_to_vec; load_emitter.reset(new jit_load_emitter(h, isa, src_prc, dst_prc, count)); } @@ -570,7 +589,7 @@ template void LoadConvertEmitter::emit_isa(const std::vector &in, const std::vector &out) const { if (!load_emitter) IE_THROW() << "Load CPU emitter isn't initialized for LoadEmitter!"; - load_emitter->emit_code({in[0]}, {out[0]}, aux_vec_idxs, aux_gpr_idxs); + load_emitter->emit_code({in[0], byte_offset}, {out[0]}, aux_vec_idxs, aux_gpr_idxs); } void LoadConvertEmitter::emit_data() const { @@ -579,7 +598,11 @@ void LoadConvertEmitter::emit_data() const { StoreConvertEmitter::StoreConvertEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr& n) : MemoryEmitter(h, isa, n) { - count = ov::as_type_ptr(n)->get_count(); + const auto store = ov::as_type_ptr(n); + if (!store) + IE_THROW() << "StoreConvertEmitter expects Store op"; + count = store->get_count(); + byte_offset = store->get_offset(); in_out_type_ = emitter_in_out_map::vec_to_gpr; if (ov::is_type(n)) { @@ -609,12 +632,200 @@ template void StoreConvertEmitter::emit_isa(const std::vector &in, const std::vector &out) const { if (!store_emitter) IE_THROW() << "Store CPU emitter isn't initialized for StoreEmitter!"; - store_emitter->emit_code({in[0]}, {out[0]}, aux_vec_idxs, aux_gpr_idxs); + store_emitter->emit_code({in[0], byte_offset}, {out[0]}, aux_vec_idxs, aux_gpr_idxs); } void StoreConvertEmitter::emit_data() const { store_emitter->emit_data(); } +HorizonMaxEmitter::HorizonMaxEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr& n) : + jit_emitter(h, isa, n, Precision::FP32, emitter_in_out_map::vec_to_vec) {} + +void HorizonMaxEmitter::emit_impl(const std::vector& in, + const std::vector& out, + const std::vector& pool, + const std::vector& gpr, + const ov::intel_cpu::emitter_context *emit_context) const { + if (host_isa_ == dnnl::impl::cpu::x64::sse41) { + emit_isa(in, out); + } else if (host_isa_ == dnnl::impl::cpu::x64::avx2) { + emit_isa(in, out); + } else if (host_isa_ == dnnl::impl::cpu::x64::avx512_core) { + emit_isa(in, out); + } else { + IE_THROW() << "HorizonMax emitter doesn't support " << host_isa_; + } +} + +template +void HorizonMaxEmitter::emit_isa(const std::vector &in, const std::vector &out) const { + using Vmm = typename dnnl::impl::utils::conditional3::type; + + Vmm src_vmm = Vmm(in[0]); + Xmm dst_xmm = Xmm(out[0]); + Xmm aux_xmm = Xmm(aux_vec_idxs[0]); + + Reg64 aux_reg = Reg64(aux_gpr_idxs[0]); + Reg32 aux_reg_32 = Reg32(aux_reg.getIdx()); + + const size_t vlen = dnnl::impl::cpu::x64::cpu_isa_traits::vlen; + const size_t vec_size = vlen / sizeof(float); + h->sub(h->rsp, vlen); + h->uni_vmovups(h->ptr[h->rsp], src_vmm); + h->uni_vpxor(dst_xmm, dst_xmm, dst_xmm); + for (size_t i = 0; i < vec_size; i++) { + h->mov(aux_reg, h->ptr[h->rsp + i * sizeof(float)]); + h->vmovq(aux_xmm, aux_reg); + h->uni_vmaxps(dst_xmm, dst_xmm, aux_xmm); + } + h->add(h->rsp, vlen); +} + +HorizonSumEmitter::HorizonSumEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr& n) : + jit_emitter(h, isa, n, Precision::FP32, emitter_in_out_map::vec_to_vec) {} + +void HorizonSumEmitter::emit_impl(const std::vector& in, + const std::vector& out, + const std::vector& pool, + const std::vector& gpr, + const ov::intel_cpu::emitter_context *emit_context) const { + if (host_isa_ == dnnl::impl::cpu::x64::sse41) { + emit_isa(in, out); + } else if (host_isa_ == dnnl::impl::cpu::x64::avx2) { + emit_isa(in, out); + } else if (host_isa_ == dnnl::impl::cpu::x64::avx512_core) { + emit_isa(in, out); + } else { + IE_THROW() << "HorizonSum emitter doesn't support " << host_isa_; + } +} + +template +void HorizonSumEmitter::emit_isa(const std::vector &in, const std::vector &out) const { + using Vmm = typename dnnl::impl::utils::conditional3::type; + + Vmm src_vmm = Vmm(in[0]); + Xmm dst_xmm = Xmm(out[0]); + Xmm aux_xmm = Xmm(aux_vec_idxs[0]); + + Reg64 aux_reg = Reg64(aux_gpr_idxs[0]); + Reg32 aux_reg_32 = Reg32(aux_reg.getIdx()); + + const size_t vlen = dnnl::impl::cpu::x64::cpu_isa_traits::vlen; + const size_t vec_size = vlen / sizeof(float); + h->sub(h->rsp, vlen); + h->uni_vmovups(h->ptr[h->rsp], src_vmm); + h->uni_vpxor(dst_xmm, dst_xmm, dst_xmm); + for (size_t i = 0; i < vec_size; i++) { + h->mov(aux_reg, h->ptr[h->rsp + i * sizeof(float)]); + h->vmovq(aux_xmm, aux_reg); + h->uni_vaddps(dst_xmm, dst_xmm, aux_xmm); + } + h->add(h->rsp, vlen); +} + +ZeroEmitter::ZeroEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr& n) : + jit_emitter(h, isa, n, Precision::FP32, emitter_in_out_map::vec_to_vec) {} + +void ZeroEmitter::emit_impl(const std::vector& in, + const std::vector& out, + const std::vector& pool, + const std::vector& gpr, + const ov::intel_cpu::emitter_context *emit_context) const { + if (host_isa_ == dnnl::impl::cpu::x64::sse41) { + emit_isa(in, out); + } else if (host_isa_ == dnnl::impl::cpu::x64::avx2) { + emit_isa(in, out); + } else if (host_isa_ == dnnl::impl::cpu::x64::avx512_core) { + emit_isa(in, out); + } else { + IE_THROW() << "Zero emitter doesn't support " << host_isa_; + } +} + +template +void ZeroEmitter::emit_isa(const std::vector &in, const std::vector &out) const { + using Vmm = typename dnnl::impl::utils::conditional3::type; + + Vmm vmm = Vmm(out[0]); + h->uni_vpxor(vmm, vmm, vmm); +} + +FillEmitter::FillEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr& n) : + jit_emitter(h, isa, n, Precision::FP32, emitter_in_out_map::vec_to_vec) { + const auto fill = ov::as_type_ptr(n); + if (!fill) { + IE_THROW() << "Fill emitter expects Fill op from Snippets opset"; + } + + offset = fill->get_offset(); + fill_value = fill->get_fill_value(); + prepare_table(); +} + +size_t FillEmitter::aux_gprs_count() const { + if (one_of(host_isa_, dnnl::impl::cpu::x64::avx512_core)) + return 2; + else + return 1; +} + +void FillEmitter::emit_impl(const std::vector& in, + const std::vector& out, + const std::vector& pool, + const std::vector& gpr, + const ov::intel_cpu::emitter_context *emit_context) const { + if (host_isa_ == dnnl::impl::cpu::x64::sse41) { + emit_isa(in, out); + } else if (host_isa_ == dnnl::impl::cpu::x64::avx2) { + emit_isa(in, out); + } else if (host_isa_ == dnnl::impl::cpu::x64::avx512_core) { + emit_isa(in, out); + } else { + IE_THROW() << "Fill emitter doesn't support " << host_isa_; + } +} + +template +void FillEmitter::emit_isa(const std::vector &in, const std::vector &out) const { + using Vmm = typename dnnl::impl::utils::conditional3::type; + + Vmm src_vmm = Vmm(in[0]); + Vmm dst_vmm = Vmm(out[0]); + + if (one_of(host_isa_, dnnl::impl::cpu::x64::avx512_core)) { + uint64_t tail_mask = 1; + tail_mask = ~((tail_mask << offset) - tail_mask); + h->mov(Reg64(aux_gpr_idxs[0]), tail_mask); + h->kmovq(k_mask, Reg64(aux_gpr_idxs[0])); + h->vblendmps(dst_vmm | k_mask, src_vmm, table_val(fill_value)); + } else if (one_of(host_isa_, dnnl::impl::cpu::x64::avx2, dnnl::impl::cpu::x64::sse41)) { + uint8 imm = 1; + imm = ~((imm << offset) - imm); // shift load_num bit + if (host_isa_ == dnnl::impl::cpu::x64::sse41 && src_vmm.getIdx() != dst_vmm.getIdx()) { + h->uni_vmovups(dst_vmm, src_vmm); + src_vmm = Vmm(dst_vmm.getIdx()); + } + h->uni_vblendps(dst_vmm, src_vmm, table_val(fill_value), imm); + } else { + IE_THROW() << "Fill emitter doesn't support " << host_isa_; + } +} + +void FillEmitter::register_table_entries() { + push_arg_entry_of("zero", 0x00000000, true); + push_arg_entry_of("int_one", 0x00000001, true); + push_arg_entry_of("float_one", 0x3f800000, true); + push_arg_entry_of("int32_min", 0xcf000000, true); + push_arg_entry_of("float_min", 0xff7fffff, true); + push_arg_entry_of("int32_max", 0x4effffff, true); + push_arg_entry_of("float_max", 0x7f7fffff, true); +} + } // namespace intel_cpu } // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.hpp b/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.hpp index 37225fbb1c76bc..daafc7568a5be4 100644 --- a/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.hpp +++ b/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.hpp @@ -28,6 +28,7 @@ namespace intel_cpu { struct jit_snippets_call_args { const void *src_ptrs[SNIPPETS_MAX_SNIPPETS_DIMS] = {}; void *dst_ptrs[SNIPPETS_MAX_SNIPPETS_DIMS] = {}; + void* buffer_scratchpad = nullptr; }; struct jit_snippets_compile_args { @@ -87,7 +88,7 @@ class KernelEmitter : public jit_container_emitter { const std::vector& pool, const std::vector& gpr, const ov::intel_cpu::emitter_context *emit_context) const override; - void init_data_pointers(size_t, size_t, const Reg64&, const Reg64&, const std::vector&) const; + void init_data_pointers(size_t, size_t, bool, const Reg64&, const Reg64&, const std::vector&) const; jit_snippets_compile_args jcp; std::vector gp_regs_pool; @@ -236,6 +237,9 @@ class MemoryEmitter : public jit_emitter { protected: Precision src_prc; Precision dst_prc; + + size_t count = 0; + size_t byte_offset = 0; }; class StoreEmitter : public MemoryEmitter { @@ -256,7 +260,6 @@ class StoreEmitter : public MemoryEmitter { void emit_data() const override; private: - size_t count; std::unique_ptr store_emitter = nullptr; }; @@ -278,7 +281,6 @@ class LoadEmitter : public MemoryEmitter { void emit_data() const override; private: - size_t count; std::unique_ptr load_emitter = nullptr; }; @@ -317,7 +319,6 @@ class LoadConvertEmitter : public MemoryEmitter { void emit_data() const override; private: - size_t count; std::unique_ptr load_emitter = nullptr; }; @@ -339,8 +340,92 @@ class StoreConvertEmitter : public MemoryEmitter { void emit_data() const override; private: - size_t count; std::unique_ptr store_emitter = nullptr; }; + +class HorizonMaxEmitter : public jit_emitter { +public: + HorizonMaxEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr& n); + + size_t get_inputs_num() const override {return 1;} + +protected: + size_t aux_gprs_count() const override {return 1;} + size_t aux_vecs_count() const override {return 1;} + +private: + void emit_impl(const std::vector& in, + const std::vector& out, + const std::vector& pool, + const std::vector& gpr, + const ov::intel_cpu::emitter_context *emit_context) const override; + + template + void emit_isa(const std::vector &in, const std::vector &out) const; +}; + +class HorizonSumEmitter : public jit_emitter { +public: + HorizonSumEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr& n); + + size_t get_inputs_num() const override {return 1;} + +protected: + size_t aux_gprs_count() const override {return 1;} + size_t aux_vecs_count() const override {return 1;} + +private: + void emit_impl(const std::vector& in, + const std::vector& out, + const std::vector& pool, + const std::vector& gpr, + const ov::intel_cpu::emitter_context *emit_context) const override; + + template + void emit_isa(const std::vector &in, const std::vector &out) const; +}; + +class ZeroEmitter : public jit_emitter { +public: + ZeroEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr& n); + + size_t get_inputs_num() const override {return 0;} + +private: + void emit_impl(const std::vector& in, + const std::vector& out, + const std::vector& pool, + const std::vector& gpr, + const ov::intel_cpu::emitter_context *emit_context) const override; + + template + void emit_isa(const std::vector &in, const std::vector &out) const; +}; + +class FillEmitter : public jit_emitter { +public: + FillEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr& n); + + size_t get_inputs_num() const override {return 1;} + +protected: + size_t aux_gprs_count() const override; + +private: + void emit_impl(const std::vector& in, + const std::vector& out, + const std::vector& pool, + const std::vector& gpr, + const ov::intel_cpu::emitter_context *emit_context) const override; + + template + void emit_isa(const std::vector &in, const std::vector &out) const; + + void register_table_entries() override; + + size_t offset = 0; + std::string fill_value; +}; + } // namespace intel_cpu } // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.cpp b/src/plugins/intel_cpu/src/nodes/subgraph.cpp index a63fd0e8065d22..3eabfec3e5e8f0 100644 --- a/src/plugins/intel_cpu/src/nodes/subgraph.cpp +++ b/src/plugins/intel_cpu/src/nodes/subgraph.cpp @@ -375,6 +375,8 @@ void Snippet::createPrimitive() { jcp.master_shape = masterShape; std::copy(data_offsets.begin(), data_offsets.end(), jcp.data_offsets); generate(&jcp); + buffer_scratchpad_size = snippet->get_buffer_scratchpad_size(); + buffer_scratchpad.resize(buffer_scratchpad_size * parallel_get_num_threads(), 0); } std::vector Snippet::shapeInfer() const { @@ -500,6 +502,12 @@ void Snippet::prepareParams() { std::copy(s.begin() + offset, s.end(), ns.begin()); new_shapes.emplace_back(std::move(ns)); } + auto ops = snippet->get_body()->get_ops(); + for (auto& op : ops) { + if (auto softmax = ov::as_type_ptr(op)) { + softmax->set_axis(tileRank - 1); + } + } snippet->set_master_shape(PartialShape(scheduler_work_amounts)); snippet->reshape_body(new_shapes); } @@ -522,6 +530,7 @@ void Snippet::execute(dnnl::stream strm) { } jit_snippets_call_args call_args; updateSrcDstPtrs(call_args); + call_args.buffer_scratchpad = buffer_scratchpad.data(); if (tensorRank == rank6D) { schedule_6d(call_args); @@ -585,12 +594,24 @@ void Snippet::generate(const jit_snippets_compile_args* jcp) { void Snippet::schedule_6d(const jit_snippets_call_args& call_args) const { const auto& dom = exec_domain; + std::vector per_thread_call_args(parallel_get_num_threads(), call_args); + if (buffer_scratchpad_size > 0) { + for (size_t i = 0; i < per_thread_call_args.size(); ++i) + per_thread_call_args[i].buffer_scratchpad = reinterpret_cast(per_thread_call_args[i].buffer_scratchpad) + i * buffer_scratchpad_size; + } // < N, C, H, W > < 1, 1, N, C*H*W> - parallel_for5d(dom[0], dom[1], dom[2], dom[3], dom[4], - [&](int64_t d0, int64_t d1, int64_t d2, int64_t d3, int64_t d4) { - int64_t indexes[] = {d0, d1, d2, d3, d4}; - schedule.get_callable()(indexes, &call_args); - }); + for (auto d0 = 0; d0 < dom[0]; d0++) { + for (auto d1 = 0; d1 < dom[1]; d1++) { + for (auto d2 = 0; d2 < dom[2]; d2++) { + for (auto d3 = 0; d3 < dom[3]; d3++) { + for (auto d4 = 0; d4 < dom[4]; d4++) { + int64_t indexes[] = {d0, d1, d2, d3, d4}; + schedule.get_callable()(indexes, &per_thread_call_args[parallel_get_thread_num()]); + } + } + } + } + } } void Snippet::schedule_nt(const jit_snippets_call_args& call_args) const { diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.h b/src/plugins/intel_cpu/src/nodes/subgraph.h index 8b657e42805131..4b86d5f5b7b7ee 100644 --- a/src/plugins/intel_cpu/src/nodes/subgraph.h +++ b/src/plugins/intel_cpu/src/nodes/subgraph.h @@ -109,6 +109,10 @@ class Snippet : public Node { std::vector start_offset_in = {}; std::vector start_offset_out = {}; + + // TODO + std::vector buffer_scratchpad = {}; + size_t buffer_scratchpad_size = 0; }; } // namespace node diff --git a/src/plugins/intel_cpu/src/snippets_transformations/fuse_load_store_and_convert.cpp b/src/plugins/intel_cpu/src/snippets_transformations/fuse_load_store_and_convert.cpp index 2db9fd9f010de8..021b3f6c1293ec 100644 --- a/src/plugins/intel_cpu/src/snippets_transformations/fuse_load_store_and_convert.cpp +++ b/src/plugins/intel_cpu/src/snippets_transformations/fuse_load_store_and_convert.cpp @@ -42,12 +42,12 @@ ov::intel_cpu::pass::FuseLoadConvert::FuseLoadConvert() { std::dynamic_pointer_cast(convert)) { load_convert = std::make_shared(param, convert_saturation->get_destination_type(), - load->get_count()); + load->get_count(), load->get_offset()); } else if (const auto convert_truncation = std::dynamic_pointer_cast(convert)) { load_convert = std::make_shared(param, convert_truncation->get_destination_type(), - load->get_count()); + load->get_count(), load->get_offset()); } else { throw ngraph::ngraph_error( "Type of Convert op is undefined. Supports only fusing Load and ConvertTruncation or ConvertSaturation ops"); @@ -91,12 +91,12 @@ ov::intel_cpu::pass::FuseStoreConvert::FuseStoreConvert() { std::dynamic_pointer_cast(convert)) { store_convert = std::make_shared(input, convert_saturation->get_destination_type(), - store->get_count()); + store->get_count(), store->get_offset()); } else if (const auto convert_truncation = std::dynamic_pointer_cast(convert)) { store_convert = std::make_shared(input, convert_truncation->get_destination_type(), - store->get_count()); + store->get_count(), store->get_offset()); } else { throw ngraph::ngraph_error( "Type of Convert op is undefined. Supports only fusing Store and ConvertTruncation or ConvertSaturation ops"); diff --git a/src/plugins/intel_cpu/src/snippets_transformations/op/load_convert.cpp b/src/plugins/intel_cpu/src/snippets_transformations/op/load_convert.cpp index 731c0cb1e1b24a..675c214ed7ae2b 100644 --- a/src/plugins/intel_cpu/src/snippets_transformations/op/load_convert.cpp +++ b/src/plugins/intel_cpu/src/snippets_transformations/op/load_convert.cpp @@ -11,8 +11,9 @@ using namespace std; using namespace ov; -intel_cpu::LoadConvertSaturation::LoadConvertSaturation(const Output& x, const ov::element::Type& destination_type, const size_t count) : - Load(x, count), m_destination_type(destination_type) { +intel_cpu::LoadConvertSaturation::LoadConvertSaturation(const Output& x, const ov::element::Type& destination_type, + const size_t count, const size_t offset) : + Load(x, count, offset), m_destination_type(destination_type) { constructor_validate_and_infer_types(); } @@ -30,11 +31,12 @@ void intel_cpu::LoadConvertSaturation::validate_and_infer_types() { std::shared_ptr intel_cpu::LoadConvertSaturation::clone_with_new_inputs(const OutputVector& new_args) const { INTERNAL_OP_SCOPE(LoadConvert_clone_with_new_inputs); check_new_args_count(this, new_args); - return std::make_shared(new_args.at(0), m_destination_type, m_count); + return std::make_shared(new_args.at(0), m_destination_type, m_count, m_offset); } -intel_cpu::LoadConvertTruncation::LoadConvertTruncation(const Output& x, const ov::element::Type& destination_type, const size_t count) : - Load(x, count), m_destination_type(destination_type) { +intel_cpu::LoadConvertTruncation::LoadConvertTruncation(const Output& x, const ov::element::Type& destination_type, + const size_t count, const size_t offset) : + Load(x, count, offset), m_destination_type(destination_type) { constructor_validate_and_infer_types(); } @@ -52,5 +54,5 @@ void intel_cpu::LoadConvertTruncation::validate_and_infer_types() { std::shared_ptr intel_cpu::LoadConvertTruncation::clone_with_new_inputs(const OutputVector& new_args) const { INTERNAL_OP_SCOPE(LoadConvert_clone_with_new_inputs); check_new_args_count(this, new_args); - return std::make_shared(new_args.at(0), m_destination_type, m_count); + return std::make_shared(new_args.at(0), m_destination_type, m_count, m_offset); } diff --git a/src/plugins/intel_cpu/src/snippets_transformations/op/load_convert.hpp b/src/plugins/intel_cpu/src/snippets_transformations/op/load_convert.hpp index 572cbf00f521d4..1b1b8988c16784 100644 --- a/src/plugins/intel_cpu/src/snippets_transformations/op/load_convert.hpp +++ b/src/plugins/intel_cpu/src/snippets_transformations/op/load_convert.hpp @@ -20,7 +20,7 @@ class LoadConvertSaturation : public ngraph::snippets::op::Load { public: OPENVINO_OP("LoadConvertSaturation", "SnippetsOpset", ngraph::snippets::op::Load); - LoadConvertSaturation(const Output& x, const ov::element::Type& destination_type, const size_t count = 1lu); + LoadConvertSaturation(const Output& x, const ov::element::Type& destination_type, const size_t count = 1lu, const size_t offset = 0lu); LoadConvertSaturation() = default; ov::element::Type get_destination_type() const { return m_destination_type; } @@ -47,7 +47,7 @@ class LoadConvertTruncation : public ngraph::snippets::op::Load { public: OPENVINO_OP("LoadConvertTruncation", "SnippetsOpset", ngraph::snippets::op::Load); - LoadConvertTruncation(const Output& x, const ov::element::Type& destination_type, const size_t count = 1lu); + LoadConvertTruncation(const Output& x, const ov::element::Type& destination_type, const size_t count = 1lu, const size_t offset = 0lu); LoadConvertTruncation() = default; ov::element::Type get_destination_type() const { return m_destination_type; } diff --git a/src/plugins/intel_cpu/src/snippets_transformations/op/store_convert.cpp b/src/plugins/intel_cpu/src/snippets_transformations/op/store_convert.cpp index e58b5bc678d1f8..6a4180c54299c5 100644 --- a/src/plugins/intel_cpu/src/snippets_transformations/op/store_convert.cpp +++ b/src/plugins/intel_cpu/src/snippets_transformations/op/store_convert.cpp @@ -11,8 +11,9 @@ using namespace std; using namespace ov; -intel_cpu::StoreConvertSaturation::StoreConvertSaturation(const Output& x, const ov::element::Type& destination_type, const size_t count) : - Store(x, count), m_destination_type(destination_type) { +intel_cpu::StoreConvertSaturation::StoreConvertSaturation(const Output& x, const ov::element::Type& destination_type, + const size_t count, const size_t offset) : + Store(x, count, offset), m_destination_type(destination_type) { constructor_validate_and_infer_types(); } @@ -30,11 +31,12 @@ void intel_cpu::StoreConvertSaturation::validate_and_infer_types() { std::shared_ptr intel_cpu::StoreConvertSaturation::clone_with_new_inputs(const OutputVector& new_args) const { INTERNAL_OP_SCOPE(StoreConvert_clone_with_new_inputs); check_new_args_count(this, new_args); - return std::make_shared(new_args.at(0), m_destination_type, m_count); + return std::make_shared(new_args.at(0), m_destination_type, m_count, m_offset); } -intel_cpu::StoreConvertTruncation::StoreConvertTruncation(const Output& x, const ov::element::Type& destination_type, const size_t count) : - Store(x, count), m_destination_type(destination_type) { +intel_cpu::StoreConvertTruncation::StoreConvertTruncation(const Output& x, const ov::element::Type& destination_type, + const size_t count, const size_t offset) : + Store(x, count, offset), m_destination_type(destination_type) { constructor_validate_and_infer_types(); } @@ -52,5 +54,5 @@ void intel_cpu::StoreConvertTruncation::validate_and_infer_types() { std::shared_ptr intel_cpu::StoreConvertTruncation::clone_with_new_inputs(const OutputVector& new_args) const { INTERNAL_OP_SCOPE(StoreConvert_clone_with_new_inputs); check_new_args_count(this, new_args); - return std::make_shared(new_args.at(0), m_destination_type, m_count); + return std::make_shared(new_args.at(0), m_destination_type, m_count, m_offset); } diff --git a/src/plugins/intel_cpu/src/snippets_transformations/op/store_convert.hpp b/src/plugins/intel_cpu/src/snippets_transformations/op/store_convert.hpp index d0c4a947433b7c..3697af21540915 100644 --- a/src/plugins/intel_cpu/src/snippets_transformations/op/store_convert.hpp +++ b/src/plugins/intel_cpu/src/snippets_transformations/op/store_convert.hpp @@ -20,7 +20,7 @@ class StoreConvertSaturation : public ngraph::snippets::op::Store { public: OPENVINO_OP("StoreConvertSaturation", "SnippetsOpset", ngraph::snippets::op::Store); - StoreConvertSaturation(const Output& x, const ov::element::Type& destination_type, const size_t count = 1lu); + StoreConvertSaturation(const Output& x, const ov::element::Type& destination_type, const size_t count = 1lu, const size_t offset = 0lu); StoreConvertSaturation() = default; ov::element::Type get_destination_type() const { return m_destination_type; } @@ -47,7 +47,7 @@ class StoreConvertTruncation : public ngraph::snippets::op::Store { public: OPENVINO_OP("StoreConvertTruncation", "SnippetsOpset", ngraph::snippets::op::Store); - StoreConvertTruncation(const Output& x, const ov::element::Type& destination_type, const size_t count = 1lu); + StoreConvertTruncation(const Output& x, const ov::element::Type& destination_type, const size_t count = 1lu, const size_t offset = 0lu); StoreConvertTruncation() = default; ov::element::Type get_destination_type() const { return m_destination_type; } diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/softmax.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/softmax.cpp new file mode 100644 index 00000000000000..da96fb60c8ebf2 --- /dev/null +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/softmax.cpp @@ -0,0 +1,47 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "snippets/softmax.hpp" +#include "common_test_utils/test_constants.hpp" + +namespace ov { +namespace test { +namespace snippets { + + +namespace { + +INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Softmax, Softmax, + ::testing::Combine( + ::testing::Values(ov::Shape {1, 42, 16, 64}), + ::testing::Values(-1), + ::testing::Values(2), + ::testing::Values(1), + ::testing::Values(CommonTestUtils::DEVICE_CPU)), + Softmax::getTestCaseName); + +const std::vector> inputShape = { + std::pair{ {1, 16}, {1, 16} }, + std::pair{ {1, 32}, {1, 32} }, + std::pair{ {1, 1}, {1, 1} }, + std::pair{ {1, 17}, {1, 17} }, + std::pair{ {1, 53}, {1, 53} }, + std::pair{ {1, 1}, {1, 50} }, + std::pair{ {1, 50}, {1, 1} }, +}; + +INSTANTIATE_TEST_SUITE_P(smoke_Snippets_AddSoftmax, AddSoftmax, + ::testing::Combine( + ::testing::ValuesIn(inputShape), + ::testing::Values(-1), + ::testing::Values(3), // Subgraph + SinH x2 + ::testing::Values(1), + ::testing::Values(CommonTestUtils::DEVICE_CPU)), + AddSoftmax::getTestCaseName); + + +} // namespace +} // namespace snippets +} // namespace test +} // namespace ov \ No newline at end of file diff --git a/src/tests/functional/plugin/shared/include/snippets/softmax.hpp b/src/tests/functional/plugin/shared/include/snippets/softmax.hpp new file mode 100644 index 00000000000000..3c77fc2ebe373e --- /dev/null +++ b/src/tests/functional/plugin/shared/include/snippets/softmax.hpp @@ -0,0 +1,49 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "shared_test_classes/base/snippets_test_utils.hpp" + +namespace ov { +namespace test { +namespace snippets { + +typedef std::tuple< + ov::Shape, // Input 0 Shape + int, // Axis + size_t, // Expected num nodes + size_t, // Expected num subgraphs + std::string // Target Device +> SoftmaxParams; + +typedef std::tuple< + std::pair, // Input Shapes + int, // Axis + size_t, // Expected num nodes + size_t, // Expected num subgraphs + std::string // Target Device +> AddSoftmaxParams; + +class Softmax : public testing::WithParamInterface, + virtual public ov::test::SnippetsTestsCommon { +public: + static std::string getTestCaseName(testing::TestParamInfo obj); + +protected: + void SetUp() override; +}; + +class AddSoftmax : public testing::WithParamInterface, + virtual public ov::test::SnippetsTestsCommon { +public: + static std::string getTestCaseName(testing::TestParamInfo obj); + +protected: + void SetUp() override; +}; + +} // namespace snippets +} // namespace test +} // namespace ov \ No newline at end of file diff --git a/src/tests/functional/plugin/shared/src/snippets/softmax.cpp b/src/tests/functional/plugin/shared/src/snippets/softmax.cpp new file mode 100644 index 00000000000000..0ad1172168987a --- /dev/null +++ b/src/tests/functional/plugin/shared/src/snippets/softmax.cpp @@ -0,0 +1,80 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "common_test_utils/common_utils.hpp" +#include "snippets/softmax.hpp" +#include "subgraph_softmax.hpp" +#include "ngraph_functions/builders.hpp" +#include "functional_test_utils/skip_tests_config.hpp" + +namespace ov { +namespace test { +namespace snippets { + +std::string Softmax::getTestCaseName(testing::TestParamInfo obj) { + ov::Shape inputShapes; + int axis; + std::string targetDevice; + size_t num_nodes, num_subgraphs; + std::tie(inputShapes, axis, num_nodes, num_subgraphs, targetDevice) = obj.param; + + std::ostringstream result; + result << "IS=" << CommonTestUtils::vec2str(inputShapes) << "_"; + result << "Axis=" << axis << "_"; + result << "#N=" << num_nodes << "_"; + result << "#S=" << num_subgraphs << "_"; + result << "targetDevice=" << targetDevice; + return result.str(); +} + +void Softmax::SetUp() { + ov::Shape inputShape; + int axis; + std::tie(inputShape, axis, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam(); + init_input_shapes({{{}, {inputShape, }}}); + + auto f = ov::test::snippets::SoftmaxFunction({inputShape}, axis); + function = f.getOriginal(); +} + +std::string AddSoftmax::getTestCaseName(testing::TestParamInfo obj) { + std::pair inputShapes; + int axis; + std::string targetDevice; + size_t num_nodes, num_subgraphs; + std::tie(inputShapes, axis, num_nodes, num_subgraphs, targetDevice) = obj.param; + + std::ostringstream result; + result << "IS0=" << CommonTestUtils::vec2str(inputShapes.first) << "_"; + result << "IS1=" << CommonTestUtils::vec2str(inputShapes.second) << "_"; + result << "Axis=" << axis << "_"; + result << "#N=" << num_nodes << "_"; + result << "#S=" << num_subgraphs << "_"; + result << "targetDevice=" << targetDevice; + return result.str(); +} + +void AddSoftmax::SetUp() { + std::pair inputShapes; + int axis; + std::tie(inputShapes, axis, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam(); + init_input_shapes({{{}, {inputShapes.first, }}, {{}, {inputShapes.second, }}}); + + auto f = ov::test::snippets::AddSoftmaxFunction({inputShapes.first, inputShapes.second}, axis); + function = f.getOriginal(); +} + +TEST_P(Softmax, CompareWithRefImpl) { + run(); + validateNumSubgraphs(); +} + +TEST_P(AddSoftmax, CompareWithRefImpl) { + run(); + validateNumSubgraphs(); +} + +} // namespace snippets +} // namespace test +} // namespace ov diff --git a/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_softmax.hpp b/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_softmax.hpp new file mode 100644 index 00000000000000..f39a75485a84c8 --- /dev/null +++ b/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_softmax.hpp @@ -0,0 +1,43 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "ngraph/ngraph.hpp" +#include "./snippets_helpers.hpp" + +/* This file contains definitions of relatively simple functions (models) that will be used + * to test snippets-specific behavior. All the functions are expected to be direct descendants of + * SnippetsFunctionBase, so their constructors take only one (inputShapes) argument. + */ + +namespace ov { +namespace test { +namespace snippets { + +class SoftmaxFunction : public SnippetsFunctionBase { +public: + explicit SoftmaxFunction(const std::vector& inputShapes, int axis) : SnippetsFunctionBase(inputShapes), axis(axis) { + NGRAPH_CHECK(input_shapes.size() == 1, "Got invalid number of input shapes"); + } +protected: + std::shared_ptr initOriginal() const override; + + int axis; +}; + +class AddSoftmaxFunction : public SnippetsFunctionBase { +public: + explicit AddSoftmaxFunction(const std::vector& inputShapes, int axis) : SnippetsFunctionBase(inputShapes), axis(axis) { + NGRAPH_CHECK(input_shapes.size() == 2, "Got invalid number of input shapes"); + } +protected: + std::shared_ptr initOriginal() const override; + + int axis; +}; + +} // namespace snippets +} // namespace test +} // namespace ov diff --git a/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_softmax.cpp b/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_softmax.cpp new file mode 100644 index 00000000000000..c2b0bf97a68510 --- /dev/null +++ b/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_softmax.cpp @@ -0,0 +1,32 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "subgraph_softmax.hpp" +#include "common_test_utils/data_utils.hpp" +#include + +namespace ov { +namespace test { +namespace snippets { + +std::shared_ptr SoftmaxFunction::initOriginal() const { + auto data = std::make_shared(precision, input_shapes[0]); + auto sinh = std::make_shared(data); + auto softmax = std::make_shared(sinh, axis); + return std::make_shared(NodeVector{softmax}, ParameterVector{data}); +} + +std::shared_ptr AddSoftmaxFunction::initOriginal() const { + auto data0 = std::make_shared(precision, input_shapes[0]); + auto data1 = std::make_shared(precision, input_shapes[1]); + auto sinh0 = std::make_shared(data0); + auto sinh1 = std::make_shared(data1); + auto add = std::make_shared(sinh0, sinh1); + auto softmax = std::make_shared(add, axis); + return std::make_shared(NodeVector{softmax}, ParameterVector{data0, data1}); +} + +} // namespace snippets +} // namespace test +} // namespace ov \ No newline at end of file