From 79d7facb00b41a1d33f2fe08c2e3a2200be19ed7 Mon Sep 17 00:00:00 2001 From: Ivan Novoselov Date: Sun, 31 Jul 2022 16:18:12 +0100 Subject: [PATCH] Snippets increase subgraph size (#3) - Implement static TileScheduler to handle compile params processing. Now compile params are accessed only here - TileScheduler should emit code only for necessary scalar/vector Tiles - Perform abstract-to-physical register mapping in one place (currently KernelEmitter constructor) - Implement more precise register mapping, so larger subgraphs could be created (now up to 12 i/o regs instead of 7) Increments are invalid in some tests because of TileScheduler optimizations Optimizations fixed, the tests pass Ok Pass increment and dims to op::Tile constructor Added support of Convert FP32, BF16, I8, U8 Fixed original input and output types fixed minor comments Applied first part Applied second part --- .../snippets/include/snippets/emitter.hpp | 2 + .../snippets/include/snippets/generator.hpp | 19 +- .../include/snippets/op/blockedload.hpp | 34 - .../include/snippets/op/blockedparameter.hpp | 36 - .../snippets/op/convert_saturation.hpp | 37 + .../snippets/op/convert_truncation.hpp | 36 + .../snippets/include/snippets/op/load.hpp | 17 +- .../include/snippets/op/scalarload.hpp | 34 - .../include/snippets/op/scalarstore.hpp | 34 - .../snippets/include/snippets/op/store.hpp | 15 +- .../snippets/include/snippets/op/subgraph.hpp | 9 +- .../snippets/include/snippets/op/tile.hpp | 21 +- .../include/snippets/op/tile_scheduler.hpp | 39 + .../include/snippets/op/vectorload.hpp | 34 - .../include/snippets/op/vectorstore.hpp | 34 - .../snippets/pass/assign_registers.hpp | 2 +- .../pass/insert_convert_on_inputs.hpp | 31 + .../snippets/pass/insert_load_store.hpp | 4 +- .../reset_type_relaxed_node_precision.hpp | 31 + .../pass/transform_convert_to_truncation.hpp | 28 + .../snippets/pass/vector_to_scalar.hpp | 20 +- .../include/snippets/snippets_isa.hpp | 9 +- .../include/snippets/snippets_isa_tbl.hpp | 8 +- src/common/snippets/src/generator.cpp | 61 +- src/common/snippets/src/op/blockedload.cpp | 10 - .../snippets/src/op/convert_saturation.cpp | 19 + .../snippets/src/op/convert_truncation.cpp | 19 + src/common/snippets/src/op/load.cpp | 4 +- src/common/snippets/src/op/scalarload.cpp | 10 - src/common/snippets/src/op/scalarstore.cpp | 10 - src/common/snippets/src/op/store.cpp | 6 +- src/common/snippets/src/op/subgraph.cpp | 106 ++- src/common/snippets/src/op/tile.cpp | 5 +- src/common/snippets/src/op/tile_scheduler.cpp | 10 + src/common/snippets/src/op/vectorload.cpp | 10 - src/common/snippets/src/op/vectorstore.cpp | 10 - .../snippets/src/pass/assign_registers.cpp | 69 +- .../snippets/src/pass/collapse_subgraph.cpp | 60 +- .../src/pass/insert_convert_on_inputs.cpp | 72 ++ .../snippets/src/pass/insert_load_store.cpp | 12 +- .../load_movebroadcast_to_broadcastload.cpp | 2 +- .../reset_type_relaxed_node_precision.cpp | 31 + .../pass/transform_convert_to_truncation.cpp | 34 + .../snippets/src/pass/vector_to_scalar.cpp | 33 +- .../snippets/tests/include/lowering_utils.hpp | 3 +- .../set_scalar_count_for_load_and_store.hpp | 40 ++ .../snippets/tests/src/lowering_utils.cpp | 5 +- .../tests/src/pass/canonicalization.cpp | 4 +- .../tests/src/pass/collapse_subgraph.cpp | 38 + .../set_scalar_count_for_load_and_store.cpp} | 49 +- src/common/snippets/tests/src/registers.cpp | 43 +- .../intel_cpu/src/emitters/cpu_generator.cpp | 20 +- .../intel_cpu/src/emitters/cpu_generator.hpp | 2 + .../src/emitters/jit_conversion_emitters.cpp | 313 ++++++++ .../src/emitters/jit_conversion_emitters.hpp | 87 +++ .../intel_cpu/src/emitters/jit_emitter.cpp | 4 + .../intel_cpu/src/emitters/jit_emitter.hpp | 1 + .../src/emitters/jit_load_store_emitters.cpp | 205 ++++-- .../src/emitters/jit_load_store_emitters.hpp | 15 +- .../src/emitters/jit_snippets_emitters.cpp | 669 ++++++++++++++++++ .../src/emitters/jit_snippets_emitters.hpp | 598 +++++----------- src/plugins/intel_cpu/src/extension.cpp | 3 + .../snippets_mark_skipped.cpp | 71 +- src/plugins/intel_cpu/src/nodes/subgraph.cpp | 112 ++- src/plugins/intel_cpu/src/nodes/subgraph.h | 1 + src/plugins/intel_cpu/src/plugin.cpp | 1 - .../fuse_load_store_and_convert.cpp | 121 ++++ .../fuse_load_store_and_convert.hpp | 38 + .../op/load_store_convert.cpp | 56 ++ .../op/load_store_convert.hpp | 76 ++ .../shared_tests_instances/snippets/add.cpp | 42 +- .../snippets/convert.cpp | 162 +++++ .../snippets/eltwise_two_results.cpp | 25 + .../snippets/max_num_params_eltwise.cpp | 26 + .../snippets/three_inputs_eltwise.cpp | 36 +- .../snippets/two_inputs_and_outputs.cpp | 45 ++ .../functional/single_layer_tests/eltwise.cpp | 2 +- src/plugins/intel_cpu/thirdparty/onednn | 2 +- .../plugin/shared/include/snippets/add.hpp | 15 + .../shared/include/snippets/convert.hpp | 76 ++ .../include/snippets/eltwise_two_results.hpp | 33 + .../snippets/max_num_params_eltwise.hpp | 31 + .../snippets/two_inputs_and_outputs.hpp | 31 + .../plugin/shared/src/snippets/add.cpp | 92 ++- .../plugin/shared/src/snippets/convert.cpp | 231 ++++++ .../src/snippets/eltwise_two_results.cpp | 44 ++ .../src/snippets/max_num_params_eltwise.cpp | 49 ++ .../src/snippets/three_inputs_eltwise.cpp | 72 +- .../src/snippets/two_inputs_and_outputs.cpp | 43 ++ .../src/base/snippets_test_utils.cpp | 5 + .../include/subgraph_converts.hpp | 214 ++++++ .../include/subgraph_lowered.hpp | 3 +- .../include/subgraph_simple.hpp | 75 +- .../src/subgraph_convert.cpp | 241 +++++++ .../src/subgraph_simple.cpp | 93 +++ 95 files changed, 4287 insertions(+), 1103 deletions(-) delete mode 100644 src/common/snippets/include/snippets/op/blockedload.hpp delete mode 100644 src/common/snippets/include/snippets/op/blockedparameter.hpp create mode 100644 src/common/snippets/include/snippets/op/convert_saturation.hpp create mode 100644 src/common/snippets/include/snippets/op/convert_truncation.hpp delete mode 100644 src/common/snippets/include/snippets/op/scalarload.hpp delete mode 100644 src/common/snippets/include/snippets/op/scalarstore.hpp create mode 100644 src/common/snippets/include/snippets/op/tile_scheduler.hpp delete mode 100644 src/common/snippets/include/snippets/op/vectorload.hpp delete mode 100644 src/common/snippets/include/snippets/op/vectorstore.hpp create mode 100644 src/common/snippets/include/snippets/pass/insert_convert_on_inputs.hpp create mode 100644 src/common/snippets/include/snippets/pass/reset_type_relaxed_node_precision.hpp create mode 100644 src/common/snippets/include/snippets/pass/transform_convert_to_truncation.hpp delete mode 100644 src/common/snippets/src/op/blockedload.cpp create mode 100644 src/common/snippets/src/op/convert_saturation.cpp create mode 100644 src/common/snippets/src/op/convert_truncation.cpp delete mode 100644 src/common/snippets/src/op/scalarload.cpp delete mode 100644 src/common/snippets/src/op/scalarstore.cpp create mode 100644 src/common/snippets/src/op/tile_scheduler.cpp delete mode 100644 src/common/snippets/src/op/vectorload.cpp delete mode 100644 src/common/snippets/src/op/vectorstore.cpp create mode 100644 src/common/snippets/src/pass/insert_convert_on_inputs.cpp create mode 100644 src/common/snippets/src/pass/reset_type_relaxed_node_precision.cpp create mode 100644 src/common/snippets/src/pass/transform_convert_to_truncation.cpp create mode 100644 src/common/snippets/tests/include/pass/set_scalar_count_for_load_and_store.hpp rename src/common/snippets/tests/src/{vector_scalar.cpp => pass/set_scalar_count_for_load_and_store.cpp} (53%) create mode 100644 src/plugins/intel_cpu/src/emitters/jit_conversion_emitters.cpp create mode 100644 src/plugins/intel_cpu/src/emitters/jit_conversion_emitters.hpp create mode 100644 src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.cpp create mode 100644 src/plugins/intel_cpu/src/snippets_transformations/fuse_load_store_and_convert.cpp create mode 100644 src/plugins/intel_cpu/src/snippets_transformations/fuse_load_store_and_convert.hpp create mode 100644 src/plugins/intel_cpu/src/snippets_transformations/op/load_store_convert.cpp create mode 100644 src/plugins/intel_cpu/src/snippets_transformations/op/load_store_convert.hpp create mode 100644 src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/convert.cpp create mode 100644 src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/eltwise_two_results.cpp create mode 100644 src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/max_num_params_eltwise.cpp create mode 100644 src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/two_inputs_and_outputs.cpp create mode 100644 src/tests/functional/plugin/shared/include/snippets/convert.hpp create mode 100644 src/tests/functional/plugin/shared/include/snippets/eltwise_two_results.hpp create mode 100644 src/tests/functional/plugin/shared/include/snippets/max_num_params_eltwise.hpp create mode 100644 src/tests/functional/plugin/shared/include/snippets/two_inputs_and_outputs.hpp create mode 100644 src/tests/functional/plugin/shared/src/snippets/convert.cpp create mode 100644 src/tests/functional/plugin/shared/src/snippets/eltwise_two_results.cpp create mode 100644 src/tests/functional/plugin/shared/src/snippets/max_num_params_eltwise.cpp create mode 100644 src/tests/functional/plugin/shared/src/snippets/two_inputs_and_outputs.cpp create mode 100644 src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_converts.hpp create mode 100644 src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_convert.cpp diff --git a/src/common/snippets/include/snippets/emitter.hpp b/src/common/snippets/include/snippets/emitter.hpp index 2ba0f85c5deda8..99c09d9d61d1bf 100644 --- a/src/common/snippets/include/snippets/emitter.hpp +++ b/src/common/snippets/include/snippets/emitter.hpp @@ -51,5 +51,7 @@ class Emitter { virtual ~Emitter() = default; }; +using AllocatedEmitter = std::pair, ngraph::snippets::RegInfo>; + } // namespace snippets } // namespace ngraph \ No newline at end of file diff --git a/src/common/snippets/include/snippets/generator.hpp b/src/common/snippets/include/snippets/generator.hpp index e1a1fdf720a413..b0510c8b13934c 100644 --- a/src/common/snippets/include/snippets/generator.hpp +++ b/src/common/snippets/include/snippets/generator.hpp @@ -18,7 +18,7 @@ auto getRegisters(std::shared_ptr& n) -> ngraph::snippets::RegInfo /** * @interface TargetMachine - * @brief Base class Target machine representation. Target derives from this class to provide generator information about supported emittors + * @brief Base class Target machine representation. Target derives from this class to provide generator information about supported emitters * @ingroup snippets */ class TargetMachine { @@ -41,9 +41,10 @@ class TargetMachine { */ virtual size_t get_lanes() const = 0; + /** - * @brief called by generator to all the emittor for a target machine - * @return a map by node's type info with callbacks to create an instance of emmitter for corresponding operation type + * @brief called by generator to all the emitter for a target machine + * @return a map by node's type info with callbacks to create an instance of emitter for corresponding operation type */ std::function(std::shared_ptr)> get(const ngraph::DiscreteTypeInfo type) const { auto jitter = jitters.find(type); @@ -118,6 +119,18 @@ class Generator { */ code generate(std::shared_ptr& m, const void* compile_params = nullptr) const; + /** + * @brief gets target machine + * @return pointer to constant target machine + */ + std::shared_ptr get_target_machine() const { return target; } + + /** + * @brief gets supported element type for execution + * @return element type + */ + virtual element::Type get_supported_exec_precision() const = 0; + protected: std::shared_ptr target; }; diff --git a/src/common/snippets/include/snippets/op/blockedload.hpp b/src/common/snippets/include/snippets/op/blockedload.hpp deleted file mode 100644 index d1ec4c5bdd43dd..00000000000000 --- a/src/common/snippets/include/snippets/op/blockedload.hpp +++ /dev/null @@ -1,34 +0,0 @@ -// Copyright (C) 2018-2022 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#pragma once - -#include -#include "load.hpp" - -namespace ngraph { -namespace snippets { -namespace op { - -/** - * @interface BlockedLoad - * @brief Generated by Canonicalization step for blocked data (NCHWc) to be loaded - * @ingroup snippets - */ -class BlockedLoad : public Load { -public: - OPENVINO_OP("BlockedLoad", "SnippetsOpset", ngraph::snippets::op::Load); - - BlockedLoad(const Output& x); - BlockedLoad() = default; - - std::shared_ptr clone_with_new_inputs(const OutputVector& new_args) const override { - check_new_args_count(this, new_args); - return std::make_shared(new_args.at(0)); - } -}; - -} // namespace op -} // namespace snippets -} // namespace ngraph \ No newline at end of file diff --git a/src/common/snippets/include/snippets/op/blockedparameter.hpp b/src/common/snippets/include/snippets/op/blockedparameter.hpp deleted file mode 100644 index 34a080d837fcf8..00000000000000 --- a/src/common/snippets/include/snippets/op/blockedparameter.hpp +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright (C) 2018-2022 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#pragma once - -#include -#include - -namespace ngraph { -namespace snippets { -namespace op { - -/** - * @interface BlockedParameter - * @brief Represents blocked input (NCHWc) for a subgraph - * @ingroup snippets - */ -class BlockedParameter : public ngraph::op::Parameter { -public: - OPENVINO_OP("BlockedParameter", "SnippetsOpset", ngraph::op::Parameter); - - BlockedParameter() = default; - BlockedParameter(const ngraph::element::Type& element_type, const PartialShape& pshape) - : Parameter(element_type, pshape) { - } - - std::shared_ptr clone_with_new_inputs(const OutputVector& new_args) const override { - check_new_args_count(this, new_args); - return std::make_shared(m_element_type, m_partial_shape); - } -}; - -} // namespace op -} // namespace snippets -} // namespace ngraph \ No newline at end of file diff --git a/src/common/snippets/include/snippets/op/convert_saturation.hpp b/src/common/snippets/include/snippets/op/convert_saturation.hpp new file mode 100644 index 00000000000000..8a26c8fb44818a --- /dev/null +++ b/src/common/snippets/include/snippets/op/convert_saturation.hpp @@ -0,0 +1,37 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include "ngraph/op/op.hpp" + +namespace ngraph { +namespace snippets { +namespace op { + +/** + * @interface ConvertSaturation + * @brief The implementation uses "saturation" conversion. + * It means that if the values are outside the limits + * of the maximum and minimum values of the destination data type, they are clamped. + * For example, int_32t ---> int8_t + * 129 ---> 127 + * @ingroup snippets + */ +class ConvertSaturation : public ov::op::v0::Convert { +public: + OPENVINO_OP("ConvertSaturation", "SnippetsOpset", ov::op::v0::Convert); + + ConvertSaturation(const Output& x, const ov::element::Type& destination_type); + ConvertSaturation() = default; + + std::shared_ptr clone_with_new_inputs(const OutputVector& new_args) const override; + + bool has_evaluate() const override { return false; } +}; + +} // namespace op +} // namespace snippets +} // namespace ngraph diff --git a/src/common/snippets/include/snippets/op/convert_truncation.hpp b/src/common/snippets/include/snippets/op/convert_truncation.hpp new file mode 100644 index 00000000000000..aa802dffa673bc --- /dev/null +++ b/src/common/snippets/include/snippets/op/convert_truncation.hpp @@ -0,0 +1,36 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include "ngraph/op/op.hpp" + +namespace ngraph { +namespace snippets { +namespace op { + +/** + * @interface ConvertTruncation + * @brief The implementation "truncation" conversion. + * It means that if there are overflow, the values will wrap around. + * For example, int_32t ---> int8_t + * 129 ---> -127 + * @ingroup snippets + */ +class ConvertTruncation : public ov::op::v0::Convert { +public: + OPENVINO_OP("ConvertTruncation", "SnippetsOpset", ov::op::v0::Convert); + + ConvertTruncation(const Output& x, const ov::element::Type& destination_type); + ConvertTruncation() = default; + + std::shared_ptr clone_with_new_inputs(const OutputVector& new_args) const override; + + bool has_evaluate() const override { return false; } +}; + +} // namespace op +} // namespace snippets +} // namespace ngraph diff --git a/src/common/snippets/include/snippets/op/load.hpp b/src/common/snippets/include/snippets/op/load.hpp index ab9182be9060a9..7f53240ae21946 100644 --- a/src/common/snippets/include/snippets/op/load.hpp +++ b/src/common/snippets/include/snippets/op/load.hpp @@ -12,20 +12,22 @@ namespace op { /** * @interface Load - * @brief Generated by Canonicalization step where explicit load instruction should be emmiteed - * ScalarLoad == scalar instruction + post increment - * Load (VectorLoad) == vector instruction + post increment - * BroadcastLoad == scalar instruction - post increment - * BlockedLoad == vector instruction - post increment + * @brief Generated by Canonicalization step where explicit instructions should be emitted for data loading + * where number of elements to load is determined by "count" + * Default value is "1" - to load one element * @ingroup snippets */ class Load : public ngraph::op::Op { public: OPENVINO_OP("Load", "SnippetsOpset"); - Load(const Output& x); + Load(const Output& x, const size_t count = 1lu); Load() = default; + size_t get_count() const { return m_count; } + + void set_count(const size_t count) { m_count = count; } + bool visit_attributes(AttributeVisitor& visitor) override; std::shared_ptr clone_with_new_inputs(const OutputVector& new_args) const override; @@ -35,6 +37,9 @@ class Load : public ngraph::op::Op { OPENVINO_SUPPRESS_DEPRECATED_START bool evaluate(const HostTensorVector& output_values, const HostTensorVector& input_values) const override; OPENVINO_SUPPRESS_DEPRECATED_END + +protected: + size_t m_count = 0lu; }; } // namespace op diff --git a/src/common/snippets/include/snippets/op/scalarload.hpp b/src/common/snippets/include/snippets/op/scalarload.hpp deleted file mode 100644 index 83088064e8bbb7..00000000000000 --- a/src/common/snippets/include/snippets/op/scalarload.hpp +++ /dev/null @@ -1,34 +0,0 @@ -// Copyright (C) 2018-2022 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#pragma once - -#include -#include "load.hpp" - -namespace ngraph { -namespace snippets { -namespace op { - -/** - * @interface ScalarLoad - * @brief Generated by Canonicalization for a scalar value load to vector register - * @ingroup snippets - */ -class ScalarLoad : public Load { -public: - OPENVINO_OP("ScalarLoad", "SnippetsOpset", ngraph::snippets::op::Load); - - ScalarLoad(const Output& x); - ScalarLoad() = default; - - std::shared_ptr clone_with_new_inputs(const OutputVector& new_args) const override { - check_new_args_count(this, new_args); - return std::make_shared(new_args.at(0)); - } -}; - -} // namespace op -} // namespace snippets -} // namespace ngraph \ No newline at end of file diff --git a/src/common/snippets/include/snippets/op/scalarstore.hpp b/src/common/snippets/include/snippets/op/scalarstore.hpp deleted file mode 100644 index dc103edf72a7a8..00000000000000 --- a/src/common/snippets/include/snippets/op/scalarstore.hpp +++ /dev/null @@ -1,34 +0,0 @@ -// Copyright (C) 2018-2022 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#pragma once - -#include -#include "store.hpp" - -namespace ngraph { -namespace snippets { -namespace op { - -/** - * @interface ScalarStore - * @brief Generated by Canonicalization for a scalar value store from vector register - * @ingroup snippets - */ -class ScalarStore : public Store { -public: - OPENVINO_OP("ScalarStore", "SnippetsOpset", ngraph::snippets::op::Store); - - ScalarStore(const Output& x); - ScalarStore() = default; - - std::shared_ptr clone_with_new_inputs(const OutputVector& new_args) const override { - check_new_args_count(this, new_args); - return std::make_shared(new_args.at(0)); - } -}; - -} // namespace op -} // namespace snippets -} // namespace ngraph \ No newline at end of file diff --git a/src/common/snippets/include/snippets/op/store.hpp b/src/common/snippets/include/snippets/op/store.hpp index fdf0801e06d92e..0ff5cc3ec8e063 100644 --- a/src/common/snippets/include/snippets/op/store.hpp +++ b/src/common/snippets/include/snippets/op/store.hpp @@ -11,17 +11,23 @@ namespace snippets { namespace op { /** - * @interface Load - * @brief Generated by Canonicalization step where explicit store instruction should be emmiteed + * @interface Store + * @brief Generated by Canonicalization step where explicit instructions should be emitted for data storing + * where number of elements to store is determined by "count" + * Default value is "1" - to store one element * @ingroup snippets */ class Store : public ngraph::op::Op { public: OPENVINO_OP("Store", "SnippetsOpset"); - Store(const Output& x); + Store(const Output& x, const size_t count = 1lu); Store() = default; + size_t get_count() const { return m_count; } + + void set_count(const size_t count) { m_count = count; } + bool visit_attributes(AttributeVisitor& visitor) override; std::shared_ptr clone_with_new_inputs(const OutputVector& new_args) const override; @@ -31,6 +37,9 @@ class Store : public ngraph::op::Op { OPENVINO_SUPPRESS_DEPRECATED_START bool evaluate(const HostTensorVector& output_values, const HostTensorVector& input_values) const override; OPENVINO_SUPPRESS_DEPRECATED_END + +protected: + size_t m_count = 0lu; }; } // namespace op diff --git a/src/common/snippets/include/snippets/op/subgraph.hpp b/src/common/snippets/include/snippets/op/subgraph.hpp index 12fc506be926ae..34b8183c69f827 100644 --- a/src/common/snippets/include/snippets/op/subgraph.hpp +++ b/src/common/snippets/include/snippets/op/subgraph.hpp @@ -90,12 +90,12 @@ class Subgraph : public ngraph::op::Op { snippets::Schedule generate(const BlockedShapeVector& output_shapes, const BlockedShapeVector& input_shapes, - ngraph::pass::Manager& opt, const void* compile_params = nullptr); + ngraph::pass::Manager& opt, const ov::element::Type exec_type = ngraph::element::f32, const void* compile_params = nullptr); snippets::Schedule generate(const BlockedShapeVector& output_shapes, const BlockedShapeVector& input_shapes, - const void* compile_params = nullptr); + const ov::element::Type exec_type = ngraph::element::f32, const void* compile_params = nullptr); snippets::Schedule generate(ngraph::pass::Manager &opt, const void* compile_params = nullptr); snippets::Schedule generate(const void* compile_params = nullptr); - Shape canonicalize(const BlockedShapeVector& output_shapes, const BlockedShapeVector& input_shapes); + Shape canonicalize(const BlockedShapeVector& output_shapes, const BlockedShapeVector& input_shapes, const ov::element::Type exec_type); // plugin sets generator for a snippet to some specific generator. // it's going to be replaced with Jitters table later @@ -107,8 +107,11 @@ class Subgraph : public ngraph::op::Op { void serialize() const; static auto wrap_node_as_subgraph(const std::shared_ptr& node) -> std::shared_ptr; + static void fill_empty_output_names(const Output& target_output_node, const Output& replacement_output_node); private: + void align_element_types(const BlockedShapeVector& outputShapes, const BlockedShapeVector& inputShapes, + const ov::element::Type exec_type); void convert_to_snippet_dialect(); Shape exec_domain; std::shared_ptr m_body; diff --git a/src/common/snippets/include/snippets/op/tile.hpp b/src/common/snippets/include/snippets/op/tile.hpp index 9620b81421fdff..ac1d6ef4d1a2b9 100644 --- a/src/common/snippets/include/snippets/op/tile.hpp +++ b/src/common/snippets/include/snippets/op/tile.hpp @@ -20,14 +20,27 @@ class Tile : public ngraph::op::Op { public: OPENVINO_OP("Tile", "SnippetsOpset"); - Tile(const std::vector, ngraph::snippets::RegInfo>>& region); + /// \brief Construct an Tile + /// \param region The vector of pairs: emitters and the corresponding registers + /// \param increment Tile size - count of elements to load and store. + /// Vector Tile should have size of vector register and Scalar Tile should have 1 + /// \param num_inputs Count of inputs + /// \param num_outputs Count of outputs + /// \param io_dims Vector of last dimensions of inputs and outputs + /// \param io_data_sizes Vector of data type sizes of inputs and outputs + Tile(const std::vector& region, size_t increment, size_t num_inputs, size_t num_outputs, + const std::vector& io_dims, const std::vector& io_data_sizes); Tile() = default; - std::vector, ngraph::snippets::RegInfo>> region; + std::vector region; + size_t increment = 0; + size_t num_inputs = 0; + size_t num_outputs = 0; + std::vector io_dims {}; + std::vector io_data_size {}; std::shared_ptr clone_with_new_inputs(const OutputVector& inputs) const override { - return std::make_shared(region); + return std::make_shared(region, increment, num_inputs, num_outputs, io_dims, io_data_size); } - const void *compile_params; }; } // namespace op diff --git a/src/common/snippets/include/snippets/op/tile_scheduler.hpp b/src/common/snippets/include/snippets/op/tile_scheduler.hpp new file mode 100644 index 00000000000000..9d6010f77978b0 --- /dev/null +++ b/src/common/snippets/include/snippets/op/tile_scheduler.hpp @@ -0,0 +1,39 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "ngraph/op/op.hpp" +#include "snippets/emitter.hpp" +#include "tile.hpp" + +namespace ngraph { +namespace snippets { +namespace op { + +/** + * @interface TileScheduler + * @brief Contains a set of Tiles (currently one vector and one scalar) and performs necessary preparations + * before the Tiles could be executed: calculates offsets, sets proper work amounts, decrement pointers if the same data + * have to be read several times (broadcasting). + * @ingroup snippets + */ +class TileScheduler : public ngraph::op::Op { +public: + OPENVINO_OP("TileScheduler", "SnippetsOpset"); + + TileScheduler(const AllocatedEmitter& vector_region, const AllocatedEmitter& scalar_region); + TileScheduler() = default; + AllocatedEmitter vector_region; + AllocatedEmitter scalar_region; + // todo: this clone_with_new_inputs is irrelevant + std::shared_ptr clone_with_new_inputs(const OutputVector& inputs) const override { + return std::make_shared(vector_region, scalar_region); + } + const void *compile_params; +}; + +} // namespace op +} // namespace snippets +} // namespace ngraph diff --git a/src/common/snippets/include/snippets/op/vectorload.hpp b/src/common/snippets/include/snippets/op/vectorload.hpp deleted file mode 100644 index a4a45ae9eb9803..00000000000000 --- a/src/common/snippets/include/snippets/op/vectorload.hpp +++ /dev/null @@ -1,34 +0,0 @@ -// Copyright (C) 2018-2022 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#pragma once - -#include -#include "load.hpp" - -namespace ngraph { -namespace snippets { -namespace op { - -/** - * @interface VectorLoad - * @brief Generated by Canonicalization for a vector value load to vector register - * @ingroup snippets - */ -class VectorLoad : public Load { -public: - OPENVINO_OP("VectorLoad", "SnippetsOpset", ngraph::snippets::op::Load); - - VectorLoad(const Output& x); - VectorLoad() = default; - - std::shared_ptr clone_with_new_inputs(const OutputVector& new_args) const override { - check_new_args_count(this, new_args); - return std::make_shared(new_args.at(0)); - } -}; - -} // namespace op -} // namespace snippets -} // namespace ngraph \ No newline at end of file diff --git a/src/common/snippets/include/snippets/op/vectorstore.hpp b/src/common/snippets/include/snippets/op/vectorstore.hpp deleted file mode 100644 index 7d55d28f2b8611..00000000000000 --- a/src/common/snippets/include/snippets/op/vectorstore.hpp +++ /dev/null @@ -1,34 +0,0 @@ -// Copyright (C) 2018-2022 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#pragma once - -#include -#include "store.hpp" - -namespace ngraph { -namespace snippets { -namespace op { - -/** - * @interface VectorStore - * @brief Generated by Canonicalization for a vector value store from vector register - * @ingroup snippets - */ -class VectorStore : public Store { -public: - OPENVINO_OP("VectorStore", "SnippetsOpset", ngraph::snippets::op::Store); - - VectorStore(const Output& x); - VectorStore() = default; - - std::shared_ptr clone_with_new_inputs(const OutputVector& new_args) const override { - check_new_args_count(this, new_args); - return std::make_shared(new_args.at(0)); - } -}; - -} // namespace op -} // namespace snippets -} // namespace ngraph \ No newline at end of file diff --git a/src/common/snippets/include/snippets/pass/assign_registers.hpp b/src/common/snippets/include/snippets/pass/assign_registers.hpp index fb3672fe389536..0eff4bcc7d7033 100644 --- a/src/common/snippets/include/snippets/pass/assign_registers.hpp +++ b/src/common/snippets/include/snippets/pass/assign_registers.hpp @@ -18,7 +18,7 @@ namespace pass { */ class AssignRegisters : public ngraph::pass::FunctionPass { public: - AssignRegisters() { + explicit AssignRegisters() { set_property(ngraph::pass::PassProperty::REQUIRE_STATIC_SHAPE, true); } bool run_on_model(const std::shared_ptr& m) override; diff --git a/src/common/snippets/include/snippets/pass/insert_convert_on_inputs.hpp b/src/common/snippets/include/snippets/pass/insert_convert_on_inputs.hpp new file mode 100644 index 00000000000000..d92ff619ede46f --- /dev/null +++ b/src/common/snippets/include/snippets/pass/insert_convert_on_inputs.hpp @@ -0,0 +1,31 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include + +namespace ngraph { +namespace snippets { +namespace pass { + +/** + * @interface InsertConvertOnInputs + * @brief Inserts ConvertSaturation op after Parameters and Scalars to convert data type of inputs + * to supported execution data type. + * Note: ConvertSaturation op isn't covered by specification of "Convert" op + * This op is used for conversion into and from FP32 after the correspoding Load + * and before Store to calculate in FP32 inside subgraph body in CPU Plugin + * @ingroup snippets + */ +class InsertConvertOnInputs: public ngraph::pass::MatcherPass { +public: + InsertConvertOnInputs(const ov::element::Type exec_type = ov::element::f32); +}; + + +} // namespace pass +} // namespace snippets +} // namespace ngraph diff --git a/src/common/snippets/include/snippets/pass/insert_load_store.hpp b/src/common/snippets/include/snippets/pass/insert_load_store.hpp index f26416e6ecd27a..dc1bf6b3e68717 100644 --- a/src/common/snippets/include/snippets/pass/insert_load_store.hpp +++ b/src/common/snippets/include/snippets/pass/insert_load_store.hpp @@ -19,7 +19,7 @@ namespace pass { */ class InsertLoad: public ngraph::pass::MatcherPass { public: - InsertLoad(); + InsertLoad(const size_t count = 1lu); }; /** @@ -30,7 +30,7 @@ class InsertLoad: public ngraph::pass::MatcherPass { */ class InsertStore: public ngraph::pass::MatcherPass { public: - InsertStore(); + InsertStore(const size_t count = 1lu); }; diff --git a/src/common/snippets/include/snippets/pass/reset_type_relaxed_node_precision.hpp b/src/common/snippets/include/snippets/pass/reset_type_relaxed_node_precision.hpp new file mode 100644 index 00000000000000..03e9db7aa78a87 --- /dev/null +++ b/src/common/snippets/include/snippets/pass/reset_type_relaxed_node_precision.hpp @@ -0,0 +1,31 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include + +namespace ngraph { +namespace snippets { +namespace pass { + +/** + * @interface ResetTypeRelaxedNodePrecision + * @brief Reset precision for type relaxed nodes inside body to align precision between nodes. + * Should be called after all Convert insertions + * @ingroup snippets + */ +class ResetTypeRelaxedNodePrecision: public ngraph::pass::FunctionPass { +public: + OPENVINO_RTTI("ResetTypeRelaxedNodePrecision", "0"); + ResetTypeRelaxedNodePrecision(const ov::element::Type exec_type = ov::element::f32); + bool run_on_model(const std::shared_ptr& m) override; +private: + ov::element::Type exec_type; +}; + +} // namespace pass +} // namespace snippets +} // namespace ngraph diff --git a/src/common/snippets/include/snippets/pass/transform_convert_to_truncation.hpp b/src/common/snippets/include/snippets/pass/transform_convert_to_truncation.hpp new file mode 100644 index 00000000000000..219d0bf0d73244 --- /dev/null +++ b/src/common/snippets/include/snippets/pass/transform_convert_to_truncation.hpp @@ -0,0 +1,28 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include + +namespace ngraph { +namespace snippets { +namespace pass { + +/** + * @interface TransofrmConvertToConvertTruncation + * @brief Transform Convert to ConvertTruncation with specification conversion rules + * Note: ConvertTruncation op is covered by specification of "Convert" op + * This op is used for real Convert ops inside subgraph body in CPU Plugin + * @ingroup snippets + */ +class TransformConvertToConvertTruncation: public ngraph::pass::MatcherPass { +public: + TransformConvertToConvertTruncation(); +}; + +} // namespace pass +} // namespace snippets +} // namespace ngraph diff --git a/src/common/snippets/include/snippets/pass/vector_to_scalar.hpp b/src/common/snippets/include/snippets/pass/vector_to_scalar.hpp index e8534cd22a2bfd..bbf428a4393dcb 100644 --- a/src/common/snippets/include/snippets/pass/vector_to_scalar.hpp +++ b/src/common/snippets/include/snippets/pass/vector_to_scalar.hpp @@ -12,27 +12,27 @@ namespace snippets { namespace pass { /** - * @interface ReplaceLoadsWithScalarLoads - * @brief Replaces vector loads with scalar versions. - * The pass is used to cange element type of function in a canonical form vector to scalar. + * @interface SetScalarCountForLoad + * @brief Set count `1` for Load to represent as ScalarLoad + * The pass is used to change element count to loading to "1" to load scalar value * Used for tail generation * @ingroup snippets */ -class ReplaceLoadsWithScalarLoads: public ngraph::pass::MatcherPass { +class SetScalarCountForLoad: public ngraph::pass::MatcherPass { public: - ReplaceLoadsWithScalarLoads(); + SetScalarCountForLoad(); }; /** - * @interface ReplaceStoresWithScalarStores - * @brief Replaces vector stores with scalar versions. - * The pass is used to cange element type of model in a canonical form vector to scalar. + * @interface SetScalarCountForStore + * @brief Set count `1` for Store to represent as ScalarStore + * The pass is used to change element count to stroring to "1" to store scalar valuw * Used for tail generation * @ingroup snippets */ -class ReplaceStoresWithScalarStores: public ngraph::pass::MatcherPass { +class SetScalarCountForStore: public ngraph::pass::MatcherPass { public: - ReplaceStoresWithScalarStores(); + SetScalarCountForStore(); }; } // namespace pass diff --git a/src/common/snippets/include/snippets/snippets_isa.hpp b/src/common/snippets/include/snippets/snippets_isa.hpp index da94fec0980d3a..f1c0e9056d66eb 100644 --- a/src/common/snippets/include/snippets/snippets_isa.hpp +++ b/src/common/snippets/include/snippets/snippets_isa.hpp @@ -7,21 +7,18 @@ #include "ngraph/ops.hpp" #include -#include "op/blockedload.hpp" -#include "op/blockedparameter.hpp" #include "op/broadcastload.hpp" #include "op/broadcastmove.hpp" +#include "op/convert_saturation.hpp" +#include "op/convert_truncation.hpp" #include "op/kernel.hpp" #include "op/load.hpp" #include "op/nop.hpp" #include "op/scalar.hpp" -#include "op/scalarload.hpp" -#include "op/scalarstore.hpp" #include "op/powerstatic.hpp" #include "op/store.hpp" #include "op/tile.hpp" -#include "op/vectorload.hpp" -#include "op/vectorstore.hpp" +#include "op/tile_scheduler.hpp" namespace ngraph { namespace snippets { diff --git a/src/common/snippets/include/snippets/snippets_isa_tbl.hpp b/src/common/snippets/include/snippets/snippets_isa_tbl.hpp index 53504a469e9a48..255a4f3a5e23d1 100644 --- a/src/common/snippets/include/snippets/snippets_isa_tbl.hpp +++ b/src/common/snippets/include/snippets/snippets_isa_tbl.hpp @@ -11,14 +11,9 @@ // SnippetS dialect NGRAPH_OP(Load, ngraph::snippets::op) -NGRAPH_OP(ScalarLoad, ngraph::snippets::op) -NGRAPH_OP(VectorLoad, ngraph::snippets::op) -NGRAPH_OP(BlockedLoad, ngraph::snippets::op) NGRAPH_OP(BroadcastLoad, ngraph::snippets::op) NGRAPH_OP(Store, ngraph::snippets::op) -NGRAPH_OP(ScalarStore, ngraph::snippets::op) -NGRAPH_OP(VectorStore, ngraph::snippets::op) NGRAPH_OP(BroadcastMove, ngraph::snippets::op) NGRAPH_OP(Scalar, ngraph::snippets::op) @@ -29,9 +24,10 @@ NGRAPH_OP(Nop, ngraph::snippets::op) // opset completeness NGRAPH_OP(Constant, ngraph::op) NGRAPH_OP(Parameter, ngraph::op::v0) -NGRAPH_OP(BlockedParameter, ngraph::snippets::op) NGRAPH_OP(Result, ngraph::op::v0) NGRAPH_OP(Broadcast, ngraph::op::v1) +NGRAPH_OP(ConvertTruncation, ngraph::snippets::op) +NGRAPH_OP(ConvertSaturation, ngraph::snippets::op) // unary NGRAPH_OP(Abs, ngraph::op::v0) diff --git a/src/common/snippets/src/generator.cpp b/src/common/snippets/src/generator.cpp index 44a69470134279..0e85fe72861a21 100644 --- a/src/common/snippets/src/generator.cpp +++ b/src/common/snippets/src/generator.cpp @@ -17,7 +17,8 @@ auto ngraph::snippets::getRegisters(std::shared_ptr& n) -> ngraph: auto rt = n->get_rt_info(); // ToDo: change to reg_t - std::vector rout; + std::vector rin, rout; + auto it_rt = rt.find("reginfo"); if (it_rt != rt.end()) { for (auto reg : it_rt->second.as>()) { @@ -25,12 +26,11 @@ auto ngraph::snippets::getRegisters(std::shared_ptr& n) -> ngraph: } } - std::vector rin; - for (auto input : n->inputs()) { + for (const auto& input : n->inputs()) { auto rt = input.get_source_output().get_node_shared_ptr()->get_rt_info(); auto it_rt = rt.find("reginfo"); if (it_rt != rt.end()) { - for (auto reg : it_rt->second.as>()) { + for (auto& reg : it_rt->second.as>()) { rin.push_back(reg); } } @@ -48,51 +48,56 @@ ngraph::snippets::code ngraph::snippets::Generator::generate(std::shared_ptrget_results(); auto in = params.size(); auto out = results.size(); - auto nptrs = in + out; + std::vector io_last_dims(in + out); + std::vector io_data_sizes(in + out); + std::transform(params.begin(), params.end(), io_last_dims.begin(), + [](const std::shared_ptr& n){return n->get_output_shape(0).back();}); + std::transform(results.begin(), results.end(), io_last_dims.begin() + in, + [](const std::shared_ptr& n){return n->get_input_shape(0).back();}); + std::transform(params.begin(), params.end(), io_data_sizes.begin(), + [](const std::shared_ptr& n){return n->get_element_type().size();}); + std::transform(results.begin(), results.end(), io_data_sizes.begin() + in, + [](const std::shared_ptr& n){return n->get_element_type().size();}); OV_ITT_TASK_CHAIN(GENERATE, ngraph::pass::itt::domains::SnippetsTransform, "Snippets::Generator", "::VectorTile") // vector tile - std::vector, ngraph::snippets::RegInfo>> lowered; + std::vector lowered; for (auto n : m->get_ordered_ops()) { - lowered.push_back(std::make_pair(target->get(n->get_type_info())(n), ngraph::snippets::getRegisters(n))); + lowered.emplace_back(std::make_pair(target->get(n->get_type_info())(n), ngraph::snippets::getRegisters(n))); } OV_ITT_TASK_NEXT(GENERATE, "::ScalarTile") // scalar tile auto m_scalar = ov::clone_model(*m.get()); ngraph::pass::Manager mng; - mng.register_pass(); - mng.register_pass(); + mng.register_pass(); + mng.register_pass(); mng.run_passes(m_scalar); OV_ITT_TASK_NEXT(GENERATE, "::ScalarTile_get") - std::vector, RegInfo>> scalar_lowered; + std::vector scalar_lowered; for (auto n : m_scalar->get_ordered_ops()) { - scalar_lowered.push_back(std::make_pair(target->get(n->get_type_info())(n), ngraph::snippets::getRegisters(n))); + scalar_lowered.emplace_back(std::make_pair(target->get(n->get_type_info())(n), ngraph::snippets::getRegisters(n))); } - OV_ITT_TASK_NEXT(GENERATE, "::Tiles1D") - + OV_ITT_TASK_NEXT(GENERATE, "::Tiles1D"); // wrapping into tiles1D - std::vector, RegInfo>> tiles1D; - auto tile = std::make_shared(lowered); - tile->compile_params = compile_params; - tiles1D.push_back(std::make_pair(target->get(ngraph::snippets::op::Tile::get_type_info_static())(tile), - std::make_pair(std::vector({target->get_lanes(), 0, nptrs, 1}), std::vector{}))); - tile = std::make_shared(scalar_lowered); - tile->compile_params = compile_params; - tiles1D.push_back(std::make_pair(target->get(ngraph::snippets::op::Tile::get_type_info_static())(tile), - std::make_pair(std::vector{{1, target->get_lanes(), nptrs, 1}}, std::vector{}))); + //todo: in, out, and io_last_dims should derive naturally from the graph representation + const auto& vector_tile = std::make_shared(lowered, target->get_lanes(), in, out, io_last_dims, io_data_sizes); + const auto& vector_region = std::make_pair(target->get(ngraph::snippets::op::Tile::get_type_info_static())(vector_tile), + std::make_pair(std::vector{}, std::vector{})); + const auto& scalar_tile = std::make_shared(scalar_lowered, 1, in, out, io_last_dims, io_data_sizes); + const auto& scalar_region = std::make_pair(target->get(ngraph::snippets::op::Tile::get_type_info_static())(scalar_tile), + std::make_pair(std::vector{}, std::vector{})); OV_ITT_TASK_NEXT(GENERATE, "::Tiles2D") // wrapping into tiles2D - std::vector, RegInfo>> tiles2D; - tile = std::make_shared(tiles1D); - tile->compile_params = compile_params; - tiles2D.push_back(std::make_pair(target->get(ngraph::snippets::op::Tile::get_type_info_static())(tile), - std::make_pair(std::vector({1, 0, nptrs, 0}), std::vector{}))); + auto tile_scheduler = std::make_shared(vector_region, scalar_region); + tile_scheduler->compile_params = compile_params; + const auto& tile_scheduler_region = std::make_pair(target->get(ngraph::snippets::op::TileScheduler::get_type_info_static())(tile_scheduler), + std::make_pair(std::vector({in, out, target->get_lanes()}), std::vector{})); OV_ITT_TASK_NEXT(GENERATE, "::EmitCode") // emission - auto tiles2DKernel = std::make_shared(tiles2D); + auto tiles2DKernel = std::make_shared(std::vector {tile_scheduler_region}); tiles2DKernel->compile_params = compile_params; std::shared_ptr kernel = target->get(ngraph::snippets::op::Kernel::get_type_info_static())(tiles2DKernel); kernel->emit_code({in, out}, {}); diff --git a/src/common/snippets/src/op/blockedload.cpp b/src/common/snippets/src/op/blockedload.cpp deleted file mode 100644 index 013977b591a6dc..00000000000000 --- a/src/common/snippets/src/op/blockedload.cpp +++ /dev/null @@ -1,10 +0,0 @@ -// Copyright (C) 2018-2022 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#include "snippets/op/blockedload.hpp" - -using namespace ngraph; - -snippets::op::BlockedLoad::BlockedLoad(const Output& x) : Load(x) { -} diff --git a/src/common/snippets/src/op/convert_saturation.cpp b/src/common/snippets/src/op/convert_saturation.cpp new file mode 100644 index 00000000000000..115f127dae5626 --- /dev/null +++ b/src/common/snippets/src/op/convert_saturation.cpp @@ -0,0 +1,19 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "snippets/itt.hpp" + +#include "snippets/op/convert_saturation.hpp" + +#include "ngraph/runtime/host_tensor.hpp" + +ngraph::snippets::op::ConvertSaturation::ConvertSaturation(const Output& x, const ov::element::Type& destination_type) + : ov::op::v0::Convert({x}, destination_type) { +} + +std::shared_ptr ngraph::snippets::op::ConvertSaturation::clone_with_new_inputs(const OutputVector& new_args) const { + INTERNAL_OP_SCOPE(ConvertSaturation_clone_with_new_inputs); + check_new_args_count(this, new_args); + return std::make_shared(new_args.at(0), m_destination_type); +} diff --git a/src/common/snippets/src/op/convert_truncation.cpp b/src/common/snippets/src/op/convert_truncation.cpp new file mode 100644 index 00000000000000..a009dc7d5618ad --- /dev/null +++ b/src/common/snippets/src/op/convert_truncation.cpp @@ -0,0 +1,19 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "snippets/itt.hpp" + +#include "snippets/op/convert_truncation.hpp" + +#include "ngraph/runtime/host_tensor.hpp" + +ngraph::snippets::op::ConvertTruncation::ConvertTruncation(const Output& x, const ov::element::Type& destination_type) + : ov::op::v0::Convert({x}, destination_type) { +} + +std::shared_ptr ngraph::snippets::op::ConvertTruncation::clone_with_new_inputs(const OutputVector& new_args) const { + INTERNAL_OP_SCOPE(ConvertTruncation_clone_with_new_inputs); + check_new_args_count(this, new_args); + return std::make_shared(new_args.at(0), m_destination_type); +} diff --git a/src/common/snippets/src/op/load.cpp b/src/common/snippets/src/op/load.cpp index b1eb539340f5e4..1ac4df725fe75d 100644 --- a/src/common/snippets/src/op/load.cpp +++ b/src/common/snippets/src/op/load.cpp @@ -11,7 +11,7 @@ using namespace std; using namespace ngraph; -snippets::op::Load::Load(const Output& x) : Op({x}) { +snippets::op::Load::Load(const Output& x, const size_t count) : Op({x}), m_count(count) { constructor_validate_and_infer_types(); } @@ -22,7 +22,7 @@ bool snippets::op::Load::visit_attributes(AttributeVisitor& visitor) { std::shared_ptr snippets::op::Load::clone_with_new_inputs(const OutputVector& new_args) const { INTERNAL_OP_SCOPE(Load); check_new_args_count(this, new_args); - return std::make_shared(new_args.at(0)); + return std::make_shared(new_args.at(0), m_count); } void snippets::op::Load::validate_and_infer_types() { diff --git a/src/common/snippets/src/op/scalarload.cpp b/src/common/snippets/src/op/scalarload.cpp deleted file mode 100644 index 83277647223616..00000000000000 --- a/src/common/snippets/src/op/scalarload.cpp +++ /dev/null @@ -1,10 +0,0 @@ -// Copyright (C) 2018-2022 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#include "snippets/op/scalarload.hpp" - -using namespace ngraph; - -snippets::op::ScalarLoad::ScalarLoad(const Output& x) : Load(x) { -} diff --git a/src/common/snippets/src/op/scalarstore.cpp b/src/common/snippets/src/op/scalarstore.cpp deleted file mode 100644 index ee333bfffa2e92..00000000000000 --- a/src/common/snippets/src/op/scalarstore.cpp +++ /dev/null @@ -1,10 +0,0 @@ -// Copyright (C) 2018-2022 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#include "snippets/op/scalarstore.hpp" - -using namespace ngraph; - -snippets::op::ScalarStore::ScalarStore(const Output& x) : Store(x) { -} diff --git a/src/common/snippets/src/op/store.cpp b/src/common/snippets/src/op/store.cpp index 1c2d62948bc5b6..db3204df69ab0b 100644 --- a/src/common/snippets/src/op/store.cpp +++ b/src/common/snippets/src/op/store.cpp @@ -4,14 +4,14 @@ #include -#include "snippets/op/scalarstore.hpp" +#include "snippets/op/store.hpp" #include using namespace std; using namespace ngraph; -snippets::op::Store::Store(const Output& x) : Op({x}) { +snippets::op::Store::Store(const Output& x, const size_t count) : Op({x}), m_count(count) { constructor_validate_and_infer_types(); } @@ -22,7 +22,7 @@ bool snippets::op::Store::visit_attributes(AttributeVisitor& visitor) { std::shared_ptr snippets::op::Store::clone_with_new_inputs(const OutputVector& new_args) const { INTERNAL_OP_SCOPE(Store); check_new_args_count(this, new_args); - return std::make_shared(new_args.at(0)); + return std::make_shared(new_args.at(0), m_count); } void snippets::op::Store::validate_and_infer_types() { diff --git a/src/common/snippets/src/op/subgraph.cpp b/src/common/snippets/src/op/subgraph.cpp index c2359af2b958d5..980e3dcdba1705 100644 --- a/src/common/snippets/src/op/subgraph.cpp +++ b/src/common/snippets/src/op/subgraph.cpp @@ -6,6 +6,7 @@ #include "snippets/remarks.hpp" #include "snippets/op/subgraph.hpp" +#include "snippets/op/convert_saturation.hpp" #include "snippets/pass/insert_load_store.hpp" #include "snippets/pass/insert_movebroadcast.hpp" #include "snippets/pass/load_movebroadcast_to_broadcastload.hpp" @@ -13,8 +14,15 @@ #include "snippets/pass/convert_constants_to_scalars.hpp" #include "snippets/pass/convert_power_to_powerstatic.hpp" #include "snippets/pass/vector_to_scalar.hpp" +#include "snippets/pass/transform_convert_to_truncation.hpp" +#include "snippets/pass/insert_convert_on_inputs.hpp" +#include "snippets/pass/reset_type_relaxed_node_precision.hpp" + +#include "transformations/common_optimizations/nop_elimination.hpp" +#include "transformations/utils/utils.hpp" #include +#include "ngraph/pass/constant_folding.hpp" #include #include @@ -92,6 +100,9 @@ auto snippets::op::Subgraph::wrap_node_as_subgraph(const std::shared_ptrclone_with_new_inputs(body_inputs); body_node->set_friendly_name(node->get_friendly_name()); + for (size_t i = 0; i < node->get_output_size(); i++) { + fill_empty_output_names(body_node->output(i), node->output(i)); + } if (node->get_output_size() != body_node->get_output_size()) { throw ngraph::ngraph_error("original node outputs size and extracted subgraph node outputs size doesn't much"); @@ -118,6 +129,20 @@ auto snippets::op::Subgraph::wrap_node_as_subgraph(const std::shared_ptr& target_output_node, const Output& replacement_output_node) { + NGRAPH_SUPPRESS_DEPRECATED_START + auto out_tensor = target_output_node.get_tensor_ptr(); + const std::string new_name = ngraph::op::util::get_ie_output_name(replacement_output_node); + if (out_tensor->get_name().empty()) { + out_tensor->set_name(new_name); + } + if (!replacement_output_node.get_names().empty()) { + out_tensor->set_names(replacement_output_node.get_names()); + } + NGRAPH_SUPPRESS_DEPRECATED_END +} + /// /// \brief Canonization transforms original subgraph and to canonical form suitable for code generation. In particular, /// it handles supported layout conversions, broadcasts inputs and outputs to a single rank and layout. Canonicalization @@ -125,7 +150,8 @@ auto snippets::op::Subgraph::wrap_node_as_subgraph(const std::shared_ptr + -Shape snippets::op::Subgraph::canonicalize(const BlockedShapeVector& outputShapes, const BlockedShapeVector& inputShapes) { +/// Also there is precision aligning inside body of subgraph during canonicalization +Shape snippets::op::Subgraph::canonicalize(const BlockedShapeVector& outputShapes, const BlockedShapeVector& inputShapes, const ov::element::Type exec_type) { INTERNAL_OP_SCOPE(Subgraph); OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::canonicalize") NODE_VALIDATION_CHECK(this, inputShapes.size() == m_body->get_parameters().size(), @@ -176,7 +202,8 @@ Shape snippets::op::Subgraph::canonicalize(const BlockedShapeVector& outputShape PartialShape::broadcast_merge_into(tmpPShape, inShape, ::ngraph::op::AutoBroadcastType::NUMPY), "Failed to create broadcastable shapes in snippets canonicalization"); const auto paramShape = m_body->get_parameters()[i]->get_shape(); - if (paramShape.size() != inShape.size() || !equal(paramShape.begin(), paramShape.end(), inShape.begin())) + const auto paramType = m_body->get_parameters()[i]->get_element_type(); + if (paramShape.size() != inShape.size() || !equal(paramShape.begin(), paramShape.end(), inShape.begin()) || paramType != inType) m_body->replace_parameter(i, std::make_shared(inType, inShape)); } @@ -213,21 +240,78 @@ Shape snippets::op::Subgraph::canonicalize(const BlockedShapeVector& outputShape ::ngraph::op::AutoBroadcastType::NUMPY); NODE_VALIDATION_CHECK(this, compatibleWithOtherOutputs, "Snippets output shapes must be numpy broadcastable"); } + + // We should insert Converts after Parameters and Constant and before Results + // to align precision inside Subgraph body that is supported by Plugin + align_element_types(outputShapes, inputShapes, exec_type); + exec_domain = outPShape.get_shape(); return exec_domain; } +void snippets::op::Subgraph::align_element_types(const BlockedShapeVector& outputShapes, + const BlockedShapeVector& inputShapes, + const ov::element::Type exec_type) { + ngraph::pass::Manager p_manager; + p_manager.register_pass(); + p_manager.run_passes(m_body); + + const auto& body_results = m_body->get_results(); + for (size_t i = 0; i < outputShapes.size(); i++) { + const auto needed_out_type = std::get<2>(outputShapes[i]); + + // If there is real Convert from graph (ConvertTruncation) before Result + // we should check destination type and insert ConvertSaturation before that if needed. + // For example, to return original element type after Convert insertion on inputs + std::shared_ptr first_convert = body_results[i]; + while (ov::is_type(first_convert->get_input_node_ptr(0))) { + first_convert = first_convert->get_input_node_shared_ptr(0); + } + if (auto existing_convert_t = ngraph::as_type_ptr(first_convert)) { + const auto original_input_element_type = existing_convert_t->get_input_element_type(0); + if (original_input_element_type != exec_type) { + const auto convert = std::make_shared( + existing_convert_t->get_input_node_shared_ptr(0), original_input_element_type); + existing_convert_t->set_argument(0, convert); + } + } + + // We should insert Convert before Results to return original output element type + const auto convert = std::make_shared( + body_results[i]->get_input_node_shared_ptr(0), needed_out_type); + body_results[i]->set_argument(0, convert); + } + + // After Convert insertion we should make the following steps: + // - insert ConvertSaturation after inputs and scalar to start aligning of exec data type inside body + // - manually set output element types of type relaxed nodes to align element type inside subgraph body + // - after Convert insertion on inputs and after scalars we should use ConstantFolding pass to convert + // element type of Scalars before inference + // - eliminate redundant Convert that could have been inserted + ngraph::pass::Manager manager; + manager.register_pass(exec_type); + manager.register_pass(exec_type); + manager.register_pass(); + manager.register_pass(); + manager.run_passes(m_body); +} + void snippets::op::Subgraph::convert_to_snippet_dialect() { INTERNAL_OP_SCOPE(Subgraph); OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::convert_to_snippet_dialect") auto skip_matching_domain = [](const std::shared_ptr& n) -> bool { return n->get_input_shape(0).back() != 1; }; + + // At the moment we support only full vector Load/Store and scalar Load/Store so that count is equal to lanes. + // Then we are going to support variadic Load/Store with different element count + const size_t count = m_generator->get_target_machine()->get_lanes(); + ngraph::pass::Manager manager; manager.register_pass(); manager.register_pass(); - manager.register_pass(); - manager.register_pass(); + manager.register_pass(count); + manager.register_pass(count); manager.register_pass(); manager.register_pass(); // Note that, BrodacastMove is typically inserted right after the Load. Such cases are typical for @@ -246,28 +330,30 @@ void snippets::op::Subgraph::convert_to_snippet_dialect() { // Result // Note: Load* should be replaced with ScalarLoad in this example to avoid invalid read in vector Tile. if (!exec_domain.empty() && exec_domain.back() != 1) { - manager.register_pass(); - manager.register_pass(); + manager.register_pass(); + manager.register_pass(); manager.get_pass_config()-> - set_callback(skip_matching_domain); + set_callback(skip_matching_domain); manager.get_pass_config()-> - set_callback(skip_matching_domain); + set_callback(skip_matching_domain); } manager.run_passes(m_body); } snippets::Schedule snippets::op::Subgraph::generate(const BlockedShapeVector& output_shapes, const BlockedShapeVector& input_shapes, + const ov::element::Type exec_type, const void* compile_params) { - canonicalize(output_shapes, input_shapes); + canonicalize(output_shapes, input_shapes, exec_type); return generate(compile_params); } snippets::Schedule snippets::op::Subgraph::generate(const BlockedShapeVector& output_shapes, const BlockedShapeVector& input_shapes, ngraph::pass::Manager& opt, + const ov::element::Type exec_type, const void* compile_params) { - canonicalize(output_shapes, input_shapes); + canonicalize(output_shapes, input_shapes, exec_type); return generate(opt, compile_params); } diff --git a/src/common/snippets/src/op/tile.cpp b/src/common/snippets/src/op/tile.cpp index c17b0b0c8163c5..b37e212fdcf88d 100644 --- a/src/common/snippets/src/op/tile.cpp +++ b/src/common/snippets/src/op/tile.cpp @@ -8,5 +8,8 @@ using namespace std; using namespace ngraph; -snippets::op::Tile::Tile(const std::vector, snippets::RegInfo>>& nested) : Op(), region(nested) { +snippets::op::Tile::Tile(const std::vector& region, size_t increment, + size_t num_inputs, size_t num_outputs, + const std::vector& io_dims, const std::vector& io_data_sizes) : + Op(), region(region), increment(increment), num_inputs(num_inputs), num_outputs(num_outputs), io_dims(io_dims), io_data_size(io_data_sizes) { } diff --git a/src/common/snippets/src/op/tile_scheduler.cpp b/src/common/snippets/src/op/tile_scheduler.cpp new file mode 100644 index 00000000000000..fd0ba9e6a23223 --- /dev/null +++ b/src/common/snippets/src/op/tile_scheduler.cpp @@ -0,0 +1,10 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "snippets/op/tile_scheduler.hpp" +#include "snippets/generator.hpp" + +ngraph::snippets::op::TileScheduler::TileScheduler(const AllocatedEmitter& vector_region, const AllocatedEmitter& scalar_region) + : Op(), vector_region{vector_region}, scalar_region{scalar_region} { +} diff --git a/src/common/snippets/src/op/vectorload.cpp b/src/common/snippets/src/op/vectorload.cpp deleted file mode 100644 index 333b310d6cb88e..00000000000000 --- a/src/common/snippets/src/op/vectorload.cpp +++ /dev/null @@ -1,10 +0,0 @@ -// Copyright (C) 2018-2022 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#include "snippets/op/vectorload.hpp" - -using namespace ngraph; - -snippets::op::VectorLoad::VectorLoad(const Output& x) : Load(x) { -} diff --git a/src/common/snippets/src/op/vectorstore.cpp b/src/common/snippets/src/op/vectorstore.cpp deleted file mode 100644 index fb4b4e76ef2311..00000000000000 --- a/src/common/snippets/src/op/vectorstore.cpp +++ /dev/null @@ -1,10 +0,0 @@ -// Copyright (C) 2018-2022 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#include "snippets/op/vectorstore.hpp" - -using namespace ngraph; - -snippets::op::VectorStore::VectorStore(const Output& x) : Store(x) { -} diff --git a/src/common/snippets/src/pass/assign_registers.cpp b/src/common/snippets/src/pass/assign_registers.cpp index d5703cc2905739..291b60e7cd809b 100644 --- a/src/common/snippets/src/pass/assign_registers.cpp +++ b/src/common/snippets/src/pass/assign_registers.cpp @@ -16,7 +16,6 @@ bool ngraph::snippets::pass::AssignRegisters::run_on_model(const std::shared_ptr& f) { RUN_ON_MODEL_SCOPE(AssignRegisters); OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::AssignRegisters") - int reg64_tmp_start { 8 }; // R8, R9, R10, R11, R12, R13, R14, R15 inputs+outputs+1 using Reg = size_t; auto ops = f->get_ordered_ops(); decltype(ops) stmts; @@ -26,8 +25,8 @@ bool ngraph::snippets::pass::AssignRegisters::run_on_model(const std::shared_ptr size_t rdx = 0; std::map, Reg> regs; - for (auto op : stmts) { - for (auto output : op->outputs()) { + for (const auto& op : stmts) { + for (const auto& output : op->outputs()) { regs[output.get_tensor_ptr()] = rdx++; } } @@ -35,9 +34,9 @@ bool ngraph::snippets::pass::AssignRegisters::run_on_model(const std::shared_ptr std::vector> used; std::vector> def; - for (auto op : stmts) { + for (const auto& op : stmts) { std::set u; - for (auto input : op->inputs()) { + for (const auto& input : op->inputs()) { if (regs.count(input.get_tensor_ptr())) { u.insert(regs[input.get_tensor_ptr()]); } @@ -46,7 +45,7 @@ bool ngraph::snippets::pass::AssignRegisters::run_on_model(const std::shared_ptr std::set d; if (!std::dynamic_pointer_cast(op)) { - for (auto output : op->outputs()) { + for (const auto& output : op->outputs()) { d.insert(regs[output.get_tensor_ptr()]); } } @@ -65,8 +64,8 @@ bool ngraph::snippets::pass::AssignRegisters::run_on_model(const std::shared_ptr for (size_t n = 0; n < stmts.size(); n++) { auto node = stmts[n]; if (!std::dynamic_pointer_cast(node)) { - for (auto out : node->outputs()) { - for (auto port : out.get_target_inputs()) { + for (const auto& out : node->outputs()) { + for (const auto& port : out.get_target_inputs()) { auto pos = std::find(stmts.begin(), stmts.end(), port.get_node()->shared_from_this()); if (pos != stmts.end()) { auto k = pos-stmts.begin(); @@ -136,46 +135,32 @@ bool ngraph::snippets::pass::AssignRegisters::run_on_model(const std::shared_ptr std::map, Reg> physical_regs; - for (auto reg : regs) { + for (const auto& reg : regs) { physical_regs[reg.first] = register_map[reg.second]; } - - size_t constantID = 0; - - for (auto n : f->get_ordered_ops()) { + const auto num_parameters = f->get_parameters().size(); + for (const auto& n : f->get_ordered_ops()) { auto& rt = n->get_rt_info(); - // nothing to do for model signature - if (std::dynamic_pointer_cast(n) || std::dynamic_pointer_cast(n)) { - continue; - } - - // store only effective address - if (auto result = std::dynamic_pointer_cast(n)) { - auto ea = reg64_tmp_start+static_cast(f->get_result_index(result) + f->get_parameters().size()); - rt["effectiveAddress"] = ea; + std::vector regs; + regs.reserve(n->outputs().size()); + /* The main idea here is that each operation stores its output regs in rt["reginfo"]. Input and output regs are + * then derived by parsing node's and parent's rt["reginfo"], look into ngraph::snippets::getRegisters for details. + * Note also that Parameter and Result store general-purpose register index, because they work with memory + * (memory pointer is stored in gpr). All other "regular" ops store vector regs indexes, since calculations are + * performed on registers. + */ + if (is_type(n)) { continue; - } - // store effective address and procced with vector registers - if (ov::as_type_ptr(n) || ov::as_type_ptr(n)) { - auto source = n->get_input_source_output(0).get_node_shared_ptr(); - - if (auto param = ov::as_type_ptr(source)) { - auto ea = reg64_tmp_start+static_cast(f->get_parameter_index(param)); - rt["effectiveAddress"] = ea; - } else if (auto constant = ov::as_type_ptr(source)) { - auto ea = reg64_tmp_start+static_cast(f->get_parameters().size() + f->get_results().size() + 1 + constantID); - rt["effectiveAddress"] = ea; - constantID++; - } else { - throw ngraph_error("load/broadcast should follow only Parameter or non-Scalar constant"); + } else if (const auto& param = ov::as_type_ptr(n)) { + regs.push_back(f->get_parameter_index(param)); + } else if (const auto& store = ov::as_type_ptr(n)) { + regs.push_back(f->get_result_index(store) + num_parameters); + } else { + for (const auto& output : n->outputs()) { + auto allocated = physical_regs[output.get_tensor_ptr()]; + regs.push_back(allocated); } } - - std::vector regs; regs.reserve(n->outputs().size()); - for (auto output : n->outputs()) { - auto allocated = physical_regs[output.get_tensor_ptr()]; - regs.push_back(allocated); - } rt["reginfo"] = regs; } diff --git a/src/common/snippets/src/pass/collapse_subgraph.cpp b/src/common/snippets/src/pass/collapse_subgraph.cpp index 49cb66b610ee8f..20acb0b35237b0 100644 --- a/src/common/snippets/src/pass/collapse_subgraph.cpp +++ b/src/common/snippets/src/pass/collapse_subgraph.cpp @@ -99,15 +99,17 @@ auto is_layout_oblivious(const std::shared_ptr &n) -> bool { || ov::is_type(n) || ov::is_type(n) || ov::is_type(n) - || ov::is_type(n); + || ov::is_type(n) + || ov::is_type(n); }; return is_layout_oblivious_unary(n) || is_layout_oblivious_binary(n); } auto has_supported_in_out(const std::shared_ptr &n) -> bool { auto supported = [](descriptor::Tensor& t) -> bool { - return t.get_element_type() == ngraph::element::f32 && - t.get_partial_shape().is_static(); + static const std::set supported_data_types = + { ngraph::element::f32, ngraph::element::i32, ngraph::element::bf16, ngraph::element::i8, ngraph::element::u8 }; + return t.get_partial_shape().is_static() && supported_data_types.count(t.get_element_type()) != 0; }; const auto & inputs = n->inputs(); const auto & outputs = n->outputs(); @@ -148,19 +150,9 @@ auto update_out_tensor_name(std::shared_ptr &sub for (unsigned int i = 0; i < subgraph->get_output_size() && not_set; i++) { for (const auto &in : subgraph->get_output_target_inputs(i)) { if (ov::is_type(in.get_node())) { - auto out_tensor = subgraph->output(i).get_tensor_ptr(); - NGRAPH_SUPPRESS_DEPRECATED_START - if (out_tensor->get_name().empty()) { - const auto& body_result = subgraph->get_body()->get_output_op(i); - const auto& body_result_input = body_result->get_input_source_output(0); - // Note that create_ie_output_name() checks only deprecated output.get_tensor().get_name() - // However output.get_tensor().get_names() should also be updated - if (!body_result_input.get_names().empty()) - out_tensor->add_names(body_result_input.get_names()); - std::string newTensorName = ngraph::op::util::get_ie_output_name(body_result_input); - out_tensor->set_name(newTensorName); - } - NGRAPH_SUPPRESS_DEPRECATED_END + const auto& body_result = subgraph->get_body()->get_output_op(i); + const auto& body_result_input = body_result->get_input_source_output(0); + op::Subgraph::fill_empty_output_names(subgraph->output(i), body_result_input); not_set = false; break; } @@ -406,6 +398,40 @@ TokenizeSnippets::TokenizeSnippets() { auto& input_body = clones[input_node]; size_t source_output_index = input_value.get_index(); auto source_result = input_body->get_results()[source_output_index]; + + // We cannot add new node, that is not Convert, after Convert (that is start node) to avoid arithmetic problems with conversion + // We can add any new node in Subgraph after Convert (bacause after Input) + // Parameter + // | + // Convert + // + // We cannot add new node, that isn't Convert, in Subgraph after existing Convert + // Parameter + // Relu + // Convert + // + // But we can add new Convert in Subgraph after existing Convert + // Parameter + // Relu + // Convert + // Convert + // + // Thus, We can grow subgraph only if Convert is the first node of subgraph and have to abort it's the last one and we want to add not Convert + // We have this limitation because at the moment we support only one execution precision inside body, so + // if there is Convert with input and output data types that aren't equal to supported exec type, + // we can get conversion math errors + const auto output_of_subgraph = source_result->get_input_node_shared_ptr(0); + if (!ov::is_type(node) && ov::is_type(output_of_subgraph)) { + // Also we can add new node after < Parameter -> Convert -> Convert -> Convert > + auto grandparent = output_of_subgraph->get_input_node_ptr(0); + while (ov::is_type(grandparent)) { + grandparent = grandparent->get_input_node_ptr(0); + } + + if (!ov::is_type(grandparent)) { + return abort_with_strategy("Convert supports only as Input and as Result of subgraph. Aborting"); + } + } // Result op has a single input internal_inputs.push_back(source_result->input_value(0)); } else { @@ -477,7 +503,7 @@ TokenizeSnippets::TokenizeSnippets() { throw ngraph_error("body results and node results size mismatch during subgraph collaps"); } // todo: move this plugin-specific constraint to the plugin callback - if (body_parameters.size() + body_results.size() > 7) { + if (body_parameters.size() + body_results.size() > 12) { const std::string message_reset = "new subgraph is created. Impossible to schedule subgraph with " + std::to_string(body_parameters.size()) + " inputs and " + std::to_string(body_results.size()) + " outputs."; const std::string message_abort = "failed to continue subgraph. Impossible to schedule subgraph with " + diff --git a/src/common/snippets/src/pass/insert_convert_on_inputs.cpp b/src/common/snippets/src/pass/insert_convert_on_inputs.cpp new file mode 100644 index 00000000000000..4647949145e44a --- /dev/null +++ b/src/common/snippets/src/pass/insert_convert_on_inputs.cpp @@ -0,0 +1,72 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include "snippets/remarks.hpp" + +#include "snippets/pass/insert_convert_on_inputs.hpp" +#include "snippets/snippets_isa.hpp" + +#include "ngraph/type.hpp" +#include "ngraph/node.hpp" + +#include +#include +#include +#include + +// We should recursivelly (after full sequences of ConvertTruncation) go through inputs and +// insert ConvertSaturation with supported element type before eltwises +// NOTE: JUST EXAMPLE: +// Parameter I8 +// ConvertTruncation U8 +// / | \ +// ConvertTruncation F32 ConvertTruncation I32 ConvertTruncation BF16 +// Eltwise ConvertSaturation FP32 ConvertTruncation I32 +// <> Eltwise ConvertSaturation FP32 +// <> Eltwise +bool insertConvertSaturationAfterNode(const std::shared_ptr& node, const ov::element::Type element_type) { + bool rewritten = false; + for (const auto& output : node->outputs()) { + for (auto consumer : output.get_target_inputs()) { + const auto output_shared_node = consumer.get_node()->shared_from_this(); + // Go down through ConvertTruncation sequence + if (auto existing_convert_t = ov::as_type_ptr(output_shared_node)) { + rewritten = insertConvertSaturationAfterNode(existing_convert_t, element_type); + continue; + } + + // Check if ConvertSaturation already exists with supported element type or not and insert ConvertSaturation with supported element type + auto existing_convert_s = ov::as_type_ptr(output_shared_node); + if ((!existing_convert_s && !ov::is_type(output_shared_node) && consumer.get_element_type() != element_type) || + (existing_convert_s && existing_convert_s->get_destination_type() != element_type)) { + const auto convert = std::make_shared(node, element_type); + consumer.replace_source_output(convert); + rewritten |= true; + } + } + } + return rewritten; +} + +ngraph::snippets::pass::InsertConvertOnInputs::InsertConvertOnInputs(const ov::element::Type exec_type) { + MATCHER_SCOPE(InsertConvertOnInputs); + + auto param_pattern = ngraph::pattern::wrap_type(); + auto scalar_pattern = pattern::wrap_type( + [=](Output output) -> bool { return ngraph::shape_size(output.get_shape()) == 1; }); + auto input = std::make_shared(OutputVector{ param_pattern, scalar_pattern }); + + ngraph::matcher_pass_callback callback = [this, exec_type](ngraph::pattern::Matcher& m) { + OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::InsertConvertOnInputs") + auto root = m.get_match_root(); + + auto rewritten = insertConvertSaturationAfterNode(root, exec_type); + + return rewritten; + }; + + auto m = std::make_shared(input, matcher_name); + register_matcher(m, callback); +} diff --git a/src/common/snippets/src/pass/insert_load_store.cpp b/src/common/snippets/src/pass/insert_load_store.cpp index 417458571d6168..827b1f914a793d 100644 --- a/src/common/snippets/src/pass/insert_load_store.cpp +++ b/src/common/snippets/src/pass/insert_load_store.cpp @@ -12,11 +12,11 @@ #include #include -ngraph::snippets::pass::InsertLoad::InsertLoad() { +ngraph::snippets::pass::InsertLoad::InsertLoad(const size_t count) { MATCHER_SCOPE(InsertLoad); register_matcher(std::make_shared( ngraph::pattern::wrap_type(), matcher_name), - [this](ngraph::pattern::Matcher &m) { + [this, count](ngraph::pattern::Matcher &m) { OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::InsertLoad") auto root = m.get_match_root(); @@ -29,7 +29,7 @@ ngraph::snippets::pass::InsertLoad::InsertLoad() { } } - auto load = std::make_shared (root); + auto load = std::make_shared(root, count); ngraph::copy_runtime_info(root, load); bool rewritten = false; @@ -46,11 +46,11 @@ ngraph::snippets::pass::InsertLoad::InsertLoad() { }); } -ngraph::snippets::pass::InsertStore::InsertStore() { +ngraph::snippets::pass::InsertStore::InsertStore(const size_t count) { MATCHER_SCOPE(InsertStore); register_matcher(std::make_shared( ngraph::pattern::wrap_type(), matcher_name), - [this](ngraph::pattern::Matcher &m) { + [this, count](ngraph::pattern::Matcher &m) { OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::InsertStore") auto root = m.get_match_root(); @@ -61,7 +61,7 @@ ngraph::snippets::pass::InsertStore::InsertStore() { } } - auto store = std::make_shared (root->input_value(0)); + auto store = std::make_shared (root->input_value(0), count); ngraph::copy_runtime_info(root, store); root->set_argument(0, store); return true; diff --git a/src/common/snippets/src/pass/load_movebroadcast_to_broadcastload.cpp b/src/common/snippets/src/pass/load_movebroadcast_to_broadcastload.cpp index cf6eb80e484c33..ce632f33608514 100644 --- a/src/common/snippets/src/pass/load_movebroadcast_to_broadcastload.cpp +++ b/src/common/snippets/src/pass/load_movebroadcast_to_broadcastload.cpp @@ -15,7 +15,7 @@ ngraph::snippets::pass::LoadMoveBroadcastToBroadcastLoad::LoadMoveBroadcastToBroadcastLoad() { MATCHER_SCOPE(LoadMoveBroadcastToBroadcastLoad); auto param_pattern = ngraph::pattern::wrap_type(); - auto load_pattern = std::make_shared(param_pattern); + auto load_pattern = ngraph::pattern::wrap_type({param_pattern}); auto fbn = std::make_shared(load_pattern, Shape{1}); register_matcher(std::make_shared(fbn, matcher_name), diff --git a/src/common/snippets/src/pass/reset_type_relaxed_node_precision.cpp b/src/common/snippets/src/pass/reset_type_relaxed_node_precision.cpp new file mode 100644 index 00000000000000..9cb89933ab0f0e --- /dev/null +++ b/src/common/snippets/src/pass/reset_type_relaxed_node_precision.cpp @@ -0,0 +1,31 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +#include "snippets/op/convert_saturation.hpp" +#include "snippets/pass/reset_type_relaxed_node_precision.hpp" +#include "ngraph_ops/type_relaxed.hpp" + +#include + + +ngraph::snippets::pass::ResetTypeRelaxedNodePrecision::ResetTypeRelaxedNodePrecision(const ov::element::Type exec_type) : exec_type(exec_type) { } + +bool ngraph::snippets::pass::ResetTypeRelaxedNodePrecision::run_on_model(const std::shared_ptr &m) { + RUN_ON_FUNCTION_SCOPE(ResetTypeRelaxedNodePrecision); + bool rewritten = false; + for (auto& op : m->get_ordered_ops()) { + if (auto node = std::dynamic_pointer_cast(op)) { + for (int i = 0; i < op->outputs().size(); i++) { + node->set_overridden_output_type(exec_type, i); + rewritten |= true; + } + } else { + op->validate_and_infer_types(); + } + } + + return rewritten; +} diff --git a/src/common/snippets/src/pass/transform_convert_to_truncation.cpp b/src/common/snippets/src/pass/transform_convert_to_truncation.cpp new file mode 100644 index 00000000000000..3ba93d74b4c023 --- /dev/null +++ b/src/common/snippets/src/pass/transform_convert_to_truncation.cpp @@ -0,0 +1,34 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "snippets/remarks.hpp" +#include + +#include "snippets/pass/transform_convert_to_truncation.hpp" +#include "snippets/snippets_isa.hpp" + +#include +#include +#include + +ngraph::snippets::pass::TransformConvertToConvertTruncation::TransformConvertToConvertTruncation() { + MATCHER_SCOPE(TransformConvertToConvertTruncation); + register_matcher(std::make_shared( + ngraph::pattern::wrap_type()), + [this](ngraph::pattern::Matcher &m) { + OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::TransformConvertToConvertTruncation") + const auto root = m.get_match_root(); + const auto convert = ngraph::as_type_ptr(root); + if (!convert) + return false; + + auto convert_truncation = std::make_shared(convert->get_input_source_output(0), + convert->get_destination_type()); + convert_truncation->set_friendly_name(convert->get_friendly_name()); + ngraph::copy_runtime_info(convert, convert_truncation); + ngraph::replace_node(convert, convert_truncation); + + return true; + }); +} \ No newline at end of file diff --git a/src/common/snippets/src/pass/vector_to_scalar.cpp b/src/common/snippets/src/pass/vector_to_scalar.cpp index 0af4d084f73f36..b8de68eafd8258 100644 --- a/src/common/snippets/src/pass/vector_to_scalar.cpp +++ b/src/common/snippets/src/pass/vector_to_scalar.cpp @@ -7,40 +7,43 @@ #include "snippets/pass/vector_to_scalar.hpp" #include "snippets/snippets_isa.hpp" -#include #include #include -ngraph::snippets::pass::ReplaceLoadsWithScalarLoads::ReplaceLoadsWithScalarLoads() { - MATCHER_SCOPE(ReplaceLoadsWithScalarLoads); +ngraph::snippets::pass::SetScalarCountForLoad::SetScalarCountForLoad() { + MATCHER_SCOPE(SetScalarCountForLoad); register_matcher(std::make_shared( ngraph::pattern::wrap_type(), matcher_name), [this](ngraph::pattern::Matcher &m) { - OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::ReplaceLoadsWithScalarLoads_callback") + OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::SetScalarCountForLoad_callback") auto root = m.get_match_root(); if (transformation_callback(root)) return false; - auto load = std::make_shared (root->input_value(0)); - load->set_friendly_name(root->get_friendly_name()); - ngraph::copy_runtime_info(root, load); - ngraph::replace_node(root, load); + + const auto load = ov::as_type_ptr(root); + if (!load) + return false; + + load->set_count(1lu); return true; }); } -ngraph::snippets::pass::ReplaceStoresWithScalarStores::ReplaceStoresWithScalarStores() { - MATCHER_SCOPE(ReplaceStoresWithScalarStores); +ngraph::snippets::pass::SetScalarCountForStore::SetScalarCountForStore() { + MATCHER_SCOPE(SetScalarCountForStore); register_matcher(std::make_shared( ngraph::pattern::wrap_type(), matcher_name), [this](ngraph::pattern::Matcher &m) { - OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::ReplaceStoresWithScalarStores_callback") + OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::SetScalarCountForStore_callback") auto root = m.get_match_root(); if (transformation_callback(root)) return false; - auto store = std::make_shared (root->input_value(0)); - store->set_friendly_name(root->get_friendly_name()); - ngraph::copy_runtime_info(root, store); - ngraph::replace_node(root, store); + + const auto store = ov::as_type_ptr(root); + if (!store) + return false; + + store->set_count(1lu); return true; }); } diff --git a/src/common/snippets/tests/include/lowering_utils.hpp b/src/common/snippets/tests/include/lowering_utils.hpp index 7d822e5853438a..1551f4fe99e311 100644 --- a/src/common/snippets/tests/include/lowering_utils.hpp +++ b/src/common/snippets/tests/include/lowering_utils.hpp @@ -29,12 +29,13 @@ class DummyTargetMachine : public ngraph::snippets::TargetMachine { DummyTargetMachine(); bool is_supported() const override { return true; } ngraph::snippets::code get_snippet() const override { return nullptr; } - size_t get_lanes() const override { return 1; } + size_t get_lanes() const override { return 10; } }; class DummyGenerator : public ngraph::snippets::Generator { public: DummyGenerator() : ngraph::snippets::Generator(std::make_shared()) {} + element::Type get_supported_exec_precision() const override { return ov::element::f32; } }; class LoweringTests : public TransformationTestsF { diff --git a/src/common/snippets/tests/include/pass/set_scalar_count_for_load_and_store.hpp b/src/common/snippets/tests/include/pass/set_scalar_count_for_load_and_store.hpp new file mode 100644 index 00000000000000..2bc13f3290b30c --- /dev/null +++ b/src/common/snippets/tests/include/pass/set_scalar_count_for_load_and_store.hpp @@ -0,0 +1,40 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "lowering_utils.hpp" +#include "snippets_helpers.hpp" + +/* The main purpose is to test that: + * - Load/Store ops are inserted + * - Load + BroadcastMove fuses to BroadcastLoad (not the main focus, but still had to cover; overlays with insert_movebroadcast.cpp) + * - Proper Load/Stores are converted to scalar form to avoid invalid memory access by vector tile + * (temporary disabled, since corresponding PR is not merged yet) + */ + +namespace ov { +namespace test { +namespace snippets { + +typedef std::tuple< + Shape, // Input shape 0 + Shape, // Input shape 1 + Shape, // Input shape 2 + Shape, // Broadcast shape 0 + Shape, // Broadcast shape 1 + Shape // Broadcast shape 2 +> insertLoadStoreParams; + +class InsertLoadStoreTests : public LoweringTests, public testing::WithParamInterface { +public: + static std::string getTestCaseName(testing::TestParamInfo obj); +protected: + void SetUp() override; + std::shared_ptr snippets_function; +}; + +} // namespace snippets +} // namespace test +} // namespace ov diff --git a/src/common/snippets/tests/src/lowering_utils.cpp b/src/common/snippets/tests/src/lowering_utils.cpp index bdbfe41d6dd45c..4aab86d5d7c07c 100644 --- a/src/common/snippets/tests/src/lowering_utils.cpp +++ b/src/common/snippets/tests/src/lowering_utils.cpp @@ -23,18 +23,15 @@ DummyTargetMachine::DummyTargetMachine() { jitters[op::v1::Multiply::get_type_info_static()] = dummy_functor; jitters[op::v1::Multiply::get_type_info_static()] = dummy_functor; jitters[ngraph::snippets::op::Load::get_type_info_static()] = dummy_functor; - jitters[ngraph::snippets::op::VectorLoad::get_type_info_static()] = dummy_functor; - jitters[ngraph::snippets::op::ScalarLoad::get_type_info_static()] = dummy_functor; jitters[ngraph::snippets::op::BroadcastLoad::get_type_info_static()] = dummy_functor; jitters[ngraph::snippets::op::Store::get_type_info_static()] = dummy_functor; - jitters[ngraph::snippets::op::VectorStore::get_type_info_static()] = dummy_functor; - jitters[ngraph::snippets::op::ScalarStore::get_type_info_static()] = dummy_functor; jitters[ngraph::snippets::op::Scalar::get_type_info_static()] = dummy_functor; jitters[ngraph::snippets::op::BroadcastMove::get_type_info_static()] = dummy_functor; jitters[ngraph::snippets::op::Kernel::get_type_info_static()] = dummy_functor; jitters[ngraph::snippets::op::Tile::get_type_info_static()] = dummy_functor; + jitters[ngraph::snippets::op::TileScheduler::get_type_info_static()] = dummy_functor; } std::shared_ptr LoweringTests::getSubgraph(const std::shared_ptr& f) { diff --git a/src/common/snippets/tests/src/pass/canonicalization.cpp b/src/common/snippets/tests/src/pass/canonicalization.cpp index 08ced11370cd7b..a9126c0f4216d1 100644 --- a/src/common/snippets/tests/src/pass/canonicalization.cpp +++ b/src/common/snippets/tests/src/pass/canonicalization.cpp @@ -49,7 +49,9 @@ TEST_P(CanonicalizationTests, Add) { function = snippets_function->getOriginal(); function_ref = snippets_function->getReference(); auto subgraph = getTokenizedSubgraph(function); - Shape canonical_output_shape = subgraph->canonicalize(output_blocked_shapes, input_blocked_shapes); + subgraph->set_generator(std::make_shared()); + const auto exec_type = subgraph->get_generator()->get_supported_exec_precision(); + Shape canonical_output_shape = subgraph->canonicalize(output_blocked_shapes, input_blocked_shapes, exec_type); ASSERT_DIMS_EQ(canonical_output_shape, expected_output_shape); } diff --git a/src/common/snippets/tests/src/pass/collapse_subgraph.cpp b/src/common/snippets/tests/src/pass/collapse_subgraph.cpp index 2a1e107df3c107..3e578119b25d19 100644 --- a/src/common/snippets/tests/src/pass/collapse_subgraph.cpp +++ b/src/common/snippets/tests/src/pass/collapse_subgraph.cpp @@ -5,6 +5,7 @@ #include #include #include +#include #include "snippets/pass/collapse_subgraph.hpp" namespace ov { @@ -39,6 +40,43 @@ TEST_F(CollapseSubgraphTests, smoke_Snippets_AvoidLoopEltwise) { run(); } +TEST_F(CollapseSubgraphTests, smoke_Snippets_OneConvert) { + const auto &f = ConvertFunction(std::vector{{2, 5}}); + function = f.getOriginal(); + function_ref = f.getReference(); + run(); +} + +TEST_F(CollapseSubgraphTests, smoke_Snippets_ConvertInput) { + const auto &f = ConvertInputFunction(std::vector{{2, 5}, {1, 5}}); + function = f.getOriginal(); + function_ref = f.getReference(); + run(); +} + +TEST_F(CollapseSubgraphTests, smoke_Snippets_ConvertOutput) { + const auto &f = ConvertOutputFunction(std::vector{{2, 5}, {1, 5}}); + function = f.getOriginal(); + function_ref = f.getReference(); + run(); +} + +TEST_F(CollapseSubgraphTests, smoke_Snippets_ConvertStub) { + const auto &f = ConvertStubFunction(std::vector{{2, 5, 2}, {1, 5, 1}}); + function = f.getOriginal(); + function_ref = f.getReference(); + run(); +} + +TEST_F(CollapseSubgraphTests, smoke_Snippets_ConvertPartialInputsAndResults) { + const auto &f = ConvertPartialInputsAndResultsFunction(std::vector{{2, 5, 1}, {1, 5, 1}, {2, 1, 10}}, + std::vector{ov::element::i8, ov::element::bf16, ov::element::f32}, + std::vector{ov::element::f32, ov::element::i8}); + function = f.getOriginal(); + function_ref = f.getReference(); + run(); +} + } // namespace snippets } // namespace test } // namespace ov \ No newline at end of file diff --git a/src/common/snippets/tests/src/vector_scalar.cpp b/src/common/snippets/tests/src/pass/set_scalar_count_for_load_and_store.cpp similarity index 53% rename from src/common/snippets/tests/src/vector_scalar.cpp rename to src/common/snippets/tests/src/pass/set_scalar_count_for_load_and_store.cpp index 7f46b9f01bc7e1..9305faa50119be 100644 --- a/src/common/snippets/tests/src/vector_scalar.cpp +++ b/src/common/snippets/tests/src/pass/set_scalar_count_for_load_and_store.cpp @@ -19,56 +19,81 @@ using namespace ngraph; // todo: Rewrite this test using Snippets test infrastructure. See ./include/canonicalization.hpp for example -TEST(TransformationTests, ReplaceLoadsWithScalarLoads) { +template +size_t get_count(const std::shared_ptr& f, const std::string& name) { + size_t load_count = std::numeric_limits::max(); + for (auto op : f->get_ops()) { + if (op->get_friendly_name() == name) { + load_count = ov::as_type_ptr(op)->get_count(); + } + } + return load_count; +} + +TEST(TransformationTests, SetScalarCountForLoad) { std::shared_ptr f(nullptr), f_ref(nullptr); + const auto count = 16; { auto data = std::make_shared(element::f32, Shape{2, 2}); - auto load = std::make_shared(data); + auto load = std::make_shared(data, count); + load->set_friendly_name("load"); auto neg = std::make_shared(load); - auto store = std::make_shared(neg); + auto store = std::make_shared(neg, count); f = std::make_shared(NodeVector{store}, ParameterVector{data}); pass::Manager m; m.register_pass(); - m.register_pass(); + m.register_pass(); m.run_passes(f); ASSERT_NO_THROW(check_rt_info(f)); } { auto data = std::make_shared(element::f32, Shape{2, 2}); - auto load = std::make_shared(data); + auto load = std::make_shared(data, 1lu); + load->set_friendly_name("load_ref"); auto neg = std::make_shared(load); - auto store = std::make_shared(neg); + auto store = std::make_shared(neg, count); f_ref = std::make_shared(NodeVector{store}, ParameterVector{data}); } auto res = compare_functions(f, f_ref); ASSERT_TRUE(res.first) << res.second; + + auto load_count = get_count(f, "load"); + auto load_count_ref = get_count(f_ref, "load_ref"); + ASSERT_EQ(load_count, load_count_ref); } -TEST(TransformationTests, ReplaceStoresWithScalarStores) { +TEST(TransformationTests, SetScalarCountForStore) { std::shared_ptr f(nullptr), f_ref(nullptr); + const auto count = 16; { auto data = std::make_shared(element::f32, Shape{2, 2}); - auto load = std::make_shared(data); + auto load = std::make_shared(data, count); auto neg = std::make_shared(load); - auto store = std::make_shared(neg); + auto store = std::make_shared(neg, count); + store->set_friendly_name("store"); f = std::make_shared(NodeVector{store}, ParameterVector{data}); pass::Manager m; m.register_pass(); - m.register_pass(); + m.register_pass(); m.run_passes(f); ASSERT_NO_THROW(check_rt_info(f)); } { auto data = std::make_shared(element::f32, Shape{2, 2}); - auto load = std::make_shared(data); + auto load = std::make_shared(data, count); auto neg = std::make_shared(load); - auto store = std::make_shared(neg); + auto store = std::make_shared(neg, 1lu); + store->set_friendly_name("store_ref"); f_ref = std::make_shared(NodeVector{store}, ParameterVector{data}); } auto res = compare_functions(f, f_ref); ASSERT_TRUE(res.first) << res.second; + + int64_t store_count = get_count(f, "store"); + int64_t store_count_ref = get_count(f_ref, "store_ref"); + ASSERT_EQ(store_count, store_count_ref); } \ No newline at end of file diff --git a/src/common/snippets/tests/src/registers.cpp b/src/common/snippets/tests/src/registers.cpp index 89e4e4768ff60e..2eb5cddd84fb9f 100644 --- a/src/common/snippets/tests/src/registers.cpp +++ b/src/common/snippets/tests/src/registers.cpp @@ -25,12 +25,14 @@ TEST(TransformationTests, AssignRegisters) { { auto p0 = std::make_shared(element::f32, Shape(1)); auto p1 = std::make_shared(element::f32, Shape(1)); + p0->set_friendly_name("p00"); + p1->set_friendly_name("p01"); auto y00 = std::make_shared(p0); y00->set_friendly_name("y00"); auto y01 = std::make_shared(p1); y01->set_friendly_name("y01"); auto y02 = std::make_shared(y00, y01); y02->set_friendly_name("y02"); - auto y03 = std::make_shared(y02); y03->set_friendly_name("y03"); - - f = std::make_shared(NodeVector{y03}, ParameterVector{p0, p1}); + auto s00 = std::make_shared(y02); s00->set_friendly_name("y03"); + s00->set_friendly_name("s00"); + f = std::make_shared(NodeVector{s00}, ParameterVector{p0, p1}); pass::Manager m; m.register_pass(); @@ -39,13 +41,17 @@ TEST(TransformationTests, AssignRegisters) { ASSERT_NO_THROW(check_rt_info(f)); } - // instead of comparing to a reference function check that registers are correctly assigned - // and stored to runtime info + /* Instead of comparing to a reference function check that registers are correctly assigned and stored to runtime + * info. Note that Parameters and Store rt_info contains gpr indexes, while general op's rt_info contain vector + * indexes */ { std::map ref_registers { + {"p00", 0}, // gpr + {"p01", 1}, // gpr {"y00", 0}, {"y01", 1}, - {"y02", 2} + {"y02", 2}, + {"s00", 2}, // gpr }; auto total_ops = 0; @@ -75,6 +81,14 @@ TEST(TransformationTests, AssignRegisters2) { auto p5 = std::make_shared(ngraph::element::f32, Shape()); auto p6 = std::make_shared(ngraph::element::f32, Shape()); auto p7 = std::make_shared(ngraph::element::f32, Shape()); + p0->set_friendly_name("p00"); + p1->set_friendly_name("p01"); + p2->set_friendly_name("p02"); + p3->set_friendly_name("p03"); + p4->set_friendly_name("p04"); + p5->set_friendly_name("p05"); + p6->set_friendly_name("p06"); + p7->set_friendly_name("p07"); auto c0 = std::make_shared(ngraph::element::f32, Shape(), 3.14f); c0->set_friendly_name("r00"); auto c1 = std::make_shared(ngraph::element::f32, Shape(), 6.6260701e-34f); c1->set_friendly_name("r01"); @@ -102,9 +116,10 @@ TEST(TransformationTests, AssignRegisters2) { auto y20 = std::make_shared(y17, y18); y20->set_friendly_name("r22"); auto y21 = std::make_shared(y15, y19); y21->set_friendly_name("r23"); auto y22 = std::make_shared(y20, y21); y22->set_friendly_name("r24"); - auto y23 = std::make_shared(y22); + auto s00 = std::make_shared(y22); + s00->set_friendly_name("s00"); - f = std::make_shared(NodeVector{y23}, ParameterVector{p0, p1, p2, p3, p4, p5, p6, p7}); + f = std::make_shared(NodeVector{s00}, ParameterVector{p0, p1, p2, p3, p4, p5, p6, p7}); pass::Manager m; m.register_pass(); @@ -117,10 +132,14 @@ TEST(TransformationTests, AssignRegisters2) { // and stored to runtime info { std::map ref_registers { - {"r00", 1}, {"r01", 3}, {"r02", 5}, {"r03", 5}, {"r04", 2}, {"r05", 6}, {"r06", 6}, {"r07", 6}, - {"r08", 5}, {"r09", 2}, {"r10", 1}, {"r11", 4}, {"r12", 4}, {"r13", 6}, {"r14", 2}, {"r15", 5}, - {"r16", 0}, {"r17", 4}, {"r18", 0}, {"r19", 2}, {"r20", 4}, {"r21", 1}, {"r22", 0}, {"r23", 6}, - {"r24", 1} + {"p00", 0}, {"p01", 1}, {"p02", 2}, {"p03", 3}, {"p04", 4}, {"p05", 5}, + {"p06", 6}, {"p07", 7}, + {"r00", 1}, {"r01", 3}, {"r02", 5}, {"r03", 5}, {"r04", 2}, {"r05", 6}, + {"r06", 6}, {"r07", 6}, {"r08", 5}, {"r09", 2}, {"r10", 1}, {"r11", 4}, + {"r12", 4}, {"r13", 6}, {"r14", 2}, {"r15", 5}, {"r16", 0}, {"r17", 4}, + {"r18", 0}, {"r19", 2}, {"r20", 4}, {"r21", 1}, {"r22", 0}, {"r23", 6}, + {"r24", 1}, + {"s00", 8}, }; auto total_ops = 0; diff --git a/src/plugins/intel_cpu/src/emitters/cpu_generator.cpp b/src/plugins/intel_cpu/src/emitters/cpu_generator.cpp index b6e5fb3b2ec6cd..c35533a0a28990 100644 --- a/src/plugins/intel_cpu/src/emitters/cpu_generator.cpp +++ b/src/plugins/intel_cpu/src/emitters/cpu_generator.cpp @@ -13,6 +13,9 @@ #include "jit_eltwise_emitters.hpp" #include "jit_dnnl_emitters.hpp" #include "jit_dnnl_ext_emitters.hpp" +#include "jit_conversion_emitters.hpp" + +#include "snippets_transformations/op/load_store_convert.hpp" #include @@ -39,25 +42,23 @@ ov::intel_cpu::CPUTargetMachine::CPUTargetMachine(dnnl::impl::cpu::x64::cpu_isa_ : TargetMachine(), h(new jit_snippet()), isa(host_isa) { // data movement jitters[ngraph::opset1::Parameter::get_type_info_static()] = CREATE_EMITTER(NopEmitter); - jitters[ngraph::snippets::op::BlockedParameter::get_type_info_static()] = CREATE_EMITTER(NopEmitter); jitters[ngraph::opset1::Result::get_type_info_static()] = CREATE_EMITTER(NopEmitter); // jitters[ngraph::opset1::Constant::get_type_info_static()] = CREATE_EMITTER(); // Not supported jitters[ngraph::snippets::op::Load::get_type_info_static()] = CREATE_EMITTER(LoadEmitter); - jitters[ngraph::snippets::op::VectorLoad::get_type_info_static()] = CREATE_EMITTER(LoadEmitter); - jitters[ngraph::snippets::op::ScalarLoad::get_type_info_static()] = CREATE_EMITTER(ScalarLoadEmitter); jitters[ngraph::snippets::op::BroadcastLoad::get_type_info_static()] = CREATE_EMITTER(BroadcastLoadEmitter); + jitters[ov::intel_cpu::LoadConvert::get_type_info_static()] = CREATE_EMITTER(LoadConvertEmitter); jitters[ngraph::snippets::op::Store::get_type_info_static()] = CREATE_EMITTER(StoreEmitter); - jitters[ngraph::snippets::op::VectorStore::get_type_info_static()] = CREATE_EMITTER(StoreEmitter); - jitters[ngraph::snippets::op::ScalarStore::get_type_info_static()] = CREATE_EMITTER(ScalarStoreEmitter); + jitters[ov::intel_cpu::StoreConvert::get_type_info_static()] = CREATE_EMITTER(StoreConvertEmitter); jitters[ngraph::snippets::op::Scalar::get_type_info_static()] = CREATE_EMITTER(ScalarEmitter); - jitters[ngraph::snippets::op::BroadcastMove::get_type_info_static()] = CREATE_EMITTER(FakeBroadcastEmitter); + jitters[ngraph::snippets::op::BroadcastMove::get_type_info_static()] = CREATE_EMITTER(BroadcastMoveEmitter); // jitters[ngraph::snippets::op::Nop::get_type_info_static()] = CREATE_EMITTER(NopEmitter); // Not supported // jitters[ngraph::opset1::Broadcast::get_type_info_static()] = CREATE_EMITTER(); // Not supported - // jitters[ngraph::opset1::Convert::get_type_info_static()] = CREATE_EMITTER(); // Not supported + jitters[ngraph::snippets::op::ConvertTruncation::get_type_info_static()] = CREATE_EMITTER(ov::intel_cpu::jit_convert_truncation_emitter); + jitters[ngraph::snippets::op::ConvertSaturation::get_type_info_static()] = CREATE_EMITTER(ov::intel_cpu::jit_convert_saturation_emitter); // jitters[ngraph::opset1::FakeQuantize::get_type_info_static()] = CREATE_EMITTER(); // not supported // binary @@ -118,6 +119,7 @@ ov::intel_cpu::CPUTargetMachine::CPUTargetMachine(dnnl::impl::cpu::x64::cpu_isa_ jitters[ngraph::snippets::op::Kernel::get_type_info_static()] = CREATE_EMITTER(KernelEmitter); jitters[ngraph::snippets::op::Tile::get_type_info_static()] = CREATE_EMITTER(TileEmitter); + jitters[ngraph::snippets::op::TileScheduler::get_type_info_static()] = CREATE_EMITTER(TileSchedulerEmitter); } size_t ov::intel_cpu::CPUTargetMachine::get_lanes() const { @@ -140,3 +142,7 @@ code ov::intel_cpu::CPUTargetMachine::get_snippet() const { ov::intel_cpu::CPUGenerator::CPUGenerator(dnnl::impl::cpu::x64::cpu_isa_t isa_) : Generator(std::make_shared(isa_)) { } + +ov::element::Type ov::intel_cpu::CPUGenerator::CPUGenerator::get_supported_exec_precision() const { + return ov::element::f32; +} diff --git a/src/plugins/intel_cpu/src/emitters/cpu_generator.hpp b/src/plugins/intel_cpu/src/emitters/cpu_generator.hpp index 7301fcb177b93f..9b1fe3bc79935b 100644 --- a/src/plugins/intel_cpu/src/emitters/cpu_generator.hpp +++ b/src/plugins/intel_cpu/src/emitters/cpu_generator.hpp @@ -28,6 +28,8 @@ class CPUTargetMachine : public ngraph::snippets::TargetMachine { class CPUGenerator : public ngraph::snippets::Generator { public: CPUGenerator(dnnl::impl::cpu::x64::cpu_isa_t isa); + + element::Type get_supported_exec_precision() const override; }; } // namespace intel_cpu diff --git a/src/plugins/intel_cpu/src/emitters/jit_conversion_emitters.cpp b/src/plugins/intel_cpu/src/emitters/jit_conversion_emitters.cpp new file mode 100644 index 00000000000000..bbb70ee3eafdb7 --- /dev/null +++ b/src/plugins/intel_cpu/src/emitters/jit_conversion_emitters.cpp @@ -0,0 +1,313 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "jit_conversion_emitters.hpp" +#include "utils/bfloat16.hpp" +#include +#include +#include + +using namespace InferenceEngine; +using namespace dnnl::impl::utils; +using namespace dnnl::impl; +using namespace dnnl::impl::cpu::x64; +using namespace Xbyak; + +namespace ov { +namespace intel_cpu { + +jit_convert_emitter::jit_convert_emitter(jit_generator *host, cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) +: jit_emitter(host, host_isa, node, exec_prc) { + input_type = node->get_input_element_type(0); + output_type = node->get_output_element_type(0); + + if (!mayiuse(avx512_core_bf16) && mayiuse(avx512_core)) + emu_vcvtneps2bf16.reset(new jit_emu_vcvtneps2bf16(host, host_isa)); +} + +void jit_convert_emitter::validate_types() const { + auto is_supported_type = [this](const ov::element::Type& type) { + return any_of(supported_types.begin(), supported_types.end(), + [&type](const ov::element::Type& supported_type) { return supported_type == type; } ); + }; + + if (!is_supported_type(input_type)) + IE_THROW() << "Unsupported input type: " << input_type.get_type_name(); + if (!is_supported_type(output_type)) + IE_THROW() << "Unsupported output type: " << output_type.get_type_name(); +} + +size_t jit_convert_emitter::get_inputs_num() const { return 1; } + +void jit_convert_emitter::emit_data() const { + jit_emitter::emit_data(); + if (emu_vcvtneps2bf16) + emu_vcvtneps2bf16->emit_data(); +} + +void jit_convert_emitter::float2bfloat(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { + Zmm zmm_src = Zmm(in_vec_idxs[0]); + Zmm zmm_dst = Zmm(out_vec_idxs[0]); + + if (mayiuse(avx512_core_bf16)) { + h->vcvtneps2bf16(zmm_dst, zmm_src); + } else { + if (!emu_vcvtneps2bf16) + IE_THROW() << "Converter from float to bf16 isn't initialized!"; + + emu_vcvtneps2bf16->emit_code({static_cast(zmm_src.getIdx())}, {static_cast(zmm_dst.getIdx())}); + } +} + +jit_convert_truncation_emitter::jit_convert_truncation_emitter(jit_generator *host, cpu_isa_t host_isa, + const std::shared_ptr& node, Precision exec_prc) + : jit_convert_emitter(host, host_isa, node, exec_prc) { + prepare_table(); +} + +bool jit_convert_truncation_emitter::is_i8_and_u8_case() const { + return one_of(input_type, ov::element::i8, ov::element::u8) && + one_of(output_type, ov::element::i8, ov::element::u8); +} + +void jit_convert_truncation_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, + const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs, + const emitter_context *emit_context) const { + validate_types(); + if (host_isa_ == cpu::x64::sse41) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == cpu::x64::avx2) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == cpu::x64::avx512_core) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else { + assert(!"unsupported isa"); + } +} + +template +void jit_convert_truncation_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { + using Vmm = typename conditional3::type; + Vmm vmm_src = Vmm(in_vec_idxs[0]); + Vmm vmm_dst = Vmm(out_vec_idxs[0]); + + // For Truncation behavior we can just move data from src to dst if we want convert i8 -> u8 or u8 -> i8 + if ((input_type == output_type) || is_i8_and_u8_case()) { + if (vmm_src != vmm_dst) { + h->uni_vmovups(vmm_dst, vmm_src); + } + return; + } + + switch (input_type) { + case ov::element::f32: + if (one_of(output_type, ov::element::i32, ov::element::i8, ov::element::u8)) + h->uni_vcvttps2dq(vmm_dst, vmm_src); + break; + case ov::element::i32: + if (one_of(output_type, ov::element::f32, ov::element::bf16)) + h->uni_vcvtdq2ps(vmm_dst, vmm_src); + break; + case ov::element::bf16: + h->vpmovzxwd(vmm_dst, vmm_src); + h->uni_vpslld(vmm_dst, vmm_dst, 16); + if (one_of(output_type, ov::element::i32, ov::element::i8, ov::element::u8)) + h->uni_vcvttps2dq(vmm_dst, vmm_dst); + break; + case ov::element::i8: + h->uni_vpmovsxbd(vmm_dst, vmm_src); + break; + case ov::element::u8: + h->uni_vpmovzxbd(vmm_dst, vmm_src); + break; + default: + assert(!"unsupported output data type"); + } + + switch (output_type) { + case ov::element::f32: + if (!one_of(input_type, ov::element::i32, ov::element::bf16)) { + h->uni_vcvtdq2ps(vmm_dst, vmm_dst); + } + break; + case ov::element::i32: + break; + case ov::element::bf16: + if (input_type == ov::element::f32) { + float2bfloat({static_cast(vmm_src.getIdx())}, {static_cast(vmm_dst.getIdx())}); + } else { + if (one_of(input_type, ov::element::i8, ov::element::u8)) { + h->uni_vcvtdq2ps(vmm_dst, vmm_dst); + } + float2bfloat({static_cast(vmm_dst.getIdx())}, {static_cast(vmm_dst.getIdx())}); + } + break; + case ov::element::i8: + case ov::element::u8: + if (input_type == ov::element::i32) { + dword2int8({static_cast(vmm_src.getIdx())}, {static_cast(vmm_dst.getIdx())}); + } else { + dword2int8({static_cast(vmm_dst.getIdx())}, {static_cast(vmm_dst.getIdx())}); + } + break; + default: + assert(!"unsupported output data type"); + } +} + +void jit_convert_truncation_emitter::register_table_entries() { + if (host_isa_ == dnnl::impl::cpu::x64::avx2 && + one_of(output_type, ov::element::i8, ov::element::u8) && + !is_i8_and_u8_case()) + push_arg_entry_of("mask_byte", 0x000000ff, true); +} + +template +void jit_convert_truncation_emitter::dword2int8(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { + using Vmm = typename conditional3::type; + Vmm vmm_src = Vmm(in_vec_idxs[0]); + + Vmm vmm_dst = Vmm(out_vec_idxs[0]); + Xmm xmm_dst = Xmm(out_vec_idxs[0]); + Ymm ymm_dst = Ymm(out_vec_idxs[0]); + + if (isa == dnnl::impl::cpu::x64::avx512_core) { + h->vpmovdb(xmm_dst, vmm_src); + } else if (isa == dnnl::impl::cpu::x64::avx2) { + h->vpand(vmm_dst, vmm_src, table_val("mask_byte")); // to avoid saturation + h->uni_vpackssdw(vmm_dst, vmm_dst, vmm_dst); + if (isa != dnnl::impl::cpu::x64::sse41) + h->vpermq(ymm_dst, ymm_dst, 0x08); + h->uni_vpackuswb(xmm_dst, xmm_dst, xmm_dst); + } +} + +jit_convert_saturation_emitter::jit_convert_saturation_emitter(jit_generator *host, cpu_isa_t host_isa, + const std::shared_ptr& node, Precision exec_prc) + : jit_convert_emitter(host, host_isa, node, exec_prc) { +} + +void jit_convert_saturation_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, + const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs, + const emitter_context *emit_context) const { + validate_types(); + if (host_isa_ == cpu::x64::sse41) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == cpu::x64::avx2) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == cpu::x64::avx512_core) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else { + assert(!"unsupported isa"); + } +} + +template +void jit_convert_saturation_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { + using Vmm = typename conditional3::type; + Vmm vmm_src = Vmm(in_vec_idxs[0]); + Vmm vmm_dst = Vmm(out_vec_idxs[0]); + + if (input_type == output_type) { + h->uni_vmovups(vmm_dst, vmm_src); + return; + } + + switch (input_type) { + case ov::element::f32: + if (one_of(output_type, ov::element::i32, ov::element::i8, ov::element::u8)) + h->uni_vcvtps2dq(vmm_dst, vmm_src); + break; + case ov::element::i32: + if (one_of(output_type, ov::element::f32, ov::element::bf16)) + h->uni_vcvtdq2ps(vmm_dst, vmm_src); + break; + case ov::element::bf16: + h->vpmovzxwd(vmm_dst, vmm_src); + h->uni_vpslld(vmm_dst, vmm_dst, 16); + if (one_of(output_type, ov::element::i32, ov::element::i8, ov::element::u8)) + h->uni_vcvttps2dq(vmm_dst, vmm_dst); + break; + case ov::element::i8: + h->uni_vpmovsxbd(vmm_dst, vmm_src); + break; + case ov::element::u8: + h->uni_vpmovzxbd(vmm_dst, vmm_src); + break; + default: + assert(!"unsupported output data type"); + } + + switch (output_type) { + case ov::element::f32: + if (!one_of(input_type, ov::element::i32, ov::element::bf16)) { + h->uni_vcvtdq2ps(vmm_dst, vmm_dst); + } + break; + case ov::element::i32: + break; + case ov::element::bf16: + if (input_type == ov::element::f32) { + float2bfloat({static_cast(vmm_src.getIdx())}, {static_cast(vmm_dst.getIdx())}); + } else { + if (one_of(input_type, ov::element::i8, ov::element::u8)) { + h->uni_vcvtdq2ps(vmm_dst, vmm_dst); + } + float2bfloat({static_cast(vmm_dst.getIdx())}, {static_cast(vmm_dst.getIdx())}); + } + break; + case ov::element::i8: + case ov::element::u8: + if (input_type == ov::element::i32) { + dword2int8({static_cast(vmm_src.getIdx())}, {static_cast(vmm_dst.getIdx())}, output_type.is_signed()); + } else { + dword2int8({static_cast(vmm_dst.getIdx())}, {static_cast(vmm_dst.getIdx())}, output_type.is_signed()); + } + break; + default: + assert(!"unsupported output data type"); + } +} + +template +void jit_convert_saturation_emitter::dword2int8(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, bool is_signed) const { + using Vmm = typename conditional3::type; + Vmm vmm_src = Vmm(in_vec_idxs[0]); + + Vmm vmm_dst = Vmm(out_vec_idxs[0]); + Xmm xmm_dst = Xmm(out_vec_idxs[0]); + Ymm ymm_dst = Ymm(out_vec_idxs[0]); + + if (isa == dnnl::impl::cpu::x64::avx512_core) { + if (is_signed) { + h->vpmovsdb(xmm_dst, vmm_src); + } else { + Vmm vmm_zero = Vmm(aux_vec_idxs[0]); + h->vpxord(vmm_zero, vmm_zero, vmm_zero); + h->vpmaxsd(vmm_dst, vmm_src, vmm_zero); + h->vpmovusdb(xmm_dst, vmm_dst); + } + } else { + if (is_signed) + h->uni_vpackssdw(vmm_dst, vmm_src, vmm_src); + else + h->uni_vpackusdw(vmm_dst, vmm_src, vmm_src); + + if (isa != dnnl::impl::cpu::x64::sse41) + h->vpermq(ymm_dst, ymm_dst, 0x08); + + if (is_signed) + h->uni_vpacksswb(xmm_dst, xmm_dst, xmm_dst); + else + h->uni_vpackuswb(xmm_dst, xmm_dst, xmm_dst); + } +} + +size_t jit_convert_saturation_emitter::aux_vecs_count() const { + // 1 register is for dword2int8 unsigned + return output_type == ov::element::u8 && host_isa_ == dnnl::impl::cpu::x64::avx512_core? 1 : 0; +} + +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/jit_conversion_emitters.hpp b/src/plugins/intel_cpu/src/emitters/jit_conversion_emitters.hpp new file mode 100644 index 00000000000000..71a45f918ea595 --- /dev/null +++ b/src/plugins/intel_cpu/src/emitters/jit_conversion_emitters.hpp @@ -0,0 +1,87 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include "jit_emitter.hpp" +#include "jit_bf16_emitters.hpp" + +namespace ov { +namespace intel_cpu { + +class jit_convert_emitter : public jit_emitter { +public: + jit_convert_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + + size_t get_inputs_num() const override; + +protected: + void emit_data() const override; + void validate_types() const; + + void float2bfloat(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + + ov::element::Type input_type; + ov::element::Type output_type; + + const ov::element::TypeVector supported_types = { + ov::element::f32, + ov::element::i32, + ov::element::bf16, + ov::element::i8, + ov::element::u8 + }; + + std::shared_ptr emu_vcvtneps2bf16 = nullptr; +}; + +// This emitter is covered by specification of "Convert" operation. The implementation uses a "warp-around" conversion. +// Example: +// int32_t -> int8_t +// 129 -> -127 +class jit_convert_truncation_emitter : public jit_convert_emitter { +public: + jit_convert_truncation_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + +private: + void emit_impl(const std::vector& in, const std::vector& out, + const std::vector& pool, const std::vector& gpr, + const ov::intel_cpu::emitter_context *emit_context) const override; + template + void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + + template + void dword2int8(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + + bool is_i8_and_u8_case() const; + void register_table_entries() override; +}; + +// This emitter is covered by the common dnnl behavior. The implementation uses a "saturation" conversion. +// Example: +// int32_t -> int8_t +// 129 -> 127 +class jit_convert_saturation_emitter : public jit_convert_emitter { +public: + jit_convert_saturation_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + +private: + void emit_impl(const std::vector& in, const std::vector& out, + const std::vector& pool, const std::vector& gpr, + const ov::intel_cpu::emitter_context *emit_context) const override; + template + void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + + template + void dword2int8(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, bool is_signed) const; + + size_t aux_vecs_count() const override; +}; + +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/jit_emitter.cpp b/src/plugins/intel_cpu/src/emitters/jit_emitter.cpp index 50f2674fb111b4..91079b55da46c8 100644 --- a/src/plugins/intel_cpu/src/emitters/jit_emitter.cpp +++ b/src/plugins/intel_cpu/src/emitters/jit_emitter.cpp @@ -46,6 +46,10 @@ size_t jit_emitter::aux_vecs_count() const { return 0; } +emitter_in_out_map jit_emitter::get_in_out_type() const { + return in_out_type_; +} + size_t jit_emitter::aux_gprs_count() const { // We need one gpr to load table address return entry_map_.empty() ? 0 : 1; diff --git a/src/plugins/intel_cpu/src/emitters/jit_emitter.hpp b/src/plugins/intel_cpu/src/emitters/jit_emitter.hpp index f0f460d51713a5..74fe712ddd6f9f 100644 --- a/src/plugins/intel_cpu/src/emitters/jit_emitter.hpp +++ b/src/plugins/intel_cpu/src/emitters/jit_emitter.hpp @@ -55,6 +55,7 @@ class jit_emitter : public ngraph::snippets::Emitter { const std::vector &pool_vec_idxs = {}, const std::vector &pool_gpr_idxs = {}); virtual size_t get_inputs_num() const = 0; virtual size_t aux_vecs_count() const; + emitter_in_out_map get_in_out_type() const; static std::set get_supported_precisions(); protected: diff --git a/src/plugins/intel_cpu/src/emitters/jit_load_store_emitters.cpp b/src/plugins/intel_cpu/src/emitters/jit_load_store_emitters.cpp index da1589aa4497d4..490f957c1efa99 100644 --- a/src/plugins/intel_cpu/src/emitters/jit_load_store_emitters.cpp +++ b/src/plugins/intel_cpu/src/emitters/jit_load_store_emitters.cpp @@ -547,8 +547,10 @@ void jit_load_emitter::register_table_entries() { /// STORE /// jit_store_emitter::jit_store_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, - Precision src_prc, Precision dst_prc, int store_num, Precision exec_prc, emitter_in_out_map in_out_type) -: jit_emitter(host, host_isa, exec_prc, in_out_type), store_num_(store_num), src_prc_(src_prc), dst_prc_(dst_prc), name_("unknown") { + Precision src_prc, Precision dst_prc, int store_num, arithmetic_mode mode, Precision exec_prc, + emitter_in_out_map in_out_type) + : jit_emitter(host, host_isa, exec_prc, in_out_type), store_num_(store_num), src_prc_(src_prc), dst_prc_(dst_prc), mode_(mode), name_("unknown") { + prepare_table(); v_len_elt_ = get_vec_length() / exec_prc.size(); store_size_ = store_num * dst_prc.size(); if (!mayiuse(cpu::x64::avx512_core_bf16) && mayiuse(cpu::x64::avx512_core)) { @@ -556,9 +558,25 @@ jit_store_emitter::jit_store_emitter(dnnl::impl::cpu::x64::jit_generator *host, } } -// 0 for temp reg for mask store for avx512 +inline bool jit_store_emitter::is_saturation() const { + return mode_ == arithmetic_mode::saturation; +} + +// case for SSE and AVX2 when we should use AND to truncate values +inline bool jit_store_emitter::is_truncation_emulation() const { + return !mayiuse(cpu::x64::avx512_core) && !is_saturation() && + src_prc_ != dst_prc_ && one_of(dst_prc_, Precision::U16, Precision::I16, Precision::U8, Precision::I8); +} + size_t jit_store_emitter::aux_gprs_count() const { - return get_aux_regs_for_avx512_mask(store_num_ * src_prc_.size()); + // for temp reg for mask store + int count = get_aux_regs_for_avx512_mask(store_num_ * src_prc_.size()); + + // for table value in truncation arithmetic mode + if (is_truncation_emulation()) + count++; + + return count; } size_t jit_store_emitter::aux_vecs_count() const { @@ -580,6 +598,7 @@ size_t jit_store_emitter::aux_vecs_count() const { size_t jit_store_emitter::get_inputs_num() const { return 1; } void jit_store_emitter::emit_data() const { + jit_emitter::emit_data(); if (emu_vcvtneps2bf16_) emu_vcvtneps2bf16_->emit_data(); } @@ -618,7 +637,11 @@ void jit_store_emitter::emit_isa(const int in_vec_idx, const Xbyak::Reg64 ®_d switch (src_prc_) { case Precision::FP32: if ((dst_prc_ != Precision::FP32) && (dst_prc_ != Precision::BF16)) { - h->uni_vcvtps2dq(Vmm(aux_vec_idxs.back()), Vmm(data_idx)); + if (is_saturation()) { + h->uni_vcvtps2dq(Vmm(aux_vec_idxs.back()), Vmm(data_idx)); + } else { + h->uni_vcvttps2dq(Vmm(aux_vec_idxs.back()), Vmm(data_idx)); + } data_idx = aux_vec_idxs.back(); } break; @@ -804,7 +827,7 @@ void jit_store_emitter::store_bytes(const Vmm &vmm, const Xbyak::Reg64 ®, int /** * store_dword_to_byte_extension is the utility function to -* 1. convert store_num (0 <= store_num <= 16) dwords in the Xmm/Ymm/Zmm to store_num bytes with singed or unsinged saturation. +* 1. convert store_num (0 <= store_num <= 16) dwords in the Xmm/Ymm/Zmm to store_num bytes with and without singed or unsinged saturation. * 2. store the packed byte into the memory referenced by ptr[reg + offset] address. */ template @@ -835,28 +858,37 @@ void jit_store_emitter::store_dword_to_byte_extension(const Vmm &vmm, const Xbya }; auto store_dword_to_byte_base = [&]() { - // db only available on avx512, need dw+wb to emulate - if (is_signed) - h->uni_vpackssdw(vmm, vmm, vmm); - else - h->uni_vpackusdw(vmm, vmm, vmm); - // gather 2(cross lane) 64 bits into lower vmm to store - // [y_3 y_2 y_1 y_0] |--> [y_0 y_0 y_2 y_0] - if (is_ymm) { - h->vpermq(ymm, ymm, 0x08); // 00001000 - } + if (is_saturation()) { + // db only available on avx512, need dw+wb to emulate + if (is_signed) + h->uni_vpackssdw(vmm, vmm, vmm); + else + h->uni_vpackusdw(vmm, vmm, vmm); + // gather 2(cross lane) 64 bits into lower vmm to store + // [y_3 y_2 y_1 y_0] |--> [y_0 y_0 y_2 y_0] + if (is_ymm) { + h->vpermq(ymm, ymm, 0x08); // 00001000 + } - if (is_signed) - h->uni_vpacksswb(vmm, vmm, vmm); - else + if (is_signed) + h->uni_vpacksswb(vmm, vmm, vmm); + else + h->uni_vpackuswb(vmm, vmm, vmm); + } else { + h->vpand(vmm, vmm, table_val("mask_truncation_byte")); // to avoid saturation + h->uni_vpackssdw(vmm, vmm, vmm); + if (is_ymm) + h->vpermq(ymm, ymm, 0x08); h->uni_vpackuswb(vmm, vmm, vmm); + } store_bytes(vmm, reg, offset, store_num); }; switch (store_num) { - case 16: - // must support avx512F + case 16: + // must support avx512F + if (is_saturation()) { if (is_signed) { h->vpmovsdb(addr(0), vmm); } else { @@ -865,9 +897,13 @@ void jit_store_emitter::store_dword_to_byte_extension(const Vmm &vmm, const Xbya h->uni_vpmaxsd(vmm, vmm, zero); h->vpmovusdb(addr(0), vmm); } - break; - case 8: - if (mayiuse(cpu::x64::avx512_core)) { // ymm block on avx512F + VL + } else { + h->vpmovdb(addr(0), vmm); + } + break; + case 8: + if (mayiuse(cpu::x64::avx512_core)) { + if (is_saturation()) { // ymm block on avx512F + VL if (is_signed) { h->vpmovsdb(addr(0), ymm); } else { @@ -877,11 +913,15 @@ void jit_store_emitter::store_dword_to_byte_extension(const Vmm &vmm, const Xbya h->vpmovusdb(addr(0), ymm); } } else { - store_dword_to_byte_base(); + h->vpmovdb(addr(0), ymm); } - break; - case 4: - if (mayiuse(cpu::x64::avx512_core)) { // xmm block on avx512F + VL + } else { + store_dword_to_byte_base(); + } + break; + case 4: + if (mayiuse(cpu::x64::avx512_core)) { + if (is_saturation()) {// xmm block on avx512F + VL if (is_signed) { h->vpmovsdb(addr(0), xmm); } else { @@ -891,15 +931,19 @@ void jit_store_emitter::store_dword_to_byte_extension(const Vmm &vmm, const Xbya h->vpmovusdb(addr(0), xmm); } } else { - store_dword_to_byte_base(); + h->vpmovdb(addr(0), xmm); } - break; - default: - if (is_zmm) { // avx512F - unsigned int mask = 1; - mask = (mask << store_num) - mask; - h->mov(Reg32(aux_gpr_idxs[0]), mask); - h->kmovw(k_mask, Reg32(aux_gpr_idxs[0])); + } else { + store_dword_to_byte_base(); + } + break; + default: + if (is_zmm) { // avx512F + unsigned int mask = 1; + mask = (mask << store_num) - mask; + h->mov(Reg32(aux_gpr_idxs[0]), mask); + h->kmovw(k_mask, Reg32(aux_gpr_idxs[0])); + if (is_saturation()) { if (is_signed) { h->vpmovsdb(addr(0), vmm | k_mask); } else { @@ -909,9 +953,12 @@ void jit_store_emitter::store_dword_to_byte_extension(const Vmm &vmm, const Xbya h->vpmovusdb(addr(0), vmm | k_mask); } } else { - store_dword_to_byte_base(); + h->vpmovdb(addr(0), vmm | k_mask); } - break; + } else { + store_dword_to_byte_base(); + } + break; } } @@ -946,16 +993,21 @@ void jit_store_emitter::store_dword_to_word_extension(const Vmm &vmm, const Xbya auto zmm = Xbyak::Zmm(vmm.getIdx()); auto store_dword_to_word_base = [&]() { - // direct mov_dw available only on avx512, emulate with pack_dw + permute + pure store - if (is_signed) - h->uni_vpackssdw(vmm, vmm, vmm); - else + // direct mov_dw available only on avx512 + if (is_saturation()) { // emulate with pack_dw + permute + pure store for saturation mode + if (is_signed) + h->uni_vpackssdw(vmm, vmm, vmm); + else + h->uni_vpackusdw(vmm, vmm, vmm); + // gather 2/4(cross lane) 64 bits into lower vmm to store + // [y_3 y_2 y_1 y_0] |--> [y_0 y_0 y_2 y_0] + // [ 128 | 128 ] |--> [ 128 | 128 ] + if (is_ymm) { + h->vpermq(ymm, ymm, 0x08); // 00001000 + } + } else { // emulate with AND + pure store for truncation mode + h->vpand(vmm, vmm, table_val("mask_truncation_word")); h->uni_vpackusdw(vmm, vmm, vmm); - // gather 2/4(cross lane) 64 bits into lower vmm to store - // [y_3 y_2 y_1 y_0] |--> [y_0 y_0 y_2 y_0] - // [ 128 | 128 ] |--> [ 128 | 128 ] - if (is_ymm) { - h->vpermq(ymm, ymm, 0x08); // 00001000 } store_bytes(vmm, reg, offset, store_num * 2); @@ -978,7 +1030,8 @@ void jit_store_emitter::store_dword_to_word_extension(const Vmm &vmm, const Xbya } } else { switch (store_num) { - case 16: + case 16: + if (is_saturation()) { if (is_signed) { h->vpmovsdw(ptr[reg + offset], vmm); // singed int32 saturate to signed int16. } else { @@ -987,9 +1040,13 @@ void jit_store_emitter::store_dword_to_word_extension(const Vmm &vmm, const Xbya h->uni_vpmaxsd(vmm, zero, vmm); // if singed bit is 1, set value as 0. h->vpmovusdw(ptr[reg + offset], vmm); // unsinged int32 saturate to unsigned int16. } - break; - case 8: - if (mayiuse(cpu::x64::avx512_core)) { + } else { + h->vpmovdw(ptr[reg + offset], vmm); + } + break; + case 8: + if (mayiuse(cpu::x64::avx512_core)) { + if (is_saturation()) { if (is_signed) { h->vpmovsdw(ptr[reg + offset], ymm); } else { @@ -999,11 +1056,15 @@ void jit_store_emitter::store_dword_to_word_extension(const Vmm &vmm, const Xbya h->vpmovusdw(ptr[reg + offset], ymm); } } else { - store_dword_to_word_base(); + h->vpmovdw(ptr[reg + offset], ymm); } - break; - case 4: - if (mayiuse(cpu::x64::avx512_core)) { + } else { + store_dword_to_word_base(); + } + break; + case 4: + if (mayiuse(cpu::x64::avx512_core)) { + if (is_saturation()) { if (is_signed) { h->vpmovsdw(ptr[reg + offset], xmm); } else { @@ -1013,15 +1074,19 @@ void jit_store_emitter::store_dword_to_word_extension(const Vmm &vmm, const Xbya h->vpmovusdw(ptr[reg + offset], xmm); } } else { - store_dword_to_word_base(); + h->vpmovdw(ptr[reg + offset], xmm); } - break; - default: - if (is_zmm) { - unsigned int mask = 1; - mask = (mask << store_num) - mask; - h->mov(Reg32(aux_gpr_idxs[0]), mask); - h->kmovw(k_mask, Reg32(aux_gpr_idxs[0])); + } else { + store_dword_to_word_base(); + } + break; + default: + if (is_zmm) { + unsigned int mask = 1; + mask = (mask << store_num) - mask; + h->mov(Reg32(aux_gpr_idxs[0]), mask); + h->kmovw(k_mask, Reg32(aux_gpr_idxs[0])); + if (is_saturation()) { if (is_signed) { h->vpmovsdw(ptr[reg + offset], vmm | k_mask); } else { @@ -1031,12 +1096,22 @@ void jit_store_emitter::store_dword_to_word_extension(const Vmm &vmm, const Xbya h->vpmovusdw(ptr[reg + offset], vmm | k_mask); } } else { - store_dword_to_word_base(); + h->vpmovdw(ptr[reg + offset], vmm | k_mask); } - break; + } else { + store_dword_to_word_base(); + } + break; } } } +void jit_store_emitter::register_table_entries() { + if (is_truncation_emulation()) { + push_arg_entry_of("mask_truncation_byte", 0x000000ff, true); + push_arg_entry_of("mask_truncation_word", 0x0000ffff, true); + } +} + } // namespace intel_cpu } // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/jit_load_store_emitters.hpp b/src/plugins/intel_cpu/src/emitters/jit_load_store_emitters.hpp index 3784a343d3fbe2..a198eb705c3022 100644 --- a/src/plugins/intel_cpu/src/emitters/jit_load_store_emitters.hpp +++ b/src/plugins/intel_cpu/src/emitters/jit_load_store_emitters.hpp @@ -39,6 +39,12 @@ struct store_emitter_params : public emitter_params { int store_num_; }; +// Arithmetic modes for data type conversion in store_emitter +enum arithmetic_mode { + saturation, + truncation +}; + class jit_load_emitter : public jit_emitter { public: jit_load_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, Precision src_prc, Precision dst_prc, int load_num, @@ -101,7 +107,8 @@ class jit_load_emitter : public jit_emitter { class jit_store_emitter : public jit_emitter { public: jit_store_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, Precision src_prc, Precision dst_prc, int store_num, - Precision exec_prc = Precision::FP32, emitter_in_out_map in_out_type = emitter_in_out_map::vec_to_gpr); + arithmetic_mode mode = arithmetic_mode::saturation, Precision exec_prc = Precision::FP32, + emitter_in_out_map in_out_type = emitter_in_out_map::vec_to_gpr); /** * store_num values with src_prc in Vmm[in_vec_idx] is stored to ptr[reg_dst + offset_byte] address as dst_prc data, where offset_byte is in_idxs[1] @@ -143,15 +150,21 @@ class jit_store_emitter : public jit_emitter { template void store_dword_to_word_extension(const Vmm &vmm, const Xbyak::Reg64 ®, int offset, bool is_bf16, bool is_signed, int store_size) const; + void register_table_entries() override; + size_t aux_gprs_count() const override; size_t aux_vecs_count() const override; + inline bool is_saturation() const; + inline bool is_truncation_emulation() const; + std::string name_; int v_len_elt_; // 4/8/16 int store_num_; int store_size_; Precision src_prc_; Precision dst_prc_; + arithmetic_mode mode_ = arithmetic_mode::saturation; std::shared_ptr emu_vcvtneps2bf16_; }; diff --git a/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.cpp b/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.cpp new file mode 100644 index 00000000000000..a1b3f1b0068a78 --- /dev/null +++ b/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.cpp @@ -0,0 +1,669 @@ +// Copyright (C) 2020-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include +#include + +#include "jit_snippets_emitters.hpp" +#include "snippets_transformations/op/load_store_convert.hpp" + +using namespace Xbyak; + +namespace ov { +namespace intel_cpu { + +inline static void transform_idxs_to_regs(const std::vector& idxs, std::vector& regs) { + regs.resize(idxs.size()); + std::transform(idxs.begin(), idxs.end(), regs.begin(), [](size_t idx){return Reg64(static_cast(idx));}); +} + +jit_container_emitter::jit_container_emitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, + const std::shared_ptr& n) : jit_emitter(h, isa, n) { + in_out_type_ = emitter_in_out_map::gpr_to_gpr; +} + +void jit_container_emitter::map_abstract_registers(const std::vector &vec_pool, const std::vector &gpr_pool, + std::set& vecs_used, std::set& gprs_used) { + if (body.empty()) + IE_THROW() << "Cannot map registers for jit_container_emitter when its body is empty"; + auto abstract_to_physical = [](const std::vector& abstract_regs, const std::vector& regs_pool) { + std::vector physical_regs(abstract_regs.size()); + for (size_t i = 0; i < abstract_regs.size(); i++) + physical_regs[i] = regs_pool.at(abstract_regs[i]); + return physical_regs; + }; + for (auto& code : body) { + const auto& emitter = code.first; + std::vector in_abstract_regs, out_abstract_regs; + std::tie(in_abstract_regs, out_abstract_regs) = code.second; + std::vector in_physical_regs, out_physical_regs; + switch (std::dynamic_pointer_cast(emitter)->get_in_out_type()) { + case gpr_to_gpr: + // Note that gpr_to_gpr is used for high-level utility operations like Kernel/TileScheduler/Tile. + // Input registers are not mapped in this case, since they contain utility info + // (num_params, tile increment, etc.), but not reg indexes. + in_physical_regs = std::move(in_abstract_regs); + out_physical_regs = std::move(abstract_to_physical(out_abstract_regs, gpr_pool)); + gprs_used.insert(out_physical_regs.begin(), out_physical_regs.end()); + break; + case gpr_to_vec: + // Load Emitters + in_physical_regs = std::move(abstract_to_physical(in_abstract_regs, gpr_pool)); + out_physical_regs = std::move(abstract_to_physical(out_abstract_regs, vec_pool)); + gprs_used.insert(in_physical_regs.begin(), in_physical_regs.end()); + vecs_used.insert(out_physical_regs.begin(), out_physical_regs.end()); + break; + case vec_to_gpr: + // Store Emitters + in_physical_regs = std::move(abstract_to_physical(in_abstract_regs, vec_pool)); + out_physical_regs = std::move(abstract_to_physical(out_abstract_regs, gpr_pool)); + vecs_used.insert(in_physical_regs.begin(), in_physical_regs.end()); + gprs_used.insert(out_physical_regs.begin(), out_physical_regs.end()); + break; + case vec_to_vec: + // Regular operations + in_physical_regs = std::move(abstract_to_physical(in_abstract_regs, vec_pool)); + out_physical_regs = std::move(abstract_to_physical(out_abstract_regs, vec_pool)); + vecs_used.insert(in_physical_regs.begin(), in_physical_regs.end()); + vecs_used.insert(out_physical_regs.begin(), out_physical_regs.end()); + break; + default: + IE_THROW() << "Unhandled in_out type"; + } + code.second = std::make_pair(in_physical_regs, out_physical_regs); + if (auto container = std::dynamic_pointer_cast(code.first)) + container->map_abstract_registers(vec_pool, gpr_pool, vecs_used, gprs_used); + } +} + +KernelEmitter::KernelEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, + const std::shared_ptr& n) : jit_container_emitter(h, isa, n) { + const auto kernel = ov::as_type_ptr(n); + if (!kernel) + IE_THROW() << "KernelEmitter invoked with invalid op argument"; + if (kernel->region.empty()) + IE_THROW() << "KernelEmitter invoked with empty body"; + body = kernel->region; + if (!kernel->compile_params) + IE_THROW() << "KernelEmitter invoked without compile_params"; + jcp = *reinterpret_cast(kernel->compile_params); + // Initialize pools of gp and vec registers + gp_regs_pool.resize(16); + vec_regs_pool.resize(16); + std::iota(gp_regs_pool.begin(), gp_regs_pool.end(), 0); + std::iota(vec_regs_pool.begin(), vec_regs_pool.end(), 0); + auto remove_regs_from_pool = [](std::vector& pool, const std::set& to_remove) { + // It's important to keep the order of other elements + pool.erase(std::remove_if(pool.begin(), pool.end(), + [&](size_t x) {return to_remove.count(x) != 0;}), pool.end()); + }; + // Reserve stack base and pointer for push(...) and pop(...) operations + // Reserve abi_param1 and abi_param2, since they'll be used to pass runtime call args to kernel + remove_regs_from_pool(gp_regs_pool, {Xbyak::Operand::RSP, Xbyak::Operand::RBP, + static_cast(abi_param1.getIdx()), + static_cast(abi_param2.getIdx())}); + std::set vecs_used, gprs_used; + map_abstract_registers(vec_regs_pool, gp_regs_pool, vecs_used, gprs_used); + remove_regs_from_pool(gp_regs_pool, gprs_used); + remove_regs_from_pool(vec_regs_pool, vecs_used); + // Remember used gprs to pass it to the TileSchedulerEmitter, so it can init them with appropriate data ptrs + gp_regs_used = std::vector(gprs_used.begin(), gprs_used.end()); +} + +void KernelEmitter::emit_code(const std::vector &in, + const std::vector &out, + const std::vector &pool, + const std::vector &gpr) const { + validate_arguments(in, out, pool, gpr); + emit_impl(in, out, pool, gpr, nullptr); +} + +void KernelEmitter::validate_arguments(const std::vector &in, + const std::vector &out, + const std::vector &pool, + const std::vector &gpr) const { + if (in.size() != 2) + IE_THROW() << "KernelEmitter got invalid number of inputs. Expected 2, got " << in.size(); + if (!out.empty()) + IE_THROW() << "KKernelEmitter got invalid number of outputs. Expected 0, got " << out.size(); +} + +void KernelEmitter::init_data_pointers(size_t num_inputs, size_t num_params, + const Reg64& reg_indexes, const Reg64& reg_const_params, const std::vector& data_ptr_regs) const { + const int64_t harness_num_dims = jcp.output_dims.size() - 1; + auto init_ptrs_with_offsets = [&](Reg64 pointer, const int64_t *offsets, Reg64 reg_tmp) { + for (int j = 0; j < harness_num_dims; j++) { + if (jcp.output_dims[j] != 1 && offsets[j] != 0) { + h->mov(reg_tmp, offsets[j]); + h->imul(reg_tmp, h->ptr[reg_indexes + j * sizeof(size_t)]); + h->add(pointer, reg_tmp); + } + } + }; + for (auto i = 0; i < num_params; i++) { + if (i < num_inputs) + h->mov(data_ptr_regs[i], h->ptr[reg_const_params + GET_OFF(src_ptrs) + i * sizeof(void*)]); + else + h->mov(data_ptr_regs[i], h->ptr[reg_const_params + GET_OFF(dst_ptrs) + (i - num_inputs) * sizeof(void*)]); + // we can use the last data_ptr_reg as tmp_reg until the last iteration, and reg_const_params then + Reg64 reg_tmp = i < num_params-1 ? data_ptr_regs.back() : reg_const_params; + init_ptrs_with_offsets(data_ptr_regs[i], &jcp.data_offsets[i * harness_num_dims], reg_tmp); + } +} +void KernelEmitter::emit_impl(const std::vector& in, + const std::vector& out, + const std::vector& allocated_vec_regs, + const std::vector& allocated_gp_regs, + const ov::intel_cpu::emitter_context *emit_context) const { + h->preamble(); + + const size_t num_inputs = in[0]; + const size_t num_outputs = in[1]; + + Reg64 reg_indexes = Reg64(abi_param1.getIdx()); + Reg64 reg_const_params = Reg64(abi_param2.getIdx()); + std::vector data_ptr_regs; + transform_idxs_to_regs(gp_regs_used, data_ptr_regs); + + init_data_pointers(num_inputs, num_inputs + num_outputs, reg_indexes, reg_const_params, data_ptr_regs); + // todo: emit_impl is a const method, so we can't just push_back unused regs to the gp_regs_pool. + // we need a more elegant approach to avoid a full copy here + auto local_gpr_pool = gp_regs_pool; + local_gpr_pool.push_back(static_cast(reg_indexes.getIdx())); + local_gpr_pool.push_back(static_cast(reg_const_params.getIdx())); + for (const auto& c : body) { + const auto& emitter = c.first; + std::vector in_regs, out_regs; + std::tie(in_regs, out_regs) = c.second; + if (auto tile_scheduler = std::dynamic_pointer_cast(emitter)) + out_regs = gp_regs_used; + emitter->emit_code(in_regs, out_regs, vec_regs_pool, local_gpr_pool); + } + h->postamble(); +} + +TileSchedulerEmitter::TileSchedulerEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, + const std::shared_ptr& n) : jit_container_emitter(h, isa, n) { + const auto tile_scheduler = ov::as_type_ptr(n); + if (!tile_scheduler) + IE_THROW() << "TileSchedulerEmitter invoked with invalid op argument"; + if (!tile_scheduler->compile_params) + IE_THROW() << "TileEmitter invoked without compile_params"; + body = {tile_scheduler->vector_region, tile_scheduler->scalar_region}; + jcp = *reinterpret_cast(tile_scheduler->compile_params); +} +void TileSchedulerEmitter::emit_code(const std::vector &in, + const std::vector &out, + const std::vector &pool, + const std::vector &gpr) const { + validate_arguments(in, out, pool, gpr); + emit_impl(in, out, pool, gpr, nullptr); +} +void TileSchedulerEmitter::validate_arguments(const std::vector &in, + const std::vector &out, + const std::vector &pool, + const std::vector &gpr) const { + if (in.size() != 3) + IE_THROW() << "TileSchedulerEmitter got invalid number of inputs. Expected 3, got " << in.size(); + if (out.size() != in[0] + in[1]) + IE_THROW() << "TileSchedulerEmitter got invalid number of outputs. Expected " << in[0] + in[1] << " , got " << out.size(); + if (body.size() != 2) + IE_THROW() << "TileSchedulerEmitter got invalid body size, expected 2 (vector & scalar TileEmitter), got " << body.size(); + if (!(std::dynamic_pointer_cast(body[0].first) && std::dynamic_pointer_cast(body[1].first))) + IE_THROW() << "TileSchedulerEmitter can contain only TileEmitters inside its body"; +} + +void TileSchedulerEmitter::emit_tiles(const Reg64& reg_inner_amount, const std::vector& data_ptr_regs, size_t vector_size, + const std::vector& vec_pool, const std::vector& gpr_pool) const { + // TileAllocatedEmitter is just an alias to perform dynamic_pointer_cast only once and reuse it below several times + using TileAllocatedEmitter = std::pair, const ngraph::snippets::RegInfo&>; + TileAllocatedEmitter vector_tile {std::dynamic_pointer_cast(body[0].first), body[0].second}; + TileAllocatedEmitter scalar_tile {std::dynamic_pointer_cast(body[1].first), body[1].second}; + const size_t inner_work_amount = jcp.scheduler_dims[1]; + auto process_tile = + [&](const bool evaluate_once, const TileAllocatedEmitter& tile) { + // If Tile is evaluated only once, then we can emit its body directly and skip work_amount decrements and checks + if (evaluate_once) { + tile.first->emit_body(vec_pool, gpr_pool); + } else { + std::vector in_regs, out_regs; + std::tie(in_regs, out_regs) = tile.second; + // pass work_amount reg to Tile + in_regs.push_back(static_cast(reg_inner_amount.getIdx())); + for (const auto& reg : data_ptr_regs) + out_regs.emplace_back(reg.getIdx()); + tile.first->emit_code(in_regs, out_regs, vec_pool, gpr_pool); + } + }; + // todo: these optimizations should be performed on using Tile graph representation in the future + bool vector_evaluate_once = false; + if (inner_work_amount >= vector_size) { + vector_evaluate_once = inner_work_amount < 2 * vector_size; + // Need to set proper work amount for inner tiles if evaluated multiple times + if (!vector_evaluate_once) + h->mov(reg_inner_amount, inner_work_amount); + process_tile(vector_evaluate_once, vector_tile); + } + if (inner_work_amount % vector_size >= 1) { + bool scalar_evaluate_once = inner_work_amount % vector_size < 2; + if (!scalar_evaluate_once) { + // vector_tile is not executed, work_amount is not set + if (inner_work_amount < vector_size) { + h->mov(reg_inner_amount, inner_work_amount); + // vector_tile is executed, but work_amount is neither set nor decremented appropriately. + } else if (vector_evaluate_once) { + vector_tile.first->emit_ptr_increments(data_ptr_regs); + h->mov(reg_inner_amount, inner_work_amount - vector_size); + } + // else: vector_tile is executed multiple times, so work_amount is already set + } else { + if (vector_evaluate_once) { + vector_tile.first->emit_ptr_increments(data_ptr_regs); + } + } + process_tile(scalar_evaluate_once, scalar_tile); + } +} + +void TileSchedulerEmitter::emit_impl(const std::vector& in, + const std::vector& out, + const std::vector& vec_pool, + const std::vector& gpr_pool, + const ov::intel_cpu::emitter_context *emit_context) const { + const size_t num_inputs = in[0]; + const size_t num_outputs = in[1]; + const size_t vector_size = in[2]; + const size_t num_params = num_inputs + num_outputs; + const auto& data_ptr_reg_idxs(out); + std::vector data_ptr_regs; + transform_idxs_to_regs(data_ptr_reg_idxs, data_ptr_regs); + // todo: emit_impl has const input args, so we can't just pop_back necessary regs from gpr_pool. + // we need a more elegant approach to avoid a full copy here. Similar problem is demonstrated in KernelEmitter + auto local_gpr_pool = gpr_pool; + Reg64 reg_outer_amount = Reg64(static_cast(local_gpr_pool.back())); + local_gpr_pool.pop_back(); + Reg64 reg_inner_amount = Reg64(static_cast(local_gpr_pool.back())); + local_gpr_pool.pop_back(); + Label for_body; + const size_t outer_work_amount = jcp.scheduler_dims[0]; + if (outer_work_amount == 1) { + // emit code directly without looping over external dim + emit_tiles(reg_inner_amount, data_ptr_regs, vector_size, vec_pool, local_gpr_pool); + } else if (outer_work_amount > 1) { + // We need to create a Loop in this case + h->mov(reg_outer_amount, outer_work_amount); + h->L(for_body); + { + emit_tiles(reg_inner_amount, data_ptr_regs, vector_size, vec_pool, local_gpr_pool); + + // Todo: Load and Store emitters are currently implemented so they ALWAYS increment appropriate pointers + // after reading/writing. This might be a problem if we need to read the same data multiple times (broadcasting shapes). + // To overcome this limitation, we add appropriate negative offsets if necessary. + for (auto i = 0; i < num_params; i++) { + if (jcp.scheduler_offsets[i] != 0) { + h->add(data_ptr_regs[i], jcp.scheduler_offsets[i]); + } + } + // Note that outer dimensions are always incremented by 1 (outer tiles are always scalar) + h->sub(reg_outer_amount, 1); + h->cmp(reg_outer_amount, 1); + h->jge(for_body, CodeGenerator::T_NEAR); + } + } +} + +std::vector& TileEmitter::get_nested_code() { + return body; +} + +TileEmitter::TileEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, + const std::shared_ptr& n) : jit_container_emitter(h, isa, n) { + const auto tile = ov::as_type_ptr(n); + if (!tile) + IE_THROW() << "TileEmitter invoked with invalid op argument"; + body = tile->region; + if (body.empty()) + IE_THROW() << "TileEmitter is invoked with empty body"; + num_inputs = tile->num_inputs; + num_outputs = tile->num_outputs; + io_dims = tile->io_dims; + io_data_size = tile->io_data_size; + increment = tile->increment; + if (io_dims.size() != num_inputs + num_outputs) + IE_THROW() << "TileEmitter constructor got inconsistent arguments. Check num_inputs + num_outputs == io_dims.size()"; +} + +void TileEmitter::emit_code(const std::vector &in, + const std::vector &out, + const std::vector &pool, + const std::vector &gpr) const { + validate_arguments(in, out, pool, gpr); + emit_impl(in, out, pool, gpr, nullptr); +} + +void TileEmitter::validate_arguments(const std::vector &in, + const std::vector &out, + const std::vector &pool, + const std::vector &gpr) const { + if (in.size() != 1) + IE_THROW() << "TileEmitter got invalid number of inputs. Expected 1, got " << in.size(); + if (out.size() != io_dims.size()) + IE_THROW() << "TileEmitter got invalid number of outputs. Expected " << io_dims.size() << " , got " << out.size(); +} + +void TileEmitter::emit_body(const std::vector& vec_pool, const std::vector& gpr_pool) const { + for (auto& code : body) + code.first->emit_code(code.second.first, code.second.second, vec_pool, gpr_pool); +} + +void TileEmitter::emit_ptr_increments(const std::vector& data_ptr_regs) const { + for (size_t i = 0; i < num_inputs + num_outputs; i++) { + // those with dims == 1 will be broadcasted, hence don't require increment + if (io_dims[i] != 1) + h->add(data_ptr_regs[i], increment * io_data_size[i]); + } +} + +void TileEmitter::emit_impl(const std::vector& in, + const std::vector& out, + const std::vector& vec_pool, + const std::vector& gpr_pool, + const ov::intel_cpu::emitter_context *emit_context) const { + Reg64 work_amount = Reg64(static_cast(in[0])); + std::vector data_ptr_regs; + transform_idxs_to_regs(out, data_ptr_regs); + Label for_body; + // Note that: + // * Work amount must be set by TileScheduler that executes Tiles + // * TileScheduler executes Tile only if it has to perform >= 1 iterations + h->L(for_body); + emit_body(vec_pool, gpr_pool); + emit_ptr_increments(data_ptr_regs); + h->sub(work_amount, increment); + h->cmp(work_amount, increment); + h->jge(for_body, CodeGenerator::T_NEAR); +} + +BroadcastMoveEmitter::BroadcastMoveEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, + const std::shared_ptr& n) : jit_emitter(h, isa, n) { + if (n->get_input_shape(0).empty()) + use_broadcast = true; + else if (*n->get_input_shape(0).rbegin() != *n->get_output_shape(0).rbegin()) + use_broadcast = true; + else + use_broadcast = false; + + if (n->get_input_element_type(0) != n->get_output_element_type(0)) + IE_THROW() << "BroadcastMoveEmitter supports only equal input and output types but gets: " + << n->get_input_element_type(0) << " and " << n->get_output_element_type(0); + byte_size = n->get_input_element_type(0).size(); +} + +void BroadcastMoveEmitter::emit_impl(const std::vector& in, + const std::vector& out, + const std::vector& pool, + const std::vector& gpr, + const ov::intel_cpu::emitter_context *emit_context) const { + if (host_isa_ == dnnl::impl::cpu::x64::sse41) { + emit_isa(in, out); + } else if (host_isa_ == dnnl::impl::cpu::x64::avx2) { + emit_isa(in, out); + } else if (host_isa_ == dnnl::impl::cpu::x64::avx512_core) { + emit_isa(in, out); + } else { + IE_THROW() << "BroadcastMove emitter doesn't support " << host_isa_; + } +} + +template +void BroadcastMoveEmitter::emit_isa(const std::vector &in, const std::vector &out) const { + using Vmm = typename dnnl::impl::utils::conditional3::type; + Vmm vmm_src0 = Vmm(in[0]); + Xmm xmm_src0 = Xmm(in[0]); + Vmm vmm_dst = Vmm(out[0]); + + if (use_broadcast) { + switch (byte_size) { + case 4: h->uni_vbroadcastss(vmm_dst, xmm_src0); break; + case 2: h->vpbroadcastw(vmm_dst, xmm_src0); break; + case 1: h->vpbroadcastb(vmm_dst, xmm_src0); break; + default: assert(!"unsupported data type"); + } + } else { + if (vmm_src0 != vmm_dst) + h->uni_vmovups(vmm_dst, vmm_src0); + } +} + +ScalarEmitter::ScalarEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, + const std::shared_ptr& n) : jit_emitter(h, isa, n) { + value = dnnl::impl::cpu::x64::float2int(ov::as_type_ptr(n)->cast_vector()[0]); + push_arg_entry_of("scalar", value, true); + prepare_table(); +} + +void ScalarEmitter::emit_impl(const std::vector& in, + const std::vector& out, + const std::vector& pool, + const std::vector& gpr, + const ov::intel_cpu::emitter_context *emit_context) const { + if (host_isa_ == dnnl::impl::cpu::x64::sse41) { + emit_isa(in, out); + } else if (host_isa_ == dnnl::impl::cpu::x64::avx2) { + emit_isa(in, out); + } else if (host_isa_ == dnnl::impl::cpu::x64::avx512_core) { + emit_isa(in, out); + } else { + IE_THROW() << "Scalar emitter doesn't support " << host_isa_; + } +} + +template +void ScalarEmitter::emit_isa(const std::vector &in, const std::vector &out) const { + using Vmm = typename dnnl::impl::utils::conditional3::type; + Vmm vmm_dst = Vmm(out[0]); + h->uni_vbroadcastss(vmm_dst, table_val("scalar")); +} + + +MemoryEmitter::MemoryEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, + const std::shared_ptr& n) : jit_emitter(h, isa, n) { + src_prc = InferenceEngine::details::convertPrecision(n->get_input_element_type(0)); + dst_prc = InferenceEngine::details::convertPrecision(n->get_output_element_type(0)); +} + +StoreEmitter::StoreEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, + const std::shared_ptr& n) : MemoryEmitter(h, isa, n) { + if (src_prc != dst_prc) + IE_THROW() << "StoreEmitter supports only equal input and output types but gets: " << src_prc.name() << " and " << dst_prc.name(); + + count = ov::as_type_ptr(n)->get_count(); + in_out_type_ = emitter_in_out_map::vec_to_gpr; + store_emitter.reset(new jit_store_emitter(h, isa, src_prc, dst_prc, count)); +} + +void StoreEmitter::emit_impl(const std::vector& in, + const std::vector& out, + const std::vector& pool, + const std::vector& gpr, + const ov::intel_cpu::emitter_context *emit_context) const { + if (host_isa_ == dnnl::impl::cpu::x64::sse41) { + emit_isa(in, out); + } else if (host_isa_ == dnnl::impl::cpu::x64::avx2) { + emit_isa(in, out); + } else if (host_isa_ == dnnl::impl::cpu::x64::avx512_core) { + emit_isa(in, out); + } else { + IE_THROW() << "Store emitter doesn't support " << host_isa_; + } +} + +template +void StoreEmitter::emit_isa(const std::vector &in, const std::vector &out) const { + using Vmm = typename dnnl::impl::utils::conditional3::type; + if (!store_emitter) + IE_THROW() << "Store CPU emitter isn't initialized for StoreEmitter!"; + store_emitter->emit_code({in[0]}, {out[0]}, aux_vec_idxs, aux_gpr_idxs); +} + +void StoreEmitter::emit_data() const { + store_emitter->emit_data(); +} + +LoadEmitter::LoadEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, + const std::shared_ptr& n) : MemoryEmitter(h, isa, n) { + if (src_prc != dst_prc) + IE_THROW() << "LoadEmitter supports only equal input and output types but gets: " << src_prc.name() << " and " << dst_prc.name(); + + count = ov::as_type_ptr(n)->get_count(); + in_out_type_ = emitter_in_out_map::gpr_to_vec; + load_emitter.reset(new jit_load_emitter(h, isa, src_prc, dst_prc, count)); +} + +void LoadEmitter::emit_impl(const std::vector& in, + const std::vector& out, + const std::vector& pool, + const std::vector& gpr, + const ov::intel_cpu::emitter_context *emit_context) const { + if (host_isa_ == dnnl::impl::cpu::x64::sse41) { + emit_isa(in, out); + } else if (host_isa_ == dnnl::impl::cpu::x64::avx2) { + emit_isa(in, out); + } else if (host_isa_ == dnnl::impl::cpu::x64::avx512_core) { + emit_isa(in, out); + } else { + IE_THROW() << "Load emitter doesn't support " << host_isa_; + } +} + +template +void LoadEmitter::emit_isa(const std::vector &in, const std::vector &out) const { + using Vmm = typename dnnl::impl::utils::conditional3::type; + if (!load_emitter) + IE_THROW() << "Load CPU emitter isn't initialized for LoadEmitter!"; + load_emitter->emit_code({in[0]}, {out[0]}, aux_vec_idxs, aux_gpr_idxs); +} + +void LoadEmitter::emit_data() const { + load_emitter->emit_data(); +} + +BroadcastLoadEmitter::BroadcastLoadEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, + const std::shared_ptr& n) : MemoryEmitter(h, isa, n) { + if (src_prc != dst_prc) + IE_THROW() << "BroadcastEmitters support only equal input and output types but gets: " << src_prc.name() << " and " << dst_prc.name(); + + in_out_type_ = emitter_in_out_map::gpr_to_vec; +} + +void BroadcastLoadEmitter::emit_impl(const std::vector& in, + const std::vector& out, + const std::vector& pool, + const std::vector& gpr, + const ov::intel_cpu::emitter_context *emit_context) const { + if (host_isa_ == dnnl::impl::cpu::x64::sse41) { + emit_isa(in, out); + } else if (host_isa_ == dnnl::impl::cpu::x64::avx2) { + emit_isa(in, out); + } else if (host_isa_ == dnnl::impl::cpu::x64::avx512_core) { + emit_isa(in, out); + } else { + IE_THROW() << "BroadcastLoad emitter doesn't support " << host_isa_; + } +} + +template +void BroadcastLoadEmitter::emit_isa(const std::vector &in, const std::vector &out) const { + using Vmm = typename dnnl::impl::utils::conditional3::type; + Reg64 in_reg(in[0]); + Vmm vmm_dst = Vmm(out[0]); + + // In doesn't really matter if we broadcast or `movss` for vector tails so keep only one version for `BroadcastLoad`, + // key point here is not to add post-increment, it might be fixed by some other approach in future + switch (src_prc.size()) { + case 4: h->uni_vbroadcastss(vmm_dst, h->ptr[in_reg]); break; + case 2: h->vpbroadcastw(vmm_dst, h->ptr[in_reg]); break; + case 1: h->vpbroadcastb(vmm_dst, h->ptr[in_reg]); break; + default: assert(!"unsupported data type"); + } +} + +LoadConvertEmitter::LoadConvertEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr& n) + : MemoryEmitter(h, isa, n) { + count = ov::as_type_ptr(n)->get_count(); + in_out_type_ = emitter_in_out_map::gpr_to_vec; + load_emitter.reset(new jit_load_emitter(h, isa, src_prc, dst_prc, count)); +} + +void LoadConvertEmitter::emit_impl(const std::vector& in, + const std::vector& out, + const std::vector& pool, + const std::vector& gpr, + const ov::intel_cpu::emitter_context *emit_context) const { + if (host_isa_ == dnnl::impl::cpu::x64::sse41) { + emit_isa(in, out); + } else if (host_isa_ == dnnl::impl::cpu::x64::avx2) { + emit_isa(in, out); + } else if (host_isa_ == dnnl::impl::cpu::x64::avx512_core) { + emit_isa(in, out); + } else { + IE_THROW() << "LoadConvert emitter doesn't support " << host_isa_; + } +} + +template +void LoadConvertEmitter::emit_isa(const std::vector &in, const std::vector &out) const { + if (!load_emitter) + IE_THROW() << "Load CPU emitter isn't initialized for LoadEmitter!"; + load_emitter->emit_code({in[0]}, {out[0]}, aux_vec_idxs, aux_gpr_idxs); +} + +void LoadConvertEmitter::emit_data() const { + load_emitter->emit_data(); +} + +StoreConvertEmitter::StoreConvertEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, + const std::shared_ptr& n) : MemoryEmitter(h, isa, n) { + count = ov::as_type_ptr(n)->get_count(); + in_out_type_ = emitter_in_out_map::vec_to_gpr; + + const auto mode = ov::as_type_ptr(n)->get_arithmetic_mode(); + store_emitter.reset(new jit_store_emitter(h, isa, src_prc, dst_prc, count, mode)); +} + +void StoreConvertEmitter::emit_impl(const std::vector& in, + const std::vector& out, + const std::vector& pool, + const std::vector& gpr, + const ov::intel_cpu::emitter_context *emit_context) const { + if (host_isa_ == dnnl::impl::cpu::x64::sse41) { + emit_isa(in, out); + } else if (host_isa_ == dnnl::impl::cpu::x64::avx2) { + emit_isa(in, out); + } else if (host_isa_ == dnnl::impl::cpu::x64::avx512_core) { + emit_isa(in, out); + } else { + IE_THROW() << "StoreConvert emitter doesn't support " << host_isa_; + } +} + +template +void StoreConvertEmitter::emit_isa(const std::vector &in, const std::vector &out) const { + if (!store_emitter) + IE_THROW() << "Store CPU emitter isn't initialized for StoreEmitter!"; + store_emitter->emit_code({in[0]}, {out[0]}, aux_vec_idxs, aux_gpr_idxs); +} + +void StoreConvertEmitter::emit_data() const { + store_emitter->emit_data(); +} + +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.hpp b/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.hpp index c078fa68003cd7..f23efe19e75dbe 100644 --- a/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.hpp +++ b/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.hpp @@ -6,15 +6,19 @@ #include #include +#include #include "jit_emitter.hpp" +#include "jit_load_store_emitters.hpp" using namespace Xbyak; +using ngraph::snippets::AllocatedEmitter; namespace ov { namespace intel_cpu { -#define SNIPPETS_MAX_SNIPPETS_DIMS 7 + +#define SNIPPETS_MAX_SNIPPETS_DIMS 12 #define SNIPPETS_MAX_HARNESS_DIMS 5 #define SNIPPETS_MAX_TILE_RANK 2 #define GET_OFF(field) offsetof(jit_snippets_call_args, field) @@ -30,11 +34,27 @@ struct jit_snippets_compile_args { std::vector output_dims = {}; }; /// -/// \brief Kernel is the only entry point to Codogen Jit compilation. Kernel calculates appropriate data offsets, -/// and invokes enclosed outer Tiles. Only 2d Tiles are currently supported, so the emitters should -/// be organized in the following way: -/// KernelEmitter { /* entry point */ -/// TileEmitter { /* outer tile */ +/// \brief jit_container_emitter designed to wrap Emitters that contain other Emitters (presently KernelEmitter, +/// TileSchedulerEmitter and TileEmitter). This is needed to provide common interface for register mapping +/// (abstract to physical) and nested code access. +/// +class jit_container_emitter: public jit_emitter { +public: + jit_container_emitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, + const std::shared_ptr& n); +protected: + // maps gpr and vec abstract registers to physical ones. Physical reg indexes are taken from the provided pools + // (the first 2 args). All the used gpr and vec registers are also stored in the provided sets (the second 2 args). + void map_abstract_registers(const std::vector&, const std::vector&, + std::set&, std::set&); + std::vector body; +}; +/// +/// \brief Kernel is the only entry point to Codogen Jit compilation. Kernel perform abstract-to-physical register +/// mapping and creates pools of available gpr and vec registers. Kernel is expected to contain (at least one) +/// TileSchedulerEmitter. In general the enclosed emitters should be organized in the following way: +/// KernelEmitter { /* entry point, maps registers, creates pools of available registers */ +/// TileSchedulerEmitter { /* executes required inner, avoids emitting code that won't be executed */ /// TileEmitter { /* inner vector tile */ /// ... /* All the necessary Load/Strore/elementwise emitters */ /// } @@ -43,255 +63,110 @@ struct jit_snippets_compile_args { /// } /// } /// } -/// Note that Kernel params are passed directly to the emit_code(). The vector of inputs should contain 2 arguments, the -/// output vector should be empty. Input parameters +/// Note that Kernel doesn't accept any input arguments. /// -/// \param in[0] The number of the node inputs -/// \param in[1] The number of the node outputs -/// -// Todo: Scheduler dims and offsets are currently calculated in Subgraph node and passed to the KernelEmitter. -// However, it seems more natural to calculate all the offsets right in the Kernel op, because the calculation is -// not device-specific. It is based only on input/output dims (which we already know) and harness num dims -// (which we should pass from the plugin). It seems also better to wrap the enclosed emitters in tiles in the Kernel op -// and avoid creating empty tiles. -class KernelEmitter : public jit_emitter { +class KernelEmitter : public jit_container_emitter { public: KernelEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, - const std::shared_ptr& n) - : jit_emitter(h, isa, n) { - const auto kernel = ov::as_type_ptr(n); - if (!kernel) - IE_THROW() << "KernelEmitter invoked with invalid op argument"; - if (!kernel->compile_params) - IE_THROW() << "KernelEmitter invoked without compile_params"; - code = kernel->region; - jcp = *reinterpret_cast(kernel->compile_params); - } + const std::shared_ptr& n); size_t get_inputs_num() const override {return 0;} - - void emit_code(const std::vector &in, const std::vector &out, - const std::vector &pool = {}, const std::vector &gpr = {}) const override { - validate_arguments(in, out, pool, gpr); - emit_impl(in, out, pool, gpr, nullptr); - } + void emit_code(const std::vector &in, + const std::vector &out, + const std::vector &pool, + const std::vector &gpr) const override; private: - void validate_arguments(const std::vector &in, const std::vector &out, - const std::vector &pool = {}, const std::vector &gpr = {}) const override { - if (in.size() != 2) - IE_THROW() << "KernelEmitter got invalid number of inputs. Expected 2, got " << in.size(); - if (out.size() != 0) - IE_THROW() << "KernelEmitter got unexpected output arguments."; - const size_t num_params = in[0] + in[1]; - if (num_params > SNIPPETS_MAX_SNIPPETS_DIMS) - IE_THROW() << "KernelEmitter supports only up to " << SNIPPETS_MAX_SNIPPETS_DIMS << - " parameters, got " << num_params; - const int64_t harness_num_dims = jcp.output_dims.size() - 1; - if (harness_num_dims > SNIPPETS_MAX_HARNESS_DIMS) - IE_THROW() << "KernelEmitter supports harness with up to " << SNIPPETS_MAX_HARNESS_DIMS << - " dims, got " << harness_num_dims; - } + void validate_arguments(const std::vector &in, + const std::vector &out, + const std::vector &pool, + const std::vector &gpr) const override; + void emit_impl(const std::vector& in, + const std::vector& out, + const std::vector& pool, + const std::vector& gpr, + const ov::intel_cpu::emitter_context *emit_context) const override; + void init_data_pointers(size_t, size_t, const Reg64&, const Reg64&, const std::vector&) const; + + jit_snippets_compile_args jcp; + std::vector gp_regs_pool; + std::vector gp_regs_used; + std::vector vec_regs_pool; +}; +/// +/// \brief TileSchedulerEmitter contains Tiles to be executed (presently vector and scalar). It calculates data offsets +/// and work amounts, performs data pointer decrements if necessary. It also performs some Tile optimizations: scalar/vector +/// tiles are emitted only if necessary; Tile body could be emitted directly, if only one Tile evaluation is required. +/// +/// \param in[0] The number of the node inputs +/// \param in[1] The number of the node outputs +/// \param in[2] The number of elements that fits into vector register +/// + +class TileSchedulerEmitter : public jit_container_emitter { +public: + TileSchedulerEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, + const std::shared_ptr& n); + size_t get_inputs_num() const override {return 0;} + void emit_code(const std::vector &in, + const std::vector &out, + const std::vector &pool, + const std::vector &gpr) const override; + +private: + void validate_arguments(const std::vector &in, + const std::vector &out, + const std::vector &pool, + const std::vector &gpr) const override; void emit_impl(const std::vector& in, const std::vector& out, const std::vector& pool, const std::vector& gpr, - const ov::intel_cpu::emitter_context *emit_context) const override { - const size_t num_inputs = in[0]; - const size_t num_outputs = in[1]; - const size_t num_params = num_inputs + num_outputs; - int reg64_tmp_start { 8 }; // R8, R9, R10, R11, R12, R13, R14, R15 inputs+outputs+1 - const int64_t harness_num_dims = jcp.output_dims.size() - 1; - - Reg64 reg_indexes { dnnl::impl::cpu::x64::abi_param_regs[0] }; - Reg64 reg_const_params { dnnl::impl::cpu::x64::abi_param_regs[1] }; - Xbyak::Reg64 reg_tmp_64 { dnnl::impl::cpu::x64::abi_not_param_reg }; - - h->preamble(); - - std::vector regs(num_params); - auto init_ptrs_with_offsets = [&](Reg64 pointer, const int64_t *offsets) { - for (int j = 0; j < harness_num_dims; j++) { - if (jcp.output_dims[j] != 1 && offsets[j] != 0) { - h->mov(reg_tmp_64, offsets[j]); - h->imul(reg_tmp_64, h->ptr[reg_indexes + j * sizeof(size_t)]); - h->add(pointer, reg_tmp_64); - } - } - }; - for (auto i = 0; i < num_params; i++) { - regs[i] = Reg64(reg64_tmp_start + i); - if (i < num_inputs) - h->mov(regs[i], h->ptr[reg_const_params + GET_OFF(src_ptrs) + i * sizeof(void*)]); - else - h->mov(regs[i], h->ptr[reg_const_params + GET_OFF(dst_ptrs) + (i - num_inputs) * sizeof(void*)]); - init_ptrs_with_offsets(regs[i], &jcp.data_offsets[i * harness_num_dims]); - } - - for (auto& c : code) { - c.first->emit_code(c.second.first, c.second.second, pool, gpr); - } - - h->postamble(); - } + const ov::intel_cpu::emitter_context *emit_context) const override; + + void emit_tiles(const Reg64&, const std::vector&, size_t, const std::vector& , const std::vector&) const; jit_snippets_compile_args jcp; - std::vector, ngraph::snippets::RegInfo>> code; }; + /// /// \brief Tile is designed to organize loop over the input and output data. It is essentially a for(...) loop: -/// it calculates the total number of iterations, performs operations specified by enclosed emitters, advances iteration counters +/// it performs operations specified by enclosed emitters, advances iteration counters /// and breaks when necessary. /// /// \param in[0] The number of input entities (or scheduler counts) processed during one iteration of the tile. -/// It is expected to be 1 for outer or scalar tiles and vlen for vector tiles. -/// \param in[1] Increment of the previous Tile in current dimension. Must be 0 if this is the first Tile. -/// So previous_inc is zero for outer and vector tiles (the are the first in dim) and vlen for scalar tiles (they usually go after vector Tiles). -/// \param in[2] sum number inputs and number of outputs of the node. -/// \param in[3] dimension of the tile. Note that only 2d Tile are currently supported, so dim is 0 for outer tiles, 1 for inner tiles. -/// -// Todo: Inner and outer tiles have different semantics. For example, outer tile always has the increment == 1, and it can contain only -// tile emitters (one outer or two inner). So it seems better to create different classes for inner and outer tiles. -// Todo: Currently data pointers incremented after each read/write in Load/Store emitters, so we have to decrement them here -// if the same data needs to be read twice. Better to move all the pointer increments to TileEmitter and avoid the increments if necessary. -class TileEmitter : public jit_emitter { +/// It is expected to be 1 for outer or scalar tiles and vlen for vector tiles. +class TileEmitter : public jit_container_emitter { public: - TileEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, - const std::shared_ptr& n) - : jit_emitter(h, isa, n) { - const auto tile = ov::as_type_ptr(n); - if (!tile) - IE_THROW() << "TileEmitter invoked with invalid op argument"; - if (!tile->compile_params) - IE_THROW() << "TileEmitter invoked without compile_params"; - code = tile->region; - jcp = *reinterpret_cast(tile->compile_params); - } + TileEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr& n); size_t get_inputs_num() const override {return 0;} + std::vector& get_nested_code(); + void emit_code(const std::vector &in, + const std::vector &out, + const std::vector &pool, + const std::vector &gpr) const override; - void emit_code(const std::vector &in, const std::vector &out, - const std::vector &pool = {}, const std::vector &gpr = {}) const override { - validate_arguments(in, out, pool, gpr); - emit_impl(in, out, pool, gpr, nullptr); - } + void emit_body(const std::vector& vec_pool, const std::vector& gpr_pool) const; + void emit_ptr_increments(const std::vector& data_ptr_regs) const; private: - void validate_arguments(const std::vector &in, const std::vector &out, - const std::vector &pool = {}, const std::vector &gpr = {}) const override { - if (in.size() != 4) - IE_THROW() << "TileEmitter got invalid number of inputs. Expected 4, got " << in.size(); - if (out.size() != 0) - IE_THROW() << "TileEmitter got unexpected output arguments."; - const size_t num_params = in[2]; - if (num_params > SNIPPETS_MAX_SNIPPETS_DIMS) - IE_THROW() << "TileEmitter supports only up to " << SNIPPETS_MAX_SNIPPETS_DIMS << - " parameters, got " << num_params; - const size_t dim = in[3]; - if (dim >= SNIPPETS_MAX_TILE_RANK) - IE_THROW() << "TileEmitter supports tile ranks up to " << SNIPPETS_MAX_TILE_RANK << - " got " << dim; - } - + void validate_arguments(const std::vector &in, + const std::vector &out, + const std::vector &pool, + const std::vector &gpr) const override; void emit_impl(const std::vector& in, const std::vector& out, const std::vector& pool, const std::vector& gpr, - const ov::intel_cpu::emitter_context *emit_context) const override { - const size_t inc = in[0]; - const size_t previous_inc = in[1]; // increment of a previous tile in the same dim (0 if the first tile in the dim) - const size_t num_params = in[2]; - const size_t dim = in[3]; // tile dimension: 0 - outer, 1 - inner - const int reg64_tmp_start { 8 }; // R8, R9, R10, R11, R12, R13, R14, R15 inputs+outputs+1 - Reg64 amount = Reg64(reg64_tmp_start + num_params); // amount - std::array for_body; - - // If R15 is not used, reserve it for use in scalar to avoid redundant push-pop's. - // todo: Do we need explicitly check that code contains ScalarEmitter? - std::vector local_gpr = reg64_tmp_start + num_params < 15 ? std::vector{15} : std::vector{}; - std::vector regs(num_params); - for (auto i = 0; dim == 0 && i < num_params; i++) - regs[i] = Reg64(reg64_tmp_start + i); - // Loop processing could be simplified in some cases - if (inc > jcp.scheduler_dims[dim]) { - return; - } else if (inc == jcp.scheduler_dims[dim]) { - for (auto& c : code) { - c.first->emit_code(c.second.first, c.second.second, pool, local_gpr); - } - } else { - // The previous tile has done nothing, all the work is ours - if (previous_inc == 0 || previous_inc > jcp.scheduler_dims[dim]) { - h->mov(amount, jcp.scheduler_dims[dim]); - // The previous tile has done all the work - } else if (jcp.scheduler_dims[dim] % previous_inc == 0) { - return; - }// else: the previous tile has already set a proper work amount - h->cmp(amount, inc); - h->jl(for_body[0], CodeGenerator::T_NEAR); - - h->L(for_body[1]); - { - h->push(amount); - for (auto& c : code) { - c.first->emit_code(c.second.first, c.second.second, pool, local_gpr); - } - h->pop(amount); - // Todo: Load and Store emitters are currently implemented so they ALWAYS increment appropriate pointers - // after reading/writing. This might be a problem if we need to read the same data multiple times (broadcasting shapes). - // To overcome this limitation, we add appropriate negative offsets if necessary. - for (auto i = 0; dim == 0 && i < num_params; i++) { - if (jcp.scheduler_offsets[i] != 0) { - h->add(regs[i], jcp.scheduler_offsets[i]); - } - } - h->sub(amount, inc); - h->cmp(amount, inc); - h->jge(for_body[1], CodeGenerator::T_NEAR); - } - - h->L(for_body[0]); - } - } + const ov::intel_cpu::emitter_context *emit_context) const override; - // A = <42, 17> - // B = < 1, 17> - // for (auto k = 0; k < dom_0; k++) { // 42 - // for (auto n = 0; n < dom_1; n++) { // 17 - // auto a = *ptr0; ptr0 += vlan; // vector/scalar load - // auto b = *ptr1; ptr1 += vlan; // vector/scalar load - // } - // ptr0 -= 0*dom_1; - // ptr1 -= 1*dom_1; - // } - - // broadcast by MVD is extra case - // A = <42, 17> - // B = <42, 1> - // for (auto k = 0; k < dom_0; k++) { // 42 - // for (auto n = 0; n < dom_1; n++) { // 17 - // auto a = *ptr0; ptr0 += vlan; // vector/scalar load - // auto b = *ptr1; // broadcast load - // } - // ptr0 -= 0*dom_1; - // ptr1 += sizeof(ptr1[0]); //ptr1 -= -sizeof(ptr1[0]); - // } - - // A = <42, 17, 31> - // B = < 1, 17, 31> - // for (auto k = 0; k < dom_0; k++) { // 42 - // for (auto n = 0; n < dom_1; n++) { // 17 - // for (auto m = 0; m < dom_2; m++) { // 31 - // auto a = *ptr0; ptr0 += vlan; // vector/scalar load - // auto b = *ptr1; ptr1 += vlan; // vector/scalar load - // } - // } - // ptr0 -= 0*dom_1*dom2; - // ptr1 -= 1*dom_1*dom2; - // } - jit_snippets_compile_args jcp; - std::vector, ngraph::snippets::RegInfo>> code; + size_t num_inputs = 0; + size_t num_outputs = 0; + std::vector io_dims {}; + std::vector io_data_size {}; + size_t increment = 0; }; class NopEmitter : public jit_emitter { @@ -311,17 +186,10 @@ class NopEmitter : public jit_emitter { } }; -class FakeBroadcastEmitter : public jit_emitter { +class BroadcastMoveEmitter : public jit_emitter { public: - FakeBroadcastEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr& n) - : jit_emitter(h, isa, n) { - if (n->get_input_shape(0).empty()) - use_broadcast = true; - else if (*n->get_input_shape(0).rbegin() != *n->get_output_shape(0).rbegin()) - use_broadcast = true; - else - use_broadcast = false; - } + BroadcastMoveEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr& n); + size_t get_inputs_num() const override {return 1;} private: @@ -329,45 +197,19 @@ class FakeBroadcastEmitter : public jit_emitter { const std::vector& out, const std::vector& pool, const std::vector& gpr, - const ov::intel_cpu::emitter_context *emit_context) const override { - if (host_isa_ == dnnl::impl::cpu::x64::sse41) { - emit_isa(in, out); - } else if (host_isa_ == dnnl::impl::cpu::x64::avx2) { - emit_isa(in, out); - } else if (host_isa_ == dnnl::impl::cpu::x64::avx512_core) { - emit_isa(in, out); - } else { - IE_THROW() << host_isa_; - assert(!"unsupported isa"); - } - } + const ov::intel_cpu::emitter_context *emit_context) const override; template - void emit_isa(const std::vector &in, const std::vector &out) const { - using Vmm = typename dnnl::impl::utils::conditional3::type; - Vmm vmm_src0 = Vmm(in[0]); - Vmm vmm_dst = Vmm(out[0]); - - if (use_broadcast) { - h->uni_vbroadcastss(vmm_dst, Xmm(in[0])); - } else { - h->uni_vmovups(vmm_dst, vmm_src0); - } - } + void emit_isa(const std::vector &in, const std::vector &out) const; private: bool use_broadcast; + size_t byte_size = 0lu; }; class ScalarEmitter : public jit_emitter { public: - ScalarEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr& n) - : jit_emitter(h, isa, n) { - value = dnnl::impl::cpu::x64::float2int(ov::as_type_ptr(n)->cast_vector()[0]); - push_arg_entry_of("scalar", value, true); - prepare_table(); - } + ScalarEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr& n); size_t get_inputs_num() const override {return 0;} @@ -379,26 +221,10 @@ class ScalarEmitter : public jit_emitter { const std::vector& out, const std::vector& pool, const std::vector& gpr, - const ov::intel_cpu::emitter_context *emit_context) const override { - if (host_isa_ == dnnl::impl::cpu::x64::sse41) { - emit_isa(in, out); - } else if (host_isa_ == dnnl::impl::cpu::x64::avx2) { - emit_isa(in, out); - } else if (host_isa_ == dnnl::impl::cpu::x64::avx512_core) { - emit_isa(in, out); - } else { - IE_THROW() << host_isa_; - assert(!"unsupported isa"); - } - } + const ov::intel_cpu::emitter_context *emit_context) const override; template - void emit_isa(const std::vector &in, const std::vector &out) const { - using Vmm = typename dnnl::impl::utils::conditional3::type; - Vmm vmm_dst = Vmm(out[0]); - h->uni_vbroadcastss(vmm_dst, table_val("scalar")); - } + void emit_isa(const std::vector &in, const std::vector &out) const; private: int32_t value; @@ -415,33 +241,16 @@ class ScalarEmitter : public jit_emitter { /// Blocked parameter to tell if input is actually blocked. Broadcast means broadcast by W in other cases no need to substitute load. class MemoryEmitter : public jit_emitter { public: - MemoryEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr& n) - : jit_emitter(h, isa, n), ea(getEA(n)) { - } - - size_t get_inputs_num() const override {return 1;} + MemoryEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr& n); protected: - static auto getEA(const std::shared_ptr& n) -> size_t { - auto& rt = n->get_rt_info(); - size_t ea = 0; - auto it = rt.find("effectiveAddress"); - if (it != rt.end()) { - ea = it->second.as(); - } else { - throw ov::Exception("effective address for Load generation cannot be determined"); - } - return ea; - } - - size_t ea; + Precision src_prc; + Precision dst_prc; }; class StoreEmitter : public MemoryEmitter { public: - StoreEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr& n) - : MemoryEmitter(h, isa, n) { - } + StoreEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr& n); size_t get_inputs_num() const override {return 1;} @@ -450,72 +259,42 @@ class StoreEmitter : public MemoryEmitter { const std::vector& out, const std::vector& pool, const std::vector& gpr, - const ov::intel_cpu::emitter_context *emit_context) const override { - if (host_isa_ == dnnl::impl::cpu::x64::sse41) { - emit_isa(in, out); - } else if (host_isa_ == dnnl::impl::cpu::x64::avx2) { - emit_isa(in, out); - } else if (host_isa_ == dnnl::impl::cpu::x64::avx512_core) { - emit_isa(in, out); - } else { - IE_THROW() << host_isa_; - assert(!"unsupported isa"); - } - } + const ov::intel_cpu::emitter_context *emit_context) const override; template - void emit_isa(const std::vector &in, const std::vector &out) const { - using Vmm = typename dnnl::impl::utils::conditional3::type; - Reg64 out_reg(ea); - Vmm vmm_src0 = Vmm(in[0]); - h->uni_vmovups(h->ptr[out_reg], vmm_src0); - h->add(out_reg, dnnl::impl::cpu::x64::cpu_isa_traits::vlen); - } + void emit_isa(const std::vector &in, const std::vector &out) const; + void emit_data() const override; + +private: + size_t count; + std::unique_ptr store_emitter = nullptr; }; -class ScalarStoreEmitter : public MemoryEmitter { +class LoadEmitter : public MemoryEmitter { public: - ScalarStoreEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr& n) - : MemoryEmitter(h, isa, n) { - } + LoadEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr& n); - size_t get_inputs_num() const override {return 1;} + size_t get_inputs_num() const override {return 0;} private: void emit_impl(const std::vector& in, const std::vector& out, const std::vector& pool, const std::vector& gpr, - const ov::intel_cpu::emitter_context *emit_context) const override { - if (host_isa_ == dnnl::impl::cpu::x64::sse41) { - emit_isa(in, out); - } else if (host_isa_ == dnnl::impl::cpu::x64::avx2) { - emit_isa(in, out); - } else if (host_isa_ == dnnl::impl::cpu::x64::avx512_core) { - emit_isa(in, out); - } else { - IE_THROW() << host_isa_; - assert(!"unsupported isa"); - } - } + const ov::intel_cpu::emitter_context *emit_context) const override; template - void emit_isa(const std::vector &in, const std::vector &out) const { - using Vmm = typename dnnl::impl::utils::conditional3::type; - Reg64 out_reg(ea); - Xmm vmm_src0 = Xmm(in[0]); - h->uni_vmovss(h->ptr[out_reg], vmm_src0); - h->add(out_reg, sizeof(float)); - } + void emit_isa(const std::vector &in, const std::vector &out) const; + void emit_data() const override; + +private: + size_t count; + std::unique_ptr load_emitter = nullptr; }; -class LoadEmitter : public MemoryEmitter { +class BroadcastLoadEmitter : public MemoryEmitter { public: - LoadEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr& n) - : MemoryEmitter(h, isa, n), shouldPostIncrement(*n->get_input_shape(0).rbegin() != 1) { - } + BroadcastLoadEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr& n); size_t get_inputs_num() const override {return 0;} @@ -524,115 +303,54 @@ class LoadEmitter : public MemoryEmitter { const std::vector& out, const std::vector& pool, const std::vector& gpr, - const ov::intel_cpu::emitter_context *emit_context) const override { - if (host_isa_ == dnnl::impl::cpu::x64::sse41) { - emit_isa(in, out); - } else if (host_isa_ == dnnl::impl::cpu::x64::avx2) { - emit_isa(in, out); - } else if (host_isa_ == dnnl::impl::cpu::x64::avx512_core) { - emit_isa(in, out); - } else { - IE_THROW() << host_isa_; - assert(!"unsupported isa"); - } - } + const ov::intel_cpu::emitter_context *emit_context) const override; template - void emit_isa(const std::vector &in, const std::vector &out) const { - using Vmm = typename dnnl::impl::utils::conditional3::type; - Reg64 in_reg(ea); - Vmm vmm_src0 = Vmm(out[0]); - h->uni_vmovups(vmm_src0, h->ptr[in_reg]); - - if (shouldPostIncrement) { - h->add(in_reg, dnnl::impl::cpu::x64::cpu_isa_traits::vlen); - } - } - -private: - bool shouldPostIncrement; + void emit_isa(const std::vector &in, const std::vector &out) const; }; -class BroadcastLoadEmitter : public MemoryEmitter { +class LoadConvertEmitter : public MemoryEmitter { public: - BroadcastLoadEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr& n) - : MemoryEmitter(h, isa, n) { - } + LoadConvertEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr& n); + size_t get_inputs_num() const override {return 0;} private: void emit_impl(const std::vector& in, - const std::vector& out, - const std::vector& pool, - const std::vector& gpr, - const ov::intel_cpu::emitter_context *emit_context) const override { - if (host_isa_ == dnnl::impl::cpu::x64::sse41) { - emit_isa(in, out); - } else if (host_isa_ == dnnl::impl::cpu::x64::avx2) { - emit_isa(in, out); - } else if (host_isa_ == dnnl::impl::cpu::x64::avx512_core) { - emit_isa(in, out); - } else { - IE_THROW() << host_isa_; - assert(!"unsupported isa"); - } - } + const std::vector& out, + const std::vector& pool, + const std::vector& gpr, + const ov::intel_cpu::emitter_context *emit_context) const override; template - void emit_isa(const std::vector &in, const std::vector &out) const { - using Vmm = typename dnnl::impl::utils::conditional3::type; - Reg64 in_reg(ea); - Vmm vmm_src0 = Vmm(out[0]); - - // In doesn't really matter if we broadcast or `movss` for vector tails so keep only one version for `BroadcastLoad`, - // key point here is not to add post-increment, it might be fixed by some other approach in future - h->uni_vbroadcastss(vmm_src0, h->ptr[in_reg]); - } + void emit_isa(const std::vector &in, const std::vector &out) const; + void emit_data() const override; + +private: + size_t count; + std::unique_ptr load_emitter = nullptr; }; -class ScalarLoadEmitter : public MemoryEmitter { +class StoreConvertEmitter : public MemoryEmitter { public: - ScalarLoadEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr& n) - : MemoryEmitter(h, isa, n), shouldPostIncrement(*n->get_input_shape(0).rbegin() != 1) { - } - size_t get_inputs_num() const override {return 0;} + StoreConvertEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr& n); + + size_t get_inputs_num() const override {return 1;} private: void emit_impl(const std::vector& in, - const std::vector& out, - const std::vector& pool, - const std::vector& gpr, - const ov::intel_cpu::emitter_context *emit_context) const override { - if (host_isa_ == dnnl::impl::cpu::x64::sse41) { - emit_isa(in, out); - } else if (host_isa_ == dnnl::impl::cpu::x64::avx2) { - emit_isa(in, out); - } else if (host_isa_ == dnnl::impl::cpu::x64::avx512_core) { - emit_isa(in, out); - } else { - IE_THROW() << host_isa_; - assert(!"unsupported isa"); - } - } + const std::vector& out, + const std::vector& pool, + const std::vector& gpr, + const ov::intel_cpu::emitter_context *emit_context) const override; template - void emit_isa(const std::vector &in, const std::vector &out) const { - using Vmm = typename dnnl::impl::utils::conditional3::type; - Reg64 in_reg(ea); - Xmm vmm_src0 = Xmm(out[0]); - h->uni_vmovss(vmm_src0, h->ptr[in_reg]); - - // Doesn't work if the same pointer comes with multiple load operations - if (shouldPostIncrement) { - h->add(in_reg, sizeof(float)); - } - } + void emit_isa(const std::vector &in, const std::vector &out) const; + void emit_data() const override; private: - bool shouldPostIncrement; + size_t count; + std::unique_ptr store_emitter = nullptr; }; } // namespace intel_cpu diff --git a/src/plugins/intel_cpu/src/extension.cpp b/src/plugins/intel_cpu/src/extension.cpp index 59b7a838ed9fd6..3a846baa390936 100644 --- a/src/plugins/intel_cpu/src/extension.cpp +++ b/src/plugins/intel_cpu/src/extension.cpp @@ -7,6 +7,7 @@ #include "ngraph_transformations/op/leaky_relu.hpp" #include "ngraph_transformations/op/power_static.hpp" #include "ngraph_transformations/op/swish_cpu.hpp" +#include "snippets_transformations/op/load_store_convert.hpp" #include #include @@ -40,6 +41,8 @@ std::map Extension::getOpSets() { NGRAPH_OP(LeakyReluNode, ov::intel_cpu) NGRAPH_OP(PowerStaticNode, ov::intel_cpu) NGRAPH_OP(SwishNode, ov::intel_cpu) + NGRAPH_OP(LoadConvert, ov::intel_cpu) + NGRAPH_OP(StoreConvert, ov::intel_cpu) #undef NGRAPH_OP return opset; diff --git a/src/plugins/intel_cpu/src/ngraph_transformations/snippets_mark_skipped.cpp b/src/plugins/intel_cpu/src/ngraph_transformations/snippets_mark_skipped.cpp index 577578914de627..0768dbb34d0821 100644 --- a/src/plugins/intel_cpu/src/ngraph_transformations/snippets_mark_skipped.cpp +++ b/src/plugins/intel_cpu/src/ngraph_transformations/snippets_mark_skipped.cpp @@ -180,6 +180,39 @@ bool isSuitableMatMulParent(const std::shared_ptr &node) { const bool has_only_child = (out.size() == 1) && (out[0].get_target_inputs().size() == 1); return is_suitable_node && has_only_child; } +// Subtract as ZeroPoints for Convolution +bool isSuitableSubtractAsZeroPointsParent(const std::shared_ptr &node) { + const bool is_suitable_node = ov::is_type(node); + const auto out = node->outputs(); + const bool has_only_child = (out.size() == 1) && (out[0].get_target_inputs().size() == 1); + const bool has_two_parents = node->get_input_size() == 2; + if (!(is_suitable_node && has_only_child && has_two_parents)) + return false; + + const auto child = node->get_output_target_inputs(0).begin()->get_node()->shared_from_this(); + const bool is_conv = ov::is_type(child); + const bool is_group_conv = ov::is_type(child); + if (!is_conv && !is_group_conv) + return false; + const auto weight_shape = child->get_input_shape(1); + const bool is_depthwise = is_group_conv && weight_shape[1] == 1 && weight_shape[2] == 1; + const bool deptwise_is_suitable = implication(is_depthwise, child->get_input_shape(0).size() < 5); + if (!(is_conv && deptwise_is_suitable)) + return false; + + const bool first_input_is_suitable = node->get_input_node_shared_ptr(0)->get_output_element_type(0) == ov::element::u8; + const auto zp_weights = node->get_input_node_shared_ptr(1); + const auto zp_weight_shape = zp_weights->get_output_shape(0); + bool second_input_is_suitable = + ov::is_type(zp_weights) && + zp_weights->get_output_element_type(0) == ov::element::u8 && + zp_weight_shape.size() >= 2; + if (!(first_input_is_suitable && second_input_is_suitable)) + return false; + auto correct_shape = ov::Shape(zp_weight_shape.size(), 1); + correct_shape[1] = zp_weight_shape[1]; + return correct_shape == zp_weight_shape; +} bool isSuitablePoolChild(const std::shared_ptr &node) { const bool is_suitable_node = ov::is_type(node); // has a single output, connected to a single child @@ -225,15 +258,40 @@ bool isSuitableChildForFusingMatMul(const std::shared_ptr &node, Nod // FuseMatMulAndSimpleOperation or FuseFullyConnectedAndSimpleOperation // Invoke SupportsFusingWithConvolution_Simple directly instead of isSuitableChildForFusingSimple to // eliminate getNumNonConstInputs() check - int fusingAxis; - if (can_be_converted_to_FC) - fusingAxis = matmul_shape.size() == 3 ? 2 : 1; - else - fusingAxis = matmul_shape.size() - 1; + int fusingAxis = can_be_converted_to_FC ? (matmul_shape.size() == 3 ? 2 : 1) : matmul_shape.size() - 1; + if (SupportsFusingWithConvolution_Simple(node, fusingAxis)) { updatedChainType = NodeFusingType::FusedWithMisc; return true; } + + // canFuse() from MatMul for case with rank > 2 + // Algorithm::EltwisePowerStatic is ignored + if (!can_be_converted_to_FC && + node->get_output_shape(0).size() > 2) { + if (ov::is_type(node) || + ov::is_type(node) || + ov::is_type(node) || + ov::is_type(node) || + ov::is_type(node)) { + const auto const1 = ov::is_type(node->get_input_node_shared_ptr(0)); + const auto const2 = ov::is_type(node->get_input_node_shared_ptr(1)); + int constPort = -1; + if (const2) { + constPort = 1; + } else if (const1) { + constPort = 0; + } + + if (constPort != -1) { + auto const_shape = node->get_input_shape(constPort); + if (ov::shape_size(const_shape) != 1) { + return false; + } + } + } + } + // FullyConnectedBiasFusion if (!(can_be_converted_to_FC && ov::is_type(node) && bias_shape.back() == matmul_shape.back() && @@ -340,6 +398,9 @@ bool SnippetsMarkSkipped::run_on_model(const std::shared_ptr &m) { } else if (isSuitableMatMulParent(node)) { SetNodeFusingType(node, NodeFusingType::FusedWithMatMul); continue; + } else if (isSuitableSubtractAsZeroPointsParent(node)) { + SetSnippetsNodeType(node, snippets::pass::SnippetsNodeType::SkippedByPlugin); + continue; } for (const auto fusingChainType : getContinuableChains(node)) { if (isSuitableChildForFusingSimple(node, channelAxis)) { diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.cpp b/src/plugins/intel_cpu/src/nodes/subgraph.cpp index a95e3d6634fe82..b16281417ba9d9 100644 --- a/src/plugins/intel_cpu/src/nodes/subgraph.cpp +++ b/src/plugins/intel_cpu/src/nodes/subgraph.cpp @@ -22,6 +22,7 @@ #include #include "emitters/cpu_generator.hpp" +#include "snippets_transformations/fuse_load_store_and_convert.hpp" using namespace InferenceEngine; using namespace dnnl::impl::utils; @@ -60,7 +61,7 @@ void Snippet::initSupportedPrimitiveDescriptors() { if (!supportedPrimitiveDescriptors.empty()) return; - const Precision supportedPrecision = Precision::FP32; + const std::set supportedPrecisions = { Precision::FP32, Precision::I32, Precision::BF16, Precision::I8, Precision::U8 }; bool dimRanksAreEqual = true; for (size_t i = 0; dimRanksAreEqual && i < inputShapes.size(); i++) { @@ -125,18 +126,29 @@ void Snippet::initSupportedPrimitiveDescriptors() { config.dynBatchSupport = false; config.inConfs.resize(inputShapes.size()); for (size_t i = 0; i < inputShapes.size(); i++) { + auto precision = getOriginalInputPrecisionAtPort(i); + if (supportedPrecisions.count(precision) == 0) + IE_THROW() << "Subgraph node with name `" << getName() << "` doesn't support " << precision << " precision."; + + const auto equalPrecisions = getOriginalOutputPrecisions().size() == 1 && + precision == getOriginalOutputPrecisionAtPort(0); + BlockedMemoryDesc::CmpMask inputMask = BLOCKED_DESC_SKIP_OFFSET_MASK; PortConfig portConfig; - portConfig.inPlace((!i && canBeInPlace()) ? 0 : -1); + portConfig.inPlace((!i && canBeInPlace() && equalPrecisions) ? 0 : -1); portConfig.constant(false); if (inputShapes[i].getDims()[0] == 1) { inputMask.reset(0); // accepts any stride on batch axis } - portConfig.setMemDesc(createMemoryDesc(inputShapes[i], supportedPrecision, offset), inputMask); + portConfig.setMemDesc(createMemoryDesc(inputShapes[i], precision, offset), inputMask); config.inConfs[i] = portConfig; } config.outConfs.resize(outputShapes.size()); for (size_t i = 0; i < outputShapes.size(); i++) { + auto precision = getOriginalOutputPrecisionAtPort(i); + if (supportedPrecisions.count(precision) == 0) + IE_THROW() << "Subgraph node with name `" << getName() << "` doesn't support " << precision << " precision."; + BlockedMemoryDesc::CmpMask outputMask = BLOCKED_DESC_SKIP_OFFSET_MASK; PortConfig portConfig; portConfig.inPlace(-1); @@ -144,7 +156,7 @@ void Snippet::initSupportedPrimitiveDescriptors() { if (outputShapes[i].getDims()[0] == 1) { outputMask.reset(0); // accepts any stride on batch axis } - portConfig.setMemDesc(createMemoryDesc(outputShapes[i], supportedPrecision, offset), outputMask); + portConfig.setMemDesc(createMemoryDesc(outputShapes[i], precision, offset), outputMask); config.outConfs[i] = portConfig; } @@ -203,11 +215,27 @@ bool Snippet::created() const { return getType() == Type::Subgraph; } +InferenceEngine::Precision Snippet::getRuntimePrecision() const { + std::vector inputPrecisions; + for (size_t i = 0; i < getParentEdges().size(); i++) { + auto parentEdge = getParentEdgeAt(i); + if (parentEdge && parentEdge->getStatus() == Edge::Status::Validated && !parentEdge->getParent()->isConstant()) { + inputPrecisions.emplace_back(DnnlExtensionUtils::DataTypeToIEPrecision((parentEdge->getMemoryPtr()->GetDataType()))); + } + } + + return getMaxPrecision(inputPrecisions); +} + bool Snippet::canBeInPlace() const { if (getParentEdgesAtPort(0)[0]->getParent()->getType() == Type::Input) { return false; } + if (getChildEdges().size() != 1) { + return false; + } + for (auto& parentEdge : getParentEdges()) { auto parent = parentEdge.lock()->getParent(); if (parent->getChildEdges().size() != 1) @@ -271,7 +299,10 @@ void Snippet::define_schedule() { ngraph::snippets::op::Subgraph::BlockedShapeVector output_blocked_shapes; for (size_t i = 0; i < outputShapes.size(); i++) output_blocked_shapes.push_back(edgeToBlockedShape(getChildEdgesAtPort(i)[0])); - exec_domain = snippet->canonicalize(output_blocked_shapes, input_blocked_shapes); + + const auto supported_exec_type = snippet->get_generator()->get_supported_exec_precision(); + exec_domain = snippet->canonicalize(output_blocked_shapes, input_blocked_shapes, supported_exec_type); + // initialize by maximum output dimension. Dimensions of outputs should be broadcastable tensorRank = std::max(static_cast(rank6D), exec_domain.size()); // Canonicalization broadcasts inputs and outputs to max input rank, which can be smaller than tensorRank @@ -287,8 +318,7 @@ void Snippet::define_schedule() { } const auto config = getSelectedPrimitiveDescriptor()->getConfig(); - const auto dataSize = config.inConfs[0].getMemDesc()->getPrecision().size(); - auto initOffsets = [this, config, dataSize]() { + auto initOffsets = [this, config]() { // find max rank input among all outputs const size_t inputNum = getParentEdges().size(); offsets_in.resize(inputNum); @@ -296,7 +326,7 @@ void Snippet::define_schedule() { offsets_in[i].resize(tensorRank, 1); offset_calculation(offsets_in[i], dims_in[i], exec_domain); for (size_t j = 0; j < tensorRank; j++) { - offsets_in[i][j] *= dataSize; + offsets_in[i][j] *= config.inConfs[i].getMemDesc()->getPrecision().size(); } } @@ -305,7 +335,8 @@ void Snippet::define_schedule() { for (size_t i = 0; i < inputNum; i++) { const auto memPtr = getParentEdgeAt(i)->getMemoryPtr(); srcMemPtrs[i] = memPtr; - start_offset_in[i] = memPtr->GetDescWithType()->getOffsetPadding() * dataSize; + start_offset_in[i] = memPtr->GetDescWithType()->getOffsetPadding() * + config.inConfs[i].getMemDesc()->getPrecision().size(); } const size_t outputNum = config.outConfs.size(); @@ -314,7 +345,7 @@ void Snippet::define_schedule() { offsets_out[i].resize(tensorRank, 1); offset_calculation(offsets_out[i], dims_out[i], exec_domain); for (size_t j = 0; j < tensorRank; j++) { - offsets_out[i][j] *= dataSize; + offsets_out[i][j] *= config.outConfs[i].getMemDesc()->getPrecision().size(); } } @@ -323,7 +354,8 @@ void Snippet::define_schedule() { for (size_t i = 0; i < outputNum; i++) { const auto memPtr = getChildEdgeAt(i)->getMemoryPtr(); dstMemPtrs[i] = memPtr; - start_offset_out[i] = memPtr->GetDescWithType()->getOffsetPadding() * dataSize; + start_offset_out[i] = memPtr->GetDescWithType()->getOffsetPadding() * + config.outConfs[i].getMemDesc()->getPrecision().size(); } }; @@ -373,7 +405,7 @@ void Snippet::define_schedule() { return collapsedDims; }; - auto initSchedulingInfo = [this, dataSize]() -> void { + auto initSchedulingInfo = [this, config]() -> void { // initialize scheduling information sch_offsets_in.resize(offsets_in.size(), 0); sch_offsets_out.resize(offsets_out.size(), 0); @@ -385,19 +417,38 @@ void Snippet::define_schedule() { schedulerWorkAmount /= exec_domain[tensorRank - 2]; exec_domain[tensorRank - 2] = 1; - // update offsets for tile 2D because loaders have ptr shifts in some cases and stores have always ptrs shifts + // update offsets for tile 2D because loaders and stores have ptr shifts in some cases + const int64_t vector_size = snippet->get_generator()->get_target_machine()->get_lanes(); for (size_t i = 0; i < offsets_in.size(); i++) { - int64_t offset = offsets_in[i][tensorRank - 2]; - if ((offset > dataSize) || (offset == 0 && dims_in[i].back() != 1)) { - sch_offsets_in[i] = offset - exec_domain.back() * dataSize; - } else if (offset == dataSize) { + const int64_t offset = offsets_in[i][tensorRank - 2]; + const int64_t data_size = config.inConfs[i].getMemDesc()->getPrecision().size(); + if (offset == data_size || offset == vector_size * data_size) { sch_offsets_in[i] = offset; + } else if ((offset > data_size) || (offset == 0 && dims_in[i].back() != 1 && dims_in[i].back() != vector_size)) { + sch_offsets_in[i] = offset - exec_domain.back() * data_size; + + // If scalar tile executes one time, ptr doesn't move on 1 value + // so we should absolutelly decrease offset + if (exec_domain.back() % vector_size == 1) { + sch_offsets_in[i] += data_size; + } } } for (size_t i = 0; i < offsets_out.size(); i++) { - int64_t offset = offsets_out[i][tensorRank - 2]; - sch_offsets_out[i] = offset - exec_domain.back() * dataSize; + const int64_t offset = offsets_out[i][tensorRank - 2]; + const size_t data_size = config.outConfs[i].getMemDesc()->getPrecision().size(); + if (offset == data_size || offset == vector_size * data_size) { + sch_offsets_out[i] = offset; + } else if ((offset > data_size) || (offset == 0 && dims_out[i].back() != 1 && dims_out[i].back() != vector_size)) { + sch_offsets_out[i] = offset - exec_domain.back() * data_size; + + // If scalar tile executes one time, ptr doesn't move on 1 value + // so we should absolutelly decrease offset + if (exec_domain.back() % vector_size == 1) { + sch_offsets_out[i] += data_size; + } + } } } }; @@ -434,7 +485,28 @@ void Snippet::generate() { auto b = offsets_out[i].begin(); std::copy(b, b + harness_num_dims, &jcp.data_offsets[(inputShapes.size() + i) * harness_num_dims]); } - schedule = snippet->generate(reinterpret_cast(&jcp)); + + ov::pass::Manager optManager; + optManager.register_pass(); + optManager.register_pass(); + + // LoadConvert uses Load emitter that support conversion from any type to only f32 + optManager.get_pass_config()->set_callback( + [](const std::shared_ptr& n) -> bool { + if (const auto& convert = std::dynamic_pointer_cast(n)) + return convert->get_destination_type() != ov::element::f32; + return true; + }); + + // StoreConvert uses Store emitter that support conversion from only f32 to any types + optManager.get_pass_config()->set_callback( + [](const std::shared_ptr& n) -> bool { + if (const auto& convert = std::dynamic_pointer_cast(n)) + return convert->get_input_element_type(0) != ov::element::f32; + return true; + }); + + schedule = snippet->generate(optManager, reinterpret_cast(&jcp)); } void Snippet::schedule_6d(const jit_snippets_call_args& call_args) const { diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.h b/src/plugins/intel_cpu/src/nodes/subgraph.h index f92b167209e451..fad68e1287dd27 100644 --- a/src/plugins/intel_cpu/src/nodes/subgraph.h +++ b/src/plugins/intel_cpu/src/nodes/subgraph.h @@ -30,6 +30,7 @@ class Snippet : public Node { void getSupportedDescriptors() override {}; void initSupportedPrimitiveDescriptors() override; void selectOptimalPrimitiveDescriptor() override; + InferenceEngine::Precision getRuntimePrecision() const override; // Here we convert to canonical for & jit everything void createPrimitive() override; diff --git a/src/plugins/intel_cpu/src/plugin.cpp b/src/plugins/intel_cpu/src/plugin.cpp index 945ed2dacfde93..9dfdee5e0a91a6 100644 --- a/src/plugins/intel_cpu/src/plugin.cpp +++ b/src/plugins/intel_cpu/src/plugin.cpp @@ -599,7 +599,6 @@ static void TransformationUpToCPUSpecificOpSet(std::shared_ptr postLPTPassManager.register_pass(); postLPTPassManager.run_passes(nGraphFunc); - if (!useLpt && _enableSnippets && dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx2)) { ngraph::pass::Manager tokenization_manager; tokenization_manager.register_pass(); diff --git a/src/plugins/intel_cpu/src/snippets_transformations/fuse_load_store_and_convert.cpp b/src/plugins/intel_cpu/src/snippets_transformations/fuse_load_store_and_convert.cpp new file mode 100644 index 00000000000000..397d5f2ce391d0 --- /dev/null +++ b/src/plugins/intel_cpu/src/snippets_transformations/fuse_load_store_and_convert.cpp @@ -0,0 +1,121 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "snippets/itt.hpp" + +#include "fuse_load_store_and_convert.hpp" +#include "snippets/snippets_isa.hpp" + +#include "snippets_transformations/op/load_store_convert.hpp" + + +#include "ngraph/opsets/opset1.hpp" +#include "ngraph/rt_info.hpp" +#include "ngraph/pattern/op/wrap_type.hpp" + +ov::intel_cpu::pass::FuseLoadConvert::FuseLoadConvert() { + MATCHER_SCOPE(FuseLoadConvert); + auto param_pattern = ngraph::pattern::wrap_type(); + auto load_pattern = ngraph::pattern::wrap_type({param_pattern}); + auto convert_pattern = ngraph::pattern::wrap_type({load_pattern}); + + auto callback = [=](ngraph::pattern::Matcher& m) { + OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "ov::intel_cpu::pass::FuseLoadConvert") + auto& pm = m.get_pattern_value_map(); + const auto param = pm.at(param_pattern).get_node_shared_ptr(); + const auto load_shared = pm.at(load_pattern).get_node_shared_ptr(); + if (!load_shared || load_shared->output(0).get_target_inputs().size() != 1) { + return false; + } + + const auto load = std::dynamic_pointer_cast(load_shared); + if (!load) + return false; + + const auto convert = pm.at(convert_pattern).get_node_shared_ptr(); + if (transformation_callback(convert)) + return false; + + std::shared_ptr load_convert = nullptr; + if (const auto convert_saturation = + std::dynamic_pointer_cast(convert)) { + load_convert = std::make_shared(param, + convert_saturation->get_destination_type(), + arithmetic_mode::saturation, + load->get_count()); + } else if (const auto convert_truncation = + std::dynamic_pointer_cast(convert)) { + load_convert = std::make_shared(param, + convert_truncation->get_destination_type(), + arithmetic_mode::truncation, + load->get_count()); + } else { + throw ngraph::ngraph_error( + "Type of Convert op is undefined. Supports only fusing Load and ConvertTruncation or ConvertSaturation ops"); + } + + if (!load_convert) + return false; + + ngraph::copy_runtime_info(convert, load_convert); + ngraph::replace_node(convert, load_convert); + + return true; + }; + + auto m = std::make_shared(convert_pattern, matcher_name); + register_matcher(m, callback); +} + + +ov::intel_cpu::pass::FuseStoreConvert::FuseStoreConvert() { + MATCHER_SCOPE(FuseStoreConvert); + auto input_pattern = ngraph::pattern::any_input(); + auto convert_pattern = ngraph::pattern::wrap_type({input_pattern}); + auto store_pattern = ngraph::pattern::wrap_type({convert_pattern}); + + auto callback = [=](ngraph::pattern::Matcher& m) { + OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "ov::intel_cpu::pass::FuseStoreConvert") + auto& pm = m.get_pattern_value_map(); + const auto input = pm.at(input_pattern).get_node_shared_ptr(); + + const auto store = std::dynamic_pointer_cast(pm.at(store_pattern).get_node_shared_ptr()); + if (!store) + return false; + + const auto convert = pm.at(convert_pattern).get_node_shared_ptr(); + if (convert->output(0).get_target_inputs().size() != 1 || transformation_callback(convert)) + return false; + + std::shared_ptr store_convert = nullptr; + if (const auto convert_saturation = + std::dynamic_pointer_cast(convert)) { + store_convert = std::make_shared(input, + convert_saturation->get_destination_type(), + arithmetic_mode::saturation, + store->get_count()); + } else if (const auto convert_truncation = + std::dynamic_pointer_cast(convert)) { + store_convert = std::make_shared(input, + convert_truncation->get_destination_type(), + arithmetic_mode::truncation, + store->get_count()); + } else { + throw ngraph::ngraph_error( + "Type of Convert op is undefined. Supports only fusing Store and ConvertTruncation or ConvertSaturation ops"); + } + + + if (!store_convert) + return false; + + ngraph::copy_runtime_info(store, store_convert); + ngraph::replace_node(store, store_convert); + + return true; + }; + + auto m = std::make_shared(store_pattern, matcher_name); + register_matcher(m, callback); +} diff --git a/src/plugins/intel_cpu/src/snippets_transformations/fuse_load_store_and_convert.hpp b/src/plugins/intel_cpu/src/snippets_transformations/fuse_load_store_and_convert.hpp new file mode 100644 index 00000000000000..fcadb235e62794 --- /dev/null +++ b/src/plugins/intel_cpu/src/snippets_transformations/fuse_load_store_and_convert.hpp @@ -0,0 +1,38 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "ngraph/pass/graph_rewrite.hpp" +#include "ngraph/pattern/matcher.hpp" + +namespace ov { +namespace intel_cpu { +namespace pass { + +/** + * @interface FuseLoadConvert + * @brief Fuse Load and ConvertSaturation/ConvertTruncation into one op LoadConvert with the corresponding mode + * @ingroup snippets + */ +class FuseLoadConvert: public ngraph::pass::MatcherPass { +public: + OPENVINO_RTTI("FuseLoadConvert", "0"); + FuseLoadConvert(); +}; + +/** + * @interface FuseStoreConvert + * @brief Fuse Store and ConvertSaturation/ConvertTruncation into one op StoreConvert with the corresponding mode + * @ingroup snippets + */ +class FuseStoreConvert: public ngraph::pass::MatcherPass { +public: + OPENVINO_RTTI("FuseStoreConvert", "0"); + FuseStoreConvert(); +}; + +} // namespace pass +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/snippets_transformations/op/load_store_convert.cpp b/src/plugins/intel_cpu/src/snippets_transformations/op/load_store_convert.cpp new file mode 100644 index 00000000000000..49d37d1b803854 --- /dev/null +++ b/src/plugins/intel_cpu/src/snippets_transformations/op/load_store_convert.cpp @@ -0,0 +1,56 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "snippets/itt.hpp" + +#include "load_store_convert.hpp" + +#include "ngraph/runtime/host_tensor.hpp" + +using namespace std; +using namespace ov; + +intel_cpu::LoadConvert::LoadConvert(const Output& x, const ov::element::Type& destination_type, arithmetic_mode mode, const size_t count) : + Load(x, count), m_destination_type(destination_type), m_mode(mode) { + constructor_validate_and_infer_types(); +} + +bool intel_cpu::LoadConvert::visit_attributes(AttributeVisitor& visitor) { + INTERNAL_OP_SCOPE(LoadConvert_visit_attributes); + visitor.on_attribute("destination_type", m_destination_type); + return true; +} + +void intel_cpu::LoadConvert::validate_and_infer_types() { + INTERNAL_OP_SCOPE(LoadConvert_validate_and_infer_types); + set_output_type(0, m_destination_type, get_input_partial_shape(0)); +} + +std::shared_ptr intel_cpu::LoadConvert::clone_with_new_inputs(const OutputVector& new_args) const { + INTERNAL_OP_SCOPE(LoadConvert_clone_with_new_inputs); + check_new_args_count(this, new_args); + return std::make_shared(new_args.at(0), m_destination_type, m_mode, m_count); +} + +intel_cpu::StoreConvert::StoreConvert(const Output& x, const ov::element::Type& destination_type, arithmetic_mode mode, const size_t count) : + Store(x, count), m_destination_type(destination_type), m_mode(mode) { + constructor_validate_and_infer_types(); +} + +bool intel_cpu::StoreConvert::visit_attributes(AttributeVisitor& visitor) { + INTERNAL_OP_SCOPE(StoreConvert_visit_attributes); + visitor.on_attribute("destination_type", m_destination_type); + return true; +} + +void intel_cpu::StoreConvert::validate_and_infer_types() { + INTERNAL_OP_SCOPE(StoreConvert_validate_and_infer_types); + set_output_type(0, m_destination_type, get_input_partial_shape(0)); +} + +std::shared_ptr intel_cpu::StoreConvert::clone_with_new_inputs(const OutputVector& new_args) const { + INTERNAL_OP_SCOPE(StoreConvert_clone_with_new_inputs); + check_new_args_count(this, new_args); + return std::make_shared(new_args.at(0), m_destination_type, m_mode, m_count); +} diff --git a/src/plugins/intel_cpu/src/snippets_transformations/op/load_store_convert.hpp b/src/plugins/intel_cpu/src/snippets_transformations/op/load_store_convert.hpp new file mode 100644 index 00000000000000..7568003f2c627a --- /dev/null +++ b/src/plugins/intel_cpu/src/snippets_transformations/op/load_store_convert.hpp @@ -0,0 +1,76 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "ngraph/op/op.hpp" +#include "snippets/op/load.hpp" + +#include "emitters/jit_load_store_emitters.hpp" + +namespace ov { +namespace intel_cpu { + +/** + * @interface LoadConvert + * @brief Fused operation to represent computations equal to consecutive Load and Convert operations. + * The operation is used for peephole optimization during subgraph lowering. + * @ingroup snippets + */ + +class LoadConvert : public ngraph::snippets::op::Load { +public: + OPENVINO_OP("LoadConvert", "SnippetsOpset", ngraph::snippets::op::Load); + + LoadConvert(const Output& x, const ov::element::Type& destination_type, arithmetic_mode mode, const size_t count = 1lu); + LoadConvert() = default; + + ov::element::Type get_destination_type() const { return m_destination_type; } + arithmetic_mode get_arithmetic_mode() const { return m_mode; } + + bool visit_attributes(AttributeVisitor& visitor) override; + + void validate_and_infer_types() override; + + std::shared_ptr clone_with_new_inputs(const OutputVector& new_args) const override; + + bool has_evaluate() const override { return false; } + +protected: + arithmetic_mode m_mode; + ov::element::Type m_destination_type; +}; + +/** + * @interface StoreConvert + * @brief Fused operation to represent computations equal to consecutive Store and Convert operations. + * The operation is used for peephole optimization during subgraph lowering. + * @ingroup snippets + */ +class StoreConvert : public ngraph::snippets::op::Store { +public: + OPENVINO_OP("StoreConvert", "SnippetsOpset", ngraph::snippets::op::Store); + + StoreConvert(const Output& x, const ov::element::Type& destination_type, arithmetic_mode mode, const size_t count = 1lu); + StoreConvert() = default; + + ov::element::Type get_destination_type() const { return m_destination_type; } + arithmetic_mode get_arithmetic_mode() const { return m_mode; } + + bool visit_attributes(AttributeVisitor& visitor) override; + + std::shared_ptr clone_with_new_inputs(const OutputVector& new_args) const override; + + void validate_and_infer_types() override; + + bool has_evaluate() const override { return false; } + +protected: + arithmetic_mode m_mode; + ov::element::Type m_destination_type; +}; + + +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/add.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/add.cpp index 56ba1a51c2b651..d7bc5d0de7e12e 100644 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/add.cpp +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/add.cpp @@ -12,23 +12,31 @@ namespace snippets { namespace { - INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Eltwise, Add, - ::testing::Combine( - ::testing::Values(ov::Shape {1, 42, 16, 64}), - ::testing::Values(ov::Shape {1, 42, 16, 1}), - ::testing::Values(1), // one node - Add - ::testing::Values(0), // SnippetsMarkSkipped disables tokenization for eltwise chains after inputs - ::testing::Values(CommonTestUtils::DEVICE_CPU)), - Add::getTestCaseName); - - INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Eltwise, AddSinh, - ::testing::Combine( - ::testing::Values(ov::Shape {1, 42, 16, 64}), - ::testing::Values(ov::Shape {1, 42, 16, 1}), - ::testing::Values(3), // Add + 2 converts after inputs - ::testing::Values(1), // Subgraph is created, since the inputs are followed by converts - ::testing::Values(CommonTestUtils::DEVICE_CPU)), - AddSinh::getTestCaseName); +INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Eltwise, Add, + ::testing::Combine( + ::testing::Values(ov::Shape {1, 42, 16, 64}), + ::testing::Values(ov::Shape {1, 42, 16, 1}), + ::testing::Values(1), // one node - Add + ::testing::Values(0), // SnippetsMarkSkipped disables tokenization for eltwise chains after inputs + ::testing::Values(CommonTestUtils::DEVICE_CPU)), + Add::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Eltwise, AddSinh, + ::testing::Combine( + ::testing::Values(ov::Shape {1, 42, 16, 64}), + ::testing::Values(ov::Shape {1, 42, 16, 1}), + ::testing::Values(3), // Add + 2 converts after inputs + ::testing::Values(1), // Subgraph is created, since the inputs are followed by converts + ::testing::Values(CommonTestUtils::DEVICE_CPU)), + AddSinh::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Eltwise, AddSinhConst, + ::testing::Combine( + ::testing::Values(ov::Shape {1, 42, 16, 64}), + ::testing::Values(2), // Add + 2 converts after inputs + ::testing::Values(1), // Subgraph is created, since the inputs are followed by converts + ::testing::Values(CommonTestUtils::DEVICE_CPU)), + AddSinhConst::getTestCaseName); } // namespace } // namespace snippets diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/convert.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/convert.cpp new file mode 100644 index 00000000000000..5c074239077886 --- /dev/null +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/convert.cpp @@ -0,0 +1,162 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "snippets/convert.hpp" +#include "common_test_utils/test_constants.hpp" + +namespace ov { +namespace test { +namespace snippets { + + +namespace { + +const std::vector, std::vector>> types_Convert = { + { { ov::element::f32 }, { ov::element::i32 } }, + { { ov::element::f32 }, { ov::element::bf16 } }, + { { ov::element::f32 }, { ov::element::u8 } }, + { { ov::element::f32 }, { ov::element::i8 } }, + + { { ov::element::bf16 }, { ov::element::f32 } }, + { { ov::element::bf16 }, { ov::element::i32 } }, + { { ov::element::bf16 }, { ov::element::i8 } }, + { { ov::element::bf16 }, { ov::element::u8 } }, + + { { ov::element::i8 }, { ov::element::f32 } }, + { { ov::element::i8 }, { ov::element::i32 } }, + { { ov::element::i8 }, { ov::element::bf16 } }, + { { ov::element::i8 }, { ov::element::u8 } }, + + { { ov::element::u8 }, { ov::element::f32 } }, + { { ov::element::u8 }, { ov::element::i32 } }, + { { ov::element::u8 }, { ov::element::bf16 } }, + { { ov::element::u8 }, { ov::element::i8 } }, +}; + +const std::vector> inputShapes_Convert = { + { ov::Shape{2, 16} }, + { ov::Shape{5, 5} }, + { ov::Shape{2, 12, 1} } +}; + +INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Convert, Convert, + ::testing::Combine( + ::testing::ValuesIn(inputShapes_Convert), + ::testing::ValuesIn(types_Convert), + ::testing::Values(2), + ::testing::Values(1), + ::testing::Values(CommonTestUtils::DEVICE_CPU)), + Convert::getTestCaseName); + +const std::vector, std::vector>> types_ConvertInput = { + { { ov::element::f32 }, { ov::element::i32 } }, + { { ov::element::f32 }, { ov::element::bf16 } }, + + { { ov::element::bf16 }, { ov::element::f32 } }, + + { { ov::element::i8 }, { ov::element::f32 } }, + { { ov::element::i8 }, { ov::element::i32 } }, + { { ov::element::i8 }, { ov::element::bf16 } }, + + { { ov::element::u8 }, { ov::element::f32 } }, + { { ov::element::u8 }, { ov::element::i32 } }, + { { ov::element::u8 }, { ov::element::bf16 } }, +}; + +const std::vector> inputShapes_ConvertInput = { + { ov::Shape{2, 16}, ov::Shape{1, 16} }, + { ov::Shape{5, 18}, ov::Shape{5, 1} }, + { ov::Shape{3, 1}, ov::Shape{3, 21} } +}; + +INSTANTIATE_TEST_SUITE_P(smoke_Snippets_ConvertInput, ConvertInput, + ::testing::Combine( + ::testing::ValuesIn(inputShapes_ConvertInput), + ::testing::ValuesIn(types_ConvertInput), + ::testing::Values(3), + ::testing::Values(1), + ::testing::Values(CommonTestUtils::DEVICE_CPU)), + Convert::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P(smoke_Snippets_ConvertOutput, ConvertOutput, + ::testing::Combine( + ::testing::ValuesIn(inputShapes_ConvertInput), + ::testing::ValuesIn(types_ConvertInput), + ::testing::Values(3), + ::testing::Values(1), + ::testing::Values(CommonTestUtils::DEVICE_CPU)), + Convert::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P(smoke_Snippets_ConvertStub, ConvertStub, + ::testing::Combine( + ::testing::ValuesIn(inputShapes_ConvertInput), + ::testing::ValuesIn(types_ConvertInput), + ::testing::Values(4), + ::testing::Values(2), + ::testing::Values(CommonTestUtils::DEVICE_CPU)), + Convert::getTestCaseName); + +const std::vector, std::vector>> types_ConvertPartialInputsAndResults = { + { { ov::element::i8, ov::element::i32, ov::element::f32 }, { ov::element::f32, ov::element::i8 } }, + { { ov::element::bf16, ov::element::u8, ov::element::i32 }, { ov::element::i32, ov::element::bf16 } }, +}; + +const std::vector> inputShapes_ConvertPartialInputsAndResults = { + { ov::Shape{2, 16}, ov::Shape{1, 16}, ov::Shape{1, 1} }, + { ov::Shape{5, 18}, ov::Shape{5, 1}, ov::Shape{1, 18} }, + { ov::Shape{3, 1}, ov::Shape{3, 21}, ov::Shape{3, 1} } +}; + +INSTANTIATE_TEST_SUITE_P(smoke_Snippets_ConvertPartialInputsAndResults, ConvertPartialInputsAndResults, + ::testing::Combine( + ::testing::ValuesIn(inputShapes_ConvertPartialInputsAndResults), + ::testing::ValuesIn(types_ConvertPartialInputsAndResults), + ::testing::Values(6), + ::testing::Values(1), + ::testing::Values(CommonTestUtils::DEVICE_CPU)), + Convert::getTestCaseName); + +const std::vector, std::vector>> types_ConvertMany = { + { { ov::element::i32, ov::element::u8}, {} }, + { { ov::element::i32, ov::element::u8, ov::element::i32 }, {} }, + { { ov::element::i32, ov::element::f32, ov::element::i32, ov::element::i8 }, {} }, + { { ov::element::i32, ov::element::i8, ov::element::i32, ov::element::f32 }, {} }, +}; + +INSTANTIATE_TEST_SUITE_P(smoke_Snippets_ConvertManyOnInputs, ConvertManyOnInputs, + ::testing::Combine( + ::testing::Values(std::vector{ov::Shape{5, 5, 5, 5}}), + ::testing::ValuesIn(types_ConvertMany), + ::testing::Values(2), + ::testing::Values(1), + ::testing::Values(CommonTestUtils::DEVICE_CPU)), + Convert::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P(smoke_Snippets_ConvertManyOnOutputs, ConvertManyOnOutputs, + ::testing::Combine( + ::testing::Values(std::vector{ov::Shape{5, 5, 5, 5}}), + ::testing::ValuesIn(types_ConvertMany), + ::testing::Values(5), // sinh + subgraph + reorders for sinh + ::testing::Values(1), + ::testing::Values(CommonTestUtils::DEVICE_CPU)), + Convert::getTestCaseName); + +const std::vector, std::vector>> types_ConvertManyIO = { + { { ov::element::i32, ov::element::u8}, {ov::element::i32} }, + { { ov::element::i32, ov::element::u8, ov::element::i32 }, { ov::element::i32, ov::element::i8, ov::element::i32, ov::element::f32 } }, +}; + +INSTANTIATE_TEST_SUITE_P(smoke_Snippets_ConvertManyOnInputOutput, ConvertManyOnInputOutput, + ::testing::Combine( + ::testing::Values(std::vector{ov::Shape{5, 5, 5, 5}}), + ::testing::ValuesIn(types_ConvertManyIO), + ::testing::Values(5), // sinh + subgraph + reorders for sinh + ::testing::Values(1), + ::testing::Values(CommonTestUtils::DEVICE_CPU)), + Convert::getTestCaseName); + +} // namespace +} // namespace snippets +} // namespace test +} // namespace ov \ No newline at end of file diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/eltwise_two_results.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/eltwise_two_results.cpp new file mode 100644 index 00000000000000..934a243773a7e8 --- /dev/null +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/eltwise_two_results.cpp @@ -0,0 +1,25 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "snippets/eltwise_two_results.hpp" +#include "common_test_utils/test_constants.hpp" + +namespace ov { +namespace test { +namespace snippets { +namespace { + +INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Eltwise, EltwiseTwoResults, + ::testing::Combine( + ::testing::Values(ov::Shape {1, 64, 10, 10}), + ::testing::Values(ov::Shape {1, 64, 10, 1}), + ::testing::Values(4), + ::testing::Values(2), + ::testing::Values(CommonTestUtils::DEVICE_CPU)), + EltwiseTwoResults::getTestCaseName); + +} // namespace +} // namespace snippets +} // namespace test +} // namespace ov \ No newline at end of file diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/max_num_params_eltwise.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/max_num_params_eltwise.cpp new file mode 100644 index 00000000000000..20c01c02be8fd3 --- /dev/null +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/max_num_params_eltwise.cpp @@ -0,0 +1,26 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "snippets/max_num_params_eltwise.hpp" +#include "common_test_utils/test_constants.hpp" + +namespace ov { +namespace test { +namespace snippets { +namespace { +// Note that we need these shapes to cover all cases of code emission (none/one/multiple of scalar/vector tiles) +std::vector input_shapes {{1, 64, 10, 10}, {1, 1, 17, 37}, {1, 1, 1, 1}, {1, 1, 1, 7}, + {1, 1, 1, 128}, {1, 1, 1, 14}, {1, 1, 1, 16}, {1, 1, 1, 30}}; +INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Eltwise, MaxNumParamsEltwiseSinh, + ::testing::Combine( + ::testing::ValuesIn(input_shapes), + ::testing::Values(12), // 10 Sinh after inputs + Subgraph + Concat + ::testing::Values(1), + ::testing::Values(CommonTestUtils::DEVICE_CPU)), + MaxNumParamsEltwiseSinh::getTestCaseName); + +} // namespace +} // namespace snippets +} // namespace test +} // namespace ov \ No newline at end of file diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/three_inputs_eltwise.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/three_inputs_eltwise.cpp index c0c833268898fb..779db741cd258b 100644 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/three_inputs_eltwise.cpp +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/three_inputs_eltwise.cpp @@ -10,25 +10,25 @@ namespace test { namespace snippets { namespace { - INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Eltwise, ThreeInputsEltwise, - ::testing::Combine( - ::testing::Values(ov::Shape {1, 64, 10, 10}), - ::testing::Values(ov::Shape {1, 64, 10, 1}), - ::testing::Values(ov::Shape {1, 1, 1, 10}), - ::testing::Values(2), // eltwises fuse only for non-broadcasted shapes - ::testing::Values(0), // SnippetsMarkSkipped disables tokenization for eltwise chains after inputs - ::testing::Values(CommonTestUtils::DEVICE_CPU)), - ThreeInputsEltwise::getTestCaseName); +INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Eltwise, ThreeInputsEltwise, + ::testing::Combine( + ::testing::Values(ov::Shape {1, 64, 10, 10}), + ::testing::Values(ov::Shape {1, 64, 10, 1}), + ::testing::Values(ov::Shape {1, 1, 1, 10}), + ::testing::Values(2), // eltwises fuse only for non-broadcasted shapes + ::testing::Values(0), // SnippetsMarkSkipped disables tokenization for eltwise chains after inputs + ::testing::Values(CommonTestUtils::DEVICE_CPU)), + ThreeInputsEltwise::getTestCaseName); - INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Eltwise, ThreeInputsEltwiseSinh, - ::testing::Combine( - ::testing::Values(ov::Shape {1, 64, 10, 10}), - ::testing::Values(ov::Shape {1, 64, 10, 1}), - ::testing::Values(ov::Shape {1, 1, 1, 10}), - ::testing::Values(4), // Subgraph + 3 converts after inputs - ::testing::Values(1), // Subgraph is created, since the inputs are followed by converts - ::testing::Values(CommonTestUtils::DEVICE_CPU)), - ThreeInputsEltwiseSinh::getTestCaseName); +INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Eltwise, ThreeInputsEltwiseSinh, + ::testing::Combine( + ::testing::Values(ov::Shape {1, 64, 10, 10}), + ::testing::Values(ov::Shape {1, 64, 10, 1}), + ::testing::Values(ov::Shape {1, 1, 1, 10}), + ::testing::Values(4), // Subgraph + 3 converts after inputs + ::testing::Values(1), // Subgraph is created, since the inputs are followed by converts + ::testing::Values(CommonTestUtils::DEVICE_CPU)), + ThreeInputsEltwiseSinh::getTestCaseName); } // namespace } // namespace snippets diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/two_inputs_and_outputs.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/two_inputs_and_outputs.cpp new file mode 100644 index 00000000000000..fa182cf548a937 --- /dev/null +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/two_inputs_and_outputs.cpp @@ -0,0 +1,45 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "snippets/two_inputs_and_outputs.hpp" +#include "common_test_utils/test_constants.hpp" + +namespace ov { +namespace test { +namespace snippets { +namespace { + +const std::vector> input_shapes = { + { {5, 5, 256, 1}, {5, 5, 256, 1} }, + { {5, 5, 16, 35}, {5, 5, 16, 35} }, + { {5, 5, 256, 1}, {5, 5, 256, 35} }, + { {5, 5, 256, 1}, {5, 5, 1, 1} }, + + { {5, 5, 16, 35}, {5, 5, 1, 1} }, + { {5, 5, 16, 35}, {5, 5, 16, 1} }, + { {5, 5, 5, 35}, {5, 5, 1, 35} }, + { {5, 5, 16, 1}, {5, 5, 1, 35} }, + + { {5, 5, 35, 16}, {5, 5, 35, 16} }, + { {5, 5, 35, 16}, {5, 5, 1, 16} }, + + { {5, 5, 35, 17}, {5, 5, 35, 17} }, + { {5, 5, 35, 17}, {5, 5, 1, 17} }, + + { {5, 5, 35, 18}, {5, 5, 35, 18} }, + { {5, 5, 35, 18}, {5, 5, 1, 18} }, +}; + +INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Eltwise, TwoInputsAndOutputs, + ::testing::Combine( + ::testing::ValuesIn(input_shapes), + ::testing::Values(4), + ::testing::Values(1), + ::testing::Values(CommonTestUtils::DEVICE_CPU)), + TwoInputsAndOutputs::getTestCaseName); + +} // namespace +} // namespace snippets +} // namespace test +} // namespace ov \ No newline at end of file diff --git a/src/plugins/intel_cpu/tests/functional/single_layer_tests/eltwise.cpp b/src/plugins/intel_cpu/tests/functional/single_layer_tests/eltwise.cpp index 603e78dde07818..4e051b2e715491 100644 --- a/src/plugins/intel_cpu/tests/functional/single_layer_tests/eltwise.cpp +++ b/src/plugins/intel_cpu/tests/functional/single_layer_tests/eltwise.cpp @@ -731,4 +731,4 @@ const auto params_5D_dyn_param = ::testing::Combine( INSTANTIATE_TEST_SUITE_P(smoke_CompareWithRefs_5D_MemOrder_dyn_param, EltwiseLayerCPUTest, params_5D_dyn_param, EltwiseLayerCPUTest::getTestCaseName); } // namespace -} // namespace CPULayerTestsDefinitions \ No newline at end of file +} // namespace CPULayerTestsDefinitions diff --git a/src/plugins/intel_cpu/thirdparty/onednn b/src/plugins/intel_cpu/thirdparty/onednn index 2a749c577f8a84..f9e363fc1ff471 160000 --- a/src/plugins/intel_cpu/thirdparty/onednn +++ b/src/plugins/intel_cpu/thirdparty/onednn @@ -1 +1 @@ -Subproject commit 2a749c577f8a841a396d4bd46eaf311b7e7dc089 +Subproject commit f9e363fc1ff47191c7ddea63b19c7893965a786a diff --git a/src/tests/functional/plugin/shared/include/snippets/add.hpp b/src/tests/functional/plugin/shared/include/snippets/add.hpp index a3dbe852cde592..7f7001de94bf5d 100644 --- a/src/tests/functional/plugin/shared/include/snippets/add.hpp +++ b/src/tests/functional/plugin/shared/include/snippets/add.hpp @@ -18,6 +18,13 @@ typedef std::tuple< std::string // Target Device > AddParams; +typedef std::tuple< + ov::Shape, // Input 0 Shape + size_t, // Expected num nodes + size_t, // Expected num subgraphs + std::string // Target Device +> AddConstParams; + class Add : public testing::WithParamInterface, virtual public ov::test::SnippetsTestsCommon { public: @@ -32,6 +39,14 @@ class AddSinh : public Add { void SetUp() override; }; +class AddSinhConst : public testing::WithParamInterface, + virtual public ov::test::SnippetsTestsCommon { +public: + static std::string getTestCaseName(testing::TestParamInfo obj); +protected: + void SetUp() override; +}; + } // namespace snippets } // namespace test } // namespace ov \ No newline at end of file diff --git a/src/tests/functional/plugin/shared/include/snippets/convert.hpp b/src/tests/functional/plugin/shared/include/snippets/convert.hpp new file mode 100644 index 00000000000000..bd4d7641711a0a --- /dev/null +++ b/src/tests/functional/plugin/shared/include/snippets/convert.hpp @@ -0,0 +1,76 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "shared_test_classes/base/snippets_test_utils.hpp" + +namespace ov { +namespace test { +namespace snippets { + +typedef std::tuple< + std::vector, // InputShapes + std::pair, std::vector>, // Input and Output data types for Converts + size_t, // Expected num nodes + size_t, // Expected num subgraphs + std::string // Target Device +> ConvertParams; + +using parameters = std::vector>; + +class Convert : public testing::WithParamInterface, + virtual public ov::test::SnippetsTestsCommon { +public: + static std::string getTestCaseName(testing::TestParamInfo obj); + +protected: + void SetUp() override; + + void generate_inputs(const std::vector& targetInputStaticShapes) override; + virtual parameters generate_params_random() const; + + ov::element::Type output_type = ov::element::f32; +}; + +class ConvertInput : public Convert { +protected: + void SetUp() override; + + parameters generate_params_random() const override; +}; + +class ConvertOutput : public ConvertInput { +protected: + void SetUp() override; +}; + +class ConvertStub : public ConvertInput { +protected: + void SetUp() override; +}; + +class ConvertPartialInputsAndResults : public ConvertInput { +protected: + void SetUp() override; +}; + +class ConvertManyOnInputs : public ConvertInput { +protected: + void SetUp() override; +}; + +class ConvertManyOnOutputs : public ConvertInput { +protected: + void SetUp() override; +}; + +class ConvertManyOnInputOutput : public ConvertInput { +protected: + void SetUp() override; +}; + +} // namespace snippets +} // namespace test +} // namespace ov \ No newline at end of file diff --git a/src/tests/functional/plugin/shared/include/snippets/eltwise_two_results.hpp b/src/tests/functional/plugin/shared/include/snippets/eltwise_two_results.hpp new file mode 100644 index 00000000000000..59d3e17e5acb18 --- /dev/null +++ b/src/tests/functional/plugin/shared/include/snippets/eltwise_two_results.hpp @@ -0,0 +1,33 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "shared_test_classes/base/snippets_test_utils.hpp" + +namespace ov { +namespace test { +namespace snippets { + +typedef std::tuple< + ov::Shape, // Input 0 Shape + ov::Shape, // Input 1 Shape + size_t, // Expected num nodes + size_t, // Expected num subgraphs + std::string // Target Device +> EltwiseTwoResultsParams; + +class EltwiseTwoResults : public testing::WithParamInterface, + virtual public ov::test::SnippetsTestsCommon { +public: + static std::string getTestCaseName(testing::TestParamInfo obj); + +protected: + void SetUp() override; +}; + + +} // namespace snippets +} // namespace test +} // namespace ov diff --git a/src/tests/functional/plugin/shared/include/snippets/max_num_params_eltwise.hpp b/src/tests/functional/plugin/shared/include/snippets/max_num_params_eltwise.hpp new file mode 100644 index 00000000000000..26640e58910512 --- /dev/null +++ b/src/tests/functional/plugin/shared/include/snippets/max_num_params_eltwise.hpp @@ -0,0 +1,31 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "shared_test_classes/base/snippets_test_utils.hpp" + +namespace ov { +namespace test { +namespace snippets { + +typedef std::tuple< + ov::Shape, // Input Shape All shapes are replicated + size_t, // Expected num nodes + size_t, // Expected num subgraphs + std::string // Target Device +> MaxNumParamsEltwiseParams; + +class MaxNumParamsEltwiseSinh : public testing::WithParamInterface, + virtual public ov::test::SnippetsTestsCommon { +public: + static std::string getTestCaseName(testing::TestParamInfo obj); + +protected: + void SetUp() override; +}; + +} // namespace snippets +} // namespace test +} // namespace ov diff --git a/src/tests/functional/plugin/shared/include/snippets/two_inputs_and_outputs.hpp b/src/tests/functional/plugin/shared/include/snippets/two_inputs_and_outputs.hpp new file mode 100644 index 00000000000000..0a209de2fe9244 --- /dev/null +++ b/src/tests/functional/plugin/shared/include/snippets/two_inputs_and_outputs.hpp @@ -0,0 +1,31 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "shared_test_classes/base/snippets_test_utils.hpp" + +namespace ov { +namespace test { +namespace snippets { + +typedef std::tuple< + std::vector, // Input Shape All shapes + size_t, // Expected num nodes + size_t, // Expected num subgraphs + std::string // Target Device +> TwoInputsAndOutputsParams; + +class TwoInputsAndOutputs : public testing::WithParamInterface, + virtual public ov::test::SnippetsTestsCommon { +public: + static std::string getTestCaseName(testing::TestParamInfo obj); + +protected: + void SetUp() override; +}; + +} // namespace snippets +} // namespace test +} // namespace ov diff --git a/src/tests/functional/plugin/shared/src/snippets/add.cpp b/src/tests/functional/plugin/shared/src/snippets/add.cpp index 896f03e78d05a1..1b8d1f8ecdfc8d 100644 --- a/src/tests/functional/plugin/shared/src/snippets/add.cpp +++ b/src/tests/functional/plugin/shared/src/snippets/add.cpp @@ -10,38 +10,61 @@ namespace ov { namespace test { namespace snippets { - std::string Add::getTestCaseName(testing::TestParamInfo obj) { - ov::Shape inputShapes0, inputShapes1, newInputShapes; - std::string targetDevice; - size_t num_nodes, num_subgraphs; - std::tie(inputShapes0, inputShapes1, num_nodes, num_subgraphs, targetDevice) = obj.param; - - std::ostringstream result; - result << "IS[0]=" << CommonTestUtils::vec2str(inputShapes0) << "_"; - result << "IS[1]=" << CommonTestUtils::vec2str(inputShapes1) << "_"; - result << "#N=" << num_nodes << "_"; - result << "#S=" << num_subgraphs << "_"; - result << "targetDevice=" << targetDevice; - return result.str(); - } - - void Add::SetUp() { - ov::Shape inputShape0, inputShape1; - std::tie(inputShape0, inputShape1, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam(); - init_input_shapes({{{}, {inputShape0, }}, {{}, {inputShape1, }}}); - - auto f = ov::test::snippets::AddFunction({inputShape0, inputShape1}); - function = f.getOriginal(); - } - - void AddSinh::SetUp() { - ov::Shape inputShape0, inputShape1; - std::tie(inputShape0, inputShape1, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam(); - init_input_shapes({{{}, {inputShape0, }}, {{}, {inputShape1, }}}); - - auto f = ov::test::snippets::AddSinhFunction({inputShape0, inputShape1}); - function = f.getOriginal(); - } +std::string Add::getTestCaseName(testing::TestParamInfo obj) { + ov::Shape inputShapes0, inputShapes1, newInputShapes; + std::string targetDevice; + size_t num_nodes, num_subgraphs; + std::tie(inputShapes0, inputShapes1, num_nodes, num_subgraphs, targetDevice) = obj.param; + + std::ostringstream result; + result << "IS[0]=" << CommonTestUtils::vec2str(inputShapes0) << "_"; + result << "IS[1]=" << CommonTestUtils::vec2str(inputShapes1) << "_"; + result << "#N=" << num_nodes << "_"; + result << "#S=" << num_subgraphs << "_"; + result << "targetDevice=" << targetDevice; + return result.str(); +} + +void Add::SetUp() { + ov::Shape inputShape0, inputShape1; + std::tie(inputShape0, inputShape1, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam(); + init_input_shapes({{{}, {inputShape0, }}, {{}, {inputShape1, }}}); + + auto f = ov::test::snippets::AddFunction({inputShape0, inputShape1}); + function = f.getOriginal(); +} + +void AddSinh::SetUp() { + ov::Shape inputShape0, inputShape1; + std::tie(inputShape0, inputShape1, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam(); + init_input_shapes({{{}, {inputShape0, }}, {{}, {inputShape1, }}}); + + auto f = ov::test::snippets::AddSinhFunction({inputShape0, inputShape1}); + function = f.getOriginal(); +} + +std::string AddSinhConst::getTestCaseName(testing::TestParamInfo obj) { + ov::Shape inputShapes, newInputShapes; + std::string targetDevice; + size_t num_nodes, num_subgraphs; + std::tie(inputShapes, num_nodes, num_subgraphs, targetDevice) = obj.param; + + std::ostringstream result; + result << "IS[0]=" << CommonTestUtils::vec2str(inputShapes) << "_"; + result << "#N=" << num_nodes << "_"; + result << "#S=" << num_subgraphs << "_"; + result << "targetDevice=" << targetDevice; + return result.str(); +} + +void AddSinhConst::SetUp() { + ov::Shape inputShape; + std::tie(inputShape, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam(); + init_input_shapes({{{}, {inputShape, }}}); + + auto f = ov::test::snippets::AddSinhConstFunction({inputShape}); + function = f.getOriginal(); +} TEST_P(Add, CompareWithRefImpl) { run(); @@ -53,6 +76,11 @@ TEST_P(AddSinh, CompareWithRefImpl) { validateNumSubgraphs(); } +TEST_P(AddSinhConst, CompareWithRefImpl) { + run(); + validateNumSubgraphs(); +} + } // namespace snippets } // namespace test } // namespace ov diff --git a/src/tests/functional/plugin/shared/src/snippets/convert.cpp b/src/tests/functional/plugin/shared/src/snippets/convert.cpp new file mode 100644 index 00000000000000..b4c5c840cb6869 --- /dev/null +++ b/src/tests/functional/plugin/shared/src/snippets/convert.cpp @@ -0,0 +1,231 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "common_test_utils/common_utils.hpp" +#include "snippets/convert.hpp" +#include "subgraph_converts.hpp" +#include "common_test_utils/ov_tensor_utils.hpp" + +namespace ov { +namespace test { +namespace snippets { + +std::string Convert::getTestCaseName(testing::TestParamInfo obj) { + std::vector inputShape; + std::pair, std::vector> types; + std::string targetDevice; + size_t num_nodes, num_subgraphs; + std::tie(inputShape, types, num_nodes, num_subgraphs, targetDevice) = obj.param; + + std::ostringstream result; + result << "IS="; + for (const auto& sh : inputShape) + result << CommonTestUtils::vec2str(sh) << "_"; + result << "IT=" << CommonTestUtils::vec2str(types.first) << "_"; + result << "OT=" << CommonTestUtils::vec2str(types.second) << "_"; + result << "#N=" << num_nodes << "_"; + result << "#S=" << num_subgraphs << "_"; + result << "targetDevice=" << targetDevice; + return result.str(); +} + +void Convert::SetUp() { + std::vector inputShape; + std::pair, std::vector> types; + std::tie(inputShape, types, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam(); + init_input_shapes(static_shapes_to_test_representation(inputShape)); + + auto f = ov::test::snippets::ConvertFunction(inputShape, types.first[0], types.second[0]); + function = f.getOriginal(); + output_type = types.second.front(); +} + +parameters Convert::generate_params_random() const { + int32_t startFrom, range, resolution = 5; + switch (output_type) { + case ov::element::f32: + case ov::element::i32: + case ov::element::bf16: + startFrom = -10; + range = 20; + break; + case ov::element::u8: + startFrom = -10; + range = 20; + break; + case ov::element::i8: + startFrom = 117; + range = 20; + break; + default: + startFrom = 0; + range = 10; + } + return {{ startFrom, range, resolution }}; +} + +void Convert::generate_inputs(const std::vector& targetInputStaticShapes) { + inputs.clear(); + const auto& funcInputs = function->inputs(); + const auto params = generate_params_random(); + if (params.size() != funcInputs.size()) { + IE_THROW() << "Incorrect count of parameters for random generation and inputs of function!"; + } + + for (int i = 0; i < funcInputs.size(); ++i) { + const auto& funcInput = funcInputs[i]; + ov::Tensor tensor; + int32_t startFrom, range, resolution; + std::tie(startFrom, range, resolution) = params[i]; + tensor = ov::test::utils::create_and_fill_tensor(funcInput.get_element_type(), targetInputStaticShapes[i], + range, startFrom, resolution); + inputs.insert({funcInput.get_node_shared_ptr(), tensor}); + } +} + +void ConvertInput::SetUp() { + std::vector inputShape; + std::pair, std::vector> types; + std::tie(inputShape, types, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam(); + init_input_shapes(static_shapes_to_test_representation(inputShape)); + + auto f = ov::test::snippets::ConvertInputFunction(inputShape, types.first[0], types.second[0]); + function = f.getOriginal(); +} + +parameters ConvertInput::generate_params_random() const { + parameters params; + const auto& funcInputs = function->inputs(); + for (int i = 0; i < funcInputs.size(); ++i) { + int32_t startFrom, range, resolution = 1; + switch (funcInputs[i].get_element_type()) { + case ov::element::f32: + case ov::element::bf16: + startFrom = -10; + range = 20; + resolution = 7; + break; + case ov::element::i32: + case ov::element::i8: + startFrom = -10; + range = 20; + break; + case ov::element::u8: + startFrom = 10; + range = 20; + break; + default: + startFrom = 0; + range = 10; + } + params.push_back({ startFrom, range, resolution }); + } + return params; +} + +void ConvertOutput::SetUp() { + std::vector inputShape; + std::pair, std::vector> types; + std::tie(inputShape, types, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam(); + init_input_shapes(static_shapes_to_test_representation(inputShape)); + + auto f = ov::test::snippets::ConvertOutputFunction(inputShape, types.first[0], types.second[0]); + function = f.getOriginal(); + output_type = types.second.front(); +} + +void ConvertStub::SetUp() { + std::vector inputShape; + std::pair, std::vector> types; + std::tie(inputShape, types, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam(); + init_input_shapes(static_shapes_to_test_representation(inputShape)); + + auto f = ov::test::snippets::ConvertStubFunction(inputShape, types.first[0], types.second[0]); + function = f.getOriginal(); + output_type = types.second.front(); +} + +void ConvertPartialInputsAndResults::SetUp() { + std::vector inputShape; + std::pair, std::vector> types; + std::tie(inputShape, types, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam(); + init_input_shapes(static_shapes_to_test_representation(inputShape)); + + auto f = ov::test::snippets::ConvertPartialInputsAndResultsFunction(inputShape, types.first, types.second); + function = f.getOriginal(); +} + +void ConvertManyOnInputs::SetUp() { + std::vector inputShape; + std::pair, std::vector> types; + std::tie(inputShape, types, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam(); + init_input_shapes(static_shapes_to_test_representation(inputShape)); + + auto f = ov::test::snippets::ConvertManyOnInputsFunction(inputShape, types.first); + function = f.getOriginal(); +} + +void ConvertManyOnOutputs::SetUp() { + std::vector inputShape; + std::pair, std::vector> types; + std::tie(inputShape, types, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam(); + init_input_shapes(static_shapes_to_test_representation(inputShape)); + + auto f = ov::test::snippets::ConvertManyOnOutputsFunction(inputShape, types.first); + function = f.getOriginal(); +} + +void ConvertManyOnInputOutput::SetUp() { + std::vector inputShape; + std::pair, std::vector> types; + std::tie(inputShape, types, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam(); + init_input_shapes(static_shapes_to_test_representation(inputShape)); + + auto f = ov::test::snippets::ConvertManyOnInputOutputFunction(inputShape, types.first, types.second); + function = f.getOriginal(); +} + +TEST_P(Convert, CompareWithRefImpl) { + run(); + validateNumSubgraphs(); +} + +TEST_P(ConvertInput, CompareWithRefImpl) { + run(); + validateNumSubgraphs(); +} + +TEST_P(ConvertOutput, CompareWithRefImpl) { + run(); + validateNumSubgraphs(); +} + +TEST_P(ConvertStub, CompareWithRefImpl) { + run(); + validateNumSubgraphs(); +} + +TEST_P(ConvertPartialInputsAndResults, CompareWithRefImpl) { + run(); + validateNumSubgraphs(); +} + +TEST_P(ConvertManyOnInputs, CompareWithRefImpl) { + run(); + validateNumSubgraphs(); +} + +TEST_P(ConvertManyOnOutputs, CompareWithRefImpl) { + run(); + validateNumSubgraphs(); +} + +TEST_P(ConvertManyOnInputOutput, CompareWithRefImpl) { + run(); + validateNumSubgraphs(); +} + +} // namespace snippets +} // namespace test +} // namespace ov diff --git a/src/tests/functional/plugin/shared/src/snippets/eltwise_two_results.cpp b/src/tests/functional/plugin/shared/src/snippets/eltwise_two_results.cpp new file mode 100644 index 00000000000000..f35f0717155e42 --- /dev/null +++ b/src/tests/functional/plugin/shared/src/snippets/eltwise_two_results.cpp @@ -0,0 +1,44 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "common_test_utils/common_utils.hpp" +#include "snippets/eltwise_two_results.hpp" +#include "subgraph_simple.hpp" + +namespace ov { +namespace test { +namespace snippets { + +std::string EltwiseTwoResults::getTestCaseName(testing::TestParamInfo obj) { + ov::Shape inputShapes0, inputShapes1; + std::string targetDevice; + size_t num_nodes, num_subgraphs; + std::tie(inputShapes0, inputShapes1, num_nodes, num_subgraphs, targetDevice) = obj.param; + + std::ostringstream result; + result << "IS[0]=" << CommonTestUtils::vec2str(inputShapes0) << "_"; + result << "IS[1]=" << CommonTestUtils::vec2str(inputShapes1) << "_"; + result << "#N=" << num_nodes << "_"; + result << "#S=" << num_subgraphs << "_"; + result << "targetDevice=" << targetDevice; + return result.str(); +} + +void EltwiseTwoResults::SetUp() { + ov::Shape inputShape0, inputShape1; + std::tie(inputShape0, inputShape1, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam(); + init_input_shapes({{{}, {inputShape0, }}, {{}, {inputShape1, }}}); + + auto f = ov::test::snippets::EltwiseTwoResultsFunction({inputShape0, inputShape1}); + function = f.getOriginal(); +} + +TEST_P(EltwiseTwoResults, CompareWithRefImpl) { + run(); + validateNumSubgraphs(); +} + +} // namespace snippets +} // namespace test +} // namespace ov diff --git a/src/tests/functional/plugin/shared/src/snippets/max_num_params_eltwise.cpp b/src/tests/functional/plugin/shared/src/snippets/max_num_params_eltwise.cpp new file mode 100644 index 00000000000000..1140937be63359 --- /dev/null +++ b/src/tests/functional/plugin/shared/src/snippets/max_num_params_eltwise.cpp @@ -0,0 +1,49 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "common_test_utils/common_utils.hpp" +#include "snippets/max_num_params_eltwise.hpp" +#include "subgraph_simple.hpp" + +namespace ov { +namespace test { +namespace snippets { + +std::string MaxNumParamsEltwiseSinh::getTestCaseName(testing::TestParamInfo obj) { + ov::Shape inputShapes; + std::string targetDevice; + size_t num_nodes, num_subgraphs; + std::tie(inputShapes, num_nodes, num_subgraphs, targetDevice) = obj.param; + + std::ostringstream result; + result << "IS[0]=" << CommonTestUtils::vec2str(inputShapes) << "_"; + result << "#N=" << num_nodes << "_"; + result << "#S=" << num_subgraphs << "_"; + result << "targetDevice=" << targetDevice; + return result.str(); +} + +void MaxNumParamsEltwiseSinh::SetUp() { + ov::Shape inputShape; + std::tie(inputShape, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam(); + std::vector expandedShapes(10, inputShape); + std::vector input_shapes; + for (const auto& s : expandedShapes) { + input_shapes.emplace_back(InputShape {{}, {s, }}); + } + + init_input_shapes(input_shapes); + + auto f = ov::test::snippets::EltwiseMaxNumParamsSinhFunction(expandedShapes); + function = f.getOriginal(); +} + +TEST_P(MaxNumParamsEltwiseSinh, CompareWithRefImpl) { + run(); + validateNumSubgraphs(); +} + +} // namespace snippets +} // namespace test +} // namespace ov diff --git a/src/tests/functional/plugin/shared/src/snippets/three_inputs_eltwise.cpp b/src/tests/functional/plugin/shared/src/snippets/three_inputs_eltwise.cpp index ad1c3e74255938..276218e6150c57 100644 --- a/src/tests/functional/plugin/shared/src/snippets/three_inputs_eltwise.cpp +++ b/src/tests/functional/plugin/shared/src/snippets/three_inputs_eltwise.cpp @@ -10,42 +10,42 @@ namespace ov { namespace test { namespace snippets { - std::string ThreeInputsEltwise::getTestCaseName(testing::TestParamInfo obj) { - ov::Shape inputShapes0, inputShapes1, inputShapes2; - std::string targetDevice; - size_t num_nodes, num_subgraphs; - std::tie(inputShapes0, inputShapes1, inputShapes2, - num_nodes, num_subgraphs, targetDevice) = obj.param; - - std::ostringstream result; - result << "IS[0]=" << CommonTestUtils::vec2str(inputShapes0) << "_"; - result << "IS[1]=" << CommonTestUtils::vec2str(inputShapes1) << "_"; - result << "IS[2]=" << CommonTestUtils::vec2str(inputShapes2) << "_"; - result << "#N=" << num_nodes << "_"; - result << "#S=" << num_subgraphs << "_"; - result << "targetDevice=" << targetDevice; - return result.str(); - } - - void ThreeInputsEltwise::SetUp() { - ov::Shape inputShape0, inputShape1, inputShape2; - std::tie(inputShape0, inputShape1, inputShape2, - ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam(); - init_input_shapes({{{}, {inputShape0, }}, {{}, {inputShape1, }}, {{}, {inputShape2, }}}); - - auto f = ov::test::snippets::EltwiseThreeInputsFunction({inputShape0, inputShape1, inputShape2}); - function = f.getOriginal(); - } - - void ThreeInputsEltwiseSinh::SetUp() { - ov::Shape inputShape0, inputShape1, inputShape2; - std::tie(inputShape0, inputShape1, inputShape2, - ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam(); - init_input_shapes({{{}, {inputShape0, }}, {{}, {inputShape1, }}, {{}, {inputShape2, }}}); - - auto f = ov::test::snippets::EltwiseThreeInputsSinhFunction({inputShape0, inputShape1, inputShape2}); - function = f.getOriginal(); - } +std::string ThreeInputsEltwise::getTestCaseName(testing::TestParamInfo obj) { + ov::Shape inputShapes0, inputShapes1, inputShapes2; + std::string targetDevice; + size_t num_nodes, num_subgraphs; + std::tie(inputShapes0, inputShapes1, inputShapes2, + num_nodes, num_subgraphs, targetDevice) = obj.param; + + std::ostringstream result; + result << "IS[0]=" << CommonTestUtils::vec2str(inputShapes0) << "_"; + result << "IS[1]=" << CommonTestUtils::vec2str(inputShapes1) << "_"; + result << "IS[2]=" << CommonTestUtils::vec2str(inputShapes2) << "_"; + result << "#N=" << num_nodes << "_"; + result << "#S=" << num_subgraphs << "_"; + result << "targetDevice=" << targetDevice; + return result.str(); +} + +void ThreeInputsEltwise::SetUp() { + ov::Shape inputShape0, inputShape1, inputShape2; + std::tie(inputShape0, inputShape1, inputShape2, + ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam(); + init_input_shapes({{{}, {inputShape0, }}, {{}, {inputShape1, }}, {{}, {inputShape2, }}}); + + auto f = ov::test::snippets::EltwiseThreeInputsFunction({inputShape0, inputShape1, inputShape2}); + function = f.getOriginal(); +} + +void ThreeInputsEltwiseSinh::SetUp() { + ov::Shape inputShape0, inputShape1, inputShape2; + std::tie(inputShape0, inputShape1, inputShape2, + ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam(); + init_input_shapes({{{}, {inputShape0, }}, {{}, {inputShape1, }}, {{}, {inputShape2, }}}); + + auto f = ov::test::snippets::EltwiseThreeInputsSinhFunction({inputShape0, inputShape1, inputShape2}); + function = f.getOriginal(); +} TEST_P(ThreeInputsEltwise, CompareWithRefImpl) { run(); diff --git a/src/tests/functional/plugin/shared/src/snippets/two_inputs_and_outputs.cpp b/src/tests/functional/plugin/shared/src/snippets/two_inputs_and_outputs.cpp new file mode 100644 index 00000000000000..205587e1a30f97 --- /dev/null +++ b/src/tests/functional/plugin/shared/src/snippets/two_inputs_and_outputs.cpp @@ -0,0 +1,43 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "common_test_utils/common_utils.hpp" +#include "snippets/two_inputs_and_outputs.hpp" +#include "subgraph_simple.hpp" + +namespace ov { +namespace test { +namespace snippets { + +std::string TwoInputsAndOutputs::getTestCaseName(testing::TestParamInfo obj) { + std::vector inputShapes; + std::string targetDevice; + size_t num_nodes, num_subgraphs; + std::tie(inputShapes, num_nodes, num_subgraphs, targetDevice) = obj.param; + + std::ostringstream result; + for (auto i = 0; i < inputShapes.size(); i++) + result << "IS[" << i << "]=" << CommonTestUtils::vec2str(inputShapes[i]) << "_"; + result << "#N=" << num_nodes << "_"; + result << "#S=" << num_subgraphs << "_"; + result << "targetDevice=" << targetDevice; + return result.str(); +} + +void TwoInputsAndOutputs::SetUp() { + std::vector inputShape; + std::tie(inputShape, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam(); + init_input_shapes(static_shapes_to_test_representation(inputShape)); + auto f = ov::test::snippets::TwoInputsAndOutputsFunction(inputShape); + function = f.getOriginal(); +} + +TEST_P(TwoInputsAndOutputs, CompareWithRefImpl) { + run(); + validateNumSubgraphs(); +} + +} // namespace snippets +} // namespace test +} // namespace ov diff --git a/src/tests/functional/shared_test_classes/src/base/snippets_test_utils.cpp b/src/tests/functional/shared_test_classes/src/base/snippets_test_utils.cpp index b7a58321172fd6..4397a88c3157c3 100644 --- a/src/tests/functional/shared_test_classes/src/base/snippets_test_utils.cpp +++ b/src/tests/functional/shared_test_classes/src/base/snippets_test_utils.cpp @@ -3,11 +3,16 @@ // #include "shared_test_classes/base/snippets_test_utils.hpp" +#include "functional_test_utils/skip_tests_config.hpp" #include "exec_graph_info.hpp" namespace ov { namespace test { void SnippetsTestsCommon::validateNumSubgraphs() { + bool isCurrentTestDisabled = FuncTestUtils::SkipTestsConfig::currentTestIsDisabled(); + if (isCurrentTestDisabled) + GTEST_SKIP() << "Disabled test due to configuration" << std::endl; + const auto& compiled_model = compiledModel.get_runtime_model(); size_t num_subgraphs = 0; size_t num_nodes = 0; diff --git a/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_converts.hpp b/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_converts.hpp new file mode 100644 index 00000000000000..a7c6bd34e0f58e --- /dev/null +++ b/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_converts.hpp @@ -0,0 +1,214 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "ngraph/ngraph.hpp" +#include "./snippets_helpers.hpp" + +/* This file contains definitions of relatively simple functions (models) that will be used + * to test snippets-specific behavior. All the functions are expected to be direct descendants of + * SnippetsFunctionBase, so their constructors take only one (inputShapes) argument. + */ + +namespace ov { +namespace test { +namespace snippets { +/// The most trivial graph, just one Convert. +/// Tokenized simply by starting subgraph. +// in1 +// Convert +// Result +class ConvertFunction : public SnippetsFunctionBase { +public: + explicit ConvertFunction(const std::vector& inputShapes, + const ov::element::Type inType = ov::element::f32, + const ov::element::Type outType = ov::element::u8) + : SnippetsFunctionBase(inputShapes), inType(inType), outType(outType) { + NGRAPH_CHECK(input_shapes.size() == 1, "Got invalid number of input shapes"); + } +protected: + std::shared_ptr initOriginal() const override; + std::shared_ptr initReference() const override; + + ov::element::Type inType; + ov::element::Type outType; +}; + + +/// The one of the input of Add is Convert +/// Tokenized simply by starting subgraph. +// in1 +// Convert in2 +// Add +// Result +class ConvertInputFunction : public SnippetsFunctionBase { +public: + explicit ConvertInputFunction(const std::vector& inputShapes, + const ov::element::Type inType = ov::element::f32, + const ov::element::Type outType = ov::element::u8) + : SnippetsFunctionBase(inputShapes), inType(inType), outType(outType) { + NGRAPH_CHECK(input_shapes.size() == 2, "Got invalid number of input shapes"); + } +protected: + std::shared_ptr initOriginal() const override; + std::shared_ptr initReference() const override; + + ov::element::Type inType; + ov::element::Type outType; +}; + +/// The output of Sub is Convert +/// Tokenized simply by starting subgraph. +// in1 in2 +// Sub +// Convert +// Result +class ConvertOutputFunction : public SnippetsFunctionBase { +public: + explicit ConvertOutputFunction(const std::vector& inputShapes, + const ov::element::Type inType = ov::element::f32, + const ov::element::Type outType = ov::element::i8) + : SnippetsFunctionBase(inputShapes), inType(inType), outType(outType) { + NGRAPH_CHECK(input_shapes.size() == 2, "Got invalid number of input shapes"); + } +protected: + std::shared_ptr initOriginal() const override; + std::shared_ptr initReference() const override; + + ov::element::Type inType; + ov::element::Type outType; +}; + + +/// There are 2 subgraphs: Add + Convert(Stub) and Relu +/// Tokenized simply by starting subgraph. +// in1 in2 in1 in2 +// Add Subgraph +// Convert -> | +// Relu Subgraph +// Result Result +class ConvertStubFunction : public SnippetsFunctionBase { +public: + explicit ConvertStubFunction(const std::vector& inputShapes, + const ov::element::Type inType = ov::element::f32, + const ov::element::Type outType = ov::element::i8) + : SnippetsFunctionBase(inputShapes), inType(inType), outType(outType) { + NGRAPH_CHECK(input_shapes.size() == 2, "Got invalid number of input shapes"); + } +protected: + std::shared_ptr initOriginal() const override; + std::shared_ptr initReference() const override; + + ov::element::Type inType; + ov::element::Type outType; +}; + + +/// Not all Inputs and Results have Convert +/// Tokenized simply by starting subgraph. +// in1 in2 +// Convert Convert +// Add +// Relu in3 +// Convert Sub +// Result1 Unsqueeze <- It's to avoid many result output for subgraph (it's a limitation of collapsing) +// Result2 +class ConvertPartialInputsAndResultsFunction : public SnippetsFunctionBase { +public: + explicit ConvertPartialInputsAndResultsFunction(const std::vector& inputShapes, + const std::vector& inTypes = {ov::element::f32}, + const std::vector& outTypes = {ov::element::f32}) + : SnippetsFunctionBase(inputShapes), inTypes(inTypes), outTypes(outTypes) { + NGRAPH_CHECK(input_shapes.size() == 3, "Got invalid number of input shapes"); + } +protected: + std::shared_ptr initOriginal() const override; + std::shared_ptr initReference() const override; + + std::vector inTypes; + std::vector outTypes; +}; + +/// Convert Sequence on input +/// Tokenized simply by starting subgraph. +// in in +// Stub Stub +// Convert | +// Convert -> Subgraph +// Convert | +// Relu Result +// Result +class ConvertManyOnInputsFunction : public SnippetsFunctionBase { +public: + explicit ConvertManyOnInputsFunction(const std::vector& inputShapes, const std::vector& types) + : SnippetsFunctionBase(inputShapes), types(types) { + NGRAPH_CHECK(input_shapes.size() == 1, "Got invalid number of input shapes"); + NGRAPH_CHECK(types.size() > 1, "Got invalid number of element types"); + } +protected: + std::shared_ptr initOriginal() const override; + std::shared_ptr initReference() const override; + + std::vector types; +}; + +/// Convert Sequence on output +/// Tokenized simply by starting subgraph. +// in in +// Stub Stub +// Relu | +// Convert -> Subgraph +// Convert | +// Convert | +// Result Result +class ConvertManyOnOutputsFunction : public SnippetsFunctionBase { +public: + explicit ConvertManyOnOutputsFunction(const std::vector& inputShapes, const std::vector& types) + : SnippetsFunctionBase(inputShapes), types(types) { + NGRAPH_CHECK(input_shapes.size() == 1, "Got invalid number of input shapes"); + NGRAPH_CHECK(types.size() > 1, "Got invalid number of element types"); + } +protected: + std::shared_ptr initOriginal() const override; + std::shared_ptr initReference() const override; + + std::vector types; +}; + +/// Convert Sequence on input and output +/// Tokenized simply by starting subgraph. +// in in +// Stub Stub +// Convert | +// Convert | +// Convert | +// Relu -> Subgraph +// Convert | +// Convert | +// Convert | +// Result Result +class ConvertManyOnInputOutputFunction : public SnippetsFunctionBase { +public: + explicit ConvertManyOnInputOutputFunction(const std::vector& inputShapes, + const std::vector& inTypes, + const std::vector& outTypes) + : SnippetsFunctionBase(inputShapes), inTypes(inTypes), outTypes(outTypes) { + NGRAPH_CHECK(input_shapes.size() == 1, "Got invalid number of input shapes"); + NGRAPH_CHECK(inTypes.size() > 1, "Got invalid number of input element types"); + NGRAPH_CHECK(outTypes.size() > 0, "Got invalid number of output element types"); + } +protected: + std::shared_ptr initOriginal() const override; + std::shared_ptr initReference() const override; + + std::vector inTypes; + std::vector outTypes; +}; + + + +} // namespace snippets +} // namespace test +} // namespace ov diff --git a/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_lowered.hpp b/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_lowered.hpp index f35e0e1ecd4b33..fad086acf031e1 100644 --- a/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_lowered.hpp +++ b/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_lowered.hpp @@ -7,6 +7,7 @@ #include "ngraph/ngraph.hpp" #include "snippets_helpers.hpp" #include "subgraph_simple.hpp" +#include "subgraph_converts.hpp" /* This file provides lowered representations (after the generate() was calles) for some simple functions. * This is required to test snippets lowering and optimization passes. All the functions are expected to be direct @@ -45,7 +46,7 @@ class EltwiseThreeInputsLoweredFunction : public EltwiseThreeInputsFunction { protected: std::shared_ptr initLowered() const override; private: - std::vector broadcast_shapes;; + std::vector broadcast_shapes; }; } // namespace snippets diff --git a/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_simple.hpp b/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_simple.hpp index f67a86966a4bd8..3623db2873f416 100644 --- a/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_simple.hpp +++ b/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_simple.hpp @@ -29,13 +29,14 @@ class AddFunction : public SnippetsFunctionBase { std::shared_ptr initOriginal() const override; std::shared_ptr initReference() const override; }; -/// Add separated from inputs by Sin to WA CPU-specific disabling after inputs. +/// Add separated from inputs by Sinh to WA CPU-specific disabling after inputs. /// Works because Sinh is not supported by tokenization yet. /// Tokenized simply by starting subgraph. // in1 in2 -// Sin Sinh +// Sinh Sinh // Add // Result +// todo: remove Sinh once "no subgraph after input" limitation is relaxed class AddSinhFunction : public SnippetsFunctionBase { public: explicit AddSinhFunction(const std::vector& inputShapes) : SnippetsFunctionBase(inputShapes) { @@ -45,6 +46,21 @@ class AddSinhFunction : public SnippetsFunctionBase { std::shared_ptr initOriginal() const override; std::shared_ptr initReference() const override; }; +/// Like AddSinh but with a constant second input (and no sinh on in) +// in1 in2 +// Sin Sinh +// Add +// Result +// todo: remove Sinh once "no subgraph after input" limitation is relaxed +class AddSinhConstFunction : public SnippetsFunctionBase { +public: + explicit AddSinhConstFunction(const std::vector& inputShapes) : SnippetsFunctionBase(inputShapes) { + NGRAPH_CHECK(input_shapes.size() == 1, "Got invalid number of input shapes"); + } +protected: + std::shared_ptr initOriginal() const override; +// std::shared_ptr initReference() const override; +}; /// Simple Eltwise graph fully convertible to Subgraph. /// Tokenized simply by attaching eltwises. // in1 in2 @@ -77,6 +93,7 @@ class EltwiseThreeInputsFunction : public SnippetsFunctionBase { }; /// EltwiseFunctionThreeInputs with Sinh after inputs to to WA CPU-specific disabling after inputs /// See AddSinh for details. +// todo: remove Sinh once "no subgraph after input" limitation is relaxed class EltwiseThreeInputsSinhFunction : public SnippetsFunctionBase { public: explicit EltwiseThreeInputsSinhFunction(const std::vector& inputShapes) : @@ -86,6 +103,24 @@ class EltwiseThreeInputsSinhFunction : public SnippetsFunctionBase { protected: std::shared_ptr initOriginal() const override; }; +/// Eltwise graph with 10 inputs and 2 outputs. +/// Needed to test for a max number of inputs+outputs allowed. +// in1 in2 in3 ... in10 +// Sinh Sinh Sinh ...Sinh +// ........................ +// Subtract Power +// \ Sinh +// Result +// todo: remove Sinh once "no subgraph after input" limitation is relaxed +class EltwiseMaxNumParamsSinhFunction : public SnippetsFunctionBase { +public: + explicit EltwiseMaxNumParamsSinhFunction(const std::vector& inputShapes) : + SnippetsFunctionBase(inputShapes) { + NGRAPH_CHECK(input_shapes.size() == 10, "Got invalid number of input shapes"); + } +protected: + std::shared_ptr initOriginal() const override; +}; /// MatMul with two eltwise branches joined with Add just before the Result. /// Tokenized by attaching eltwises to separate subgraphs, and then joining them together. // in1 in2 @@ -125,7 +160,41 @@ class EltwiseLogLoopFunction : public SnippetsFunctionBase { std::shared_ptr initOriginal() const override; std::shared_ptr initReference() const override; }; - +/// 2 results. +/// So we have 2 subgraphs - Snippets don't support subgraphs with many results +/// Also Output tensors have names to check correct copying output names +// in1 in2 +// Sinh Sinh +// Add +// HSwish Result +// Relu +// Result +class EltwiseTwoResultsFunction : public SnippetsFunctionBase { +public: + explicit EltwiseTwoResultsFunction(const std::vector& inputShapes) : SnippetsFunctionBase(inputShapes) { + NGRAPH_CHECK(input_shapes.size() == 2, "Got invalid number of input shapes"); + } +protected: + std::shared_ptr initOriginal() const override; + std::shared_ptr initReference() const override; +}; +/// Two different Input and Outputs. +/// This function is to check correct Broadcasting +// in1 in2 +// Sin Sin +// HSwish / +// Result Add +// Relu +// Sin +// Result +class TwoInputsAndOutputsFunction : public SnippetsFunctionBase { +public: + explicit TwoInputsAndOutputsFunction(const std::vector& inputShapes) : SnippetsFunctionBase(inputShapes) { + NGRAPH_CHECK(input_shapes.size() == 2, "Got invalid number of input shapes"); + } +protected: + std::shared_ptr initOriginal() const override; +}; } // namespace snippets } // namespace test } // namespace ov diff --git a/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_convert.cpp b/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_convert.cpp new file mode 100644 index 00000000000000..5c743cf2006bb3 --- /dev/null +++ b/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_convert.cpp @@ -0,0 +1,241 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "subgraph_converts.hpp" +#include "common_test_utils/data_utils.hpp" +#include +#include + +namespace ov { +namespace test { +namespace snippets { + +std::shared_ptr createRollAsStub(const std::shared_ptr& parent) { + auto shift = std::make_shared(ov::element::i32, Shape{1}, std::vector{1}); + auto axes = std::make_shared(ov::element::i32, Shape{1}, std::vector{0}); + return std::make_shared(parent->output(0), shift, axes); +} + +std::shared_ptr ConvertFunction::initOriginal() const { + auto data0 = std::make_shared(inType, input_shapes[0]); + auto stub = createRollAsStub(data0); + auto convert = std::make_shared(stub, outType); + return std::make_shared(NodeVector{convert}, ParameterVector{data0}); +} +std::shared_ptr ConvertFunction::initReference() const { + auto data0 = std::make_shared(inType, input_shapes[0]); + auto stub = createRollAsStub(data0); + auto indata0 = std::make_shared(inType, stub->get_shape()); + auto subgraph = std::make_shared(NodeVector{stub}, + std::make_shared(NodeVector{std::make_shared(indata0, outType)}, + ParameterVector{indata0})); + return std::make_shared(NodeVector{subgraph}, ParameterVector{data0}); +} + +std::shared_ptr ConvertInputFunction::initOriginal() const { + auto data0 = std::make_shared(inType, input_shapes[0]); + auto data1 = std::make_shared(outType, input_shapes[1]); + auto stub0 = createRollAsStub(data0); + auto stub1 = createRollAsStub(data1); + auto convert = std::make_shared(stub0, outType); + auto add = std::make_shared(convert, stub1); + return std::make_shared(NodeVector{add}, ParameterVector{data0, data1}); +} +std::shared_ptr ConvertInputFunction::initReference() const { + auto data0 = std::make_shared(inType, input_shapes[0]); + auto data1 = std::make_shared(outType, input_shapes[1]); + auto stub0 = createRollAsStub(data0); + auto stub1 = createRollAsStub(data1); + auto indata0 = std::make_shared(inType, stub0->get_shape()); + auto indata1 = std::make_shared(outType, stub1->get_shape()); + auto convert = std::make_shared(indata0, outType); + auto subgraph = std::make_shared(NodeVector{stub0, stub1}, + std::make_shared( + NodeVector{std::make_shared(convert, indata1)}, + ParameterVector{indata0, indata1})); + return std::make_shared(NodeVector{subgraph}, ParameterVector{data0, data1}); +} + +std::shared_ptr ConvertOutputFunction::initOriginal() const { + auto data0 = std::make_shared(inType, input_shapes[0]); + auto data1 = std::make_shared(inType, input_shapes[1]); + auto stub0 = createRollAsStub(data0); + auto stub1 = createRollAsStub(data1); + auto add = std::make_shared(stub0, stub1); + auto convert = std::make_shared(add, outType); + return std::make_shared(NodeVector{convert}, ParameterVector{data0, data1}); +} +std::shared_ptr ConvertOutputFunction::initReference() const { + auto data0 = std::make_shared(inType, input_shapes[0]); + auto data1 = std::make_shared(inType, input_shapes[1]); + auto stub0 = createRollAsStub(data0); + auto stub1 = createRollAsStub(data1); + auto indata0 = std::make_shared(inType, stub0->get_shape()); + auto indata1 = std::make_shared(inType, stub1->get_shape()); + auto add = std::make_shared(indata0, indata1); + auto convert = std::make_shared(add, outType); + auto subgraph = std::make_shared(NodeVector{stub0, stub1}, + std::make_shared( + NodeVector{convert}, + ParameterVector{indata0, indata1})); + return std::make_shared(NodeVector{subgraph}, ParameterVector{data0, data1}); +} + +std::shared_ptr ConvertStubFunction::initOriginal() const { + auto data0 = std::make_shared(inType, input_shapes[0]); + auto data1 = std::make_shared(inType, input_shapes[1]); + auto stub0 = createRollAsStub(data0); + auto stub1 = createRollAsStub(data1); + auto add = std::make_shared(stub0, stub1); + auto convert = std::make_shared(add, outType); + auto relu = std::make_shared(convert); + return std::make_shared(NodeVector{relu}, ParameterVector{data0, data1}); +} +std::shared_ptr ConvertStubFunction::initReference() const { + auto data0 = std::make_shared(inType, input_shapes[0]); + auto data1 = std::make_shared(inType, input_shapes[1]); + auto stub0 = createRollAsStub(data0); + auto stub1 = createRollAsStub(data1); + auto indata0 = std::make_shared(inType, stub0->get_shape()); + auto indata1 = std::make_shared(inType, stub1->get_shape()); + auto add = std::make_shared(indata0, indata1); + auto convert = std::make_shared(add, outType); + auto subgraph0 = std::make_shared( + NodeVector{stub0, stub1}, std::make_shared(NodeVector{convert}, ParameterVector{indata0, indata1})); + auto indata2 = std::make_shared(convert->get_destination_type(), convert->get_shape()); + auto relu = std::make_shared(indata2); + auto subgraph1 = std::make_shared( + NodeVector{subgraph0}, std::make_shared(NodeVector{relu}, ParameterVector{indata2})); + return std::make_shared(NodeVector{subgraph1}, ParameterVector{data0, data1}); +} + +std::shared_ptr ConvertPartialInputsAndResultsFunction::initOriginal() const { + auto data0 = std::make_shared(inTypes[0], input_shapes[0]); + auto data1 = std::make_shared(inTypes[1], input_shapes[1]); + auto data2 = std::make_shared(inTypes[2], input_shapes[2]); + auto stub0 = createRollAsStub(data0); + auto stub1 = createRollAsStub(data1); + auto stub2 = createRollAsStub(data2); + auto convert0 = std::make_shared(stub0, outTypes[0]); + auto convert1 = std::make_shared(stub1, outTypes[0]); + auto add = std::make_shared(convert0, convert1); + auto relu = std::make_shared(add); + auto sub = std::make_shared(relu, stub2); + auto stub3 = createRollAsStub(sub); + auto convert2 = std::make_shared(relu, outTypes[1]); + return std::make_shared(NodeVector{convert2, stub3}, ParameterVector{data0, data1, data2}); +} +std::shared_ptr ConvertPartialInputsAndResultsFunction::initReference() const { + auto data0 = std::make_shared(inTypes[0], input_shapes[0]); + auto data1 = std::make_shared(inTypes[1], input_shapes[1]); + auto data2 = std::make_shared(inTypes[2], input_shapes[2]); + auto stub0 = createRollAsStub(data0); + auto stub1 = createRollAsStub(data1); + auto stub2 = createRollAsStub(data2); + auto indata0 = std::make_shared(inTypes[0], stub0->get_shape()); + auto indata1 = std::make_shared(inTypes[1], stub1->get_shape()); + auto indata2 = std::make_shared(inTypes[2], stub2->get_shape()); + auto convert0 = std::make_shared(indata0, outTypes[0]); + auto convert1 = std::make_shared(indata1, outTypes[0]); + auto add = std::make_shared(convert0, convert1); + auto relu = std::make_shared(add); + auto sub = std::make_shared(relu, indata2); + auto convert2 = std::make_shared(relu, outTypes[1]); + auto subgraph = std::make_shared( + NodeVector{stub0, stub1, stub2}, std::make_shared(NodeVector{sub, convert2}, ParameterVector{indata0, indata1, indata2})); + auto stub3 = createRollAsStub(subgraph); + return std::make_shared(OutputVector{subgraph->output(1), stub3->output(0)}, + ParameterVector{data0, data1, data2}); +} + +std::shared_ptr ConvertManyOnInputsFunction::initOriginal() const { + auto data0 = std::make_shared(types[0], input_shapes[0]); + auto stub0 = createRollAsStub(data0); + std::shared_ptr out = stub0; + for (auto i = 1; i < types.size(); i++) { + auto convert = std::make_shared(out, types[i]); + out = convert; + } + auto relu = std::make_shared(out); + return std::make_shared(NodeVector{relu}, ParameterVector{data0}); +} +std::shared_ptr ConvertManyOnInputsFunction::initReference() const { + auto data0 = std::make_shared(types[0], input_shapes[0]); + auto stub0 = createRollAsStub(data0); + auto indata0 = std::make_shared(types[0], stub0->get_shape()); + std::shared_ptr out = indata0; + for (auto i = 1; i < types.size(); i++) { + auto convert = std::make_shared(out, types[i]); + out = convert; + } + auto relu = std::make_shared(out); + auto subgraph = std::make_shared(NodeVector{stub0}, + std::make_shared(NodeVector{relu}, ParameterVector{indata0})); + return std::make_shared(NodeVector{subgraph}, ParameterVector{data0}); +} + +std::shared_ptr ConvertManyOnOutputsFunction::initOriginal() const { + auto data0 = std::make_shared(types[0], input_shapes[0]); + auto stub0 = std::make_shared(data0); + auto relu = std::make_shared(stub0); + std::shared_ptr out = relu; + for (auto i = 1; i < types.size(); i++) { + auto convert = std::make_shared(out, types[i]); + out = convert; + } + return std::make_shared(NodeVector{out}, ParameterVector{data0}); +} +std::shared_ptr ConvertManyOnOutputsFunction::initReference() const { + auto data0 = std::make_shared(types[0], input_shapes[0]); + auto stub0 = std::make_shared(data0); + auto indata0 = std::make_shared(types[0], stub0->get_shape()); + auto relu = std::make_shared(indata0); + std::shared_ptr out = relu; + for (auto i = 1; i < types.size(); i++) { + auto convert = std::make_shared(out, types[i]); + out = convert; + } + auto subgraph = std::make_shared(NodeVector{stub0}, + std::make_shared(NodeVector{out}, ParameterVector{indata0})); + return std::make_shared(NodeVector{subgraph}, ParameterVector{data0}); +} + +std::shared_ptr ConvertManyOnInputOutputFunction::initOriginal() const { + auto data0 = std::make_shared(inTypes[0], input_shapes[0]); + auto stub0 = std::make_shared(data0); + std::shared_ptr out = stub0; + for (auto i = 1; i < inTypes.size(); i++) { + auto convert = std::make_shared(out, inTypes[i]); + out = convert; + } + auto relu = std::make_shared(stub0); + out = relu; + for (auto i = 0; i < outTypes.size(); i++) { + auto convert = std::make_shared(out, outTypes[i]); + out = convert; + } + return std::make_shared(NodeVector{out}, ParameterVector{data0}); +} +std::shared_ptr ConvertManyOnInputOutputFunction::initReference() const { + auto data0 = std::make_shared(inTypes[0], input_shapes[0]); + auto stub0 = std::make_shared(data0); + auto indata0 = std::make_shared(inTypes[0], stub0->get_shape()); + std::shared_ptr out = indata0; + for (auto i = 1; i < inTypes.size(); i++) { + auto convert = std::make_shared(out, inTypes[i]); + out = convert; + } + auto relu = std::make_shared(stub0); + out = relu; + for (auto i = 0; i < outTypes.size(); i++) { + auto convert = std::make_shared(out, outTypes[i]); + out = convert; + } + auto subgraph = std::make_shared(NodeVector{stub0}, + std::make_shared(NodeVector{out}, ParameterVector{indata0})); + return std::make_shared(NodeVector{subgraph}, ParameterVector{data0}); +} +} // namespace snippets +} // namespace test +} // namespace ov diff --git a/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_simple.cpp b/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_simple.cpp index 81c267f3745828..6117ffb6c76c68 100644 --- a/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_simple.cpp +++ b/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_simple.cpp @@ -46,6 +46,14 @@ std::shared_ptr AddSinhFunction::initReference() const { ParameterVector{indata0, indata1})); return std::make_shared(NodeVector{add}, ParameterVector{data0, data1}); } +std::shared_ptr AddSinhConstFunction::initOriginal() const { + auto data0 = std::make_shared(precision, input_shapes[0]); + const std::vector const_values = CommonTestUtils::generate_float_numbers(shape_size(input_shapes[0]), -10., 10.); + auto const_data1 = std::make_shared(precision, input_shapes[0], const_values); + auto sin0 = std::make_shared(data0); + auto add = std::make_shared(sin0, const_data1); + return std::make_shared(NodeVector{add}, ParameterVector{data0}); +} std::shared_ptr EltwiseFunction::initOriginal() const { auto data0 = std::make_shared(precision, input_shapes[0]); auto data1 = std::make_shared(precision, input_shapes[1]); @@ -98,6 +106,28 @@ std::shared_ptr EltwiseThreeInputsSinhFunction::initOriginal() const auto mul = std::make_shared(add, sub); return std::make_shared(NodeVector{mul}, ParameterVector{data0, data1, data2}); } +std::shared_ptr EltwiseMaxNumParamsSinhFunction::initOriginal() const { + ParameterVector params; + std::vector> sinh; // 10 + for (const auto& shape : input_shapes) { + auto param = std::make_shared(precision, shape); + params.push_back(param); + sinh.push_back(std::make_shared(param)); + } + std::vector> add; // 5 + for (size_t i = 0; i < input_shapes.size() / 2; i++) { + add.push_back(std::make_shared(sinh[i * 2], sinh[i * 2 + 1])); + } + std::vector> mul; // 2 + for (size_t i = 0; i < add.size() / 2; i++) { + auto mul_node = std::make_shared(add[i * 2], add[i * 2 + 1]); + mul.push_back(mul_node); + } + auto sub = std::make_shared(mul[0], mul[1]); + auto power = std::make_shared(add.back(), sub); + auto exit_sinh = std::make_shared(power); + return std::make_shared(NodeVector{sub, exit_sinh}, params); +} std::shared_ptr MatMulEltwiseBranchesFunction::initOriginal() const { auto data_1 = std::make_shared(precision, input_shapes[0]); @@ -187,6 +217,69 @@ std::shared_ptr EltwiseLogLoopFunction::initReference() const { return std::make_shared(NodeVector{mul}, ParameterVector{data0, data1}); } +std::shared_ptr EltwiseTwoResultsFunction::initOriginal() const { + auto data0 = std::make_shared(precision, input_shapes[0]); + auto data1 = std::make_shared(precision, input_shapes[1]); + auto sinh0 = std::make_shared(data0); + auto sinh1 = std::make_shared(data1); + auto add = std::make_shared(sinh0, sinh1); + auto hswish = std::make_shared(add); + auto relu = std::make_shared(hswish); + + NGRAPH_SUPPRESS_DEPRECATED_START + auto& out_tensor0 = add->get_output_tensor(0); + out_tensor0.set_name("add_out"); + out_tensor0.set_names({"add_out", "y0"}); + + auto& out_tensor1 = relu->get_output_tensor(0); + out_tensor1.set_name("relu_out"); + out_tensor1.set_names({"relu_out", "y1"}); + NGRAPH_SUPPRESS_DEPRECATED_END + + return std::make_shared(NodeVector{add, relu}, ParameterVector{data0, data1}); +} +std::shared_ptr EltwiseTwoResultsFunction::initReference() const { + auto data0 = std::make_shared(precision, input_shapes[0]); + auto data1 = std::make_shared(precision, input_shapes[1]); + auto sinh0 = std::make_shared(data0); + auto sinh1 = std::make_shared(data1); + auto indata0 = std::make_shared(precision, sinh0->get_shape()); + auto indata1 = std::make_shared(precision, sinh1->get_shape()); + auto add = std::make_shared(indata0, indata1); + auto hswish = std::make_shared(add); + auto subgraph0 = std::make_shared(NodeVector{sinh0, sinh1}, + std::make_shared(NodeVector{add, hswish}, + ParameterVector{indata0, indata1})); + auto indata2 = std::make_shared(precision, subgraph0->get_output_shape(1)); + auto relu = std::make_shared(indata2); + auto subgraph1 = std::make_shared(OutputVector{subgraph0->output(1)}, + std::make_shared(NodeVector{relu}, + ParameterVector{indata2})); + NGRAPH_SUPPRESS_DEPRECATED_START + auto& out_tensor0 = subgraph0->get_output_tensor(0); + out_tensor0.set_name("add_out"); + out_tensor0.set_names({"add_out", "y0"}); + + auto& out_tensor1 = subgraph1->get_output_tensor(0); + out_tensor1.set_name("relu_out"); + out_tensor1.set_names({"relu_out", "y1"}); + NGRAPH_SUPPRESS_DEPRECATED_END + return std::make_shared(OutputVector{subgraph0->output(0), subgraph1->output(0)}, ParameterVector{data0, data1}); +} + +std::shared_ptr TwoInputsAndOutputsFunction::initOriginal() const { + auto data0 = std::make_shared(precision, input_shapes[0]); + auto data1 = std::make_shared(precision, input_shapes[1]); + auto sin0 = std::make_shared(data0); + auto sin1 = std::make_shared(data1); + auto hswish = std::make_shared(sin0); + auto add = std::make_shared(hswish, sin1); + auto relu = std::make_shared(add); + auto sin3 = std::make_shared(relu); + + return std::make_shared(NodeVector{hswish, sin3}, ParameterVector{data0, data1}); +} + } // namespace snippets } // namespace test } // namespace ov \ No newline at end of file