diff --git a/src/common/snippets/include/snippets/generator.hpp b/src/common/snippets/include/snippets/generator.hpp index 7540c950e32253..2ea92983819f1e 100644 --- a/src/common/snippets/include/snippets/generator.hpp +++ b/src/common/snippets/include/snippets/generator.hpp @@ -120,10 +120,16 @@ class Generator { public: // True if the lowered Emitters need to be accessed during runtime. Normally they're destroyed after code emission. bool m_save_lowered_code = false; + // True if one evaluation optimizations are enabled + bool m_one_evaluation_optimizations = true; + // True if we should check runtime info for nodes to call specific needed transformations + bool m_need_fill_tail_register = false; }; /** * @brief virtual method any specific implementation should implement * @param m model in canonical for for table-based code generation + * @param config config with transformation and optimization parameters + * @param compile_params parameters for generated code * @return pointer to generated code */ code generate(std::shared_ptr& m, const GeneratorConfig& config, const void* compile_params = nullptr); diff --git a/src/common/snippets/include/snippets/op/brgemm.hpp b/src/common/snippets/include/snippets/op/brgemm.hpp index 83471c04d0553a..2746d974a06400 100644 --- a/src/common/snippets/include/snippets/op/brgemm.hpp +++ b/src/common/snippets/include/snippets/op/brgemm.hpp @@ -19,13 +19,27 @@ namespace op { class Brgemm : public ngraph::op::v0::MatMul { public: OPENVINO_OP("Brgemm", "SnippetsOpset", ngraph::op::v0::MatMul); - Brgemm(const Output& A, const Output& B); + Brgemm(const Output& A, const Output& B, const size_t offset_a = 0lu, const size_t offset_b = 0lu, const size_t offset_c = 0lu); Brgemm() = default; + bool visit_attributes(AttributeVisitor& visitor) override; void validate_and_infer_types() override; std::shared_ptr clone_with_new_inputs(const OutputVector& new_args) const override; bool has_evaluate() const override { return false; } + + size_t get_offset_a() const { return m_offset_a; } + size_t get_offset_b() const { return m_offset_b; } + size_t get_offset_c() const { return m_offset_c; } + + void set_offset_a(const size_t offset) { m_offset_a = offset; } + void set_offset_b(const size_t offset) { m_offset_b = offset; } + void set_offset_c(const size_t offset) { m_offset_c = offset; } + +private: + size_t m_offset_a = 0lu; // offset for first input + size_t m_offset_b = 0lu; // offset for second input + size_t m_offset_c = 0lu; // offset for output }; } // namespace op diff --git a/src/common/snippets/include/snippets/op/buffer.hpp b/src/common/snippets/include/snippets/op/buffer.hpp new file mode 100644 index 00000000000000..658b23b47919e3 --- /dev/null +++ b/src/common/snippets/include/snippets/op/buffer.hpp @@ -0,0 +1,54 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + +namespace ngraph { +namespace snippets { +namespace op { + +/** + * @interface Buffer + * @brief The operation is for intermediate data storage + * - m_offset - offset from common Buffer allocated memory. + * Default value is 0. + * - m_allocation_rank - rank of shape for memory allocation: shape[shape_rank - m_allocation_rank : shape_rank]. + * It's needed to allocate needed memory size that depends on Tile rank, for example. + * Default value is -1 (full shape) + * Notes: + * - All buffers in a graph have the same memory pointer. So if we have a few buffers, + * each buffer should have its own offset for common memory + * - Buffer should be a single consumer for operation output port + * @ingroup snippets + */ +class Buffer : public ngraph::op::Op { +public: + OPENVINO_OP("Buffer", "SnippetsOpset"); + BWDCMP_RTTI_DECLARATION; + + Buffer(const Output& x, const int32_t allocation_rank = -1); + Buffer() = default; + + size_t get_offset() const { return m_offset; } + void set_offset(const size_t offset); + + int32_t get_allocation_rank() const { return m_allocation_rank; } + void set_allocation_rank(int32_t rank) { m_allocation_rank = rank; } + + size_t get_byte_size() const; + + bool visit_attributes(AttributeVisitor& visitor) override; + std::shared_ptr clone_with_new_inputs(const OutputVector& new_args) const override; + void validate_and_infer_types() override; + +private: + size_t m_offset = 0lu; + int32_t m_allocation_rank = -1; +}; + +} // namespace op +} // namespace snippets +} // namespace ngraph diff --git a/src/common/snippets/include/snippets/op/fill.hpp b/src/common/snippets/include/snippets/op/fill.hpp new file mode 100644 index 00000000000000..91aceb49d1e360 --- /dev/null +++ b/src/common/snippets/include/snippets/op/fill.hpp @@ -0,0 +1,47 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + +namespace ngraph { +namespace snippets { +namespace op { + +/** + * @interface Fill + * @brief Generated in Tail Loop vector representation in code generation step for cases when we should + * refill regsiters by special numbers. + * For example, for cases with ReduceMax or ReduceSum in Softmax + * Where: + * - offset - is value shift for filling + * - fill_value - hexadecimal filling value + * @ingroup snippets + */ +class Fill : public ngraph::op::Op { +public: + OPENVINO_OP("Fill", "SnippetsOpset"); + + Fill(const Output& x, const size_t offset, const uint32_t fill_value = 0x0); + Fill() = default; + + size_t get_offset() const { return m_offset; } + uint32_t get_fill_value() const { return m_fill_value; } + + void set_offset(const size_t offset) { m_offset = offset; } + void set_fill_value(const uint32_t fill_value) { m_fill_value = fill_value; } + + bool visit_attributes(AttributeVisitor& visitor) override; + std::shared_ptr clone_with_new_inputs(const OutputVector& new_args) const override; + void validate_and_infer_types() override; + +protected: + size_t m_offset = 0lu; + uint32_t m_fill_value = 0x0; +}; + +} // namespace op +} // namespace snippets +} // namespace ngraph diff --git a/src/common/snippets/include/snippets/op/horizon_max.hpp b/src/common/snippets/include/snippets/op/horizon_max.hpp new file mode 100644 index 00000000000000..d26c4a8c9e58c6 --- /dev/null +++ b/src/common/snippets/include/snippets/op/horizon_max.hpp @@ -0,0 +1,32 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "ngraph/op/op.hpp" + +namespace ngraph { +namespace snippets { +namespace op { + +/** + * @interface HorizonMax + * @brief The operation calculates a horizon maximum of a vector register + * @ingroup snippets + */ +class HorizonMax : public ngraph::op::Op { +public: + OPENVINO_OP("HorizonMax", "SnippetsOpset"); + + HorizonMax(const Output& x); + HorizonMax() = default; + + bool visit_attributes(AttributeVisitor& visitor) override { return true;} + std::shared_ptr clone_with_new_inputs(const OutputVector& new_args) const override; + void validate_and_infer_types() override; +}; + +} // namespace op +} // namespace snippets +} // namespace ngraph diff --git a/src/common/snippets/include/snippets/op/horizon_sum.hpp b/src/common/snippets/include/snippets/op/horizon_sum.hpp new file mode 100644 index 00000000000000..2dc25374bc0f70 --- /dev/null +++ b/src/common/snippets/include/snippets/op/horizon_sum.hpp @@ -0,0 +1,32 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "ngraph/op/op.hpp" + +namespace ngraph { +namespace snippets { +namespace op { + +/** + * @interface HorizonSum + * @brief The operation calculates a horizon sum of a vector register + * @ingroup snippets + */ +class HorizonSum : public ngraph::op::Op { +public: + OPENVINO_OP("HorizonSum", "SnippetsOpset"); + + HorizonSum(const Output& x); + HorizonSum() = default; + + bool visit_attributes(AttributeVisitor& visitor) override { return true;} + std::shared_ptr clone_with_new_inputs(const OutputVector& new_args) const override; + void validate_and_infer_types() override; +}; + +} // namespace op +} // namespace snippets +} // namespace ngraph diff --git a/src/common/snippets/include/snippets/op/load.hpp b/src/common/snippets/include/snippets/op/load.hpp index fc500e038f6375..157f25f74c7e34 100644 --- a/src/common/snippets/include/snippets/op/load.hpp +++ b/src/common/snippets/include/snippets/op/load.hpp @@ -13,16 +13,16 @@ namespace op { /** * @interface Load - * @brief Generated by Canonicalization step where explicit instructions should be emitted for data loading - * where number of elements to load is determined by "count" - * Default value is "1" - to load one element + * @brief Generated during Lowering stage (convert_to_snippets_dialect) where explicit instructions should be emitted for data loading + * where number of elements to load is determined by "count" (Default value is "1" - to load one element) + * and memory offset for loading is determined by "offset" (Default value is "0" - to load starting from the first element) * @ingroup snippets */ class Load : public MemoryAccess { public: OPENVINO_OP("Load", "SnippetsOpset"); - Load(const Output& x, const size_t count = 1lu); + Load(const Output& x, const size_t count = 1lu, const size_t offset = 0lu); Load() = default; std::shared_ptr clone_with_new_inputs(const OutputVector& new_args) const override; @@ -38,7 +38,7 @@ class Load : public MemoryAccess { class LoadReshape : public Load { public: OPENVINO_OP("LoadReshape", "SnippetsOpset"); - LoadReshape(const Output& x, size_t count = 1lu, std::vector order = {}); + LoadReshape(const Output& x, size_t count = 1lu, const size_t offset = 0lu, std::vector order = {}); bool visit_attributes(AttributeVisitor& visitor) override; std::shared_ptr clone_with_new_inputs(const OutputVector& new_args) const override; void validate_and_infer_types() override; diff --git a/src/common/snippets/include/snippets/op/memory_access.hpp b/src/common/snippets/include/snippets/op/memory_access.hpp index 5971c5cc5ce744..f1b2d8ebb2f00d 100644 --- a/src/common/snippets/include/snippets/op/memory_access.hpp +++ b/src/common/snippets/include/snippets/op/memory_access.hpp @@ -24,14 +24,17 @@ class MemoryAccess : public ngraph::op::Op { OPENVINO_OP("MemoryAccess", "SnippetsOpset"); size_t get_count() const; - void set_count(size_t count); + size_t get_offset() const; + void set_count(const size_t count); + void set_offset(const size_t offset); bool visit_attributes(AttributeVisitor& visitor) override; void validate_and_infer_types() override; protected: - explicit MemoryAccess(const Output& x, size_t count = 1lu); + explicit MemoryAccess(const Output& x, size_t count = 1lu, size_t offset = 0lu); MemoryAccess() = default; size_t m_count = 0lu; + size_t m_offset = 0lu; }; } // namespace op diff --git a/src/common/snippets/include/snippets/op/store.hpp b/src/common/snippets/include/snippets/op/store.hpp index b83a4fdcec2b18..48c7466b924cff 100644 --- a/src/common/snippets/include/snippets/op/store.hpp +++ b/src/common/snippets/include/snippets/op/store.hpp @@ -13,16 +13,16 @@ namespace op { /** * @interface Store - * @brief Generated by Canonicalization step where explicit instructions should be emitted for data storing - * where number of elements to store is determined by "count" - * Default value is "1" - to store one element + * @brief Generated during Lowering stage (convert_to_snippets_dialect) where explicit instructions should be emitted for data storing + * where number of elements to store is determined by "count" (Default value is "1" - to store one element) + * and memory offset for storing is determined by "offset" (Default value is "0" - to store starting at start memory ptr) * @ingroup snippets */ class Store : public MemoryAccess { public: OPENVINO_OP("Store", "SnippetsOpset"); - Store(const Output& x, const size_t count = 1lu); + Store(const Output& x, const size_t count = 1lu, const size_t offset = 0lu); Store() = default; std::shared_ptr clone_with_new_inputs(const OutputVector& new_args) const override; diff --git a/src/common/snippets/include/snippets/op/subgraph.hpp b/src/common/snippets/include/snippets/op/subgraph.hpp index 31975978695c5f..72af822bf1c9ad 100644 --- a/src/common/snippets/include/snippets/op/subgraph.hpp +++ b/src/common/snippets/include/snippets/op/subgraph.hpp @@ -89,21 +89,13 @@ class Subgraph : public ngraph::op::Op { return m_generator; } - size_t get_non_scalar_constants_count() const { - return m_non_scalar_constants_count; - } - - bool is_quantized() const { - return config.m_is_quantized; - } - - bool has_type_relaxed_ops() const { - return config.m_has_type_relaxed_ops; - } - - bool has_domain_sensitive_ops() const { - return config.m_has_domain_sensitive_ops; - } + // Return common memory size for all buffers in body. Should be called only after tileRank setting + size_t get_buffer_scratchpad_size() const; + size_t get_virtual_port_count() const { return m_virtual_port_count; } + bool is_buffer_needed() const { return m_buffer_needed; } + bool is_quantized() const { return config.m_is_quantized; } + bool has_type_relaxed_ops() const { return config.m_has_type_relaxed_ops; } + bool has_domain_sensitive_ops() const { return config.m_has_domain_sensitive_ops; } snippets::Schedule generate(const BlockedShapeVector& output_shapes, const BlockedShapeVector& input_shapes, ngraph::pass::Manager& opt, const void* compile_params = nullptr); @@ -117,8 +109,9 @@ class Subgraph : public ngraph::op::Op { // plugin sets generator for a snippet to some specific generator. // it's going to be replaced with Jitters table later void set_generator(std::shared_ptr generator); - void set_non_scalar_constants_count(const size_t count); void set_tile_rank(size_t newRank) {tileRank = newRank;} + void set_virtual_port_count(const size_t count); + void buffer_needed(const bool need); void print() const; void print_statistics(bool verbose); @@ -133,11 +126,14 @@ class Subgraph : public ngraph::op::Op { void align_element_types(const BlockedShapeVector& outputShapes, const BlockedShapeVector& inputShapes); void convert_to_snippet_dialect(); void init_config(); - // Count of potentional non-scalar Consants that will be created after some tranformations - // At the moment it's relevant only for FakeQuantize decomposition - // NOTE: To avoid overheads in each calcution of this count (for example, in validate_and_type_infer()), + // Count of Subgraph virtual ports: + // - Potential non-scalar Constants that will be created after some transformations (At the moment it's relevant only for FakeQuantize decomposition) + // Need Buffer op or not + // - Buffers. All Buffers are considered as one common additional virtual port. So we cannot summarize them as potential non-scalar Constants + // NOTE: To avoid overheads in each calculation of this count (for example, in validate_and_type_infer()), // we should MANUALLY calculate it where it needed. - size_t m_non_scalar_constants_count = 0; + size_t m_virtual_port_count = 0; + bool m_buffer_needed = false; Shape exec_domain = {}; std::shared_ptr m_body = nullptr; std::shared_ptr m_generator = nullptr; @@ -162,11 +158,12 @@ class Subgraph : public ngraph::op::Op { // True if Subgraph contains TypeRelaxed nodes -> for several streams in tp mode we should copy body using mutexes // because TypeRelaxed::copy_with_new_inputs() isn't save-thread method bool m_has_type_relaxed_ops = false; - // True if we should check runtime info for nodes to call specific needed transformations - bool m_need_fill_tail_register = false; // True if body has operations that don't support plugin-side domain optimizations // (e.g. Transpose, Softmax, MatMul in general doesn't support dimensions collapsing) bool m_has_domain_sensitive_ops = false; + // True if we should go through whole body to check for where loops should be explicitly inserted. + // Otherwise, we insert Loops on Parameters and Results - for example, it's optimized out for subgraph with only Eltwise ops + bool m_explicit_loop_insertion = false; } config; }; diff --git a/src/common/snippets/include/snippets/op/vector_buffer.hpp b/src/common/snippets/include/snippets/op/vector_buffer.hpp new file mode 100644 index 00000000000000..9d93e4c01577bf --- /dev/null +++ b/src/common/snippets/include/snippets/op/vector_buffer.hpp @@ -0,0 +1,34 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + +namespace ngraph { +namespace snippets { +namespace op { + +/** + * @interface VectorBuffer + * @brief The operation is for intermediate data storage in vector register + * @ingroup snippets + */ +class VectorBuffer : public ngraph::op::Op { +public: + OPENVINO_OP("VectorBuffer", "SnippetsOpset"); + + VectorBuffer(const ov::element::Type element_type = ov::element::f32); + + bool visit_attributes(AttributeVisitor& visitor) override { return true;} + std::shared_ptr clone_with_new_inputs(const OutputVector& new_args) const override; + void validate_and_infer_types() override; + +private: + ov::element::Type m_element_type; +}; + +} // namespace op +} // namespace snippets +} // namespace ngraph diff --git a/src/common/snippets/include/snippets/pass/insert_buffer.hpp b/src/common/snippets/include/snippets/pass/insert_buffer.hpp new file mode 100644 index 00000000000000..9dfa3f8be894ac --- /dev/null +++ b/src/common/snippets/include/snippets/pass/insert_buffer.hpp @@ -0,0 +1,27 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include + +namespace ngraph { +namespace snippets { +namespace pass { + +/** + * @interface InsertBuffer + * @brief The pass inserts Buffers on Inputs and Outputs of special operations [Softmax, Transpose] is it's needed + * @ingroup snippets + */ +class InsertBuffer: public ngraph::pass::MatcherPass { +public: + InsertBuffer(const int32_t allocation_rank = -1); +}; + + +} // namespace pass +} // namespace snippets +} // namespace ngraph diff --git a/src/common/snippets/include/snippets/pass/insert_load_store.hpp b/src/common/snippets/include/snippets/pass/insert_load_store.hpp index dc1bf6b3e68717..6935c7495cc38f 100644 --- a/src/common/snippets/include/snippets/pass/insert_load_store.hpp +++ b/src/common/snippets/include/snippets/pass/insert_load_store.hpp @@ -13,7 +13,7 @@ namespace pass { /** * @interface InsertLoad - * @brief Inserts explicit load instruction after each parameter. + * @brief Inserts explicit load instruction after each parameter and buffer. * The pass is used to convert model to a canonical form for code generation * @ingroup snippets */ @@ -24,7 +24,7 @@ class InsertLoad: public ngraph::pass::MatcherPass { /** * @interface InsertStore - * @brief Inserts explicit store instruction before each result. + * @brief Inserts explicit store instruction before each result and buffer. * The pass is used to convert model to a canonical form for code generation * @ingroup snippets */ diff --git a/src/common/snippets/include/snippets/pass/insert_loops.hpp b/src/common/snippets/include/snippets/pass/insert_loops.hpp index 5b4fe1e3c24fde..8d766df2811028 100644 --- a/src/common/snippets/include/snippets/pass/insert_loops.hpp +++ b/src/common/snippets/include/snippets/pass/insert_loops.hpp @@ -22,13 +22,17 @@ namespace pass { class InsertLoops: public ngraph::pass::FunctionPass { public: OPENVINO_RTTI("InsertLoops", "0"); - InsertLoops(ov::PartialShape master_shape, size_t loop_depth, size_t vector_size); + InsertLoops(ov::PartialShape master_shape, size_t loop_depth, size_t vector_size, bool is_optimized = true); bool run_on_model(const std::shared_ptr& m) override; + static std::vector calculate_inner_apply_increments(const ov::PartialShape& master, const std::vector& shapes); + static std::vector calculate_outer_apply_increments(const std::vector& shapes); + static std::vector calculate_finalization_offsets(const ov::PartialShape& master, const std::vector& shapes); private: ov::PartialShape m_master_shape; size_t m_loop_depth; size_t m_vector_size; + bool m_is_optimized; }; } // namespace pass diff --git a/src/common/snippets/include/snippets/pass/reset_buffer.hpp b/src/common/snippets/include/snippets/pass/reset_buffer.hpp new file mode 100644 index 00000000000000..599b533e3ebf1e --- /dev/null +++ b/src/common/snippets/include/snippets/pass/reset_buffer.hpp @@ -0,0 +1,29 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include + +namespace ngraph { +namespace snippets { +namespace pass { + +/** + * @interface ResetBufferState + * @brief If there is Buffer between loops we should reset Buffer pointer after first loop execution (data storing) using finalization offsets + * to have correct buffer data pointer for data loading in the next loop where data was stored in previous loop + * @ingroup snippets + */ +class ResetBufferState: public ngraph::pass::MatcherPass { +public: + ResetBufferState(); + + static int64_t calculate_required_finalization_offsets(const size_t inner_master_work_amount, const size_t inner_target_work_amount); +}; + +} // namespace pass +} // namespace snippets +} // namespace ngraph diff --git a/src/common/snippets/include/snippets/pass/set_buffer_offset.hpp b/src/common/snippets/include/snippets/pass/set_buffer_offset.hpp new file mode 100644 index 00000000000000..072e978eb5c4a3 --- /dev/null +++ b/src/common/snippets/include/snippets/pass/set_buffer_offset.hpp @@ -0,0 +1,32 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include + +namespace ngraph { +namespace snippets { +namespace pass { + +/** + * @interface SetBufferOffset + * @brief All buffers in body have one common memory pointer. To correct work with them each buffer has own offset for common memory ptr + * The pass consistently set offset in buffers. + * NOTE: Should be called after Load/Store insertion and before LoadMoveBroadcastToBroadcastLoad because + * we cannot fuse Load with non-zero offset and MoveBroadcast + * @ingroup snippets + */ +class SetBufferOffset: public ngraph::pass::MatcherPass { +public: + SetBufferOffset(); + +private: + size_t current_offset = 0lu; +}; + +} // namespace pass +} // namespace snippets +} // namespace ngraph diff --git a/src/common/snippets/include/snippets/pass/softmax_decomposition.hpp b/src/common/snippets/include/snippets/pass/softmax_decomposition.hpp new file mode 100644 index 00000000000000..b640ab35b0bbbc --- /dev/null +++ b/src/common/snippets/include/snippets/pass/softmax_decomposition.hpp @@ -0,0 +1,30 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include + +namespace ngraph { +namespace snippets { +namespace pass { + +/** + * @interface SoftmaxDecomposition + * @brief The pass decomposise Softmax into explicit Snippets dialects + * Note: + * - At the moment Snippets supports Softmax only in MHA pattern where there are Buffer ops before and after Softmax. + * Also Snippets support Loops with Buffer ops on inputs and outputs if Buffer have the same buffer byte size + * because of work with ptr increment. So we have to set Tile rank as buffer allocation rank even if rank 1 is enough + * @ingroup snippets + */ +class SoftmaxDecomposition: public ngraph::pass::MatcherPass { +public: + SoftmaxDecomposition(const size_t vector_size, const int32_t buffer_allocation_rank = -1); +}; + +} // namespace pass +} // namespace snippets +} // namespace ngraph diff --git a/src/common/snippets/include/snippets/pass/softmax_reshape_elimination.hpp b/src/common/snippets/include/snippets/pass/softmax_reshape_elimination.hpp new file mode 100644 index 00000000000000..7522f411669dc3 --- /dev/null +++ b/src/common/snippets/include/snippets/pass/softmax_reshape_elimination.hpp @@ -0,0 +1,27 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include + +namespace ngraph { +namespace snippets { +namespace pass { + +/** + * @interface SoftmaxReshapeElimination + * @brief The pass removes Reshape operations around Softmax if possible + * @ingroup snippets + */ +class SoftmaxReshapeElimination: public ngraph::pass::MatcherPass { +public: + SoftmaxReshapeElimination(); +}; + + +} // namespace pass +} // namespace snippets +} // namespace ngraph diff --git a/src/common/snippets/include/snippets/snippets_isa.hpp b/src/common/snippets/include/snippets/snippets_isa.hpp index 20ce6444682b82..0f091abe956381 100644 --- a/src/common/snippets/include/snippets/snippets_isa.hpp +++ b/src/common/snippets/include/snippets/snippets_isa.hpp @@ -9,8 +9,12 @@ #include "op/broadcastload.hpp" #include "op/broadcastmove.hpp" +#include "op/buffer.hpp" #include "op/convert_saturation.hpp" #include "op/convert_truncation.hpp" +#include "op/horizon_max.hpp" +#include "op/horizon_sum.hpp" +#include "op/fill.hpp" #include "op/kernel.hpp" #include "op/load.hpp" #include "op/nop.hpp" @@ -19,6 +23,7 @@ #include "op/store.hpp" #include "op/loop.hpp" #include "op/brgemm.hpp" +#include "op/vector_buffer.hpp" namespace ngraph { namespace snippets { diff --git a/src/common/snippets/include/snippets/utils.hpp b/src/common/snippets/include/snippets/utils.hpp index 1d08a786922bfb..770722501eb674 100644 --- a/src/common/snippets/include/snippets/utils.hpp +++ b/src/common/snippets/include/snippets/utils.hpp @@ -29,6 +29,9 @@ ov::PartialShape get_reordered_planar_shape(const ov::PartialShape& shape, const std::vector get_node_output_layout(const std::shared_ptr& node); std::vector get_node_output_layout(const Node* node); +inline ov::Dimension get_inner_dim(const ov::PartialShape &shape) { return *(shape.rbegin()); } +inline ov::Dimension get_outer_dim(const ov::PartialShape &shape) { return *(shape.rbegin() + 1); } + } // namespace utils } // namespace snippets } // namespace ngraph \ No newline at end of file diff --git a/src/common/snippets/src/generator.cpp b/src/common/snippets/src/generator.cpp index 3d0060b3805925..5bd97e295da3eb 100644 --- a/src/common/snippets/src/generator.cpp +++ b/src/common/snippets/src/generator.cpp @@ -40,6 +40,53 @@ auto getRegisters(const std::shared_ptr &n) -> RegInfo { return std::make_pair(rin, rout); } +auto tail_transformations(NodeVector& tail, const size_t tail_size, const ngraph::snippets::Generator::GeneratorConfig& config) -> void { + NodeVector updated_tile; + auto insertFill = [tail_size](const ov::Input& input) -> std::shared_ptr { + auto copyRegInfo = [](const ov::descriptor::Tensor& from, ov::descriptor::Tensor& to) -> void { + auto rt = from.get_rt_info(); + auto reginfo = rt.find("reginfo"); + if (reginfo != rt.end()) { + to.get_rt_info()["reginfo"] = reginfo->second; + } + }; + std::shared_ptr fill = nullptr; + auto& rt = input.get_rt_info(); + auto fill_rt = rt.find("set_fill"); + if (fill_rt != rt.end()) { + const auto fill_value = fill_rt->second.as(); + fill = std::make_shared(input.get_source_output(), tail_size, fill_value); + input.get_node()->set_argument(input.get_index(), fill); + // we should explicitly copy reg info because we insert Fill after assign register + copyRegInfo(fill->get_input_tensor(0), fill->get_output_tensor(0)); + } + return fill; + }; + + for (auto& op : tail) { + // We should fill vector regs by float_min and zero to have + // correct math calculations for ReduceMax and ReduceSum in scalar case. + // Note: We find Maximum and Add ops because HorizonMax and HorizonSum are outside Loop, + // so they are missed in + if (config.m_need_fill_tail_register && + (ov::is_type(op) || + ov::is_type(op))) { + for (auto i = 0; i < op->inputs().size(); ++i) { + if (auto fill = insertFill(op->input(i))) { + updated_tile.push_back(fill); + } + } + } else if (const auto memory_access = std::dynamic_pointer_cast(op)) { + if (memory_access->get_count() != 1) { + memory_access->set_count(tail_size); + } + } + updated_tile.push_back(op); + } + + tail = std::move(updated_tile); +} + ngraph::snippets::code ngraph::snippets::Generator::generate(std::shared_ptr& m, const GeneratorConfig& config, const void* compile_params) { @@ -107,8 +154,12 @@ ngraph::snippets::code ngraph::snippets::Generator::generate(std::shared_ptrset_finalization_offsets(std::vector(tail_finalization_offsets.size(), 0)); - // force ptr increments if there is tail - optimize_single_evaluation(vector_loop_end, need_tail); + + if (config.m_one_evaluation_optimizations) { + // force ptr increments if there is tail + optimize_single_evaluation(vector_loop_end, need_tail); + } + lower_ops(vector_loop); } OV_ITT_TASK_NEXT(GENERATE, "::TailLoop") @@ -118,14 +169,7 @@ ngraph::snippets::code ngraph::snippets::Generator::generate(std::shared_ptr& n){ - const auto& memory_access = std::dynamic_pointer_cast(n); - if (memory_access && memory_access->get_count() != 1) { - memory_access->set_count(tail_size); - } - return n; - }); + tail_transformations(tail_loop, tail_size, config); tail_loop_end = ov::as_type_ptr(*tail_loop.rbegin()); tail_loop_end->set_finalization_offsets(tail_finalization_offsets); tail_loop_end->set_increment(tail_size); @@ -133,8 +177,12 @@ ngraph::snippets::code ngraph::snippets::Generator::generate(std::shared_ptrupdate_ptr_increments(static_cast(tail_size)); tail_loop_end->set_work_amount(tail_size); tail_loop_end->has_outer_loop = vector_loop_end->has_outer_loop; - // tail loop is always executed once - optimize_single_evaluation(tail_loop_end); + + if (config.m_one_evaluation_optimizations) { + // tail loop is always executed once + optimize_single_evaluation(tail_loop_end); + } + lower_ops(tail_loop); } } else { @@ -173,4 +221,4 @@ std::shared_ptr Generator::get_target_machine() const { } }// namespace snippets -}// namespace ngraph \ No newline at end of file +}// namespace ngraph diff --git a/src/common/snippets/src/op/brgemm.cpp b/src/common/snippets/src/op/brgemm.cpp index e48b599b96a22b..7bf999cb15e423 100644 --- a/src/common/snippets/src/op/brgemm.cpp +++ b/src/common/snippets/src/op/brgemm.cpp @@ -13,12 +13,21 @@ namespace ngraph { namespace snippets { namespace op { -Brgemm::Brgemm(const Output& A, const Output& B) : MatMul() { +Brgemm::Brgemm(const Output& A, const Output& B, const size_t offset_a, const size_t offset_b, const size_t offset_c) + : MatMul(), m_offset_a(offset_a), m_offset_b(offset_b), m_offset_c(offset_c) { set_arguments({A, B}); set_output_size(1); constructor_validate_and_infer_types(); } +bool Brgemm::visit_attributes(AttributeVisitor& visitor) { + MatMul::visit_attributes(visitor); + visitor.on_attribute("offset_a", m_offset_a); + visitor.on_attribute("offset_b", m_offset_b); + visitor.on_attribute("offset_c", m_offset_c); + return true; +} + void Brgemm::validate_and_infer_types() { INTERNAL_OP_SCOPE(Brgemm_validate_and_infer_types); element::Type result_et; @@ -47,7 +56,7 @@ void Brgemm::validate_and_infer_types() { std::shared_ptr Brgemm::clone_with_new_inputs(const OutputVector& new_args) const { INTERNAL_OP_SCOPE(Brgemm_clone_with_new_inputs); check_new_args_count(this, new_args); - return std::make_shared(new_args.at(0), new_args.at(1));; + return std::make_shared(new_args.at(0), new_args.at(1), m_offset_a, m_offset_b, m_offset_c); } } // namespace op diff --git a/src/common/snippets/src/op/buffer.cpp b/src/common/snippets/src/op/buffer.cpp new file mode 100644 index 00000000000000..3af8e2858aa202 --- /dev/null +++ b/src/common/snippets/src/op/buffer.cpp @@ -0,0 +1,112 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +#include "snippets/op/buffer.hpp" +#include "snippets/snippets_isa.hpp" + +#include + +using namespace std; +using namespace ngraph; + +BWDCMP_RTTI_DEFINITION(ngraph::snippets::op::Buffer); + +auto normalize_rank(int32_t allocation_rank, const size_t shape_rank) -> int32_t { + return allocation_rank < 0 ? allocation_rank + shape_rank : allocation_rank; +} + +snippets::op::Buffer::Buffer(const Output& x, const int32_t allocation_rank) : Op({x}), m_allocation_rank(allocation_rank) { + constructor_validate_and_infer_types(); +} + +bool snippets::op::Buffer::visit_attributes(AttributeVisitor& visitor) { + INTERNAL_OP_SCOPE(Buffer_visit_attributes); + visitor.on_attribute("offset", m_offset); + visitor.on_attribute("allocation_rank", m_allocation_rank); + return true; +} + +std::shared_ptr snippets::op::Buffer::clone_with_new_inputs(const OutputVector& new_args) const { + INTERNAL_OP_SCOPE(Buffer_clone_with_new_inputs); + check_new_args_count(this, new_args); + auto new_buffer = std::make_shared(new_args.at(0), m_allocation_rank); + new_buffer->set_offset(m_offset); + return new_buffer; +} + +void snippets::op::Buffer::validate_and_infer_types() { + INTERNAL_OP_SCOPE(Buffer_validate_and_infer_types); + const auto shape_rank = get_input_partial_shape(0).rank(); + if (shape_rank.is_static()) { + const auto normalized_rank = normalize_rank(m_allocation_rank, shape_rank.get_length()); + NGRAPH_CHECK(normalized_rank >= 0 && normalized_rank <= shape_rank.get_length(), + "Buffer has incorrect allocation rank: " + std::to_string(m_allocation_rank)); + } + set_output_type(0, get_input_element_type(0), get_input_partial_shape(0)); +} + +void snippets::op::Buffer::set_offset(const size_t offset) { + m_offset = offset; + + // If Buffer has offset We set this offset in the next Load and Store ops + // to correctly read and write data because all buffers have the one register + // Also if user sets offset to a Buffer It means that the Buffer has the corresponding Load and Store ops + + // Propagate to up: in Store. Buffer can have only one Store + { + auto parent = get_input_node_shared_ptr(0); + auto idx = input(0).get_source_output().get_index(); + auto loop = std::dynamic_pointer_cast(parent); + while (loop) { + parent = loop->get_input_node_shared_ptr(idx); + idx = input(idx).get_source_output().get_index(); + loop = std::dynamic_pointer_cast(parent); + } + if (auto store = std::dynamic_pointer_cast(parent)) { + store->set_offset(m_offset); + } else if (const auto brgemm = std::dynamic_pointer_cast(parent)) { // Brgemm encapsulates work with loading and storing + brgemm->set_offset_c(m_offset); + } else { + throw ngraph_error("Buffer::set_offset() was called when Buffer didn't have the corresponding Store op for offset propagation"); + } + } + + // Propagate to down: in Load. Buffer can have several Load and Loops after himself. We should go through all target inputs + { + std::function&)> propagate_down; + propagate_down = [&](const Input& target_input) { + const auto child = target_input.get_node()->shared_from_this(); + if (std::dynamic_pointer_cast(child)) { + const auto index = target_input.get_index(); + for (const auto loop_target_output : child->output(index).get_target_inputs()) { + propagate_down(loop_target_output); + } + } else if (const auto load = std::dynamic_pointer_cast(child)) { + load->set_offset(m_offset); + } else if (const auto brgemm = std::dynamic_pointer_cast(child)) { // Brgemm encapsulates work with loading and storing + if (target_input.get_index() == 0) { + brgemm->set_offset_a(m_offset); + } else { + brgemm->set_offset_b(m_offset); + } + } else { + throw ngraph_error("Buffer::set_offset() was called when Buffer didn't have the corresponding Load op for offset propagation"); + } + }; + + for (const auto target_output : output(0).get_target_inputs()) { + propagate_down(target_output); + } + } +} + +size_t ngraph::snippets::op::Buffer::get_byte_size() const { + const auto pshape = get_input_partial_shape(0); + NGRAPH_CHECK(pshape.is_static(), "Buffer should have static shapes for memory allocation"); + const auto shape = pshape.get_shape(); + const auto normalized_rank = normalize_rank(m_allocation_rank, shape.size()); + return ngraph::shape_size(shape.rbegin(), shape.rbegin() + normalized_rank) * get_element_type().size(); +} diff --git a/src/common/snippets/src/op/fill.cpp b/src/common/snippets/src/op/fill.cpp new file mode 100644 index 00000000000000..a4be641f34e5a1 --- /dev/null +++ b/src/common/snippets/src/op/fill.cpp @@ -0,0 +1,36 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +#include "snippets/op/fill.hpp" + +#include + +using namespace std; +using namespace ngraph; + +snippets::op::Fill::Fill(const Output& x, const size_t offset, const uint32_t fill_value) + : Op({x}), m_offset(offset), m_fill_value(fill_value) { + constructor_validate_and_infer_types(); +} + +bool snippets::op::Fill::visit_attributes(AttributeVisitor& visitor) { + INTERNAL_OP_SCOPE(Fill_visit_attributes); + visitor.on_attribute("offset", m_offset); + visitor.on_attribute("fill_value", m_fill_value); + return true; +} + +std::shared_ptr snippets::op::Fill::clone_with_new_inputs(const OutputVector& new_args) const { + INTERNAL_OP_SCOPE(Fill_clone_with_new_inputs); + check_new_args_count(this, new_args); + return std::make_shared(new_args.at(0), m_offset, m_fill_value); +} + +void snippets::op::Fill::validate_and_infer_types() { + INTERNAL_OP_SCOPE(Fill_validate_and_infer_types); + set_output_type(0, get_input_element_type(0), get_input_partial_shape(0)); +} + diff --git a/src/common/snippets/src/op/horizon_max.cpp b/src/common/snippets/src/op/horizon_max.cpp new file mode 100644 index 00000000000000..003b662200df7b --- /dev/null +++ b/src/common/snippets/src/op/horizon_max.cpp @@ -0,0 +1,26 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include "snippets/op/horizon_max.hpp" + +using namespace std; +using namespace ngraph; + +snippets::op::HorizonMax::HorizonMax(const Output& x) : Op({x}) { + constructor_validate_and_infer_types(); +} + +std::shared_ptr snippets::op::HorizonMax::clone_with_new_inputs(const OutputVector& new_args) const { + INTERNAL_OP_SCOPE(HorizonMax_clone_with_new_inputs); + check_new_args_count(this, new_args); + return std::make_shared(new_args.at(0)); +} + +void snippets::op::HorizonMax::validate_and_infer_types() { + INTERNAL_OP_SCOPE(HorizonMax_validate_and_infer_types); + auto new_shape = get_input_partial_shape(0); + new_shape[new_shape.size() - 1] = 1lu; + set_output_type(0, get_input_element_type(0), new_shape); +} diff --git a/src/common/snippets/src/op/horizon_sum.cpp b/src/common/snippets/src/op/horizon_sum.cpp new file mode 100644 index 00000000000000..6289e4efee2377 --- /dev/null +++ b/src/common/snippets/src/op/horizon_sum.cpp @@ -0,0 +1,26 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include "snippets/op/horizon_sum.hpp" + +using namespace std; +using namespace ngraph; + +snippets::op::HorizonSum::HorizonSum(const Output& x) : Op({x}) { + constructor_validate_and_infer_types(); +} + +std::shared_ptr snippets::op::HorizonSum::clone_with_new_inputs(const OutputVector& new_args) const { + INTERNAL_OP_SCOPE(HorizonSum_clone_with_new_inputs); + check_new_args_count(this, new_args); + return std::make_shared(new_args.at(0)); +} + +void snippets::op::HorizonSum::validate_and_infer_types() { + INTERNAL_OP_SCOPE(HorizonSum_validate_and_infer_types); + auto new_shape = get_input_partial_shape(0); + new_shape[new_shape.size() - 1] = 1lu; + set_output_type(0, get_input_element_type(0), new_shape); +} diff --git a/src/common/snippets/src/op/load.cpp b/src/common/snippets/src/op/load.cpp index d998afef73b101..f7637fbc7962a5 100644 --- a/src/common/snippets/src/op/load.cpp +++ b/src/common/snippets/src/op/load.cpp @@ -12,19 +12,19 @@ namespace ngraph { namespace snippets { namespace op { -Load::Load(const Output& x, const size_t count) : MemoryAccess({x}, count) { +Load::Load(const Output& x, const size_t count, const size_t offset) : MemoryAccess({x}, count, offset) { constructor_validate_and_infer_types(); } std::shared_ptr Load::clone_with_new_inputs(const OutputVector& new_args) const { INTERNAL_OP_SCOPE(Load); check_new_args_count(this, new_args); - return std::make_shared(new_args.at(0), m_count); + return std::make_shared(new_args.at(0), m_count, m_offset); } -LoadReshape::LoadReshape(const Output& x, const size_t count, std::vector order) - : Load(x, count), m_order(std::move(order)) { +LoadReshape::LoadReshape(const Output& x, const size_t count, const size_t offset, std::vector order) + : Load(x, count, offset), m_order(std::move(order)) { const auto& in_shape = x.get_partial_shape(); NGRAPH_CHECK(in_shape.is_static(), "LoadReshape supports only static input shapes"); const auto in_shape_size = in_shape.size(); @@ -45,7 +45,7 @@ void snippets::op::LoadReshape::validate_and_infer_types() { } bool snippets::op::LoadReshape::visit_attributes(AttributeVisitor& visitor) { - visitor.on_attribute("count", m_count); + Load::visit_attributes(visitor); visitor.on_attribute("order", m_order); return true; } @@ -53,9 +53,9 @@ bool snippets::op::LoadReshape::visit_attributes(AttributeVisitor& visitor) { std::shared_ptr snippets::op::LoadReshape::clone_with_new_inputs(const OutputVector& new_args) const { INTERNAL_OP_SCOPE(LoadReshape); check_new_args_count(this, new_args); - return std::make_shared(new_args.at(0), m_count, m_order); + return std::make_shared(new_args.at(0), m_count, m_offset, m_order); } }// namespace op }// namespace snippets -}// namespace ngraph \ No newline at end of file +}// namespace ngraph diff --git a/src/common/snippets/src/op/memory_access.cpp b/src/common/snippets/src/op/memory_access.cpp index 79f6b63a4be691..0166713b5e1c08 100644 --- a/src/common/snippets/src/op/memory_access.cpp +++ b/src/common/snippets/src/op/memory_access.cpp @@ -12,8 +12,7 @@ namespace ngraph { namespace snippets { namespace op { -MemoryAccess::MemoryAccess(const Output& x, const size_t count) : Op({x}), m_count(count) { -} +MemoryAccess::MemoryAccess(const Output& x, const size_t count, const size_t offset) : Op({x}), m_count(count), m_offset(offset) {} bool MemoryAccess::visit_attributes(AttributeVisitor& visitor) { visitor.on_attribute("count", m_count); @@ -24,10 +23,18 @@ size_t MemoryAccess::get_count() const { return m_count; } +size_t MemoryAccess::get_offset() const { + return m_offset; +} + void MemoryAccess::set_count(const size_t count) { m_count = count; } +void MemoryAccess::set_offset(const size_t offset) { + m_offset = offset; +} + void MemoryAccess::validate_and_infer_types() { set_output_type(0, get_input_element_type(0), get_input_partial_shape(0)); } diff --git a/src/common/snippets/src/op/store.cpp b/src/common/snippets/src/op/store.cpp index 69e1e1643b769b..90750de6b65fec 100644 --- a/src/common/snippets/src/op/store.cpp +++ b/src/common/snippets/src/op/store.cpp @@ -12,16 +12,15 @@ namespace ngraph { namespace snippets { namespace op { -Store::Store(const Output& x, const size_t count) : MemoryAccess({x}, count) { +snippets::op::Store::Store(const Output& x, const size_t count, const size_t offset) : MemoryAccess({x}, count, offset) { constructor_validate_and_infer_types(); } - -std::shared_ptr Store::clone_with_new_inputs(const OutputVector& new_args) const { - INTERNAL_OP_SCOPE(Store); +std::shared_ptr snippets::op::Store::clone_with_new_inputs(const OutputVector& new_args) const { + INTERNAL_OP_SCOPE(Store_clone_with_new_inputs); check_new_args_count(this, new_args); - return std::make_shared(new_args.at(0), m_count); + return std::make_shared(new_args.at(0), m_count, m_offset); } } // namespace op } // namespace snippets -} // namespace ngraph \ No newline at end of file +} // namespace ngraph diff --git a/src/common/snippets/src/op/subgraph.cpp b/src/common/snippets/src/op/subgraph.cpp index 933e05b89fca7c..82f73836b6c2c5 100644 --- a/src/common/snippets/src/op/subgraph.cpp +++ b/src/common/snippets/src/op/subgraph.cpp @@ -20,6 +20,10 @@ #include "snippets/pass/align_element_type.hpp" #include "snippets/pass/matmul_to_brgemm.hpp" #include "snippets/pass/fuse_transpose_brgemm.hpp" +#include "snippets/pass/softmax_decomposition.hpp" +#include "snippets/pass/set_buffer_offset.hpp" +#include "snippets/pass/reset_buffer.hpp" +#include "snippets/pass/insert_buffer.hpp" #include "snippets/utils.hpp" #include "transformations/common_optimizations/nop_elimination.hpp" @@ -41,8 +45,12 @@ void snippets::op::Subgraph::set_generator(std::shared_ptr(op); - config.m_need_fill_tail_register = config.m_need_fill_tail_register || - ov::is_type(op) || - ov::is_type(op); config.m_has_type_relaxed_ops = config.m_has_type_relaxed_ops || std::dynamic_pointer_cast(op); config.m_is_needed_to_align_precision = config.m_is_needed_to_align_precision || @@ -60,11 +65,13 @@ void snippets::op::Subgraph::init_config() { has_type_relaxed_ops() || snippets::pass::AlignElementType::opNeedsAlignElementType(op, execution_element_type); config.m_has_domain_sensitive_ops = config.m_has_domain_sensitive_ops || - ov::is_type(op) || - ov::is_type(op) || - ov::is_type(op) || - ov::is_type(op); + ov::is_type(op) || + ov::is_type(op) || + ov::is_type(op) || + ov::is_type(op); } + // Domain sensitive ops are decomposed with explicit Loops. So, we should explicitly insert Loops in Subgraph if it contains these ops + config.m_explicit_loop_insertion = config.m_has_domain_sensitive_ops; } snippets::op::Subgraph::Subgraph(const OutputVector& args, std::shared_ptr body) @@ -184,9 +191,17 @@ auto snippets::op::Subgraph::wrap_node_as_subgraph(const std::shared_ptrget_friendly_name(), body_results, body_parameters); auto subgraph = build_subgraph(node, subgraph_inputs, body); + bool need_buffer = false; + size_t hidden_data_count = 0lu; if (auto fq_node = ov::as_type_ptr(node)) { - subgraph->set_non_scalar_constants_count(utils::get_non_scalar_constant_count_for_fq(fq_node)); + hidden_data_count += utils::get_non_scalar_constant_count_for_fq(fq_node); + // Ops that requires Buffer + } else if (ov::is_type(node) || + ov::is_type(node)) { + need_buffer |= true; } + subgraph->set_virtual_port_count(hidden_data_count); + subgraph->buffer_needed(need_buffer); for (size_t i = 0; i < body->get_parameters().size(); i++) { body->get_parameters()[i]->set_friendly_name(body_parameters[i]->get_friendly_name()); @@ -329,6 +344,17 @@ ov::PartialShape snippets::op::Subgraph::canonicalize(const BlockedShapeVector& return master_shape; } +size_t snippets::op::Subgraph::get_buffer_scratchpad_size() const { + size_t buffer_size = 0; + const auto ops = m_body->get_ops(); + for (const auto& op : ops) { + if (const auto buffer = ov::as_type_ptr(op)) { + buffer_size += buffer->get_byte_size(); + } + } + return buffer_size; +} + void snippets::op::Subgraph::align_element_types(const BlockedShapeVector& outputShapes, const BlockedShapeVector& inputShapes) { // We should insert Convert before Results to set original output element type if needed @@ -384,13 +410,16 @@ void snippets::op::Subgraph::convert_to_snippet_dialect() { return p->get_partial_shape().rbegin()->is_dynamic(); }); ngraph::pass::Manager manager; - manager.register_pass(); - manager.register_pass(); manager.register_pass(); manager.register_pass(); + manager.register_pass(tileRank); + manager.register_pass(count, tileRank); manager.register_pass(); + manager.register_pass(); + manager.register_pass(); manager.register_pass(count); manager.register_pass(count); + manager.register_pass(); // todo: presently dynamic pipeline is activated even if the last two dimension are static // In general, we can use static kernels in this case, but several parameters (src and dst memory pointers for example) // should be passed as run-time args, so it's a mixed mode: kernel is shape-aware, but some additional runtime args are required @@ -422,12 +451,13 @@ void snippets::op::Subgraph::convert_to_snippet_dialect() { manager.get_pass_config()-> set_callback(skip_matching_domain); } - // todo: get_lanes() assumes fp32. Could there be any int8 issues? // Note that InsertLoops requires validate_and_infer_types afterwards, so add it manually if // automatic validation will be disabled in the pass manager - if (!has_domain_sensitive_ops()) - manager.register_pass(master_shape, tileRank, - m_generator->get_target_machine()->get_lanes()); + manager.register_pass(master_shape, tileRank, + m_generator->get_target_machine()->get_lanes(), !config.m_explicit_loop_insertion); + if (config.m_has_domain_sensitive_ops) { + manager.register_pass(); + } } manager.run_passes(m_body); } @@ -462,8 +492,13 @@ snippets::Schedule snippets::op::Subgraph::generate(ngraph::pass::Manager& opt, opt.run_passes(m_body); snippets::pass::AssignRegisters().run_on_model(m_body); + const auto ops = m_body->get_ops(); ngraph::snippets::Generator::GeneratorConfig generatorConfig; generatorConfig.m_save_lowered_code = config.m_has_domain_sensitive_ops; + generatorConfig.m_need_fill_tail_register = config.m_has_domain_sensitive_ops; + generatorConfig.m_one_evaluation_optimizations = std::none_of(ops.begin(), ops.end(), [](const std::shared_ptr& op) { + return ov::is_type(op); + }); // actual code emission ngraph::snippets::code ptr = m_generator->generate(m_body, generatorConfig, compile_params); diff --git a/src/common/snippets/src/op/vector_buffer.cpp b/src/common/snippets/src/op/vector_buffer.cpp new file mode 100644 index 00000000000000..1be69a6d9ad678 --- /dev/null +++ b/src/common/snippets/src/op/vector_buffer.cpp @@ -0,0 +1,27 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +#include "snippets/op/vector_buffer.hpp" + +#include + +using namespace std; +using namespace ngraph; + +snippets::op::VectorBuffer::VectorBuffer(const ov::element::Type element_type) : Op(), m_element_type(std::move(element_type)) { + constructor_validate_and_infer_types(); +} + +std::shared_ptr snippets::op::VectorBuffer::clone_with_new_inputs(const OutputVector& new_args) const { + INTERNAL_OP_SCOPE(VectorBuffer_clone_with_new_inputs); + check_new_args_count(this, new_args); + return std::make_shared(m_element_type); +} + +void snippets::op::VectorBuffer::validate_and_infer_types() { + INTERNAL_OP_SCOPE(VectorBuffer_validate_and_infer_types); + set_output_type(0, m_element_type, Shape{1lu}); +} diff --git a/src/common/snippets/src/pass/align_element_type.cpp b/src/common/snippets/src/pass/align_element_type.cpp index f2cf4ce5c47de6..fa45c0b5754eba 100644 --- a/src/common/snippets/src/pass/align_element_type.cpp +++ b/src/common/snippets/src/pass/align_element_type.cpp @@ -20,7 +20,8 @@ inline auto is_in_op(const std::shared_ptr& n) -> bool { || ov::is_type(n); } -// At the moment Subgraph supports only Eltwise, Convert and FQ (which is decomposed into Eltwises and Convert) +// At the moment Subgraph supports only Eltwise, Convert, FQ (which is decomposed into Eltwises and Convert) and +// Softmax (which is decompsed into Eltwises as well) // And only Eltwises supports execution only in "exec_type". So we can check op type from the opposite // NOTE: This check is only for executable which isn't Parameter/Constant/Result inline auto op_supports_only_exec_type(const std::shared_ptr& n) -> bool { diff --git a/src/common/snippets/src/pass/assign_registers.cpp b/src/common/snippets/src/pass/assign_registers.cpp index dd40f6640a3a10..04cbadf5a608cd 100644 --- a/src/common/snippets/src/pass/assign_registers.cpp +++ b/src/common/snippets/src/pass/assign_registers.cpp @@ -7,6 +7,10 @@ #include "snippets/snippets_isa.hpp" #include +namespace { +static constexpr size_t reg_count = 16lu; +} // namespace + bool ngraph::snippets::pass::AssignRegisters::run_on_model(const std::shared_ptr& f) { RUN_ON_MODEL_SCOPE(AssignRegisters); OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::AssignRegisters") @@ -24,7 +28,8 @@ bool ngraph::snippets::pass::AssignRegisters::run_on_model(const std::shared_ptr std::dynamic_pointer_cast(op) || std::dynamic_pointer_cast(op) || std::dynamic_pointer_cast(op) || - std::dynamic_pointer_cast(op)) + std::dynamic_pointer_cast(op) || + std::dynamic_pointer_cast(op)) return gpr2gpr; else if (std::dynamic_pointer_cast(op) || std::dynamic_pointer_cast(op)) @@ -41,22 +46,57 @@ bool ngraph::snippets::pass::AssignRegisters::run_on_model(const std::shared_ptr size_t counter_gpr = 0; std::map regs_vec, regs_gpr; // Define a set of immune tensors that will be ignored by auto reg allocation => their reg allocation is done manually - // todo: presently it hold only gpr's. If you need to manually assign vec's, implement reg_type or create a second map - std::map manually_assigned_regs; + std::map manually_assigned_gprs, manually_assigned_vecs; const auto IS_MANUALLY_ALLOCATED_REG = SIZE_MAX; const auto num_parameters = f->get_parameters().size(); + const auto num_results = f->get_results().size(); + auto accumulator_reg = 0lu; for (const auto& op : ops) { if (const auto& param = ov::as_type_ptr(op)) { - manually_assigned_regs[op->output(0).get_tensor_ptr()] = + manually_assigned_gprs[op->output(0).get_tensor_ptr()] = static_cast(f->get_parameter_index(param)); } else if (const auto& result = ov::as_type_ptr(op)) { // here we use the fact that Result input & output tensors are identical by construction - manually_assigned_regs[op->output(0).get_tensor_ptr()] = + manually_assigned_gprs[op->output(0).get_tensor_ptr()] = static_cast(f->get_result_index(result) + num_parameters); + } else if (const auto& buffer = ov::as_type_ptr(op)) { + // All buffers have one common data pointer + manually_assigned_gprs[op->input(0).get_tensor_ptr()] = + static_cast(num_results + num_parameters); + manually_assigned_gprs[op->output(0).get_tensor_ptr()] = + static_cast(num_results + num_parameters); + } else if (ov::is_type(op) || ov::is_type(op)) { + // Only in SoftmaxDecomposition ReduceMax and ReduceSum use HorizonMax/HorizonSum and VectorBuffer. + // We should manually set the one vector register for VectorBuffer and Max/Sum output to simulate a accumulator + // TODO [96351]: We should rewrite accumulator pattern using another way + const auto input = op->get_input_node_shared_ptr(0); // input - it's accumulator math op: Add or Max + for (size_t i = 0; i < input->get_input_size(); ++i) { + if (ov::is_type(input->get_input_node_shared_ptr(i))) { + manually_assigned_vecs[input->input(i).get_tensor_ptr()] = + static_cast(accumulator_reg); + } + } + + manually_assigned_vecs[input->output(0).get_tensor_ptr()] = + static_cast(accumulator_reg); + manually_assigned_vecs[op->output(0).get_tensor_ptr()] = + static_cast(accumulator_reg); + + // If there is Broadcast, it should have the same register as Horizon op + // because it's a result of the accumulator as well + for (auto& out : op->output(0).get_target_inputs()) { + const auto child = out.get_node()->shared_from_this(); + if (ov::is_type(child)) { + manually_assigned_vecs[child->output(0).get_tensor_ptr()] = + static_cast(accumulator_reg); + } + } + accumulator_reg++; } } - auto enumerate_out_tensors = [IS_MANUALLY_ALLOCATED_REG, &manually_assigned_regs] (const std::shared_ptr& op, + auto enumerate_out_tensors = [IS_MANUALLY_ALLOCATED_REG] (const std::shared_ptr& op, decltype(regs_vec)& reg_map, + const std::map& manually_assigned_regs, size_t& counter) { for (const auto& output : op->outputs()) { const auto& t = output.get_tensor_ptr(); @@ -71,11 +111,11 @@ bool ngraph::snippets::pass::AssignRegisters::run_on_model(const std::shared_ptr switch (t_op.first) { case vec2vec: case gpr2vec: - enumerate_out_tensors(t_op.second, regs_vec, counter_vec); + enumerate_out_tensors(t_op.second, regs_vec, manually_assigned_vecs, counter_vec); break; case gpr2gpr: case vec2gpr: - enumerate_out_tensors(t_op.second, regs_gpr, counter_gpr); + enumerate_out_tensors(t_op.second, regs_gpr, manually_assigned_gprs, counter_gpr); break; } } @@ -96,7 +136,7 @@ bool ngraph::snippets::pass::AssignRegisters::run_on_model(const std::shared_ptr } return result; }; - for (int i = 0; i < typed_ops.size(); i++) { + for (size_t i = 0; i < typed_ops.size(); i++) { const auto& t_op = typed_ops[i]; std::vector used_tensors, defined_tensors; for (const auto& in : t_op.second->inputs()) @@ -239,15 +279,18 @@ bool ngraph::snippets::pass::AssignRegisters::run_on_model(const std::shared_ptr }; // todo: vec_/gpr_pool are hardware-specific and should be provided by a backend, e.g. overloaded generator std::set vec_pool; - for (Reg i = 0; i < 16; i++) + for (Reg i = 0; i < reg_count; i++) vec_pool.insert(i); - auto unique2reused_map_vec = linescan_assign_registers(live_intervals_vec, vec_pool); - std::set gpr_pool(std::move(vec_pool)); - for (const auto& t_reg : manually_assigned_regs) + std::set gpr_pool(vec_pool); + for (const auto& t_reg : manually_assigned_vecs) + vec_pool.erase(t_reg.second); + for (const auto& t_reg : manually_assigned_gprs) gpr_pool.erase(t_reg.second); + auto unique2reused_map_vec = linescan_assign_registers(live_intervals_vec, vec_pool); auto unique2reused_map_gpr = linescan_assign_registers(live_intervals_gpr, gpr_pool); - std::map assigned_regs(std::move(manually_assigned_regs)); + std::map assigned_regs(std::move(manually_assigned_gprs)); + assigned_regs.insert(manually_assigned_vecs.begin(), manually_assigned_vecs.end()); auto register_assigned_regs = [IS_MANUALLY_ALLOCATED_REG, &assigned_regs](const std::map& unique_regs, const std::map& unique2reused) { for (const auto& reg : unique_regs) { diff --git a/src/common/snippets/src/pass/collapse_subgraph.cpp b/src/common/snippets/src/pass/collapse_subgraph.cpp index 4501eb0797467d..5924729efec99f 100644 --- a/src/common/snippets/src/pass/collapse_subgraph.cpp +++ b/src/common/snippets/src/pass/collapse_subgraph.cpp @@ -122,8 +122,28 @@ auto is_supported_op(const std::shared_ptr &n) -> bool { || ov::is_type(n) || ov::is_type(n); }; - return is_supported_unary_eltwise_op(n) || is_supported_binary_eltwise_op(n) || - is_supported_transpose(n) || is_supported_fq_op(n) || is_supported_matmul(n); + + auto is_supported_softmax = [](const std::shared_ptr &n) -> bool { + if (n->get_input_size() != 1 || n->get_input_partial_shape(0).rank().is_dynamic()) + return false; + int64_t axis = -1; + const auto rank = n->get_input_partial_shape(0).rank(); + if (const auto softmax_v8 = ngraph::as_type_ptr(n)) { + axis = ngraph::normalize_axis(n->get_friendly_name(), softmax_v8->get_axis(), rank); + } else if (const auto softmax_v1 = ngraph::as_type_ptr(n)) { + axis = softmax_v1->get_axis(); + } else { + return false; + } + return axis >= 0 && axis == (rank.get_length() - 1); + }; + + return is_supported_fq_op(n) + || is_supported_unary_eltwise_op(n) + || is_supported_binary_eltwise_op(n) + || is_supported_transpose(n) + || is_supported_softmax(n) + || is_supported_matmul(n); } auto has_supported_in_out(const std::shared_ptr &n) -> bool { @@ -503,18 +523,24 @@ TokenizeSnippets::TokenizeSnippets() { // than the actual number of Constants during tokenization. // To avoid unsupported number of non-scalar Constants in the future (plugin specific limitation) // we should calculate potentional number of non-scalar Constants that will be moved up from body. - size_t hidden_non_scalar_constant_count = 0; + size_t hidden_data_count = 0; + bool need_buffer = false; if (const auto fq_node = ov::as_type_ptr(node)) { - hidden_non_scalar_constant_count += ngraph::snippets::utils::get_non_scalar_constant_count_for_fq(fq_node); + hidden_data_count += ngraph::snippets::utils::get_non_scalar_constant_count_for_fq(fq_node); + // Ops require a Buffer + } else if (ov::is_type(node) || + ov::is_type(node)) { + need_buffer |= true; } ResultVector body_results; std::vector>> subgraph_result_inputs; for (auto subgraph : input_subgraphs) { - // we should summurize non-scalar Constants count from all input subgraphs - // because we will collapse them with our node and we should get total count of non-scalar Constants - hidden_non_scalar_constant_count += ov::as_type_ptr(subgraph)->get_non_scalar_constants_count(); + // we should summurize additional needed data count (non-scalar Constants and Buffers) from all input subgraphs + // because we will collapse them with our node and we should get total count + hidden_data_count += ov::as_type_ptr(subgraph)->get_virtual_port_count(); + need_buffer |= ov::as_type_ptr(subgraph)->is_buffer_needed(); for (auto output : subgraph->outputs()) { bool first_side_consumer = true; @@ -555,13 +581,13 @@ TokenizeSnippets::TokenizeSnippets() { } // todo: move this plugin-specific constraint to the plugin callback - if (body_parameters.size() + body_results.size() + hidden_non_scalar_constant_count > 12) { + if (body_parameters.size() + body_results.size() + hidden_data_count + static_cast(need_buffer) > 12) { const std::string message_reset = "new subgraph is created. Impossible to schedule subgraph with " + std::to_string(body_parameters.size()) + " inputs, " + std::to_string(body_results.size()) + " outputs and " + - std::to_string(hidden_non_scalar_constant_count) + " non-scalar constants."; + std::to_string(hidden_data_count) + " non-scalar constants and " + std::to_string(need_buffer) + "buffers."; const std::string message_abort = "failed to continue subgraph. Impossible to schedule subgraph with " + std::to_string(body_parameters.size()) + " inputs, " + std::to_string(body_results.size()) + " outputs and " + - std::to_string(hidden_non_scalar_constant_count) + " non-scalar constants."; + std::to_string(hidden_data_count) + " non-scalar constants and " + std::to_string(need_buffer) + "buffers."; return abort_with_strategy(message_reset, message_abort); } @@ -596,7 +622,8 @@ TokenizeSnippets::TokenizeSnippets() { act_body1->get_parameters()[i]->set_friendly_name(body_parameters[i]->get_friendly_name()); } subgraph->get_rt_info()["originalLayersNames"] = fusedNames; - subgraph->set_non_scalar_constants_count(hidden_non_scalar_constant_count); + subgraph->set_virtual_port_count(hidden_data_count); + subgraph->buffer_needed(need_buffer); remark(1) << "Replacement (merge) done for: " << subgraph->get_friendly_name() diff --git a/src/common/snippets/src/pass/common_optimizations.cpp b/src/common/snippets/src/pass/common_optimizations.cpp index c81ec235bb7ea8..be78a136cc71ae 100644 --- a/src/common/snippets/src/pass/common_optimizations.cpp +++ b/src/common/snippets/src/pass/common_optimizations.cpp @@ -10,6 +10,7 @@ #include "transformations/utils/utils.hpp" #include "snippets/pass/fq_decomposition.hpp" +#include "snippets/pass/softmax_reshape_elimination.hpp" #include "snippets/op/subgraph.hpp" #include "snippets/itt.hpp" @@ -33,6 +34,10 @@ void ConvertConstantsToParameters(const std::shared_ptrget_shape()) != 1ul)) continue; + const auto child = constant->get_output_target_inputs(0).begin()->get_node()->shared_from_this(); + if (ov::is_type(child) || ov::is_type(child)) + continue; + auto parameter = std::make_shared(constant->get_element_type(), constant->output(0).get_partial_shape()); parameter->set_friendly_name(constant->get_friendly_name()); ngraph::copy_runtime_info(constant, parameter); @@ -69,6 +74,7 @@ CommonOptimizations::CommonOptimizations() { if (is_quantized) { manager.register_pass(); } + manager.register_pass(); manager.run_passes(body); // At the moment only non-scalar Constants of FakeQuantize can be inside Subgraph diff --git a/src/common/snippets/src/pass/insert_buffer.cpp b/src/common/snippets/src/pass/insert_buffer.cpp new file mode 100644 index 00000000000000..3cc7ca90921464 --- /dev/null +++ b/src/common/snippets/src/pass/insert_buffer.cpp @@ -0,0 +1,97 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include "snippets/remarks.hpp" + +#include "snippets/pass/insert_buffer.hpp" +#include "snippets/snippets_isa.hpp" + +#include +#include + +ngraph::snippets::pass::InsertBuffer::InsertBuffer(const int32_t allocation_rank) { + MATCHER_SCOPE(InsertBuffer); + // The list of operations that require Buffers on their Inputs and Outputs + const auto pattern = ngraph::pattern::wrap_type(); + + register_matcher(std::make_shared(pattern, matcher_name), + [this, allocation_rank](ngraph::pattern::Matcher &m) { + OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::InsertBuffer") + auto root = m.get_match_root(); + bool rewritten = false; + + // check if already has Buffer, Parameter or Constant as an input + for (const auto& input : root->inputs()) { + const auto input_node = input.get_source_output().get_node()->shared_from_this(); + if (!ov::is_type(input_node) && + !ov::is_type(input_node) && + !ov::is_type(input_node)) { + const auto buffer = std::make_shared(input_node, allocation_rank); + ngraph::copy_runtime_info(root, buffer); + root->set_argument(input.get_index(), buffer); + rewritten |= true; + } + if (ov::is_type(input.get_source_output().get_node_shared_ptr()) && + input.get_source_output().get_target_inputs().size() != 1) { + throw ngraph::ngraph_error( + "If Buffer is a input for operation output, this Buffer should be a single consumer for this port"); + } + } + + // check if already has Buffer or outputs is Result + for (const auto& output : root->outputs()) { + const auto target_inputs = output.get_target_inputs(); + if (target_inputs.size() > 1) { + for (const auto& consumer : target_inputs) { + const auto output_node = consumer.get_node()->shared_from_this(); + if (ov::is_type(output_node)) { + // If some of children from one common port are different Buffers, + // we should remove them to insert one common Buffer on one common port + replace_output_update_name(output_node->output(0), output_node->input_value(0)); + } else if (ov::is_type(output_node)) { + // TODO: At this moment operation which is should be wrapped by Buffers doesn't support several childs where one of them is Result + // because Result and Buffer from one root port should have the same register. It's not supported at the moment + // For example, + // Buffer + // | + // Softmax + // / \ + // Buffer Result + throw ngraph::ngraph_error( + "Operation which is should be wrapped by Buffers has few children from one output port where one of them is Result"); + } + } + } + + const auto buffer = std::make_shared(output, allocation_rank); + for (const auto& consumer : output.get_target_inputs()) { + const auto output_node = consumer.get_node()->shared_from_this(); + if (output_node != buffer && + !ov::is_type(output_node) && + !ov::is_type(output_node)) { + consumer.replace_source_output(buffer); + rewritten |= true; + } + } + + const auto new_target_inputs = output.get_target_inputs(); + const auto has_buffer_on_output = std::any_of(new_target_inputs.begin(), new_target_inputs.end(), [](const ov::Input& consumer) { + const auto child = consumer.get_node()->shared_from_this(); + // We check for count of target inputs of Buffer output because + // we created Buffer op with root input previously for the next possible insertions + // Thus, if Buffer wasn't inserted, this op doesn't have target inputs on output + return ov::is_type(child) && child->output(0).get_target_inputs().size() > 0; + }); + if (has_buffer_on_output && new_target_inputs.size() != 1) { + throw ngraph::ngraph_error( + "If Buffer is a input for operation output, this Buffer should be a single consumer for this port"); + } + } + return rewritten; + }); +} diff --git a/src/common/snippets/src/pass/insert_load_store.cpp b/src/common/snippets/src/pass/insert_load_store.cpp index d22d094fdd207c..efa0d6396c63fd 100644 --- a/src/common/snippets/src/pass/insert_load_store.cpp +++ b/src/common/snippets/src/pass/insert_load_store.cpp @@ -15,7 +15,7 @@ ngraph::snippets::pass::InsertLoad::InsertLoad(const size_t count) { MATCHER_SCOPE(InsertLoad); register_matcher(std::make_shared( - ngraph::pattern::wrap_type(), matcher_name), + ngraph::pattern::wrap_type(), matcher_name), [this, count](ngraph::pattern::Matcher &m) { OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::InsertLoad") auto root = m.get_match_root(); @@ -57,7 +57,7 @@ ngraph::snippets::pass::InsertLoad::InsertLoad(const size_t count) { ngraph::snippets::pass::InsertStore::InsertStore(const size_t count) { MATCHER_SCOPE(InsertStore); register_matcher(std::make_shared( - ngraph::pattern::wrap_type(), matcher_name), + ngraph::pattern::wrap_type(), matcher_name), [this, count](ngraph::pattern::Matcher &m) { OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::InsertStore") auto root = m.get_match_root(); @@ -73,7 +73,7 @@ ngraph::snippets::pass::InsertStore::InsertStore(const size_t count) { } } - auto store = std::make_shared (root->input_value(0), count); + auto store = std::make_shared(root->input_value(0), count); ngraph::copy_runtime_info(root, store); root->set_argument(0, store); return true; diff --git a/src/common/snippets/src/pass/insert_loops.cpp b/src/common/snippets/src/pass/insert_loops.cpp index bbe93ab95413d6..7d513239765f87 100644 --- a/src/common/snippets/src/pass/insert_loops.cpp +++ b/src/common/snippets/src/pass/insert_loops.cpp @@ -5,26 +5,220 @@ #include #include "snippets/pass/insert_loops.hpp" #include "snippets/pass/loop_helpers.hpp" +#include "snippets/snippets_isa.hpp" +#include "snippets/utils.hpp" #include -ngraph::snippets::pass::InsertLoops::InsertLoops(ov::PartialShape master_shape, size_t loop_depth, size_t vector_size) -: m_master_shape(std::move(master_shape)), m_loop_depth(loop_depth), m_vector_size(vector_size) { +namespace ngraph { +namespace snippets { +namespace pass { + +InsertLoops::InsertLoops(ov::PartialShape master_shape, size_t loop_depth, size_t vector_size, bool is_optimized) + : m_master_shape(std::move(master_shape)), m_loop_depth(loop_depth), m_vector_size(vector_size), m_is_optimized(is_optimized) { if (m_master_shape.size() < m_loop_depth) throw ngraph_error("InsertLoops can't insert loops: master shape rank is too small"); } -bool ngraph::snippets::pass::InsertLoops::run_on_model(const std::shared_ptr &model) { +std::vector InsertLoops::calculate_inner_apply_increments(const ov::PartialShape& master, + const std::vector& shapes) { + // Inner Loop applies increments if a dimension is not broadcasted + std::vector apply_increments; + apply_increments.reserve(shapes.size()); + std::transform(shapes.begin(), shapes.end(), std::back_inserter(apply_increments), + [=](const ov::PartialShape& ps) { return utils::get_inner_dim(ps) != 1 && utils::get_inner_dim(master) != 1; }); + return apply_increments; +} +std::vector InsertLoops::calculate_outer_apply_increments(const std::vector& shapes) { + // Outer Loop applies increments only if a corresponding lower dim was broadcasted (or all lower dims == 1) + std::vector apply_increments; + apply_increments.reserve(shapes.size()); + std::transform(shapes.begin(), shapes.end(), std::back_inserter(apply_increments), + [=](const ov::PartialShape& ps) { return utils::get_outer_dim(ps) != 1 && utils::get_inner_dim(ps) == 1; }); + return apply_increments; +} +std::vector InsertLoops::calculate_finalization_offsets(const ov::PartialShape& master, + const std::vector& shapes) { + const auto inner_work_amount = utils::get_inner_dim(master).get_length(); + std::vector inner_finalization_offsets(shapes.size(), 0); + std::transform(shapes.begin(), shapes.end(), inner_finalization_offsets.begin(), + [=](const ov::PartialShape& ps) { + return utils::get_outer_dim(ps) == 1 && utils::get_inner_dim(ps) != 1 ? -inner_work_amount : 0; + }); + return inner_finalization_offsets; +} + +void insert_explicitly_loops(const ov::NodeVector& ops, const ov::PartialShape& master_shape, const size_t vector_size) { + ov::NodeVector body; + ov::NodeVector body_remainder; + ov::OutputVector body_parameters; + std::vector> body_results; + + // check for potential parameters for new Loop + auto add_body_parameters = [](const std::shared_ptr& op, ov::OutputVector& body_parameters) { + for (auto input : op->inputs()) { + auto parent = input.get_source_output().get_node_shared_ptr(); + if (ov::is_type(parent) || + ov::is_type(parent) || + ov::is_type(parent) || + ov::is_type(parent)) { + body_parameters.push_back(input.get_source_output()); + } + } + }; + + // check for potential results for new Loop + auto add_body_results = [](const std::shared_ptr& op, std::vector>& body_results) { + for (auto output : op->outputs()) { + for (auto target_input : output.get_target_inputs()) { + auto child = target_input.get_node()->shared_from_this(); + if (ov::is_type(child) || + ov::is_type(child) || + ov::is_type(child) || + ov::is_type(child)) { + body_results.push_back(target_input); + } + } + } + }; + + // check for potential missing body ops for new loop + std::function& op, ov::NodeVector& body)> add_missing_body_ops; + add_missing_body_ops = [&](const std::shared_ptr& op, ov::NodeVector& body) { + if (body_remainder.size()) { + for (auto input : op->inputs()) { + auto parent = input.get_source_output().get_node_shared_ptr(); + auto iter = std::find(body_remainder.begin(), body_remainder.end(), parent); + if (iter != body_remainder.end()) { + *std::back_inserter(body) = std::move(*iter); + add_missing_body_ops(parent, body); + add_body_parameters(parent, body_parameters); + add_body_results(op, body_results); + } + } + } + }; + + auto wrapBodyByLoop = [&](const ov::NodeVector& body, const ov::OutputVector& body_parameters, const std::vector>& body_results) { + NGRAPH_CHECK(body_parameters.size() > 0, "The count of parameters for loop should be more than zero to create loop"); + NGRAPH_CHECK(body_results.size() > 0, "The count of results for loop should be more than zero to create loop"); + std::vector body_shapes; + const auto count_io = body_parameters.size() + body_results.size(); + body_shapes.reserve(count_io); + std::transform(body_parameters.begin(), body_parameters.end(), std::back_inserter(body_shapes), + [](const ov::Output& out) { return out.get_partial_shape(); }); + std::transform(body_results.begin(), body_results.end(), std::back_inserter(body_shapes), + [](const ov::Input& in) { return in.get_partial_shape(); }); + + auto body_master_shape = body_shapes.front(); + for (const auto& shape : body_shapes) + PartialShape::broadcast_merge_into(body_master_shape, shape, ::ngraph::op::AutoBroadcastType::NUMPY); + const auto inner_work_amount = utils::get_inner_dim(body_master_shape).get_length(); + const auto outer_work_amount = utils::get_outer_dim(body_master_shape).get_length(); + + auto apply_increments = InsertLoops::calculate_inner_apply_increments(master_shape, body_shapes); + std::vector inner_finalization_offsets(body_shapes.size(), 0); + if (outer_work_amount > 1) { + inner_finalization_offsets = InsertLoops::calculate_finalization_offsets(master_shape, body_shapes); + } + + const auto& inner_loop_begin = op::insertLoopBeginAfterOutputs(body_parameters); + const auto& inner_loop_end = op::insertLoopEndBeforeInputs( + body_results, inner_loop_begin, inner_work_amount, vector_size, + apply_increments, inner_finalization_offsets); + // set internal flag to enable scalar vs vector loop optimizations + inner_loop_end->has_outer_loop = outer_work_amount > 1; + // Due to features of topological sort, some Constants (Scalars) may appear right after Parameters in + // sorted ops (so it's between Parameters and LoopBegin). Consequently, ScalarEmitters would be called + // outside the Loop, and only the first Loop iteration would yield correct data (assuming the vector reg + // assigned to scalar will get corrupted inside the loop body). To avoid such cases, we add control dependency + // on LoopBegin to guarantee that the constants are executed inside the Loop. + for (const auto& n : body) { + if (auto c = std::dynamic_pointer_cast(n)) { + c->add_control_dependency(inner_loop_begin); + } + } + + if (outer_work_amount > 1) { + std::vector apply_increments = InsertLoops::calculate_outer_apply_increments(body_shapes); + std::vector outer_finalization_offsets(body_shapes.size(), 0); + const auto& outer_loop_begin = op::insertLoopBegin(body_parameters); + op::insertLoopEnd(body_results, outer_loop_begin, outer_work_amount, 1lu, + apply_increments, outer_finalization_offsets); + } + }; + + auto op_is_outside_loop = [](const std::shared_ptr& op) -> bool { + if (ov::is_type(op) || + ov::is_type(op) || + ov::is_type(op)) + return true; + auto& rt = op->get_rt_info(); + auto outside_rt = rt.find("outside_loop"); + bool is_outside = false; + // If rt info isn't setted it means that op should be inside loop by default + if (outside_rt != rt.end()) { + is_outside = outside_rt->second.as(); + } + return is_outside; + }; + + for (auto iter = ops.begin(); iter < ops.end(); iter++) { + const auto op = *iter; + // Need to check for that op should be inside or outside loop + if (op_is_outside_loop(op)) { + continue; + } + + // If we meet loopBegin or Brgemm, it means that all previous nodes from ordered body + // should be in one body. It's like stop signal + const auto& loop_begin = ov::as_type_ptr(op); + const auto& brgemm = ov::as_type_ptr(op); + if (loop_begin || brgemm) { + if (body.size() > 0) { + if (body_results.size() > 0) { + wrapBodyByLoop(body, body_parameters, body_results); + } else { + // If there aren't body results, it means that the current body ops are inputs of the next some operations in ordered_ops + // So this set of the current body ops is part of the future body loop. + // We should save them to add in body ops in the future + std::move(body.begin(), body.end(), std::back_inserter(body_remainder)); + } + } + + // we should skip the next existing Loop body + if (loop_begin) { + const auto &loop_end = loop_begin->get_loop_end(); + iter = std::find(iter, ops.end(), loop_end); + } + + // clear loop body to create the next + body.clear(); + body_parameters.clear(); + body_results.clear(); + } else { + add_missing_body_ops(op, body); + add_body_parameters(op, body_parameters); + add_body_results(op, body_results); + + body.push_back(op); + } + } + + if (body.size() > 0) { + wrapBodyByLoop(body, body_parameters, body_results); + } +} + +bool InsertLoops::run_on_model(const std::shared_ptr &model) { RUN_ON_FUNCTION_SCOPE(InsertLoops); if (m_master_shape.is_dynamic()) throw ngraph_error("InsertLoops doesn't support dynamic shapes yet"); - const auto inner_dim = m_master_shape.size() - 1; - // Note: outer_dim will not be used if m_master_shape.size() < 2 - const auto outer_dim = m_loop_depth == 2 ? m_master_shape.size() - 2 : -1; - const auto inner_work_amount = m_master_shape[inner_dim].get_length(); - const auto outer_work_amount = m_loop_depth == 2 ? m_master_shape[outer_dim].get_length() : 1; + const auto inner_work_amount = utils::get_inner_dim(m_master_shape).get_length(); + const auto outer_work_amount = m_loop_depth == 2 ? utils::get_outer_dim(m_master_shape).get_length() : 1; + auto ops = model->get_ordered_ops(); ParameterVector commonParams = model->get_parameters(); // Note that topological sort parses node arguments in reversed order, but results are added - in direct order // So ve need to pass the reversed results to LoopEnd to keep the original traversal order in topological sorter @@ -48,50 +242,42 @@ bool ngraph::snippets::pass::InsertLoops::run_on_model(const std::shared_ptr 0) { - std::vector apply_increments; - apply_increments.reserve(ioShapes.size()); - // Inner Loop applies increments if a dimension is not broadcasted - std::transform(ioShapes.begin(), ioShapes.end(), std::back_inserter(apply_increments), - [=](const PartialShape& ps) { - return ps[inner_dim] != 1 && m_master_shape[inner_dim] != 1; - }); - std::vector inner_finalization_offsets(ioShapes.size(), 0); - if (outer_work_amount > 1) { - // We need to step back if an outer dim is broadcasted, while the corresponding lower one is not - std::transform(ioShapes.begin(), ioShapes.end(), inner_finalization_offsets.begin(), - [=](const PartialShape& ps) { - return ps[outer_dim] == 1 && ps[inner_dim] != 1 ? -inner_work_amount : 0; - }); - } - const auto& inner_loop_begin = op::insertLoopBegin(commonParams); - const auto& inner_loop_end = insertLoopEnd(commonResults, inner_loop_begin, inner_work_amount, - m_vector_size, apply_increments, inner_finalization_offsets); - // set internal flag to enable scalar vs vector loop optimizations - inner_loop_end->has_outer_loop = outer_work_amount > 1; - // Due to features of topological sort, some Constants (Scalars) may appear right after Parameters in - // sorted ops (so it's between Parameters and LoopBegin). Consequently, ScalarEmitters would be called - // outside the Loop, and only the first Loop iteration would yield correct data (assuming the vector reg - // assigned to scalar will get corrupted inside the loop body). To avoid such cases, we add control dependency - // on LoopBegin to guarantee that the constants are executed inside the Loop. - for (const auto& n : model->get_ordered_ops()) { - if (auto c = std::dynamic_pointer_cast(n)) - c->add_control_dependency(inner_loop_begin); - else if (n == inner_loop_begin) - break; - } - } + if (m_is_optimized) { + const auto apply_increments = InsertLoops::calculate_inner_apply_increments(m_master_shape, ioShapes); + std::vector inner_finalization_offsets(ioShapes.size(), 0); + if (outer_work_amount > 1) { + inner_finalization_offsets = InsertLoops::calculate_finalization_offsets(m_master_shape, ioShapes); + } + const auto& inner_loop_begin = op::insertLoopBegin(commonParams); + const auto& inner_loop_end = insertLoopEnd(commonResults, inner_loop_begin, inner_work_amount, + m_vector_size, apply_increments, inner_finalization_offsets); + // set internal flag to enable scalar vs vector loop optimizations + inner_loop_end->has_outer_loop = outer_work_amount > 1; + // Due to features of topological sort, some Constants (Scalars) may appear right after Parameters in + // sorted ops (so it's between Parameters and LoopBegin). Consequently, ScalarEmitters would be called + // outside the Loop, and only the first Loop iteration would yield correct data (assuming the vector reg + // assigned to scalar will get corrupted inside the loop body). To avoid such cases, we add control dependency + // on LoopBegin to guarantee that the constants are executed inside the Loop. + for (const auto& n : model->get_ordered_ops()) { + if (auto c = std::dynamic_pointer_cast(n)) + c->add_control_dependency(inner_loop_begin); + else if (n == inner_loop_begin) + break; + } - if (outer_work_amount > 1) { - std::vector apply_increments; - apply_increments.reserve(ioShapes.size()); - // Outer Loop applies increments only if a corresponding lower dim was broadcasted (or all lower dims == 1) - std::transform(ioShapes.begin(), ioShapes.end(), std::back_inserter(apply_increments), - [=](const PartialShape& ps) { - return ps[outer_dim] != 1 && ps[inner_dim] == 1; - }); - const auto& outer_loop_begin = op::insertLoopBegin(commonParams); - insertLoopEnd(commonResults, outer_loop_begin, outer_work_amount, 1, apply_increments); + if (outer_work_amount > 1) { + std::vector apply_increments = InsertLoops::calculate_outer_apply_increments(ioShapes); + const auto& outer_loop_begin = op::insertLoopBegin(commonParams); + op::insertLoopEnd(commonResults, outer_loop_begin, outer_work_amount, 1lu, apply_increments); + } + } else { + insert_explicitly_loops(ops, m_master_shape, m_vector_size); + } } return true; -} \ No newline at end of file +} + +} // namespace pass +} // namespace snippets +} // namespace ngraph diff --git a/src/common/snippets/src/pass/insert_movebroadcast.cpp b/src/common/snippets/src/pass/insert_movebroadcast.cpp index 499be69e67f062..f42bc06844262d 100644 --- a/src/common/snippets/src/pass/insert_movebroadcast.cpp +++ b/src/common/snippets/src/pass/insert_movebroadcast.cpp @@ -7,6 +7,7 @@ #include "snippets/pass/insert_movebroadcast.hpp" #include "snippets/snippets_isa.hpp" +#include "snippets/utils.hpp" #include #include @@ -30,6 +31,10 @@ std::shared_ptr broadcast_node_last_dim(const ngraph::Output(broadcasted_node, broadcasted_shape); + // BroadcastMove should be immediately executed after its input op (input op is node with output which should be broadcasted). + // For example, to execute Broadcast outside of a Loop We transfer control dependents and copy rt info + broadcasted_node->add_node_control_dependents(value.get_node_shared_ptr()); + ov::copy_runtime_info(value.get_node_shared_ptr(), broadcasted_node); } return broadcasted_node; @@ -64,23 +69,25 @@ ngraph::snippets::pass::InsertMoveBroadcast::InsertMoveBroadcast() { return false; } - auto is_scalar_constant = [](const ov::Output& v){ - if (auto constant = ov::as_type_ptr(v.get_node_shared_ptr())) { - if (constant->get_shape().empty() || ngraph::shape_size(constant->get_shape()) == 1) { - return true; - } + auto is_ignored_node = [](const ov::Output& v){ + if (utils::is_scalar_constant(v.get_node_shared_ptr())) { + return true; + } else if (ov::is_type(v.get_node_shared_ptr())) { + // VectorBuffer has scalar output shape to avoid broadcast conflicts and manually shape insertion. + // So we shouldn't insert BroadcastMove + return true; } return false; }; std::vector input_shapes; - std::vector ignore_as_scalar; + std::vector is_ignored; for (const auto& val : values) { input_shapes.emplace_back(val.get_partial_shape()); - ignore_as_scalar.push_back(is_scalar_constant(val)); + is_ignored.push_back(is_ignored_node(val)); // Do not insert MoveBroadcast if any of the last dims is dynamic, // since we don't know if we really need it. In these cases, broadcasting will be performed // by outer Loop based on runtime shapes. - if (!ignore_as_scalar.back() && !input_shapes.back().rbegin()->is_static()) + if (!is_ignored.back() && !input_shapes.back().rbegin()->is_static()) return false; } @@ -89,7 +96,7 @@ ngraph::snippets::pass::InsertMoveBroadcast::InsertMoveBroadcast() { ngraph::OutputVector broadcasted_inputs; for (size_t i = 0; i < values.size(); ++i) { - if (ignore_as_scalar[i]) { + if (is_ignored[i]) { broadcasted_inputs.push_back(values[i]); } else { auto node = broadcast_node_last_dim(values[i], bcast_shapes.first, bcast_shapes.second[i]); diff --git a/src/common/snippets/src/pass/load_movebroadcast_to_broadcastload.cpp b/src/common/snippets/src/pass/load_movebroadcast_to_broadcastload.cpp index f3765e471971a2..7499b14f1989fb 100644 --- a/src/common/snippets/src/pass/load_movebroadcast_to_broadcastload.cpp +++ b/src/common/snippets/src/pass/load_movebroadcast_to_broadcastload.cpp @@ -28,9 +28,12 @@ ngraph::snippets::pass::LoadMoveBroadcastToBroadcastLoad::LoadMoveBroadcastToBro const auto param = pm.at(param_pattern).get_node_shared_ptr(); // Cannot rewrite Broadcast + Load if load has more than 1 user - // or more than one input, or if Broadcast has several inputs + // or more than one input, + // or if Broadcast has several inputs, + // or if Load has offset (TODO [96353]: It's CPU Plugin limitation) if (input->output(0).get_target_inputs().size() != 1 || - root->inputs().size() != 1 || input->inputs().size() != 1) { + root->inputs().size() != 1 || input->inputs().size() != 1 || + ov::as_type_ptr(input)->get_offset() > 0) { return false; } diff --git a/src/common/snippets/src/pass/reset_buffer.cpp b/src/common/snippets/src/pass/reset_buffer.cpp new file mode 100644 index 00000000000000..6e09f21e9a7019 --- /dev/null +++ b/src/common/snippets/src/pass/reset_buffer.cpp @@ -0,0 +1,117 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include +#include + +#include "snippets/snippets_isa.hpp" +#include "snippets/pass/reset_buffer.hpp" +#include "snippets/op/subgraph.hpp" +#include "snippets/utils.hpp" + + +namespace { +void normalize_ptr_and_offsets(const ov::NodeVector &io, std::vector &ptr_increments, std::vector &finalization_offsets) { + bool there_is_buffer = false; + // Iterations are from end because before we correct finalization offsets for Loop outputs (io = inputs + outputs) + for (int i = static_cast(io.size()) - 1; i >= 0; --i) { + if (ov::is_type(io[i])) { + if (there_is_buffer) { + ptr_increments[i] = 0; + finalization_offsets[i] = 0; + } else { + there_is_buffer = true; + } + } + } +} +} // namespace + +int64_t ngraph::snippets::pass::ResetBufferState::calculate_required_finalization_offsets(const size_t back_step, const size_t target_work_amount) { + return target_work_amount != 1 ? -static_cast(back_step) : 0; +} + +ngraph::snippets::pass::ResetBufferState::ResetBufferState() { + MATCHER_SCOPE(ResetBufferState); + + auto m_loop_end = ngraph::pattern::wrap_type(); + auto m_buffer = ngraph::pattern::wrap_type({m_loop_end}); + + register_matcher(std::make_shared(m_buffer, matcher_name), + [=](ngraph::pattern::Matcher &m) { + OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::ResetBufferState") + auto& pattern_to_output = m.get_pattern_value_map(); + + const auto loop_end = ngraph::as_type_ptr(pattern_to_output.at(m_loop_end).get_node_shared_ptr()); + const auto loop_begin = loop_end->get_loop_begin(); + const auto parent_loop_end = ngraph::as_type_ptr(loop_end->input_value(0).get_node_shared_ptr()); + std::shared_ptr inner_loop_end = parent_loop_end ? parent_loop_end : loop_end; + std::shared_ptr outer_loop_end = parent_loop_end ? loop_end : nullptr; + std::shared_ptr inner_loop_begin = inner_loop_end->get_loop_begin(); + std::shared_ptr outer_loop_begin = outer_loop_end ? outer_loop_end->get_loop_begin() : nullptr; + const bool case_2d = outer_loop_end != nullptr; + + const auto i_size = loop_begin->get_input_size(); + const auto o_size = loop_end->get_output_size(); + + const auto count_io = i_size + o_size; + std::vector body_shapes(count_io); + ov::NodeVector io(count_io); + for (size_t i = 0; i < i_size; ++i) { + body_shapes[i] = loop_begin->input_value(i).get_partial_shape(); + io[i] = loop_begin->input_value(i).get_node_shared_ptr(); + auto port_idx = loop_begin->input_value(i).get_index(); + while (std::dynamic_pointer_cast(io[i])) { + port_idx = io[i]->input_value(port_idx).get_index(); + io[i] = io[i]->input_value(port_idx).get_node_shared_ptr(); + } + } + for (size_t i = 0; i < o_size; ++i) { + body_shapes[i_size + i] = loop_end->output(i).get_partial_shape(); + // check for first target input is enough for Buffer searching because operations can have only single Buffer per each output port as op + io[i_size + i] = loop_end->output(i).get_target_inputs().begin()->get_node()->shared_from_this(); + } + + const size_t inner_work_amount = inner_loop_end->get_work_amount(); + auto inner_ptr_increments = inner_loop_end->get_ptr_increments(); + auto inner_finalization_offsets = inner_loop_end->get_finalization_offsets(); + // We should reset Buffer ptr after data storing + // If there isn't outer_work_amount for buffer, we should reset this ptr for inner loop + // otherwise we should reset it for outer loop + if (!case_2d) { + for (size_t i = 0; i < o_size; ++i) { + const auto result_pshape = loop_end->output(i).get_partial_shape(); + if (ov::is_type(io[i_size + i])) { + inner_finalization_offsets[i_size + i] = + calculate_required_finalization_offsets(inner_work_amount, utils::get_inner_dim(result_pshape).get_length()); + } + } + } + // If there are several Buffers on I/O we should remember that all Buffer have the register, + // so we should update ptr for only one Buffer + normalize_ptr_and_offsets(io, inner_ptr_increments, inner_finalization_offsets); + inner_loop_end->set_finalization_offsets(inner_finalization_offsets); + inner_loop_end->set_ptr_increments(inner_ptr_increments); + + if (case_2d) { + auto outer_ptr_increments = outer_loop_end->get_ptr_increments(); + auto outer_finalization_offsets = outer_loop_end->get_finalization_offsets(); + for (size_t i = 0; i < o_size; ++i) { + const auto result_pshape = loop_end->output(i).get_partial_shape(); + if (ov::is_type(io[i_size + i])) { + outer_finalization_offsets[i_size + i] = + calculate_required_finalization_offsets( + utils::get_outer_dim(result_pshape).get_length() * utils::get_inner_dim(result_pshape).get_length(), + utils::get_outer_dim(result_pshape).get_length()); + } + } + normalize_ptr_and_offsets(io, outer_ptr_increments, outer_finalization_offsets); + outer_loop_end->set_finalization_offsets(outer_finalization_offsets); + outer_loop_end->set_ptr_increments(outer_ptr_increments); + } + + return true; + }); +} diff --git a/src/common/snippets/src/pass/set_buffer_offset.cpp b/src/common/snippets/src/pass/set_buffer_offset.cpp new file mode 100644 index 00000000000000..4f4627231b78ff --- /dev/null +++ b/src/common/snippets/src/pass/set_buffer_offset.cpp @@ -0,0 +1,26 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include +#include + +#include "snippets/snippets_isa.hpp" +#include "snippets/pass/set_buffer_offset.hpp" +#include "snippets/op/subgraph.hpp" + + +ngraph::snippets::pass::SetBufferOffset::SetBufferOffset() { + MATCHER_SCOPE(SetBufferOffset); + register_matcher(std::make_shared( + ngraph::pattern::wrap_type(), matcher_name), + [&](ngraph::pattern::Matcher &m) { + OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::SetBufferOffset") + auto root = m.get_match_root(); + const auto buffer = ov::as_type_ptr(root); + buffer->set_offset(current_offset); + current_offset += ngraph::shape_size(buffer->get_shape()) * buffer->get_element_type().size(); + return true; + }); +} diff --git a/src/common/snippets/src/pass/softmax_decomposition.cpp b/src/common/snippets/src/pass/softmax_decomposition.cpp new file mode 100644 index 00000000000000..fb5ece8c363b1c --- /dev/null +++ b/src/common/snippets/src/pass/softmax_decomposition.cpp @@ -0,0 +1,190 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "snippets/remarks.hpp" +#include + +#include "snippets/pass/softmax_decomposition.hpp" +#include "snippets/pass/reset_buffer.hpp" +#include "snippets/pass/insert_loops.hpp" +#include "snippets/pass/loop_helpers.hpp" +#include "snippets/snippets_isa.hpp" + +#include +#include +#include +#include +#include + + +ngraph::snippets::pass::SoftmaxDecomposition::SoftmaxDecomposition(const size_t vector_size, const int32_t buffer_allocation_rank) { + MATCHER_SCOPE(SoftmaxDecomposition); + register_matcher(std::make_shared( + ngraph::pattern::wrap_type(), matcher_name), + [this, vector_size, buffer_allocation_rank](ngraph::pattern::Matcher &m) { + OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::SoftmaxDecomposition") + auto root = m.get_match_root(); + const auto master_pshape = root->get_input_partial_shape(0); + const auto rank = master_pshape.rank(); + if (rank.is_dynamic() || master_pshape.is_dynamic()) + return false; + + int64_t axis = 0; + if (const auto softmax_v8 = ngraph::as_type_ptr(root)) { + axis = ngraph::normalize_axis(root->get_friendly_name(), softmax_v8->get_axis(), rank); + } else if (const auto softmax_v1 = ngraph::as_type_ptr(root)) { + axis = softmax_v1->get_axis(); + } else { + return false; + } + + const auto shape_rank = static_cast(rank.get_length()); + if (axis != shape_rank - 1) + return false; + + const auto data = root->get_input_node_shared_ptr(0); + + const auto master_shape = master_pshape.get_shape(); + const auto dimension = shape_rank - 1; + const auto work_amount = master_shape[dimension]; + const auto increment = vector_size; + const auto inner_dim = shape_rank - 1; + const auto inner_master_work_amount = static_cast(master_shape[inner_dim]); + const int outer_dim = shape_rank > 1 ? static_cast(shape_rank - 2) : -1; + const auto has_outer_loop = outer_dim >= 0 && master_shape[outer_dim] > 1; + + /* ====== ReduceMax decomposition ====== */ + + const auto vector_buffer_max = std::make_shared(); + const auto loop_max_begin = ngraph::snippets::op::insertLoopBegin(ngraph::OutputVector{data, data}); + + const auto load_max = std::make_shared(loop_max_begin->output(0), increment); + const auto max = std::make_shared(load_max, vector_buffer_max); + + auto apply_increments_max = + InsertLoops::calculate_inner_apply_increments(master_shape, {data->get_shape(), data->get_shape(), data->get_shape()}); + // Input of softmax is Input and Output of this loop, which isn't used inside (it's just to have one output in Loop at least) + // So we shouldn't increment pointer after each loop iteration + apply_increments_max[1] = false; + apply_increments_max[2] = false; + // we should always reset data ptr after this loop because in the next Loop this ptr is used + const auto finalization_offsets_max = + std::vector{ ResetBufferState::calculate_required_finalization_offsets(inner_master_work_amount, data->get_shape()[inner_dim]), 0, 0 }; + const auto loop_max_end = std::make_shared(ngraph::OutputVector{loop_max_begin->output(1), loop_max_begin->output(2)}, + work_amount, increment, apply_increments_max, finalization_offsets_max); + + const auto horizon_max = std::make_shared(max); + + /* =========================================== */ + + /* === Sub + Exp + ReduceSum decomposition === */ + + const auto vector_buffer_sum = std::make_shared(); + const auto loop_sum_begin = ngraph::snippets::op::insertLoopBegin(ngraph::OutputVector{loop_max_end->output(0)}); + + const auto load_sub = std::make_shared(loop_sum_begin->output(0), increment); + const auto sub = std::make_shared(load_sub, horizon_max); + const auto exp = std::make_shared(sub); + const auto sum = std::make_shared(exp, vector_buffer_sum); + const auto store_exp = std::make_shared(exp, increment); + + auto apply_increments_sum = + InsertLoops::calculate_inner_apply_increments(master_shape, {load_sub->get_shape(), store_exp->get_shape()}); + std::vector finalization_offsets_sum(2, 0); + if (has_outer_loop) { + finalization_offsets_sum = + InsertLoops::calculate_finalization_offsets(master_shape, {load_sub->get_shape(), store_exp->get_shape()}); + } + // we should always reset buffer ptr after loop because in the next Loop this buffer ptr is used + finalization_offsets_sum[1] = ResetBufferState::calculate_required_finalization_offsets(inner_master_work_amount, store_exp->get_shape()[inner_dim]); + const auto loop_sum_end = std::make_shared( + ngraph::OutputVector{store_exp, loop_sum_begin->output(1)}, work_amount, increment, + apply_increments_sum, finalization_offsets_sum); + + const auto horizon_sum = std::make_shared(sum); + const auto buffer_exp = std::make_shared(loop_sum_end->output(0), buffer_allocation_rank); + + /* =========================================== */ + + /* ================== Div ==================== */ + + // Divide is expensive operation, so we decompose it into 1 / x * y, where 1 / x is executed outside loop + const auto pow = std::make_shared(horizon_sum, + ngraph::op::Constant::create(ov::element::f32, ngraph::Shape{}, {-1})); + + const auto loop_div_begin = op::insertLoopBegin(ngraph::OutputVector{buffer_exp}); + + const auto load_div = std::make_shared(loop_div_begin->output(0), increment); + const auto mul = std::make_shared(load_div, pow); + const auto store_div = std::make_shared(mul, increment); + + auto apply_increments_div = + InsertLoops::calculate_inner_apply_increments(master_shape, {load_div->get_shape(), store_div->get_shape()}); + std::vector finalization_offsets_div(2, 0); + if (has_outer_loop) { + finalization_offsets_div = + InsertLoops::calculate_finalization_offsets(master_shape, {load_div->get_shape(), store_div->get_shape()}); + } + const auto loop_div_end = std::make_shared( + ngraph::OutputVector{store_div, loop_div_begin->output(1)}, work_amount, increment, + apply_increments_div, finalization_offsets_div); + + /* =========================================== */ + + /* ========== Control dependency ============= */ + + loop_max_begin->add_control_dependency(vector_buffer_max); + loop_max_end->add_control_dependency(max); + horizon_max->add_control_dependency(loop_max_end); + loop_sum_begin->add_control_dependency(vector_buffer_sum); + loop_sum_begin->add_control_dependency(horizon_max); + loop_sum_end->add_control_dependency(sum); + horizon_sum->add_control_dependency(loop_sum_end); + loop_div_begin->add_control_dependency(pow); + + /* =========================================== */ + + /* ============= Runtime Info ================ */ + + // For tail loop we should fill input of Max by float min and + // input of Sum by zero to avoid math incorrect calculations + max->input(0).get_rt_info()["set_fill"] = uint32_t(0xff7fffff); + sum->input(0).get_rt_info()["set_fill"] = uint32_t(0x00000000); + + // These nodes should be executed outside loops + ov::NodeVector ops_outside_loop = { vector_buffer_max, horizon_max, vector_buffer_sum, horizon_sum, pow, buffer_exp }; + for (const auto& op : ops_outside_loop) { + op->get_rt_info()["outside_loop"] = true; + } + + ngraph::copy_runtime_info(root, + {vector_buffer_max, loop_max_begin, load_max, max, horizon_max, loop_max_end, + vector_buffer_sum, loop_sum_begin, load_sub, sub, exp, sum, store_exp, horizon_sum, loop_sum_end, buffer_exp, pow, + loop_div_begin, load_div, mul, store_div, loop_div_end}); + + /* =========================================== */ + + ngraph::replace_node(root, loop_div_end); + + /* ============== Outer loop ================= */ + if (has_outer_loop) { + std::vector apply_increments = + InsertLoops::calculate_outer_apply_increments({root->get_input_shape(0), root->get_output_shape(0)}); + const auto softmax_parameters = + std::vector>{loop_max_begin->input(0).get_source_output()}; + const auto output_set = loop_div_end->output(0).get_target_inputs(); + const auto softmax_results = std::vector>{output_set.begin(), output_set.end()}; + const auto& outer_loop_begin = ngraph::snippets::op::insertLoopBegin(softmax_parameters); + const auto outer_loop_end = ngraph::snippets::op::insertLoopEndBeforeInputs( + softmax_results, outer_loop_begin, master_shape[outer_dim], 1, apply_increments); + + vector_buffer_max->add_control_dependency(outer_loop_begin); + + ngraph::copy_runtime_info(root, {outer_loop_begin, outer_loop_end}); + } + /* =========================================== */ + + return true; + }); +} diff --git a/src/common/snippets/src/pass/softmax_reshape_elimination.cpp b/src/common/snippets/src/pass/softmax_reshape_elimination.cpp new file mode 100644 index 00000000000000..f770f4e80668cd --- /dev/null +++ b/src/common/snippets/src/pass/softmax_reshape_elimination.cpp @@ -0,0 +1,70 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include "snippets/remarks.hpp" + +#include "snippets/pass/softmax_reshape_elimination.hpp" +#include "snippets/snippets_isa.hpp" + +#include +#include +#include + +ngraph::snippets::pass::SoftmaxReshapeElimination::SoftmaxReshapeElimination() { + MATCHER_SCOPE(SoftmaxReshapeElimination); + const auto m_reshape0 = pattern::wrap_type(pattern::has_static_shape()); + const auto m_softmax = pattern::wrap_type({m_reshape0}); + const auto m_reshape1 = pattern::wrap_type({m_softmax, pattern::wrap_type()}); + + register_matcher(std::make_shared(m_reshape1, matcher_name), + [=](ngraph::pattern::Matcher &m) { + OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::SoftmaxReshapeElimination") + auto& pattern_to_output = m.get_pattern_value_map(); + auto reshape0 = pattern_to_output[m_reshape0].get_node_shared_ptr(); + auto softmax = pattern_to_output[m_softmax].get_node_shared_ptr(); + auto reshape1 = pattern_to_output[m_reshape1].get_node_shared_ptr(); + + const auto input_shape = reshape0->get_input_partial_shape(0); + const auto output_shape = reshape1->get_output_partial_shape(0); + if (input_shape.is_dynamic() || output_shape.is_dynamic() || input_shape.get_shape() != output_shape.get_shape()) + return false; + + const auto softmax_rank = softmax->get_input_partial_shape(0).rank(); + int64_t axis = 0; + if (const auto softmax_v8 = ngraph::as_type_ptr(softmax)) { + axis = ngraph::normalize_axis(softmax->get_friendly_name(), softmax_v8->get_axis(), softmax_rank); + } else if (const auto softmax_v1 = ngraph::as_type_ptr(softmax)) { + axis = softmax_v1->get_axis(); + } else { + return false; + } + + // Supports only last axis + if (axis != softmax_rank.get_length() - 1) + return false; + + // Dimensions by reduction axis should be equal + if (input_shape.get_shape().back() != softmax->get_input_shape(0).back()) + return false; + + // Eliminate Reshape before Softmax + reshape0->output(0).replace(reshape0->input_value(0)); + copy_runtime_info({reshape0->input_value(0).get_node_shared_ptr(), reshape0->output(0).get_node_shared_ptr()}, + reshape0->input_value(0).get_node_shared_ptr()); + + // Eliminate Reshape after Softmax with name saving + replace_output_update_name(reshape1->output(0), reshape1->input_value(0)); + + // update axis + const auto new_axis = input_shape.rank().get_length() - 1; + if (auto softmax_v8 = ngraph::as_type_ptr(softmax)) { + softmax_v8->set_axis(new_axis); + } else if (auto softmax_v1 = ngraph::as_type_ptr(softmax)) { + softmax_v1->set_axis(new_axis); + } + + return true; + }); +} diff --git a/src/common/snippets/src/pass/transpose_decomposition.cpp b/src/common/snippets/src/pass/transpose_decomposition.cpp index db9b00bf5b8f2a..5dc6960b2fd71a 100644 --- a/src/common/snippets/src/pass/transpose_decomposition.cpp +++ b/src/common/snippets/src/pass/transpose_decomposition.cpp @@ -60,7 +60,7 @@ ngraph::snippets::pass::TransposeDecomposition::TransposeDecomposition() { auto loop_C_begin = std::make_shared(OutputVector{loop_W_begin->output(0)}); // todo: LoadReshape used here is essentially Load + an easy way to maintain correct shape propagation // fix this in future and develop a more consistent shape propagation approach. - auto load = std::make_shared(loop_C_begin->output(0), 1, access_pattern); + auto load = std::make_shared(loop_C_begin->output(0), 1, 0, access_pattern); auto store = std::make_shared(load, 1); const std::vector ptr_increments_C {size_H * size_W, 1}; const std::vector finalization_offsets_C {1 - size_H * size_W * size_C, 0}; diff --git a/src/common/snippets/tests/include/pass/fuse_transpose_brgemm.hpp b/src/common/snippets/tests/include/pass/fuse_transpose_brgemm.hpp index 20c2fa1b272958..8b886ef9876b06 100644 --- a/src/common/snippets/tests/include/pass/fuse_transpose_brgemm.hpp +++ b/src/common/snippets/tests/include/pass/fuse_transpose_brgemm.hpp @@ -16,6 +16,7 @@ namespace snippets { typedef std::tuple< std::vector, // Input shapes + PartialShape, // Master shape size_t // Transpose position > fuseTransposeBrgemmParams; diff --git a/src/common/snippets/tests/include/pass/softmax_decomposition.hpp b/src/common/snippets/tests/include/pass/softmax_decomposition.hpp new file mode 100644 index 00000000000000..3943bd641bf8bb --- /dev/null +++ b/src/common/snippets/tests/include/pass/softmax_decomposition.hpp @@ -0,0 +1,43 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "lowering_utils.hpp" +#include "snippets_helpers.hpp" + +namespace ov { +namespace test { +namespace snippets { + +typedef std::tuple< + Shape, // Input shape 0 + int // Axis +> SoftmaxParams; + +typedef std::tuple< + Shape, // Input shape 0 + Shape, // Input shape 1 + int // Axis +> AddSoftmaxParams; + +class SoftmaxTests : public LoweringTests, public testing::WithParamInterface { +public: + static std::string getTestCaseName(testing::TestParamInfo obj); +protected: + void SetUp() override; + std::shared_ptr snippets_function; +}; + +class AddSoftmaxTests : public LoweringTests, public testing::WithParamInterface { +public: + static std::string getTestCaseName(testing::TestParamInfo obj); +protected: + void SetUp() override; + std::shared_ptr snippets_function; +}; + +} // namespace snippets +} // namespace test +} // namespace ov diff --git a/src/common/snippets/tests/src/lowering_utils.cpp b/src/common/snippets/tests/src/lowering_utils.cpp index ef5b74a08b910d..110c2052bd8399 100644 --- a/src/common/snippets/tests/src/lowering_utils.cpp +++ b/src/common/snippets/tests/src/lowering_utils.cpp @@ -21,7 +21,12 @@ DummyTargetMachine::DummyTargetMachine() { jitters[op::v1::Add::get_type_info_static()] = dummy_functor; jitters[op::v1::Subtract::get_type_info_static()] = dummy_functor; jitters[op::v1::Multiply::get_type_info_static()] = dummy_functor; - jitters[op::v1::Multiply::get_type_info_static()] = dummy_functor; + jitters[op::v1::Divide::get_type_info_static()] = dummy_functor; + jitters[op::v1::Maximum::get_type_info_static()] = dummy_functor; + jitters[op::v0::Exp::get_type_info_static()] = dummy_functor; + jitters[ngraph::snippets::op::PowerStatic::get_type_info_static()] = dummy_functor; + jitters[ngraph::snippets::op::HorizonMax::get_type_info_static()] = dummy_functor; + jitters[ngraph::snippets::op::HorizonSum::get_type_info_static()] = dummy_functor; jitters[ngraph::snippets::op::Load::get_type_info_static()] = dummy_functor; jitters[ngraph::snippets::op::BroadcastLoad::get_type_info_static()] = dummy_functor; @@ -33,6 +38,9 @@ DummyTargetMachine::DummyTargetMachine() { jitters[ngraph::snippets::op::LoopBegin::get_type_info_static()] = dummy_functor; jitters[ngraph::snippets::op::LoopEnd::get_type_info_static()] = dummy_functor; jitters[ngraph::snippets::op::Brgemm::get_type_info_static()] = dummy_functor; + jitters[ngraph::snippets::op::Buffer::get_type_info_static()] = dummy_functor; + jitters[ngraph::snippets::op::VectorBuffer::get_type_info_static()] = dummy_functor; + jitters[ngraph::snippets::op::Fill::get_type_info_static()] = dummy_functor; } void LoweringTests::SetUp() { diff --git a/src/common/snippets/tests/src/pass/fuse_transpose_brgemm.cpp b/src/common/snippets/tests/src/pass/fuse_transpose_brgemm.cpp index a3f60e4656abc1..1962bb610db3a3 100644 --- a/src/common/snippets/tests/src/pass/fuse_transpose_brgemm.cpp +++ b/src/common/snippets/tests/src/pass/fuse_transpose_brgemm.cpp @@ -14,11 +14,13 @@ namespace snippets { std::string FuseTransposeBrgemmTests::getTestCaseName(testing::TestParamInfo obj) { std::vector input_shapes(2); + PartialShape master_shape; size_t transpose_position; - std::tie(input_shapes, transpose_position) = obj.param; + std::tie(input_shapes, master_shape, transpose_position) = obj.param; std::ostringstream result; result << "IS[0]=" << CommonTestUtils::partialShape2str({input_shapes[0]}) << "_"; result << "IS[1]=" << CommonTestUtils::partialShape2str({input_shapes[1]}) << "_"; + result << "MS=" << CommonTestUtils::partialShape2str({master_shape}) << "_"; result << "Pos=" << transpose_position << "_"; return result.str(); } @@ -27,7 +29,7 @@ void FuseTransposeBrgemmTests::SetUp() { LoweringTests::SetUp(); std::vector input_shapes(2); size_t transpose_position; - std::tie(input_shapes, transpose_position) = this->GetParam(); + std::tie(input_shapes, master_shape, transpose_position) = this->GetParam(); snippets_function = std::make_shared(input_shapes, transpose_position); } @@ -41,9 +43,9 @@ TEST_P(FuseTransposeBrgemmTests, FuseTransposeMatmul) { namespace FuseTransposeBrgemmTestsInstantiation { using ov::Shape; std::vector test_params{ - {{{1, 49, 2, 23}, {2, 2, 23, 39}}, 0}, - {{{1, 2, 49, 23}, {2, 23, 1, 39}}, 1}, - {{{1, 2, 49, 23}, {2, 2, 23, 39}}, 2}, + {{{1, 49, 2, 23}, {2, 2, 23, 39}}, {2, 2, 49, 23}, 0}, + {{{1, 2, 49, 23}, {2, 23, 1, 39}}, {2, 2, 49, 39}, 1}, + {{{1, 2, 49, 23}, {2, 2, 23, 39}}, {2, 2, 49, 39}, 2}, }; INSTANTIATE_TEST_SUITE_P(smoke_Snippets_FuseTransposeMatMul, FuseTransposeBrgemmTests, diff --git a/src/common/snippets/tests/src/pass/softmax_decomposition.cpp b/src/common/snippets/tests/src/pass/softmax_decomposition.cpp new file mode 100644 index 00000000000000..c016e1c8b2467a --- /dev/null +++ b/src/common/snippets/tests/src/pass/softmax_decomposition.cpp @@ -0,0 +1,122 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +#include "pass/softmax_decomposition.hpp" +#include "common_test_utils/common_utils.hpp" +#include "subgraph_softmax.hpp" +#include "subgraph_lowered.hpp" + +#include "snippets/pass/softmax_decomposition.hpp" +#include "snippets/pass/insert_load_store.hpp" +#include "snippets/pass/insert_movebroadcast.hpp" +#include "snippets/pass/insert_buffer.hpp" +#include "snippets/pass/set_buffer_offset.hpp" +#include "snippets/pass/convert_power_to_powerstatic.hpp" + + +namespace ov { +namespace test { +namespace snippets { + +std::string SoftmaxTests::getTestCaseName(testing::TestParamInfo obj) { + Shape inputShape; + int axis; + std::tie(inputShape, axis) = obj.param; + std::ostringstream result; + result << "IS=" << CommonTestUtils::vec2str(inputShape) << "_"; + result << "Axis=" << axis << "_"; + return result.str(); +} + +void SoftmaxTests::SetUp() { + const size_t count = 10; + manager.register_pass(); + manager.register_pass(count); + manager.register_pass(); + manager.register_pass(count); + manager.register_pass(count); + manager.register_pass(); + Shape inputShape; + int axis; + std::tie(inputShape, axis) = this->GetParam(); + snippets_function = std::make_shared(std::vector{inputShape}, axis); + master_shape = inputShape; +} + +std::string AddSoftmaxTests::getTestCaseName(testing::TestParamInfo obj) { + Shape inputShape0, inputShape1; + int axis; + std::tie(inputShape0, inputShape1, axis) = obj.param; + std::ostringstream result; + result << "IS[0]=" << CommonTestUtils::vec2str(inputShape0) << "_"; + result << "IS[1]=" << CommonTestUtils::vec2str(inputShape1) << "_"; + result << "Axis=" << axis << "_"; + return result.str(); +} + +void AddSoftmaxTests::SetUp() { + const size_t count = 10; + manager.register_pass(); + manager.register_pass(); + manager.register_pass(count); + manager.register_pass(); + manager.register_pass(count); + manager.register_pass(count); + manager.register_pass(); + manager.register_pass(); + Shape inputShape0, inputShape1; + int axis; + std::tie(inputShape0, inputShape1, axis) = this->GetParam(); + snippets_function = std::make_shared(std::vector{inputShape0, inputShape1}, axis); + + ov::PartialShape master_pshape(inputShape0); + ov::PartialShape::broadcast_merge_into(master_pshape, inputShape1, op::AutoBroadcastType::NUMPY); + master_shape = master_pshape.get_shape(); +} + +TEST_P(SoftmaxTests, SoftmaxDecomposition) { + PartialShape scheduler_shape({master_shape[master_shape.size() - 2], + master_shape[master_shape.size() - 1]}); + auto subgraph = getLoweredSubgraph(snippets_function->getOriginal(), scheduler_shape); + function = subgraph->get_body(); + function_ref = snippets_function->getLowered(); +} + +TEST_P(AddSoftmaxTests, AddSoftmaxDecomposition) { + PartialShape scheduler_shape({master_shape[master_shape.size() - 2], + master_shape[master_shape.size() - 1]}); + auto subgraph = getLoweredSubgraph(snippets_function->getOriginal(), scheduler_shape); + function = subgraph->get_body(); + function_ref = snippets_function->getLowered(); +} + +namespace SoftmaxTestsInstantiation { +std::vector inputShape{{12, 4, 12, 12, 127}, {12, 4, 12, 12, 1}}; + +INSTANTIATE_TEST_SUITE_P(smoke_Snippets_SoftmaxDecomposition, SoftmaxTests, + ::testing::Combine( + ::testing::ValuesIn(inputShape), + ::testing::Values(-1)), + SoftmaxTests::getTestCaseName); + +} // namespace SoftmaxTestsInstantiation + +namespace AddSoftmaxTestsInstantiation { +std::vector inputShape0{{12, 4, 12, 12, 17}, {12, 4, 12, 12, 1}}; +std::vector inputShape1{{12, 4, 12, 12, 17}, {12, 4, 12, 12, 1}}; + +INSTANTIATE_TEST_SUITE_P(smoke_Snippets_AddSoftmaxDecomposition, AddSoftmaxTests, + ::testing::Combine( + ::testing::ValuesIn(inputShape0), + ::testing::ValuesIn(inputShape1), + ::testing::Values(-1)), + AddSoftmaxTests::getTestCaseName); + +} // namespace AddSoftmaxTestsInstantiation + +} // namespace snippets +} // namespace test +} // namespace ov diff --git a/src/common/snippets/tests/src/pass/softmax_reshape_elimination.cpp b/src/common/snippets/tests/src/pass/softmax_reshape_elimination.cpp new file mode 100644 index 00000000000000..3f2f731c781331 --- /dev/null +++ b/src/common/snippets/tests/src/pass/softmax_reshape_elimination.cpp @@ -0,0 +1,103 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +#include +#include + +#include +#include + +#include + +#include "common_test_utils/ngraph_test_utils.hpp" + +using namespace testing; +using namespace ngraph; + +TEST(TransformationTests, SoftmaxV1ReshapeElimination) { + std::shared_ptr f(nullptr), f_ref(nullptr); + { + auto data = std::make_shared(element::f32, Shape{2, 3, 240}); + auto shape0 = std::make_shared(ov::element::i32, ov::Shape{2}, std::vector{6, 240}); + auto reshape0 = std::make_shared(data, shape0, false); + auto softmax_v1 = std::make_shared(reshape0, 1); + auto shape1 = std::make_shared(ov::element::i32, ov::Shape{3}, std::vector{2, 3, 240}); + auto reshape1 = std::make_shared(softmax_v1, shape1, false); + f = std::make_shared(NodeVector{reshape1}, ParameterVector{data}); + + pass::Manager m; + m.register_pass(); + m.register_pass(); + m.run_passes(f); + ASSERT_NO_THROW(check_rt_info(f)); + } + { + auto data = std::make_shared(element::f32, Shape{2, 3, 240}); + auto softmax_v1 = std::make_shared(data, 2); + f_ref = std::make_shared(NodeVector{softmax_v1}, ParameterVector{data}); + } + + auto res = compare_functions(f, f_ref); + ASSERT_TRUE(res.first) << res.second; +} + +TEST(TransformationTests, SoftmaxV8ReshapeElimination) { + std::shared_ptr f(nullptr), f_ref(nullptr); + { + auto data = std::make_shared(element::f32, Shape{1, 2, 340, 240}); + auto shape0 = std::make_shared(ov::element::i32, ov::Shape{2}, std::vector{680, 240}); + auto reshape0 = std::make_shared(data, shape0, false); + auto softmax_v1 = std::make_shared(reshape0, -1); + auto shape1 = std::make_shared(ov::element::i32, ov::Shape{4}, std::vector{1, 2, 340, 240}); + auto reshape1 = std::make_shared(softmax_v1, shape1, false); + f = std::make_shared(NodeVector{reshape1}, ParameterVector{data}); + + pass::Manager m; + m.register_pass(); + m.register_pass(); + m.run_passes(f); + ASSERT_NO_THROW(check_rt_info(f)); + } + { + auto data = std::make_shared(element::f32, Shape{1, 2, 340, 240}); + auto softmax_v1 = std::make_shared(data, 3); + f_ref = std::make_shared(NodeVector{softmax_v1}, ParameterVector{data}); + } + + auto res = compare_functions(f, f_ref); + ASSERT_TRUE(res.first) << res.second; +} + +TEST(TransformationTests, SoftmaxReshapeElimination_IncorrectReshape) { + std::shared_ptr f(nullptr), f_ref(nullptr); + { + auto data = std::make_shared(element::f32, Shape{1, 2, 340, 240}); + auto shape0 = std::make_shared(ov::element::i32, ov::Shape{2}, std::vector{2, 81600}); + auto reshape0 = std::make_shared(data, shape0, false); + auto softmax_v1 = std::make_shared(reshape0, -1); + auto shape1 = std::make_shared(ov::element::i32, ov::Shape{4}, std::vector{1, 2, 340, 240}); + auto reshape1 = std::make_shared(softmax_v1, shape1, false); + f = std::make_shared(NodeVector{reshape1}, ParameterVector{data}); + + pass::Manager m; + m.register_pass(); + m.register_pass(); + m.run_passes(f); + ASSERT_NO_THROW(check_rt_info(f)); + } + { + auto data = std::make_shared(element::f32, Shape{1, 2, 340, 240}); + auto shape0 = std::make_shared(ov::element::i32, ov::Shape{2}, std::vector{2, 81600}); + auto reshape0 = std::make_shared(data, shape0, false); + auto softmax_v1 = std::make_shared(reshape0, -1); + auto shape1 = std::make_shared(ov::element::i32, ov::Shape{4}, std::vector{1, 2, 340, 240}); + auto reshape1 = std::make_shared(softmax_v1, shape1, false); + f_ref = std::make_shared(NodeVector{reshape1}, ParameterVector{data}); + } + + auto res = compare_functions(f, f_ref); + ASSERT_TRUE(res.first) << res.second; +} diff --git a/src/plugins/intel_cpu/src/emitters/cpu_generator.cpp b/src/plugins/intel_cpu/src/emitters/cpu_generator.cpp index 1438fc286ce4e4..13178aed44fcb9 100644 --- a/src/plugins/intel_cpu/src/emitters/cpu_generator.cpp +++ b/src/plugins/intel_cpu/src/emitters/cpu_generator.cpp @@ -46,6 +46,8 @@ ov::intel_cpu::CPUTargetMachine::CPUTargetMachine(dnnl::impl::cpu::x64::cpu_isa_ // data movement jitters[ngraph::opset1::Parameter::get_type_info_static()] = CREATE_EMITTER(NopEmitter); jitters[ngraph::opset1::Result::get_type_info_static()] = CREATE_EMITTER(NopEmitter); + jitters[ngraph::snippets::op::Buffer::get_type_info_static()] = CREATE_EMITTER(NopEmitter); + jitters[ngraph::snippets::op::VectorBuffer::get_type_info_static()] = CREATE_EMITTER(ZeroEmitter); // jitters[ngraph::opset1::Constant::get_type_info_static()] = CREATE_EMITTER(); // Not supported jitters[ngraph::snippets::op::Load::get_type_info_static()] = CREATE_EMITTER(LoadEmitter); @@ -123,6 +125,10 @@ ov::intel_cpu::CPUTargetMachine::CPUTargetMachine(dnnl::impl::cpu::x64::cpu_isa_ // jitters[ngraph::opset1::Selu::get_type_info_static()] = CREATE_EMITTER(); // not supported jitters[ngraph::op::v0::Gelu::get_type_info_static()] = CREATE_EMITTER(ov::intel_cpu::jit_gelu_v0_emitter); jitters[ngraph::op::v7::Gelu::get_type_info_static()] = CREATE_EMITTER(ov::intel_cpu::jit_gelu_v7_emitter); + jitters[ngraph::snippets::op::Fill::get_type_info_static()] = CREATE_EMITTER(FillEmitter); + + jitters[ngraph::snippets::op::HorizonMax::get_type_info_static()] = CREATE_EMITTER(HorizonMaxEmitter); + jitters[ngraph::snippets::op::HorizonSum::get_type_info_static()] = CREATE_EMITTER(HorizonSumEmitter); jitters[ngraph::snippets::op::Kernel::get_type_info_static()] = CREATE_EMITTER(KernelEmitter); jitters[ngraph::snippets::op::LoopBegin::get_type_info_static()] = CREATE_EMITTER(LoopBeginEmitter); diff --git a/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.cpp b/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.cpp index 327e6acd258438..f3fb02cef9a6c2 100644 --- a/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.cpp +++ b/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.cpp @@ -128,10 +128,13 @@ KernelEmitter::KernelEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl: } return layout; }; + const auto& ops = model->get_ordered_ops(); auto params = model->get_parameters(); auto results = model->get_results(); num_inputs = params.size(); num_outputs = results.size(); + is_buffer_needed = std::any_of(ops.begin(), ops.end(), + [](const std::shared_ptr& node) { return ov::is_type(node); } ); NodeVector io_nodes; std::copy(params.begin(), params.end(), std::back_inserter(io_nodes)); std::copy(results.begin(), results.end(), std::back_inserter(io_nodes)); @@ -210,15 +213,15 @@ void KernelEmitter::validate_arguments(const std::vector &in, IE_THROW() << "KernelEmitter got invalid number of inputs. Expected 0, got " << in.size(); if (!out.empty()) IE_THROW() << "KernelEmitter got invalid number of outputs. Expected 0, got " << out.size(); - const auto num_params = num_inputs + num_outputs; + const auto num_params = num_inputs + num_outputs + static_cast(is_buffer_needed); // The number of used gpr may be >= num_params since LoopBegin+LoopEnd could also use gpr to store work_amount if (data_ptr_regs_idx.size() != num_params) IE_THROW() << "KernelEmitter: number of inputs and outputs is inconsisnent with the number of allocated registers" << num_params << " data_ptr_regs_idx.size() = " << data_ptr_regs_idx.size(); } -void KernelEmitter::init_data_pointers(size_t num_inputs, size_t num_params, - const Reg64& reg_indexes, const Reg64& reg_const_params, const std::vector& data_ptr_regs) const { +void KernelEmitter::init_data_pointers(size_t num_inputs, size_t num_params, bool is_buffer_needed, + const Reg64& reg_indexes, const Reg64& reg_const_params, const std::vector& data_ptr_regs) const { // Note that we don't need offset for the last dim, since it's handled directly by Tile emitter const size_t offset_rank = jcp.master_shape.size() - 1; //const size_t tile_rank = jcp.tile_rank; @@ -277,7 +280,13 @@ void KernelEmitter::init_data_pointers(size_t num_inputs, size_t num_params, return reg != reg_indexes_idx && reg != reg_const_params_idx; }); const bool last_iter_explicitly = spare_corruptable_gpr == gp_regs_pool.end(); - Reg64 reg_tmp = last_iter_explicitly ? data_ptr_regs.back() : Reg64(static_cast(*spare_corruptable_gpr)); + Reg64 reg_tmp = last_iter_explicitly ? data_ptr_regs[num_params - 1] : Reg64(static_cast(*spare_corruptable_gpr)); + // Vector "data_ptr_regs" is sorted by abstract regs. + // It means that the vector contains the physical registers in order [src, .., src, dst, .., dst, buffer] + // So we can initialize buffer register firstly as last value of vector "data_ptr_regs" + if (is_buffer_needed) { + h->mov(data_ptr_regs[num_params], h->ptr[reg_const_params + GET_OFF(buffer_scratchpad)]); + } size_t i = 0; for (; i < num_params - last_iter_explicitly; i++) { if (i < num_inputs) @@ -286,7 +295,7 @@ void KernelEmitter::init_data_pointers(size_t num_inputs, size_t num_params, h->mov(data_ptr_regs[i], h->ptr[reg_const_params + GET_OFF(dst_ptrs) + (i - num_inputs) * sizeof(void*)]); init_ptr_with_offset(data_ptr_regs[i], data_offsets[i], reg_tmp); } - // a rare case when num_params is maximal, so we have no spare gprs + // a rare case when num_io is maximal, so we have no spare gprs // * Static case: we can use reg_const_params as the last reg_tmp for the last iteration (and corrupt it), since // it won't be used anymore // * Dynamic case: we will need reg_const_params to pass runtime args to LoopScheduler, so we have to @@ -310,7 +319,7 @@ void KernelEmitter::emit_impl(const std::vector& in, std::vector data_ptr_regs; transform_idxs_to_regs(data_ptr_regs_idx, data_ptr_regs); - init_data_pointers(num_inputs, num_inputs + num_outputs, reg_indexes, reg_const_params, data_ptr_regs); + init_data_pointers(num_inputs, num_inputs + num_outputs, is_buffer_needed, reg_indexes, reg_const_params, data_ptr_regs); for (const auto& c : body) { const auto& emitter = c.first; std::vector in_regs, out_regs; @@ -535,7 +544,9 @@ StoreEmitter::StoreEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::c if (src_prc != dst_prc) IE_THROW() << "StoreEmitter supports only equal input and output types but gets: " << src_prc.name() << " and " << dst_prc.name(); - count = ov::as_type_ptr(n)->get_count(); + const auto store = ov::as_type_ptr(n); + count = store->get_count(); + byte_offset = store->get_offset(); in_out_type_ = emitter_in_out_map::vec_to_gpr; store_emitter.reset(new jit_store_emitter(h, isa, src_prc, dst_prc, count)); } @@ -562,7 +573,7 @@ void StoreEmitter::emit_isa(const std::vector &in, const std::vector::type; if (!store_emitter) IE_THROW() << "Store CPU emitter isn't initialized for StoreEmitter!"; - store_emitter->emit_code({in[0]}, {out[0]}, aux_vec_idxs, aux_gpr_idxs); + store_emitter->emit_code({in[0], byte_offset}, {out[0]}, aux_vec_idxs, aux_gpr_idxs); } void StoreEmitter::emit_data() const { @@ -574,7 +585,12 @@ LoadEmitter::LoadEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu if (src_prc != dst_prc) IE_THROW() << "LoadEmitter supports only equal input and output types but gets: " << src_prc.name() << " and " << dst_prc.name(); - count = std::dynamic_pointer_cast(n)->get_count(); + const auto load = std::dynamic_pointer_cast(n); + if (!load) + IE_THROW() << "LoadEmitter expects Load snippets op"; + + count = load->get_count(); + byte_offset = load->get_offset(); in_out_type_ = emitter_in_out_map::gpr_to_vec; load_emitter.reset(new jit_load_emitter(h, isa, src_prc, dst_prc, count)); } @@ -601,7 +617,7 @@ void LoadEmitter::emit_isa(const std::vector &in, const std::vector::type; if (!load_emitter) IE_THROW() << "Load CPU emitter isn't initialized for LoadEmitter!"; - load_emitter->emit_code({in[0]}, {out[0]}, aux_vec_idxs, aux_gpr_idxs); + load_emitter->emit_code({in[0], byte_offset}, {out[0]}, aux_vec_idxs, aux_gpr_idxs); } void LoadEmitter::emit_data() const { @@ -651,7 +667,9 @@ void BroadcastLoadEmitter::emit_isa(const std::vector &in, const std::ve LoadConvertEmitter::LoadConvertEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr& n) : MemoryEmitter(h, isa, n) { - count = ov::as_type_ptr(n)->get_count(); + const auto load = ov::as_type_ptr(n); + count = load->get_count(); + byte_offset = load->get_offset(); in_out_type_ = emitter_in_out_map::gpr_to_vec; load_emitter.reset(new jit_load_emitter(h, isa, src_prc, dst_prc, count)); } @@ -676,7 +694,7 @@ template void LoadConvertEmitter::emit_isa(const std::vector &in, const std::vector &out) const { if (!load_emitter) IE_THROW() << "Load CPU emitter isn't initialized for LoadEmitter!"; - load_emitter->emit_code({in[0]}, {out[0]}, aux_vec_idxs, aux_gpr_idxs); + load_emitter->emit_code({in[0], byte_offset}, {out[0]}, aux_vec_idxs, aux_gpr_idxs); } void LoadConvertEmitter::emit_data() const { @@ -685,7 +703,9 @@ void LoadConvertEmitter::emit_data() const { StoreConvertEmitter::StoreConvertEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr& n) : MemoryEmitter(h, isa, n) { - count = ov::as_type_ptr(n)->get_count(); + const auto store = ov::as_type_ptr(n); + count = store->get_count(); + byte_offset = store->get_offset(); in_out_type_ = emitter_in_out_map::vec_to_gpr; if (ov::is_type(n)) { @@ -715,7 +735,7 @@ template void StoreConvertEmitter::emit_isa(const std::vector &in, const std::vector &out) const { if (!store_emitter) IE_THROW() << "Store CPU emitter isn't initialized for StoreEmitter!"; - store_emitter->emit_code({in[0]}, {out[0]}, aux_vec_idxs, aux_gpr_idxs); + store_emitter->emit_code({in[0], byte_offset}, {out[0]}, aux_vec_idxs, aux_gpr_idxs); } void StoreConvertEmitter::emit_data() const { @@ -814,6 +834,10 @@ BrgemmEmitter::BrgemmEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl: } } } + + load_offset_a = brgemm_node->get_offset_a(); + load_offset_b = brgemm_node->get_offset_b(); + store_offset_c = brgemm_node->get_offset_c(); } void BrgemmEmitter::initBrgemm(brgemmCtx& ctx, std::unique_ptr& brgKernel, bool use_amx) const { @@ -854,8 +878,9 @@ void BrgemmEmitter::emit_impl(const std::vector& in, } template void BrgemmEmitter::emit_brgemm_kernel_call(const brgemm_kernel_t *brgKernel, int bs, - Reg64 addr_A, Reg64 addr_B, - const brgemm_batch_element_t *batch, Reg64 addr_C, void *scratch) const { + Reg64 addr_A, Reg64 addr_B, + const brgemm_batch_element_t *batch, Reg64 addr_C, void *scratch, + const size_t in0_kernel_offset, const size_t in1_kernel_offset, const size_t out0_kernel_offset) const { using Vmm = typename dnnl::impl::utils::conditional3::type; size_t gpr_size = 8; Xbyak::Operand gprs_to_save[] = {h->r8, h->r9, h->r10, h->r11, h->rax, @@ -905,8 +930,15 @@ void BrgemmEmitter::emit_brgemm_kernel_call(const brgemm_kernel_t *brgKernel, in // todo: Windows ABI : requires different num of arguments passed in regs and on the stack. Need to align. h->mov(abi_param1, reinterpret_cast(brgKernel)); h->mov(abi_param2, bs); - h->uni_vmovq(abi_param3, Xmm(0)); - h->uni_vmovq(abi_param4, Xmm(1)); + + const auto data_ptr = [&](Xmm xmm, Xbyak::Reg64 reg, size_t memory_bytes_offset, size_t kernel_bytes_offset) { + h->uni_vmovq(reg, xmm); + if (memory_bytes_offset) h->add(reg, memory_bytes_offset); + if (kernel_bytes_offset) h->add(reg, kernel_bytes_offset); + }; + data_ptr(Xmm(0), abi_param3, load_offset_a, in0_kernel_offset); + data_ptr(Xmm(1), abi_param4, load_offset_b, in1_kernel_offset); + size_t num_args_passed_on_stack = 1; #ifdef _WIN32 num_args_passed_on_stack = 3; @@ -915,14 +947,16 @@ void BrgemmEmitter::emit_brgemm_kernel_call(const brgemm_kernel_t *brgKernel, in h->mov(h->qword[h->rsp], reinterpret_cast(scratch)); h->mov(h->qword[h->rsp + gpr_size], reinterpret_cast(batch)); h->mov(h->qword[h->rsp + 2 * gpr_size], Xmm(2)); + if (store_offset_c) h->add(h->qword[h->rsp + 2 * gpr_size], store_offset_c); + if (out0_kernel_offset) h->add(h->qword[h->rsp + 2 * gpr_size], out0_kernel_offset); #else h->mov(abi_param5, reinterpret_cast(batch)); - h->uni_vmovq(abi_param6, Xmm(2)); + data_ptr(Xmm(2), abi_param6, store_offset_c, out0_kernel_offset); h->sub(h->rsp, gpr_size); h->mov(h->qword[h->rsp], reinterpret_cast(scratch)); #endif - // align stack on 16-byte as ABI requires - // note that RBX must not be changed by the callee + // align stack on 16-byte as ABI requires + // note that RBX must not be changed by the callee h->mov(h->rbx, h->rsp); h->and_(h->rbx, 0xf); h->sub(h->rsp, h->rbx); @@ -978,29 +1012,210 @@ void BrgemmEmitter::emit_isa(const std::vector &in, const std::vectoradd(input_0, in0_offset); - if (in1_offset != 0) - h->add(input_1, in1_offset); - if (out0_offset != 0) - h->add(output_0, out0_offset); + emit_brgemm_kernel_call(brgKernels0[getBrgIdx(mIdx, k, n)].get(), 1, input_0, input_1, nullptr, output_0, - nullptr); - if (in0_offset != 0) - h->sub(input_0, in0_offset); - if (in1_offset != 0) - h->sub(input_1, in1_offset); - if (out0_offset != 0) - h->sub(output_0, out0_offset); + nullptr, + in0_offset, + in1_offset, + out0_offset); } } } } } + +HorizonMaxEmitter::HorizonMaxEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr& n) : + jit_emitter(h, isa, n, Precision::FP32, emitter_in_out_map::vec_to_vec) { + prepare_table(); +} + +void HorizonMaxEmitter::emit_impl(const std::vector& in, + const std::vector& out, + const std::vector& pool, + const std::vector& gpr, + const ov::intel_cpu::emitter_context *emit_context) const { + if (host_isa_ == dnnl::impl::cpu::x64::sse41) { + emit_isa(in, out); + } else if (host_isa_ == dnnl::impl::cpu::x64::avx2) { + emit_isa(in, out); + } else if (host_isa_ == dnnl::impl::cpu::x64::avx512_core) { + emit_isa(in, out); + } else { + IE_THROW() << "HorizonMax emitter doesn't support " << host_isa_; + } +} + +template +void HorizonMaxEmitter::emit_isa(const std::vector &in, const std::vector &out) const { + using Vmm = typename dnnl::impl::utils::conditional3::type; + + Vmm src_vmm = Vmm(in[0]); + Xmm dst_xmm = Xmm(out[0]); + Xmm aux_xmm = Xmm(aux_vec_idxs[0]); + + Reg64 aux_reg = Reg64(aux_gpr_idxs[0]); + Reg32 aux_reg_32 = Reg32(aux_reg.getIdx()); + + const size_t vlen = dnnl::impl::cpu::x64::cpu_isa_traits::vlen; + const size_t vec_size = vlen / sizeof(float); + h->sub(h->rsp, vlen); + h->uni_vmovups(h->ptr[h->rsp], src_vmm); + h->uni_vmovups(dst_xmm, table_val("float_min")); + for (size_t i = 0; i < vec_size; i++) { + h->mov(aux_reg, h->ptr[h->rsp + i * sizeof(float)]); + h->vmovq(aux_xmm, aux_reg); + h->uni_vmaxps(dst_xmm, dst_xmm, aux_xmm); + } + h->add(h->rsp, vlen); +} + +void HorizonMaxEmitter::register_table_entries() { + push_arg_entry_of("float_min", 0xff7fffff, true); +} + +HorizonSumEmitter::HorizonSumEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr& n) : + jit_emitter(h, isa, n, Precision::FP32, emitter_in_out_map::vec_to_vec) { +} + + +void HorizonSumEmitter::emit_impl(const std::vector& in, + const std::vector& out, + const std::vector& pool, + const std::vector& gpr, + const ov::intel_cpu::emitter_context *emit_context) const { + if (host_isa_ == dnnl::impl::cpu::x64::sse41) { + emit_isa(in, out); + } else if (host_isa_ == dnnl::impl::cpu::x64::avx2) { + emit_isa(in, out); + } else if (host_isa_ == dnnl::impl::cpu::x64::avx512_core) { + emit_isa(in, out); + } else { + IE_THROW() << "HorizonSum emitter doesn't support " << host_isa_; + } +} + +template +void HorizonSumEmitter::emit_isa(const std::vector &in, const std::vector &out) const { + using Vmm = typename dnnl::impl::utils::conditional3::type; + + Vmm src_vmm = Vmm(in[0]); + Xmm dst_xmm = Xmm(out[0]); + Xmm aux_xmm = Xmm(aux_vec_idxs[0]); + + Reg64 aux_reg = Reg64(aux_gpr_idxs[0]); + Reg32 aux_reg_32 = Reg32(aux_reg.getIdx()); + + const size_t vlen = dnnl::impl::cpu::x64::cpu_isa_traits::vlen; + const size_t vec_size = vlen / sizeof(float); + h->sub(h->rsp, vlen); + h->uni_vmovups(h->ptr[h->rsp], src_vmm); + h->uni_vpxor(dst_xmm, dst_xmm, dst_xmm); + for (size_t i = 0; i < vec_size; i++) { + h->mov(aux_reg, h->ptr[h->rsp + i * sizeof(float)]); + h->vmovq(aux_xmm, aux_reg); + h->uni_vaddps(dst_xmm, dst_xmm, aux_xmm); + } + h->add(h->rsp, vlen); +} + +ZeroEmitter::ZeroEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr& n) : + jit_emitter(h, isa, n, Precision::FP32, emitter_in_out_map::vec_to_vec) {} + +void ZeroEmitter::emit_impl(const std::vector& in, + const std::vector& out, + const std::vector& pool, + const std::vector& gpr, + const ov::intel_cpu::emitter_context *emit_context) const { + if (host_isa_ == dnnl::impl::cpu::x64::sse41) { + emit_isa(in, out); + } else if (host_isa_ == dnnl::impl::cpu::x64::avx2) { + emit_isa(in, out); + } else if (host_isa_ == dnnl::impl::cpu::x64::avx512_core) { + emit_isa(in, out); + } else { + IE_THROW() << "Zero emitter doesn't support " << host_isa_; + } +} + +template +void ZeroEmitter::emit_isa(const std::vector &in, const std::vector &out) const { + using Vmm = typename dnnl::impl::utils::conditional3::type; + + Vmm vmm = Vmm(out[0]); + h->uni_vpxor(vmm, vmm, vmm); +} + +FillEmitter::FillEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr& n) : + jit_emitter(h, isa, n, Precision::FP32, emitter_in_out_map::vec_to_vec) { + const auto fill = ov::as_type_ptr(n); + if (fill->get_element_type().size() != 4) { + IE_THROW() << "Fill emitter supports only 4 Byte element types but gets: " << fill->get_element_type(); + } + + offset = fill->get_offset(); + fill_value = fill->get_fill_value(); + prepare_table(); +} + +size_t FillEmitter::aux_gprs_count() const { + // + 1 reg for mask on avx512 + return one_of(host_isa_, dnnl::impl::cpu::x64::avx512_core) ? 2 : 1; +} + +void FillEmitter::emit_impl(const std::vector& in, + const std::vector& out, + const std::vector& pool, + const std::vector& gpr, + const ov::intel_cpu::emitter_context *emit_context) const { + if (host_isa_ == dnnl::impl::cpu::x64::sse41) { + emit_isa(in, out); + } else if (host_isa_ == dnnl::impl::cpu::x64::avx2) { + emit_isa(in, out); + } else if (host_isa_ == dnnl::impl::cpu::x64::avx512_core) { + emit_isa(in, out); + } else { + IE_THROW() << "Fill emitter doesn't support " << host_isa_; + } +} + +template +void FillEmitter::emit_isa(const std::vector &in, const std::vector &out) const { + using Vmm = typename dnnl::impl::utils::conditional3::type; + + Vmm src_vmm = Vmm(in[0]); + Vmm dst_vmm = Vmm(out[0]); + + if (one_of(host_isa_, dnnl::impl::cpu::x64::avx512_core)) { + uint64_t tail_mask = 1; + tail_mask = ~((tail_mask << offset) - tail_mask); + h->mov(Reg64(aux_gpr_idxs[0]), tail_mask); + h->kmovq(k_mask, Reg64(aux_gpr_idxs[0])); + h->vblendmps(dst_vmm | k_mask, src_vmm, table_val("value")); + } else if (one_of(host_isa_, dnnl::impl::cpu::x64::avx2, dnnl::impl::cpu::x64::sse41)) { + uint8 imm = 1; + imm = ~((imm << offset) - imm); // shift load_num bit + if (host_isa_ == dnnl::impl::cpu::x64::sse41 && src_vmm.getIdx() != dst_vmm.getIdx()) { + h->uni_vmovups(dst_vmm, src_vmm); + src_vmm = Vmm(dst_vmm.getIdx()); + } + h->uni_vblendps(dst_vmm, src_vmm, table_val("value"), imm); + } else { + IE_THROW() << "Fill emitter doesn't support " << host_isa_; + } +} + +void FillEmitter::register_table_entries() { + push_arg_entry_of("value", fill_value, true); +} + } // namespace intel_cpu } // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.hpp b/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.hpp index c559f2421f0235..7baee1f92789e7 100644 --- a/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.hpp +++ b/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.hpp @@ -33,6 +33,7 @@ namespace intel_cpu { struct jit_snippets_call_args { const void *src_ptrs[SNIPPETS_MAX_SNIPPETS_DIMS] = {}; void *dst_ptrs[SNIPPETS_MAX_SNIPPETS_DIMS] = {}; + void *buffer_scratchpad = nullptr; }; struct jit_snippets_compile_args { @@ -94,12 +95,13 @@ class KernelEmitter : public jit_container_emitter { const std::vector& pool, const std::vector& gpr, const ov::intel_cpu::emitter_context *emit_context) const override; - void init_data_pointers(size_t, size_t, const Reg64&, const Reg64&, const std::vector&) const; + void init_data_pointers(size_t, size_t, bool, const Reg64&, const Reg64&, const std::vector&) const; jit_snippets_compile_args jcp; std::vector gp_regs_pool; size_t num_inputs; size_t num_outputs; + bool is_buffer_needed; // Vector of indices (lenght = input tensor rank) per every input and output that describes in which order // corresponding tensor dimensions are accessed (default: consecutive dense, e.g. 0,1,2,3 for 4D tensor). // Needed to calc i/o offsets. @@ -254,6 +256,9 @@ class MemoryEmitter : public jit_emitter { protected: Precision src_prc; Precision dst_prc; + + size_t count = 0; + size_t byte_offset = 0; }; class StoreEmitter : public MemoryEmitter { @@ -274,7 +279,6 @@ class StoreEmitter : public MemoryEmitter { void emit_data() const override; private: - size_t count; std::unique_ptr store_emitter = nullptr; }; @@ -296,7 +300,6 @@ class LoadEmitter : public MemoryEmitter { void emit_data() const override; private: - size_t count; std::unique_ptr load_emitter = nullptr; }; @@ -335,7 +338,6 @@ class LoadConvertEmitter : public MemoryEmitter { void emit_data() const override; private: - size_t count; std::unique_ptr load_emitter = nullptr; }; @@ -357,7 +359,6 @@ class StoreConvertEmitter : public MemoryEmitter { void emit_data() const override; private: - size_t count; std::unique_ptr store_emitter = nullptr; }; @@ -391,8 +392,9 @@ class BrgemmEmitter : public jit_emitter { size_t getBrgIdx(size_t mIdx, size_t kIdx, size_t nIdx) const; template void emit_brgemm_kernel_call(const brgemm_kernel_t *brg_kernel, int bs, - Reg64 addr_A, Reg64 addr_B, - const brgemm_batch_element_t *batch, Reg64 addr_C, void *scratch) const; + Reg64 addr_A, Reg64 addr_B, + const brgemm_batch_element_t *batch, Reg64 addr_C, void *scratch, + const size_t in0_kernel_offset, const size_t in1_kernel_offset, const size_t out0_kernel_offset) const; static constexpr size_t BRGEMM_KERNELS_NUM = 8; static constexpr size_t matmulOptimalM = 32; @@ -403,6 +405,96 @@ class BrgemmEmitter : public jit_emitter { size_t K, K_blk, K_tail; size_t N, N_blk, N_tail; size_t brg0VnniFactor; + + size_t load_offset_a = 0lu; + size_t load_offset_b = 0lu; + size_t store_offset_c = 0lu; +}; + +class HorizonMaxEmitter : public jit_emitter { +public: + HorizonMaxEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr& n); + + size_t get_inputs_num() const override {return 1;} + +protected: + size_t aux_gprs_count() const override {return 1;} + size_t aux_vecs_count() const override {return 1;} + +private: + void emit_impl(const std::vector& in, + const std::vector& out, + const std::vector& pool, + const std::vector& gpr, + const ov::intel_cpu::emitter_context *emit_context) const override; + + template + void emit_isa(const std::vector &in, const std::vector &out) const; + + void register_table_entries() override; +}; + +class HorizonSumEmitter : public jit_emitter { +public: + HorizonSumEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr& n); + + size_t get_inputs_num() const override {return 1;} + +protected: + size_t aux_gprs_count() const override {return 1;} + size_t aux_vecs_count() const override {return 1;} + +private: + void emit_impl(const std::vector& in, + const std::vector& out, + const std::vector& pool, + const std::vector& gpr, + const ov::intel_cpu::emitter_context *emit_context) const override; + + template + void emit_isa(const std::vector &in, const std::vector &out) const; +}; + +class ZeroEmitter : public jit_emitter { +public: + ZeroEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr& n); + + size_t get_inputs_num() const override {return 0;} + +private: + void emit_impl(const std::vector& in, + const std::vector& out, + const std::vector& pool, + const std::vector& gpr, + const ov::intel_cpu::emitter_context *emit_context) const override; + + template + void emit_isa(const std::vector &in, const std::vector &out) const; +}; + +class FillEmitter : public jit_emitter { +public: + FillEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr& n); + + size_t get_inputs_num() const override {return 1;} + +protected: + size_t aux_gprs_count() const override; + +private: + void emit_impl(const std::vector& in, + const std::vector& out, + const std::vector& pool, + const std::vector& gpr, + const ov::intel_cpu::emitter_context *emit_context) const override; + + template + void emit_isa(const std::vector &in, const std::vector &out) const; + + void register_table_entries() override; + + size_t offset = 0; + uint32_t fill_value = 0x0; }; } // namespace intel_cpu diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.cpp b/src/plugins/intel_cpu/src/nodes/subgraph.cpp index d2a8f5381c9174..34ddc06577ff58 100644 --- a/src/plugins/intel_cpu/src/nodes/subgraph.cpp +++ b/src/plugins/intel_cpu/src/nodes/subgraph.cpp @@ -345,6 +345,8 @@ void Snippet::createPrimitive() { jcp.master_shape = masterShape; jcp.tile_rank = tileRank; generate(&jcp); + buffer_scratchpad_size = snippet->get_buffer_scratchpad_size(); + buffer_scratchpad.resize(buffer_scratchpad_size * parallel_get_max_threads(), 0); } std::vector Snippet::shapeInfer() const { @@ -468,28 +470,6 @@ bool Snippet::needPrepareParams() const { return inputShapesModified() || !schedule.ptr; } -void Snippet::updateSrcDstPtrs(jit_snippets_call_args& call_args) const { - for (size_t i = 0; i < srcMemPtrs.size(); i++) - call_args.src_ptrs[i] = reinterpret_cast(srcMemPtrs[i]->GetData()) + start_offset_in[i]; - - for (size_t i = 0; i < dstMemPtrs.size(); i++) - call_args.dst_ptrs[i] = reinterpret_cast(dstMemPtrs[i]->GetData()) + start_offset_out[i]; -} - -void Snippet::execute(dnnl::stream strm) { - if (schedule.ptr == nullptr) { - IE_THROW() << "Snippet can't use Optimized implementation and can't fallback to reference"; - } - jit_snippets_call_args call_args; - updateSrcDstPtrs(call_args); - - if (tensorRank == rank6D) { - schedule_6d(call_args); - } else { - schedule_nt(call_args); - } -} - bool Snippet::canBeInPlace() const { if (isDynamic || getParentEdgesAtPort(0)[0]->getParent()->getType() == Type::Input) { return false; @@ -543,6 +523,47 @@ void Snippet::generate(const jit_snippets_compile_args* jcp) { schedule = snippet->generate(optManager, reinterpret_cast(jcp)); } +void Snippet::updateSrcDstPtrs(jit_snippets_call_args& call_args) const { + for (size_t i = 0; i < srcMemPtrs.size(); i++) + call_args.src_ptrs[i] = reinterpret_cast(srcMemPtrs[i]->GetData()) + start_offset_in[i]; + + for (size_t i = 0; i < dstMemPtrs.size(); i++) + call_args.dst_ptrs[i] = reinterpret_cast(dstMemPtrs[i]->GetData()) + start_offset_out[i]; +} + +void Snippet::execute(dnnl::stream strm) { + if (schedule.ptr == nullptr) { + IE_THROW() << "Snippet can't use Optimized implementation and can't fallback to reference"; + } + jit_snippets_call_args call_args; + updateSrcDstPtrs(call_args); + + if (buffer_scratchpad_size > 0) { + schedule_with_buffer_scratchpad(call_args); + return; + } + + if (tensorRank == rank6D) { + schedule_6d(call_args); + } else { + schedule_nt(call_args); + } +} + +void Snippet::schedule_with_buffer_scratchpad(const jit_snippets_call_args& call_args) { + std::vector per_thread_call_args(parallel_get_max_threads(), call_args); + if (buffer_scratchpad_size > 0) { + for (size_t i = 0; i < per_thread_call_args.size(); ++i) + per_thread_call_args[i].buffer_scratchpad = reinterpret_cast(buffer_scratchpad.data()) + i * buffer_scratchpad_size; + } + + if (tensorRank == rank6D) { + schedule_6d_per_thread(per_thread_call_args); + } else { + schedule_nt_per_thread(per_thread_call_args); + } +} + void Snippet::schedule_6d(const jit_snippets_call_args& call_args) const { const auto& dom = exec_domain; // < N, C, H, W > < 1, 1, N, C*H*W> @@ -553,6 +574,16 @@ void Snippet::schedule_6d(const jit_snippets_call_args& call_args) const { }); } +void Snippet::schedule_6d_per_thread(const std::vector& call_args) const { + const auto& dom = exec_domain; + // < N, C, H, W > < 1, 1, N, C*H*W> + parallel_for5d(dom[0], dom[1], dom[2], dom[3], dom[4], + [&](int64_t d0, int64_t d1, int64_t d2, int64_t d3, int64_t d4) { + int64_t indexes[] = {d0, d1, d2, d3, d4}; + schedule.get_callable()(indexes, &call_args[parallel_get_thread_num()]); + }); +} + void Snippet::schedule_nt(const jit_snippets_call_args& call_args) const { const auto& work_size = exec_domain; parallel_nt(0, [&](const int ithr, const int nthr) { @@ -572,6 +603,25 @@ void Snippet::schedule_nt(const jit_snippets_call_args& call_args) const { }); } +void Snippet::schedule_nt_per_thread(const std::vector& call_args) const { + const auto& work_size = exec_domain; + parallel_nt(0, [&](const int ithr, const int nthr) { + size_t start = 0, end = 0; + splitter(harnessWorkAmount, nthr, ithr, start, end); + + std::vector indexes(work_size.size() - 1, 0); + for (size_t iwork = start; iwork < end; ++iwork) { + size_t tmp = iwork; + for (ptrdiff_t j = work_size.size() - 2; j >= 0; j--) { + indexes[j] = tmp % work_size[j]; + tmp /= work_size[j]; + } + + schedule.get_callable()(indexes.data(), &call_args[ithr]); + } + }); +} + } // namespace node } // namespace intel_cpu } // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.h b/src/plugins/intel_cpu/src/nodes/subgraph.h index 9be60ddc8f4c98..8cb426c625f980 100644 --- a/src/plugins/intel_cpu/src/nodes/subgraph.h +++ b/src/plugins/intel_cpu/src/nodes/subgraph.h @@ -66,7 +66,10 @@ class Snippet : public Node { void updateSrcDstPtrs(jit_snippets_call_args&) const; // Evaluates generated snippet using parallel backend void schedule_6d(const jit_snippets_call_args& const_args) const; + void schedule_6d_per_thread(const std::vector& const_args) const; void schedule_nt(const jit_snippets_call_args& const_args) const; + void schedule_nt_per_thread(const std::vector& const_args) const; + void schedule_with_buffer_scratchpad(const jit_snippets_call_args& const_args); // Original subgraph node std::shared_ptr original_snippet; @@ -107,6 +110,10 @@ class Snippet : public Node { std::vector start_offset_in = {}; std::vector start_offset_out = {}; + + // Buffer scratchpad + std::vector buffer_scratchpad = {}; + size_t buffer_scratchpad_size = 0; }; } // namespace node diff --git a/src/plugins/intel_cpu/src/snippets_transformations/fuse_load_store_and_convert.cpp b/src/plugins/intel_cpu/src/snippets_transformations/fuse_load_store_and_convert.cpp index 2db9fd9f010de8..021b3f6c1293ec 100644 --- a/src/plugins/intel_cpu/src/snippets_transformations/fuse_load_store_and_convert.cpp +++ b/src/plugins/intel_cpu/src/snippets_transformations/fuse_load_store_and_convert.cpp @@ -42,12 +42,12 @@ ov::intel_cpu::pass::FuseLoadConvert::FuseLoadConvert() { std::dynamic_pointer_cast(convert)) { load_convert = std::make_shared(param, convert_saturation->get_destination_type(), - load->get_count()); + load->get_count(), load->get_offset()); } else if (const auto convert_truncation = std::dynamic_pointer_cast(convert)) { load_convert = std::make_shared(param, convert_truncation->get_destination_type(), - load->get_count()); + load->get_count(), load->get_offset()); } else { throw ngraph::ngraph_error( "Type of Convert op is undefined. Supports only fusing Load and ConvertTruncation or ConvertSaturation ops"); @@ -91,12 +91,12 @@ ov::intel_cpu::pass::FuseStoreConvert::FuseStoreConvert() { std::dynamic_pointer_cast(convert)) { store_convert = std::make_shared(input, convert_saturation->get_destination_type(), - store->get_count()); + store->get_count(), store->get_offset()); } else if (const auto convert_truncation = std::dynamic_pointer_cast(convert)) { store_convert = std::make_shared(input, convert_truncation->get_destination_type(), - store->get_count()); + store->get_count(), store->get_offset()); } else { throw ngraph::ngraph_error( "Type of Convert op is undefined. Supports only fusing Store and ConvertTruncation or ConvertSaturation ops"); diff --git a/src/plugins/intel_cpu/src/snippets_transformations/op/load_convert.cpp b/src/plugins/intel_cpu/src/snippets_transformations/op/load_convert.cpp index 731c0cb1e1b24a..675c214ed7ae2b 100644 --- a/src/plugins/intel_cpu/src/snippets_transformations/op/load_convert.cpp +++ b/src/plugins/intel_cpu/src/snippets_transformations/op/load_convert.cpp @@ -11,8 +11,9 @@ using namespace std; using namespace ov; -intel_cpu::LoadConvertSaturation::LoadConvertSaturation(const Output& x, const ov::element::Type& destination_type, const size_t count) : - Load(x, count), m_destination_type(destination_type) { +intel_cpu::LoadConvertSaturation::LoadConvertSaturation(const Output& x, const ov::element::Type& destination_type, + const size_t count, const size_t offset) : + Load(x, count, offset), m_destination_type(destination_type) { constructor_validate_and_infer_types(); } @@ -30,11 +31,12 @@ void intel_cpu::LoadConvertSaturation::validate_and_infer_types() { std::shared_ptr intel_cpu::LoadConvertSaturation::clone_with_new_inputs(const OutputVector& new_args) const { INTERNAL_OP_SCOPE(LoadConvert_clone_with_new_inputs); check_new_args_count(this, new_args); - return std::make_shared(new_args.at(0), m_destination_type, m_count); + return std::make_shared(new_args.at(0), m_destination_type, m_count, m_offset); } -intel_cpu::LoadConvertTruncation::LoadConvertTruncation(const Output& x, const ov::element::Type& destination_type, const size_t count) : - Load(x, count), m_destination_type(destination_type) { +intel_cpu::LoadConvertTruncation::LoadConvertTruncation(const Output& x, const ov::element::Type& destination_type, + const size_t count, const size_t offset) : + Load(x, count, offset), m_destination_type(destination_type) { constructor_validate_and_infer_types(); } @@ -52,5 +54,5 @@ void intel_cpu::LoadConvertTruncation::validate_and_infer_types() { std::shared_ptr intel_cpu::LoadConvertTruncation::clone_with_new_inputs(const OutputVector& new_args) const { INTERNAL_OP_SCOPE(LoadConvert_clone_with_new_inputs); check_new_args_count(this, new_args); - return std::make_shared(new_args.at(0), m_destination_type, m_count); + return std::make_shared(new_args.at(0), m_destination_type, m_count, m_offset); } diff --git a/src/plugins/intel_cpu/src/snippets_transformations/op/load_convert.hpp b/src/plugins/intel_cpu/src/snippets_transformations/op/load_convert.hpp index 572cbf00f521d4..1b1b8988c16784 100644 --- a/src/plugins/intel_cpu/src/snippets_transformations/op/load_convert.hpp +++ b/src/plugins/intel_cpu/src/snippets_transformations/op/load_convert.hpp @@ -20,7 +20,7 @@ class LoadConvertSaturation : public ngraph::snippets::op::Load { public: OPENVINO_OP("LoadConvertSaturation", "SnippetsOpset", ngraph::snippets::op::Load); - LoadConvertSaturation(const Output& x, const ov::element::Type& destination_type, const size_t count = 1lu); + LoadConvertSaturation(const Output& x, const ov::element::Type& destination_type, const size_t count = 1lu, const size_t offset = 0lu); LoadConvertSaturation() = default; ov::element::Type get_destination_type() const { return m_destination_type; } @@ -47,7 +47,7 @@ class LoadConvertTruncation : public ngraph::snippets::op::Load { public: OPENVINO_OP("LoadConvertTruncation", "SnippetsOpset", ngraph::snippets::op::Load); - LoadConvertTruncation(const Output& x, const ov::element::Type& destination_type, const size_t count = 1lu); + LoadConvertTruncation(const Output& x, const ov::element::Type& destination_type, const size_t count = 1lu, const size_t offset = 0lu); LoadConvertTruncation() = default; ov::element::Type get_destination_type() const { return m_destination_type; } diff --git a/src/plugins/intel_cpu/src/snippets_transformations/op/store_convert.cpp b/src/plugins/intel_cpu/src/snippets_transformations/op/store_convert.cpp index e58b5bc678d1f8..6a4180c54299c5 100644 --- a/src/plugins/intel_cpu/src/snippets_transformations/op/store_convert.cpp +++ b/src/plugins/intel_cpu/src/snippets_transformations/op/store_convert.cpp @@ -11,8 +11,9 @@ using namespace std; using namespace ov; -intel_cpu::StoreConvertSaturation::StoreConvertSaturation(const Output& x, const ov::element::Type& destination_type, const size_t count) : - Store(x, count), m_destination_type(destination_type) { +intel_cpu::StoreConvertSaturation::StoreConvertSaturation(const Output& x, const ov::element::Type& destination_type, + const size_t count, const size_t offset) : + Store(x, count, offset), m_destination_type(destination_type) { constructor_validate_and_infer_types(); } @@ -30,11 +31,12 @@ void intel_cpu::StoreConvertSaturation::validate_and_infer_types() { std::shared_ptr intel_cpu::StoreConvertSaturation::clone_with_new_inputs(const OutputVector& new_args) const { INTERNAL_OP_SCOPE(StoreConvert_clone_with_new_inputs); check_new_args_count(this, new_args); - return std::make_shared(new_args.at(0), m_destination_type, m_count); + return std::make_shared(new_args.at(0), m_destination_type, m_count, m_offset); } -intel_cpu::StoreConvertTruncation::StoreConvertTruncation(const Output& x, const ov::element::Type& destination_type, const size_t count) : - Store(x, count), m_destination_type(destination_type) { +intel_cpu::StoreConvertTruncation::StoreConvertTruncation(const Output& x, const ov::element::Type& destination_type, + const size_t count, const size_t offset) : + Store(x, count, offset), m_destination_type(destination_type) { constructor_validate_and_infer_types(); } @@ -52,5 +54,5 @@ void intel_cpu::StoreConvertTruncation::validate_and_infer_types() { std::shared_ptr intel_cpu::StoreConvertTruncation::clone_with_new_inputs(const OutputVector& new_args) const { INTERNAL_OP_SCOPE(StoreConvert_clone_with_new_inputs); check_new_args_count(this, new_args); - return std::make_shared(new_args.at(0), m_destination_type, m_count); + return std::make_shared(new_args.at(0), m_destination_type, m_count, m_offset); } diff --git a/src/plugins/intel_cpu/src/snippets_transformations/op/store_convert.hpp b/src/plugins/intel_cpu/src/snippets_transformations/op/store_convert.hpp index d0c4a947433b7c..3697af21540915 100644 --- a/src/plugins/intel_cpu/src/snippets_transformations/op/store_convert.hpp +++ b/src/plugins/intel_cpu/src/snippets_transformations/op/store_convert.hpp @@ -20,7 +20,7 @@ class StoreConvertSaturation : public ngraph::snippets::op::Store { public: OPENVINO_OP("StoreConvertSaturation", "SnippetsOpset", ngraph::snippets::op::Store); - StoreConvertSaturation(const Output& x, const ov::element::Type& destination_type, const size_t count = 1lu); + StoreConvertSaturation(const Output& x, const ov::element::Type& destination_type, const size_t count = 1lu, const size_t offset = 0lu); StoreConvertSaturation() = default; ov::element::Type get_destination_type() const { return m_destination_type; } @@ -47,7 +47,7 @@ class StoreConvertTruncation : public ngraph::snippets::op::Store { public: OPENVINO_OP("StoreConvertTruncation", "SnippetsOpset", ngraph::snippets::op::Store); - StoreConvertTruncation(const Output& x, const ov::element::Type& destination_type, const size_t count = 1lu); + StoreConvertTruncation(const Output& x, const ov::element::Type& destination_type, const size_t count = 1lu, const size_t offset = 0lu); StoreConvertTruncation() = default; ov::element::Type get_destination_type() const { return m_destination_type; } diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/softmax.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/softmax.cpp new file mode 100644 index 00000000000000..a35587aed7887d --- /dev/null +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/softmax.cpp @@ -0,0 +1,72 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "snippets/softmax.hpp" +#include "common_test_utils/test_constants.hpp" + +namespace ov { +namespace test { +namespace snippets { + + +namespace { + +const std::vector inputShape = { + ov::Shape{1, 16}, + ov::Shape{1, 32}, + ov::Shape{1, 1}, + ov::Shape{1, 9}, + ov::Shape{1, 17}, + ov::Shape{1, 19}, + ov::Shape{1, 49}, + ov::Shape{1, 50}, + ov::Shape{5, 16}, + ov::Shape{5, 32}, + ov::Shape{5, 1}, + ov::Shape{5, 9}, + ov::Shape{5, 17}, + ov::Shape{5, 19}, + ov::Shape{5, 49}, + ov::Shape{5, 50}, + ov::Shape{1, 3, 128, 128}, + ov::Shape{1, 3, 128, 129}, + ov::Shape{1, 3, 128, 130}, + ov::Shape{1, 3, 128, 1}, + ov::Shape{1, 3, 128, 9}, + ov::Shape{1, 3, 128, 16}, + ov::Shape{1, 3, 128, 17}, + ov::Shape{1, 3, 128, 20}, +}; + +INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Softmax, Softmax, + ::testing::Combine( + ::testing::ValuesIn(inputShape), + ::testing::Values(-1), + ::testing::Values(2), // Subgraph + Sin + ::testing::Values(1), + ::testing::Values(CommonTestUtils::DEVICE_CPU)), + Softmax::getTestCaseName); + +const std::vector> inputShapesPair = { + std::pair{ov::Shape{1, 5, 16, 35}, ov::Shape{1, 5, 16, 35}}, + std::pair{ov::Shape{1, 5, 16, 1}, ov::Shape{1, 5, 16, 35}}, + std::pair{ov::Shape{1, 5, 16, 35}, ov::Shape{1, 5, 1, 1}}, + std::pair{ov::Shape{1, 5, 16, 1}, ov::Shape{1, 5, 16, 1}}, + std::pair{ov::Shape{1, 5, 16, 35}, ov::Shape{1, 5, 1, 35}}, + std::pair{ov::Shape{1, 5, 1, 35}, ov::Shape{1, 5, 1, 35}}, +}; + +INSTANTIATE_TEST_SUITE_P(smoke_Snippets_AddSoftmax, AddSoftmax, + ::testing::Combine( + ::testing::ValuesIn(inputShapesPair), + ::testing::Values(-1), + ::testing::Values(3), // Subgraph + Sin * 2 + ::testing::Values(1), + ::testing::Values(CommonTestUtils::DEVICE_CPU)), + AddSoftmax::getTestCaseName); + +} // namespace +} // namespace snippets +} // namespace test +} // namespace ov \ No newline at end of file diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/transpose_softmax.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/transpose_softmax.cpp new file mode 100644 index 00000000000000..76dbb58f5b4644 --- /dev/null +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/transpose_softmax.cpp @@ -0,0 +1,42 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "snippets/transpose_softmax.hpp" +#include "common_test_utils/test_constants.hpp" + +namespace ov { +namespace test { +namespace snippets { + + +namespace { + +const std::vector inputShape = { + ov::Shape{1, 128, 3, 16}, +}; + +INSTANTIATE_TEST_SUITE_P(smoke_Snippets_TransposeSoftmax, TransposeSoftmax, + ::testing::Combine( + ::testing::Values(inputShape), + ::testing::Values(std::vector{0, 2, 3, 1}), + ::testing::Values(-1), + ::testing::Values(2), // Subgraph + Sin + ::testing::Values(1), + ::testing::Values(CommonTestUtils::DEVICE_CPU)), + TransposeSoftmax::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P(smoke_Snippets_TransposeSoftmaxEltwise, TransposeSoftmaxEltwise, + ::testing::Combine( + ::testing::Values(inputShape), + ::testing::Values(std::vector{0, 2, 3, 1}), + ::testing::Values(-1), + ::testing::Values(2), // Subgraph + Sin + ::testing::Values(1), + ::testing::Values(CommonTestUtils::DEVICE_CPU)), + TransposeSoftmax::getTestCaseName); + +} // namespace +} // namespace snippets +} // namespace test +} // namespace ov \ No newline at end of file diff --git a/src/tests/functional/plugin/shared/include/snippets/softmax.hpp b/src/tests/functional/plugin/shared/include/snippets/softmax.hpp new file mode 100644 index 00000000000000..ca3f77e43197eb --- /dev/null +++ b/src/tests/functional/plugin/shared/include/snippets/softmax.hpp @@ -0,0 +1,49 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "shared_test_classes/base/snippets_test_utils.hpp" + +namespace ov { +namespace test { +namespace snippets { + +typedef std::tuple< + ov::Shape, // Input 0 Shape + int, // Axis + size_t, // Expected num nodes + size_t, // Expected num subgraphs + std::string // Target Device +> SoftmaxParams; + +typedef std::tuple< + std::pair, // Input Shapes + int, // Axis + size_t, // Expected num nodes + size_t, // Expected num subgraphs + std::string // Target Device +> AddSoftmaxParams; + +class Softmax : public testing::WithParamInterface, + virtual public ov::test::SnippetsTestsCommon { +public: + static std::string getTestCaseName(testing::TestParamInfo obj); + +protected: + void SetUp() override; +}; + +class AddSoftmax : public testing::WithParamInterface, + virtual public ov::test::SnippetsTestsCommon { +public: + static std::string getTestCaseName(testing::TestParamInfo obj); + +protected: + void SetUp() override; +}; + +} // namespace snippets +} // namespace test +} // namespace ov \ No newline at end of file diff --git a/src/tests/functional/plugin/shared/include/snippets/transpose_softmax.hpp b/src/tests/functional/plugin/shared/include/snippets/transpose_softmax.hpp new file mode 100644 index 00000000000000..952b7528a00375 --- /dev/null +++ b/src/tests/functional/plugin/shared/include/snippets/transpose_softmax.hpp @@ -0,0 +1,40 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "shared_test_classes/base/snippets_test_utils.hpp" + +namespace ov { +namespace test { +namespace snippets { + +typedef std::tuple< + std::vector, // Input shapes + std::vector, // Transpose Order + int64_t, // Softmax Axis + size_t, // Expected num nodes + size_t, // Expected num subgraphs + std::string // Target Device +> TransposeSoftmaxParams; + + +class TransposeSoftmax : public testing::WithParamInterface, + virtual public ov::test::SnippetsTestsCommon { +public: + static std::string getTestCaseName(testing::TestParamInfo obj); + +protected: + void SetUp() override; +}; + +class TransposeSoftmaxEltwise : public TransposeSoftmax { +protected: + void SetUp() override; +}; + + +} // namespace snippets +} // namespace test +} // namespace ov \ No newline at end of file diff --git a/src/tests/functional/plugin/shared/src/snippets/softmax.cpp b/src/tests/functional/plugin/shared/src/snippets/softmax.cpp new file mode 100644 index 00000000000000..be0fc59ef3c50a --- /dev/null +++ b/src/tests/functional/plugin/shared/src/snippets/softmax.cpp @@ -0,0 +1,91 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "common_test_utils/common_utils.hpp" +#include "snippets/softmax.hpp" +#include "subgraph_softmax.hpp" +#include "ngraph_functions/builders.hpp" +#include "functional_test_utils/skip_tests_config.hpp" +#include "cpp_interfaces/interface/ie_internal_plugin_config.hpp" + +namespace ov { +namespace test { +namespace snippets { + +std::string Softmax::getTestCaseName(testing::TestParamInfo obj) { + ov::Shape inputShapes; + int axis; + std::string targetDevice; + size_t num_nodes, num_subgraphs; + std::tie(inputShapes, axis, num_nodes, num_subgraphs, targetDevice) = obj.param; + + std::ostringstream result; + result << "IS=" << CommonTestUtils::vec2str(inputShapes) << "_"; + result << "Axis=" << axis << "_"; + result << "#N=" << num_nodes << "_"; + result << "#S=" << num_subgraphs << "_"; + result << "targetDevice=" << targetDevice; + return result.str(); +} + +void Softmax::SetUp() { + ov::Shape inputShape; + int axis; + std::tie(inputShape, axis, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam(); + init_input_shapes({{{}, {inputShape, }}}); + + auto f = ov::test::snippets::SinhSoftmaxFunction({inputShape}, axis); + function = f.getOriginal(); + + if (!configuration.count(InferenceEngine::PluginConfigInternalParams::KEY_SNIPPETS_MODE)) { + configuration.insert({InferenceEngine::PluginConfigInternalParams::KEY_SNIPPETS_MODE, + InferenceEngine::PluginConfigInternalParams::IGNORE_CALLBACK}); + } +} + +std::string AddSoftmax::getTestCaseName(testing::TestParamInfo obj) { + std::pair inputShapes; + int axis; + std::string targetDevice; + size_t num_nodes, num_subgraphs; + std::tie(inputShapes, axis, num_nodes, num_subgraphs, targetDevice) = obj.param; + + std::ostringstream result; + result << "IS[0]=" << CommonTestUtils::vec2str(inputShapes.first) << "_"; + result << "IS[1]=" << CommonTestUtils::vec2str(inputShapes.second) << "_"; + result << "Axis=" << axis << "_"; + result << "#N=" << num_nodes << "_"; + result << "#S=" << num_subgraphs << "_"; + result << "targetDevice=" << targetDevice; + return result.str(); +} + +void AddSoftmax::SetUp() { + std::pair inputShapes; + int axis; + std::tie(inputShapes, axis, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam(); + init_input_shapes({{{}, {inputShapes.first, }}, {{}, {inputShapes.second, }}}); + + auto f = ov::test::snippets::SinhAddSoftmaxFunction({inputShapes.first, inputShapes.second}, axis); + function = f.getOriginal(); + + if (!configuration.count(InferenceEngine::PluginConfigInternalParams::KEY_SNIPPETS_MODE)) { + configuration.insert({InferenceEngine::PluginConfigInternalParams::KEY_SNIPPETS_MODE, + InferenceEngine::PluginConfigInternalParams::IGNORE_CALLBACK}); + } +} + +TEST_P(Softmax, CompareWithRefImpl) { + run(); + validateNumSubgraphs(); +} + +TEST_P(AddSoftmax, CompareWithRefImpl) { + run(); + validateNumSubgraphs(); +} + +} // namespace snippets +} // namespace test +} // namespace ov diff --git a/src/tests/functional/plugin/shared/src/snippets/transpose_softmax.cpp b/src/tests/functional/plugin/shared/src/snippets/transpose_softmax.cpp new file mode 100644 index 00000000000000..ae6ca2e6790201 --- /dev/null +++ b/src/tests/functional/plugin/shared/src/snippets/transpose_softmax.cpp @@ -0,0 +1,80 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "common_test_utils/common_utils.hpp" +#include "snippets/transpose_softmax.hpp" +#include "subgraph_softmax.hpp" +#include "ngraph_functions/builders.hpp" +#include "functional_test_utils/skip_tests_config.hpp" +#include "cpp_interfaces/interface/ie_internal_plugin_config.hpp" + +namespace ov { +namespace test { +namespace snippets { + +std::string TransposeSoftmax::getTestCaseName(testing::TestParamInfo obj) { + std::vector inputShapes; + std::vector order; + int axis; + std::string targetDevice; + size_t num_nodes, num_subgraphs; + std::tie(inputShapes, order, axis, num_nodes, num_subgraphs, targetDevice) = obj.param; + + std::ostringstream result; + for (size_t i = 0; i < inputShapes.size(); ++i) + result << "IS[" << i << "]=" << CommonTestUtils::vec2str(inputShapes[i]) << "_"; + result << "TO=" << CommonTestUtils::vec2str(order) << "_"; + result << "Axis=" << axis << "_"; + result << "#N=" << num_nodes << "_"; + result << "#S=" << num_subgraphs << "_"; + result << "targetDevice=" << targetDevice; + return result.str(); +} + +void TransposeSoftmax::SetUp() { + std::vector inputShapes; + std::vector order; + int64_t axis; + std::tie(inputShapes, order, axis, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam(); + init_input_shapes(static_shapes_to_test_representation(inputShapes)); + + auto f = ov::test::snippets::TransposeSoftmaxFunction(inputDynamicShapes, order, axis); + function = f.getOriginal(); + + if (!configuration.count(InferenceEngine::PluginConfigInternalParams::KEY_SNIPPETS_MODE)) { + configuration.insert({InferenceEngine::PluginConfigInternalParams::KEY_SNIPPETS_MODE, + InferenceEngine::PluginConfigInternalParams::IGNORE_CALLBACK}); + } +} + +void TransposeSoftmaxEltwise::SetUp() { + std::vector inputShapes; + std::vector order; + int64_t axis; + std::tie(inputShapes, order, axis, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam(); + init_input_shapes(static_shapes_to_test_representation(inputShapes)); + + auto f = ov::test::snippets::TransposeSoftmaxEltwiseFunction(inputDynamicShapes, order, axis); + function = f.getOriginal(); + + if (!configuration.count(InferenceEngine::PluginConfigInternalParams::KEY_SNIPPETS_MODE)) { + configuration.insert({InferenceEngine::PluginConfigInternalParams::KEY_SNIPPETS_MODE, + InferenceEngine::PluginConfigInternalParams::IGNORE_CALLBACK}); + } +} + +TEST_P(TransposeSoftmax, CompareWithRefImpl) { + run(); + validateNumSubgraphs(); +} + +TEST_P(TransposeSoftmaxEltwise, CompareWithRefImpl) { + run(); + validateNumSubgraphs(); +} + + +} // namespace snippets +} // namespace test +} // namespace ov diff --git a/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_lowered.hpp b/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_lowered.hpp index 7218f192a8dbcf..57756d8c734bfe 100644 --- a/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_lowered.hpp +++ b/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_lowered.hpp @@ -9,6 +9,7 @@ #include "subgraph_simple.hpp" #include "subgraph_converts.hpp" #include "subgraph_matmul.hpp" +#include "subgraph_softmax.hpp" /* This file provides lowered representations (after the generate() was called) for some simple functions. * This is required to test snippets lowering and optimization passes. All the functions are expected to be direct @@ -57,6 +58,21 @@ class Transpose0213MatMulSinhLoweredFunction : public Transpose0213MatMulSinhFun explicit Transpose0213MatMulSinhLoweredFunction(const std::vector& inputShapes, size_t position = 0) : Transpose0213MatMulSinhFunction(inputShapes, position, false) { } +protected: + std::shared_ptr initLowered() const override; +}; + +class SoftmaxLoweredFunction : public SoftmaxFunction { +public: + explicit SoftmaxLoweredFunction(const std::vector& inputShapes, int axis) : SoftmaxFunction(inputShapes, axis) {} + +protected: + std::shared_ptr initLowered() const override; +}; + +class AddSoftmaxLoweredFunction : public AddSoftmaxFunction { +public: + explicit AddSoftmaxLoweredFunction(const std::vector& inputShapes, int axis) : AddSoftmaxFunction(inputShapes, axis) {} protected: std::shared_ptr initLowered() const override; diff --git a/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_softmax.hpp b/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_softmax.hpp new file mode 100644 index 00000000000000..6250a0d8eba128 --- /dev/null +++ b/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_softmax.hpp @@ -0,0 +1,77 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "ngraph/ngraph.hpp" +#include "./snippets_helpers.hpp" + +namespace ov { +namespace test { +namespace snippets { + +class SoftmaxFunction : public SnippetsFunctionBase { +public: + explicit SoftmaxFunction(const std::vector& inputShapes, int axis) : SnippetsFunctionBase(inputShapes), axis(axis) { + NGRAPH_CHECK(input_shapes.size() == 1, "Got invalid number of input shapes"); + } +protected: + std::shared_ptr initOriginal() const override; + int axis; +}; + +class SinhSoftmaxFunction : public SnippetsFunctionBase { +public: + explicit SinhSoftmaxFunction(const std::vector& inputShapes, int axis) : SnippetsFunctionBase(inputShapes), axis(axis) { + NGRAPH_CHECK(input_shapes.size() == 1, "Got invalid number of input shapes"); + } +protected: + std::shared_ptr initOriginal() const override; + int axis; +}; + +class AddSoftmaxFunction : public SnippetsFunctionBase { +public: + explicit AddSoftmaxFunction(const std::vector& inputShapes, int axis) : SnippetsFunctionBase(inputShapes), axis(axis) { + NGRAPH_CHECK(input_shapes.size() == 2, "Got invalid number of input shapes"); + } +protected: + std::shared_ptr initOriginal() const override; + int axis; +}; + +class SinhAddSoftmaxFunction : public SnippetsFunctionBase { +public: + explicit SinhAddSoftmaxFunction(const std::vector& inputShapes, int axis) : SnippetsFunctionBase(inputShapes), axis(axis) { + NGRAPH_CHECK(input_shapes.size() == 2, "Got invalid number of input shapes"); + } +protected: + std::shared_ptr initOriginal() const override; + int axis; +}; + +class TransposeSoftmaxFunction : public SnippetsFunctionBase { +public: + explicit TransposeSoftmaxFunction(const std::vector& inputShapes, const std::vector& order, const int64_t axis) + : SnippetsFunctionBase(inputShapes), m_order(order), m_axis(axis) { + NGRAPH_CHECK(input_shapes.size() > 0, "Got invalid number of input shapes"); + } +protected: + std::shared_ptr initOriginal() const override; + + std::vector m_order; + int64_t m_axis; +}; + +class TransposeSoftmaxEltwiseFunction : public TransposeSoftmaxFunction { +public: + explicit TransposeSoftmaxEltwiseFunction(const std::vector& inputShapes, const std::vector& order, const int64_t axis) + : TransposeSoftmaxFunction(inputShapes, order, axis) {} +protected: + std::shared_ptr initOriginal() const override; +}; + +} // namespace snippets +} // namespace test +} // namespace ov diff --git a/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_lowered.cpp b/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_lowered.cpp index 86d07b912f9ea2..6de4c8b0d5f32d 100644 --- a/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_lowered.cpp +++ b/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_lowered.cpp @@ -108,21 +108,296 @@ std::shared_ptr EltwiseThreeInputsLoweredFunction::initLowered() cons std::shared_ptr Transpose0213MatMulSinhLoweredFunction::initLowered() const { ParameterVector data{std::make_shared(precision, input_shapes[0]), - std::make_shared(precision, input_shapes[1])}; + std::make_shared(precision, input_shapes[1])}; std::vector layout{0, 2, 1, 3}; // Note: validity of transpose_position values is checked in Transpose0213MatMulSinhFunction constructor if (transpose_position <= 1) { - auto& rt_info = data[transpose_position]->get_rt_info(); + auto &rt_info = data[transpose_position]->get_rt_info(); rt_info["Layout"] = layout; } auto matmul = std::make_shared(data[0], data[1]); if (transpose_position == 2) { - auto& rt_info = matmul->get_rt_info(); + auto &rt_info = matmul->get_rt_info(); rt_info["Layout"] = layout; matmul->validate_and_infer_types(); } return std::make_shared(NodeVector{matmul}, data); } + +std::shared_ptr SoftmaxLoweredFunction::initLowered() const { + auto input_params = ngraph::builder::makeParams(precision, {input_shapes[0].get_shape()}); + + const auto data = input_params.front(); + + const auto master_shape = input_shapes[0].get_shape(); + const auto shape_rank = master_shape.size(); + const auto dimension = shape_rank - 1; + const auto work_amount = master_shape[dimension]; + const auto increment = 10; + const auto inner_dim = shape_rank - 1; + const auto inner_master_wa = static_cast(master_shape[inner_dim]); + const int outer_dim = shape_rank > 1 ? shape_rank - 2 : -1; + const auto has_outer_loop = outer_dim >= 0 && master_shape[outer_dim] > 1; + const bool is_scalar = work_amount == 1; + + /* ====== ReduceMax decomposition ====== */ + + const auto vector_buffer_max = std::make_shared(); + const auto loop_max_begin = ngraph::snippets::op::insertLoopBegin(ngraph::OutputVector{data, data}); + + // we don't insert Fill here after load_max to verify because in generate() call Fill op is inserted only on vector representation + const auto load_max = std::make_shared(loop_max_begin->output(0), increment); + const auto max = std::make_shared(load_max, vector_buffer_max); + + std::vector apply_increments_max(3, false); + std::vector finalization_offsets_max(3, 0); + apply_increments_max[0] = data->get_shape()[inner_dim] != 1 && inner_master_wa != 1; + finalization_offsets_max[0] = data->get_shape()[inner_dim] != 1 ? -inner_master_wa : 0; + const auto loop_max_end = std::make_shared(ngraph::OutputVector{loop_max_begin->output(1), loop_max_begin->output(2)}, + work_amount, increment, apply_increments_max, finalization_offsets_max); + + std::shared_ptr horizon_max = std::make_shared(max); + horizon_max->add_control_dependency(loop_max_end); + const auto prev_horizon_max = horizon_max; + if (!is_scalar) { + horizon_max = std::make_shared(horizon_max, horizon_max->get_input_partial_shape(0)); + } + + loop_max_begin->add_control_dependency(vector_buffer_max); + loop_max_end->add_control_dependency(max); + + /* =========================================== */ + + /* === Sub + Exp + ReduceSum decomposition === */ + + const auto vector_buffer_sum = std::make_shared(); + const auto loop_sum_begin = ngraph::snippets::op::insertLoopBegin(ngraph::OutputVector{loop_max_end->output(0)}); + + const auto load_sub = std::make_shared(loop_sum_begin->output(0), increment); + const auto sub = std::make_shared(load_sub, horizon_max); + // we don't insert Fill here after Exp to verify because in generate() call Fill op is inserted only on vector representation + const auto exp = std::make_shared(sub); + const auto sum = std::make_shared(exp, vector_buffer_sum); + const auto store_exp = std::make_shared(exp, increment); + + std::vector apply_increments_sum(2, false); + std::vector finalization_offsets_sum(2, 0); + apply_increments_sum[0] = load_sub->get_shape()[inner_dim] != 1 && inner_master_wa != 1; + apply_increments_sum[1] = store_exp->get_shape()[inner_dim] != 1 && inner_master_wa != 1; + finalization_offsets_sum[0] = has_outer_loop && load_sub->get_shape()[inner_dim] != 1 ? -inner_master_wa : 0; + finalization_offsets_sum[1] = store_exp->get_shape()[inner_dim] != 1 ? -inner_master_wa : 0; + const auto loop_sum_end = std::make_shared( + ngraph::OutputVector{store_exp, loop_sum_begin->output(1)}, work_amount, increment, + apply_increments_sum, finalization_offsets_sum); + loop_sum_end->add_control_dependency(sum); + + const auto horizon_sum = std::make_shared(sum); + horizon_sum->add_control_dependency(loop_sum_end); + + const auto buffer_exp = std::make_shared(loop_sum_end->output(0)); + + loop_sum_begin->add_control_dependency(vector_buffer_sum); + loop_sum_begin->add_control_dependency(horizon_max); + loop_sum_begin->add_control_dependency(prev_horizon_max); + + /* =========================================== */ + + /* ================== Div ==================== */ + + std::shared_ptr pow = std::make_shared(horizon_sum, -1); + const auto prev_pow = pow; + if (!is_scalar) { + pow = std::make_shared(pow, horizon_sum->get_input_partial_shape(0)); + } + + const auto loop_div_begin = ngraph::snippets::op::insertLoopBegin(ngraph::OutputVector{buffer_exp}); + + const auto load_div = std::make_shared(loop_div_begin->output(0), increment); + const auto mul = std::make_shared(load_div, pow); + const auto store_div = std::make_shared(mul, increment); + + std::vector apply_increments_div(2, false); + std::vector finalization_offsets_div(2, 0); + apply_increments_div[0] = load_div->get_shape()[inner_dim] != 1 && inner_master_wa != 1; + apply_increments_div[1] = store_div->get_shape()[inner_dim] != 1 && inner_master_wa != 1; + finalization_offsets_div[0] = has_outer_loop && load_div->get_shape()[inner_dim] != 1 ? -inner_master_wa : 0; + finalization_offsets_div[1] = has_outer_loop && store_div->get_shape()[inner_dim] != 1 ? -inner_master_wa : 0; + const auto loop_div_end = std::make_shared( + ngraph::OutputVector{store_div, loop_div_begin->output(1)}, work_amount, increment, + apply_increments_div, finalization_offsets_div); + + loop_div_begin->add_control_dependency(pow); + loop_div_begin->add_control_dependency(prev_pow); + + /* =========================================== */ + + const auto result = std::make_shared(loop_div_end); + if (has_outer_loop) { + const auto need_increment = input_shapes[0].get_shape()[outer_dim] != 1 && input_shapes[0].get_shape()[inner_dim] == 1; + const auto& outer_loop_begin = ngraph::snippets::op::insertLoopBegin(input_params); + const auto outer_loop_end = insertLoopEnd(NodeVector{result}, outer_loop_begin, 1, 1, std::vector{need_increment, need_increment}); + vector_buffer_max->add_control_dependency(outer_loop_begin); + } + + return std::make_shared(ResultVector{result}, input_params); +} +std::shared_ptr AddSoftmaxLoweredFunction::initLowered() const { + auto input_params = ngraph::builder::makeParams(precision, {input_shapes[0].get_shape(), input_shapes[1].get_shape()}); + + auto master_pshape = input_shapes[0]; + ov::PartialShape::broadcast_merge_into(master_pshape, input_shapes[1], op::AutoBroadcastType::NUMPY); + const auto master_shape = master_pshape.get_shape(); + const auto shape_rank = master_shape.size(); + const auto dimension = shape_rank - 1; + const auto work_amount = master_shape[dimension]; + const auto increment = 10; + const auto inner_dim = shape_rank - 1; + const auto inner_master_wa = static_cast(master_shape[inner_dim]); + const int outer_dim = shape_rank > 1 ? shape_rank - 2 : -1; + const auto has_outer_loop = outer_dim >= 0 && master_shape[outer_dim] > 1; + const bool is_scalar = work_amount == 1; + + /* ================== Add ==================== */ + + const auto loop_add_begin = ngraph::snippets::op::insertLoopBegin(input_params); + + std::shared_ptr load0 = std::make_shared(loop_add_begin->output(0), increment); + if (!is_scalar && input_shapes[0].get_shape().back() == 1) { + auto new_shape = input_shapes[0].get_shape(); + new_shape[new_shape.size() - 1] = static_cast(inner_master_wa); + load0 = std::make_shared(loop_add_begin->output(0), new_shape); + } + std::shared_ptr load1 = std::make_shared(loop_add_begin->output(1), increment); + if (!is_scalar && input_shapes[1].get_shape().back() == 1) { + auto new_shape = input_shapes[1].get_shape(); + new_shape[new_shape.size() - 1] = static_cast(inner_master_wa); + load1 = std::make_shared(loop_add_begin->output(1), new_shape); + } + const auto add = std::make_shared(load0, load1); + const auto store = std::make_shared(add, increment); + + std::vector apply_increments_add(3, false); + std::vector finalization_offsets_add(3, 0); + apply_increments_add[0] = input_shapes[0].get_shape()[inner_dim] != 1 && inner_master_wa != 1; + apply_increments_add[1] = input_shapes[1].get_shape()[inner_dim] != 1 && inner_master_wa != 1; + apply_increments_add[2] = master_shape[inner_dim] != 1 && inner_master_wa != 1; + finalization_offsets_add[0] = input_shapes[0].get_shape()[inner_dim] != 1 ? -inner_master_wa : 0; + finalization_offsets_add[1] = input_shapes[1].get_shape()[inner_dim] != 1 ? -inner_master_wa : 0; + finalization_offsets_add[2] = master_shape[inner_dim] != 1 ? -inner_master_wa : 0; + auto loop_add_end = std::make_shared(ngraph::OutputVector{store, loop_add_begin->output(2)}, + work_amount, increment, apply_increments_add, finalization_offsets_add); + + /* =========================================== */ + + const auto buffer_add = std::make_shared(loop_add_end->output(0)); + + /* ====== ReduceMax decomposition ====== */ + + const auto vector_buffer_max = std::make_shared(); + const auto loop_max_begin = ngraph::snippets::op::insertLoopBegin(ngraph::OutputVector{buffer_add, buffer_add}); + + // we don't insert Fill here after load_max to verify because in generate() call Fill op is inserted only on vector representation + const auto load_max = std::make_shared(loop_max_begin->output(0), increment); + const auto max = std::make_shared(load_max, vector_buffer_max); + + std::vector apply_increments_max(3, false); + std::vector finalization_offsets_max(3, 0); + apply_increments_max[0] = master_shape[inner_dim] != 1 && inner_master_wa != 1; + finalization_offsets_max[0] = master_shape[outer_dim] == 1 && master_shape[inner_dim] != 1 ? -inner_master_wa : 0; + const auto loop_max_end = std::make_shared(ngraph::OutputVector{loop_max_begin->output(1), loop_max_begin->output(2)}, + work_amount, increment, apply_increments_max, finalization_offsets_max); + + std::shared_ptr horizon_max = std::make_shared(max); + horizon_max->add_control_dependency(loop_max_end); + const auto prev_horizon_max = horizon_max; + if (!is_scalar) { + horizon_max = std::make_shared(horizon_max, horizon_max->get_input_partial_shape(0)); + } + + loop_max_begin->add_control_dependency(vector_buffer_max); + loop_max_end->add_control_dependency(max); + + /* =========================================== */ + + /* === Sub + Exp + ReduceSum decomposition === */ + + const auto vector_buffer_sum = std::make_shared(); + const auto loop_sum_begin = ngraph::snippets::op::insertLoopBegin(ngraph::OutputVector{loop_max_end->output(0)}); + + const auto load_sub = std::make_shared(loop_sum_begin->output(0), increment); + const auto sub = std::make_shared(load_sub, horizon_max); + // we don't insert Fill here after exp to verify because in generate() call Fill op is inserted only on vector representation + const auto exp = std::make_shared(sub); + const auto sum = std::make_shared(exp, vector_buffer_sum); + const auto store_exp = std::make_shared(exp, increment); + + std::vector apply_increments_sum(2, false); + std::vector finalization_offsets_sum(2, 0); + apply_increments_sum[0] = load_sub->get_shape()[inner_dim] != 1 && inner_master_wa != 1; + apply_increments_sum[1] = store_exp->get_shape()[inner_dim] != 1 && inner_master_wa != 1; + finalization_offsets_sum[0] = has_outer_loop && load_sub->get_shape()[inner_dim] != 1 ? -inner_master_wa : 0; + finalization_offsets_sum[1] = store_exp->get_shape()[inner_dim] != 1 ? -inner_master_wa : 0; + const auto loop_sum_end = std::make_shared( + ngraph::OutputVector{store_exp, loop_sum_begin->output(1)}, work_amount, increment, + apply_increments_sum, finalization_offsets_sum); + loop_sum_end->add_control_dependency(sum); + + const auto horizon_sum = std::make_shared(sum); + horizon_sum->add_control_dependency(loop_sum_end); + + const auto buffer_exp = std::make_shared(loop_sum_end->output(0)); + + loop_sum_begin->add_control_dependency(vector_buffer_sum); + loop_sum_begin->add_control_dependency(horizon_max); + loop_sum_begin->add_control_dependency(prev_horizon_max); + + /* =========================================== */ + + /* ================== Div ==================== */ + + std::shared_ptr pow = std::make_shared(horizon_sum, -1); + const auto prev_pow = pow; + if (!is_scalar) { + pow = std::make_shared(pow, horizon_sum->get_input_partial_shape(0)); + } + + const auto loop_div_begin = ngraph::snippets::op::insertLoopBegin(ngraph::OutputVector{buffer_exp}); + + const auto load_div = std::make_shared(loop_div_begin->output(0), increment); + const auto mul = std::make_shared(load_div, pow); + const auto store_div = std::make_shared(mul, increment); + + std::vector apply_increments_div(2, false); + std::vector finalization_offsets_div(2, 0); + apply_increments_div[0] = load_div->get_shape()[inner_dim] != 1 && inner_master_wa != 1; + apply_increments_div[1] = store_div->get_shape()[inner_dim] != 1 && inner_master_wa != 1; + finalization_offsets_div[0] = has_outer_loop && load_div->get_shape()[inner_dim] != 1 ? -inner_master_wa : 0; + finalization_offsets_div[1] = has_outer_loop && store_div->get_shape()[inner_dim] != 1 ? -inner_master_wa : 0; + const auto loop_div_end = std::make_shared( + ngraph::OutputVector{store_div, loop_div_begin->output(1)}, work_amount, increment, + apply_increments_div, finalization_offsets_div); + loop_div_begin->add_control_dependency(pow); + loop_div_begin->add_control_dependency(prev_pow); + + /* =========================================== */ + + const auto result = std::make_shared(loop_div_end); + if (has_outer_loop) { + const auto need_increment0 = input_shapes[0].get_shape()[outer_dim] != 1 && input_shapes[0].get_shape()[inner_dim] == 1; + const auto need_increment1 = input_shapes[1].get_shape()[outer_dim] != 1 && input_shapes[1].get_shape()[inner_dim] == 1; + const auto need_increment2 = master_shape[outer_dim] != 1 && master_shape[inner_dim] == 1; + const auto outer_loop_add_begin = ngraph::snippets::op::insertLoopBegin(input_params); + const auto outer_loop_add_end = + insertLoopEnd(NodeVector{buffer_add}, outer_loop_add_begin, 1, 1, std::vector{need_increment0, need_increment1, need_increment2}); + + const auto need_increment = master_shape[outer_dim] != 1 && master_shape[inner_dim] == 1; + const auto outer_loop_begin = ngraph::snippets::op::insertLoopBegin(NodeVector{buffer_add}); + const auto outer_loop_end = insertLoopEnd(NodeVector{result}, outer_loop_begin, 1, 1, std::vector{need_increment, need_increment}); + vector_buffer_max->add_control_dependency(outer_loop_begin); + } + + return std::make_shared(ResultVector{result}, input_params); +} } // namespace snippets } // namespace test } // namespace ov diff --git a/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_softmax.cpp b/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_softmax.cpp new file mode 100644 index 00000000000000..aba0301993dd06 --- /dev/null +++ b/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_softmax.cpp @@ -0,0 +1,71 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "subgraph_softmax.hpp" +#include "common_test_utils/data_utils.hpp" +#include +#include "ngraph_functions/builders.hpp" + +namespace ov { +namespace test { +namespace snippets { + +std::shared_ptr SoftmaxFunction::initOriginal() const { + auto data = std::make_shared(precision, input_shapes[0]); + auto softmax = std::make_shared(data, axis); + return std::make_shared(NodeVector{softmax}, ParameterVector{data}); +} + +std::shared_ptr SinhSoftmaxFunction::initOriginal() const { + auto data = std::make_shared(precision, input_shapes[0]); + auto sinh = std::make_shared(data); + auto softmax = std::make_shared(sinh, axis); + return std::make_shared(NodeVector{softmax}, ParameterVector{data}); +} + +std::shared_ptr AddSoftmaxFunction::initOriginal() const { + auto data0 = std::make_shared(precision, input_shapes[0]); + auto data1 = std::make_shared(precision, input_shapes[1]); + auto add = std::make_shared(data0, data1); + auto softmax = std::make_shared(add, axis); + return std::make_shared(NodeVector{softmax}, ParameterVector{data0, data1}); +} + +std::shared_ptr SinhAddSoftmaxFunction::initOriginal() const { + auto data0 = std::make_shared(precision, input_shapes[0]); + auto data1 = std::make_shared(precision, input_shapes[1]); + auto sinh0 = std::make_shared(data0); + auto sinh1 = std::make_shared(data1); + auto add = std::make_shared(sinh0, sinh1); + auto softmax = std::make_shared(add, axis); + return std::make_shared(NodeVector{softmax}, ParameterVector{data0, data1}); +} + +std::shared_ptr TransposeSoftmaxFunction::initOriginal() const { + const auto transpose0Param = std::make_shared(precision, input_shapes[0]); + const auto sinh0 = std::make_shared(transpose0Param); + const auto transpose0Const = ngraph::builder::makeConstant(ngraph::element::i64, ov::Shape{m_order.size()}, m_order); + const auto transpose2 = std::make_shared(sinh0, transpose0Const); + const auto softMax = std::make_shared(transpose2, m_axis); + return std::make_shared(ov::NodeVector{softMax}, ov::ParameterVector {transpose0Param}, "softmax_transpose"); +} + +std::shared_ptr TransposeSoftmaxEltwiseFunction::initOriginal() const { + const auto transpose0Param = std::make_shared(precision, input_shapes[0]); + const auto sinh0 = std::make_shared(transpose0Param); + const auto transpose0Const = ngraph::builder::makeConstant(ngraph::element::i64, ov::Shape{m_order.size()}, + m_order); + const auto transpose2 = std::make_shared(sinh0, transpose0Const); + const auto mulConst = ngraph::builder::makeConstant(ngraph::element::f32, transpose2->get_shape(), + std::vector{}, true); + const auto mul = std::make_shared(transpose2, mulConst); + const auto softMax = std::make_shared(mul, m_axis); + const auto hswish = std::make_shared(softMax); + return std::make_shared(ov::NodeVector{hswish}, ov::ParameterVector{transpose0Param}, + "softmax_transpose"); +} + +} // namespace snippets +} // namespace test +} // namespace ov \ No newline at end of file