diff --git a/src/bindings/python/tests_compatibility/test_onnx/test_backend.py b/src/bindings/python/tests_compatibility/test_onnx/test_backend.py index 87f53223c2d672..c1ad04a6fe44a5 100644 --- a/src/bindings/python/tests_compatibility/test_onnx/test_backend.py +++ b/src/bindings/python/tests_compatibility/test_onnx/test_backend.py @@ -565,9 +565,7 @@ def expect_fail(test_case_path, xfail): # type: (str) -> None ), ( xfail_issue_99955, - "OnnxBackendNodeModelTest.test_group_normalization_epsilon_cpu", "OnnxBackendNodeModelTest.test_group_normalization_epsilon_expanded_cpu", - "OnnxBackendNodeModelTest.test_group_normalization_example_cpu", "OnnxBackendNodeModelTest.test_group_normalization_example_expanded_cpu", ), ( diff --git a/src/common/snippets/include/snippets/lowered/port_descriptor.hpp b/src/common/snippets/include/snippets/lowered/port_descriptor.hpp index 551ef1907037ab..2c74867d8436d6 100644 --- a/src/common/snippets/include/snippets/lowered/port_descriptor.hpp +++ b/src/common/snippets/include/snippets/lowered/port_descriptor.hpp @@ -65,6 +65,16 @@ class PortDescriptor { VectorDims m_subtensor_shape{}; /// \brief The corresponding abstract/physical register size_t m_reg = 0; + + /// Notes: + /// - `m_tensor_shape` is dense shape which is controlled by expression outputs. + /// It means that the result of data writing of expression outputs should be read using this shape by the next expression inputs. + /// - `m_layout` is the order of data reading or writing by MemoryAccess ops. Note that only MemoryAccess ops may have `m_layout`. + /// For other expressions this order parameter is simply ignored for now. + /// if it's input port of MemoryAccess expression: + /// - `m_layout` shows how the data should be read (by which strides) using m_tensor_shape. + /// If it's output port of MemoryAccess expression: + /// - `m_layout` shows how the data should be written (by which strides) to get m_tensor_shape. }; class PortDescriptorUtils { diff --git a/src/common/snippets/include/snippets/op/subgraph.hpp b/src/common/snippets/include/snippets/op/subgraph.hpp index b17031e2a67d1c..b642bbd7a23ccb 100644 --- a/src/common/snippets/include/snippets/op/subgraph.hpp +++ b/src/common/snippets/include/snippets/op/subgraph.hpp @@ -10,7 +10,7 @@ #include #include "openvino/op/op.hpp" #include "openvino/core/rt_info.hpp" -#include "snippets/pass_manager.hpp" +#include "snippets/pass/manager.hpp" #include "snippets/shape_inference/shape_inference.hpp" #include "snippets/lowered/pass/pass.hpp" diff --git a/src/common/snippets/include/snippets/pass/common_optimizations.hpp b/src/common/snippets/include/snippets/pass/common_optimizations.hpp index 30ec301eb92c43..aba1ef9fb919df 100644 --- a/src/common/snippets/include/snippets/pass/common_optimizations.hpp +++ b/src/common/snippets/include/snippets/pass/common_optimizations.hpp @@ -5,7 +5,6 @@ #pragma once #include "openvino/pass/graph_rewrite.hpp" -#include "snippets/op/subgraph.hpp" #include "snippets/pass/tokenization.hpp" namespace ov { @@ -13,22 +12,15 @@ namespace snippets { namespace pass { class CommonOptimizations : public ov::pass::MatcherPass { + class SubgraphPass; + class SubgraphManager; + friend class ExtractConstants; + friend class ExtractUnsupportedTransposes; + friend class SplitDimensionM; + public: OPENVINO_RTTI("CommonOptimizations", "0"); CommonOptimizations(const SnippetsTokenization::Config& config = {}); - - // Returns True if parallelism work amount can be increased using SplitDimensionM optimization - static bool CanOptimizeParallelWA(const std::shared_ptr& node, size_t concurrency); - -private: - // Move up Constants which aren't scalars from body to Subgraph and replace them with Parameters inside body - void ExtractConstants(const std::shared_ptr& subgraph); - // Move up unsupported Transposes on Parameter outputs from body - void ExtractUnsupportedTransposes(const std::shared_ptr& subgraph); - // Insert Reshape nodes after and before Parameters and Results in Subgraphs with MatMul inside - // to split dimension M for MatMuls to increase work amount for parallelism - // Note: works only with 3D MHA patterns - void SplitDimensionM(const std::shared_ptr& subgraph, size_t concurrency); }; } // namespace pass diff --git a/src/common/snippets/include/snippets/pass/extract_constants.hpp b/src/common/snippets/include/snippets/pass/extract_constants.hpp new file mode 100644 index 00000000000000..17db3101c95138 --- /dev/null +++ b/src/common/snippets/include/snippets/pass/extract_constants.hpp @@ -0,0 +1,29 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "subgraph_pass.hpp" + +namespace ov { +namespace snippets { +namespace pass { + +/** + * @interface ExtractConstants + * @brief Moves up Constants which aren't scalars outside of the Subgraph's body and replaces them with Parameters inside body + * @ingroup snippets + */ +class ExtractConstants: public CommonOptimizations::SubgraphPass { +public: + OPENVINO_RTTI("ExtractConstants", "0"); + ExtractConstants() = default; + + bool run_on_subgraph(const std::shared_ptr& subgraph) override; +}; + + +} // namespace pass +} // namespace snippets +} // namespace ov diff --git a/src/common/snippets/include/snippets/pass/extract_unsupported_transposes.hpp b/src/common/snippets/include/snippets/pass/extract_unsupported_transposes.hpp new file mode 100644 index 00000000000000..48b1c2fed88ad1 --- /dev/null +++ b/src/common/snippets/include/snippets/pass/extract_unsupported_transposes.hpp @@ -0,0 +1,29 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "subgraph_pass.hpp" + +namespace ov { +namespace snippets { +namespace pass { + +/** + * @interface ExtractUnsupportedTransposes + * @brief Moves up unsupported Transposes on Parameter outputs from body + * @ingroup snippets + */ +class ExtractUnsupportedTransposes: public CommonOptimizations::SubgraphPass { +public: + OPENVINO_RTTI("ExtractUnsupportedTransposes", "0"); + ExtractUnsupportedTransposes() = default; + + bool run_on_subgraph(const std::shared_ptr& subgraph) override; +}; + + +} // namespace pass +} // namespace snippets +} // namespace ov diff --git a/src/common/snippets/include/snippets/pass/fuse_transpose_brgemm.hpp b/src/common/snippets/include/snippets/pass/fuse_transpose_brgemm.hpp index 69266fc90ffc62..faf320a8d8c7e7 100644 --- a/src/common/snippets/include/snippets/pass/fuse_transpose_brgemm.hpp +++ b/src/common/snippets/include/snippets/pass/fuse_transpose_brgemm.hpp @@ -18,18 +18,17 @@ namespace pass { /** * @interface FuseTransposeBrgemm * @brief Fuses Transpose with Brgemm node, fusing on both Brgemm inputs and output is supported. Applicable to - * Transposes that don't change the position of the last dimension (since Brgemm supports strided rows i/o), - * but only 0213 Transpose is currently supported. + * Transposes that don't change the position of the last dimension (since Brgemm supports strided rows i/o). + * Supported any Transpose order where last index is equal to [rank - 1] - it means that last dimension isn't moved. * @ingroup snippets */ class FuseTransposeBrgemm: public ov::pass::MatcherPass { public: OPENVINO_RTTI("FuseTransposeBrgemm", "0"); FuseTransposeBrgemm(); - static const std::set> supported_cases; -private: - static bool is_supported_transpose(const Output& transpose_port); + static bool is_supported_transpose(const Output& transpose_out); + static bool is_supported_transpose_order(const std::vector& order); }; } // namespace pass diff --git a/src/common/snippets/include/snippets/pass_manager.hpp b/src/common/snippets/include/snippets/pass/manager.hpp similarity index 99% rename from src/common/snippets/include/snippets/pass_manager.hpp rename to src/common/snippets/include/snippets/pass/manager.hpp index 04d6ad57c9a6e0..d83a102acec313 100644 --- a/src/common/snippets/include/snippets/pass_manager.hpp +++ b/src/common/snippets/include/snippets/pass/manager.hpp @@ -3,15 +3,18 @@ // #pragma once + #include "openvino/pass/manager.hpp" #include "openvino/pass/pass.hpp" #include "openvino/pass/validate.hpp" + #include namespace ov { namespace snippets { namespace pass { + /** * @brief Manager is like ov::pass::Manager, but allows to insert new passes at arbitrary places in the pipeline * @ingroup snippets diff --git a/src/common/snippets/include/snippets/pass/mha_tokenization.hpp b/src/common/snippets/include/snippets/pass/mha_tokenization.hpp index acd887b0f4a2a0..f5d637f1abb15a 100644 --- a/src/common/snippets/include/snippets/pass/mha_tokenization.hpp +++ b/src/common/snippets/include/snippets/pass/mha_tokenization.hpp @@ -43,6 +43,9 @@ class TokenizeMHASnippets: public ov::pass::MatcherPass { public: OPENVINO_RTTI("TokenizeMHASnippets", "0"); TokenizeMHASnippets(const SnippetsTokenization::Config& config = {}); + + static std::vector get_fusion_transpose_order(size_t rank); + static std::vector get_decomposed_transpose_order(size_t rank); static bool is_matmul0_supported(const std::shared_ptr& matmul); }; diff --git a/src/common/snippets/include/snippets/pass/split_dimension_m.hpp b/src/common/snippets/include/snippets/pass/split_dimension_m.hpp new file mode 100644 index 00000000000000..b57841a5e9cf0f --- /dev/null +++ b/src/common/snippets/include/snippets/pass/split_dimension_m.hpp @@ -0,0 +1,44 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "subgraph_pass.hpp" + +namespace ov { +namespace snippets { +namespace pass { + +/** + * @interface SplitDimensionM + * @brief Inserts Reshape nodes before inputs and after outputs of Subgraphs with MatMul inside + * to split dimension M for MatMuls. It allows to increase work amount for parallelism + * @ingroup snippets + */ +class SplitDimensionM: public CommonOptimizations::SubgraphPass { +public: + OPENVINO_RTTI("SplitDimensionM", "0"); + SplitDimensionM(size_t concurrency) : m_concurrency(concurrency) {} + + bool run_on_subgraph(const std::shared_ptr& subgraph) override; + + // Return True if the MatMul node is supported by this optimization + static bool is_supported_matmul(const std::shared_ptr& node); + // Returns True if parallelism work amount (concurrency) can be increased by this optimization + static bool can_be_optimized(const std::shared_ptr& node, size_t concurrency); + +private: + static std::shared_ptr get_matmul(const std::shared_ptr& subgraph); + static std::pair get_splited_dimensions(size_t batch_dim, size_t m_dim, size_t optimal_parallelism_work_amount); + static bool split(const ov::Shape& shape, size_t optimal_parallelism_work_amount, size_t& batch_m_dim, size_t& new_m_dim); + + void reshape_subgraph(const std::shared_ptr& subgraph, const ov::Shape& shape, size_t batch_m_dim, size_t new_m_dim); + + size_t m_concurrency; +}; + + +} // namespace pass +} // namespace snippets +} // namespace ov diff --git a/src/common/snippets/include/snippets/pass/subgraph_manager.hpp b/src/common/snippets/include/snippets/pass/subgraph_manager.hpp new file mode 100644 index 00000000000000..2aeea775987352 --- /dev/null +++ b/src/common/snippets/include/snippets/pass/subgraph_manager.hpp @@ -0,0 +1,49 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include + +#include "snippets/pass/common_optimizations.hpp" + +#include "snippets/pass/subgraph_pass.hpp" +#include "snippets/op/subgraph.hpp" + +namespace ov { +namespace snippets { +namespace pass { +/** + * @brief Manager class allows to manage transformation passes (SubgraphPasses) on Subgraph ops. + * See SubgraphPasses description for more details. + * It's light version of ov::Manager implementation the purpose of which is to change only Subgraph as separate node in model. + * @ingroup snippets + */ +class CommonOptimizations::SubgraphManager { +public: + SubgraphManager() = default; + + /// @brief Register given transformation class type to execution list + /// @return shared_ptr to the transformation instance + template + std::shared_ptr register_pass(Args&&... args) { + static_assert(std::is_base_of::value, "pass not derived from SubgraphPass base"); + auto pass = std::make_shared(std::forward(args)...); + m_pass_list.push_back(std::static_pointer_cast(pass)); + return pass; + } + + /// @brief Runs registered transformations on a given model + /// @param subgraph Input model + /// @return Returns true if the model was changed by transformations, false otherwise. + bool run_passes(std::shared_ptr subgraph); + +protected: + std::vector> m_pass_list; +}; +} // namespace pass +} // namespace snippets +} // namespace ov diff --git a/src/common/snippets/include/snippets/pass/subgraph_pass.hpp b/src/common/snippets/include/snippets/pass/subgraph_pass.hpp new file mode 100644 index 00000000000000..c8d65f0bc536bc --- /dev/null +++ b/src/common/snippets/include/snippets/pass/subgraph_pass.hpp @@ -0,0 +1,45 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + +#include "snippets/pass/common_optimizations.hpp" + + +namespace ov { +namespace snippets { +namespace pass { + +/** + * @brief Base class for Subgraph passes. + * The pass runs on `Subgraph` op that allows users to transform + * `Subgraph` as node and `body` of this `Subgraph` as model at the same time. + * These passes may change `Subgraph` as node, its `body` and other ops around `Subgraph` in model. + * To avoid unsafe changes of other ops in model, SubgraphPass is not derived from ov::Pass to avoid + * registration to ov::Model + * @ingroup snippets + */ +class CommonOptimizations::SubgraphPass { +public: + SubgraphPass() = default; + virtual ~SubgraphPass() = default; + + virtual bool run_on_subgraph(const std::shared_ptr& subgraph) = 0; + + void set_name(const std::string& name) { m_name = name; } + std::string get_name() const { return m_name; } + + using type_info_t = DiscreteTypeInfo; + virtual const type_info_t& get_type_info() const = 0; + +private: + std::string m_name; +}; + + +} // namespace pass +} // namespace snippets +} // namespace ov diff --git a/src/common/snippets/include/snippets/pass/tokenization.hpp b/src/common/snippets/include/snippets/pass/tokenization.hpp index 9b070fb13c3445..a222bd72ef4f54 100644 --- a/src/common/snippets/include/snippets/pass/tokenization.hpp +++ b/src/common/snippets/include/snippets/pass/tokenization.hpp @@ -51,6 +51,11 @@ class EnumerateNodes : public ov::pass::ModelPass { * 2. MHA tokenization * 3. Common tokenization * 4. Some common transformations for Subgraphs. For example, FakeQuantize decomposition + * Naming policy: + * - During tokenization new Subgraph op takes the name of the last tokenized op. + * It's needed to save output names of model in cases when tokenized op was before model Result. + * - If some transformation (for example, SplitDimensionM) insert new op after Subgraph, + * the op should be called as this Subgraph to save output name. The Subgraph name is updated using suffix "_original". * @ingroup snippets */ class SnippetsTokenization : public ov::pass::ModelPass { @@ -61,9 +66,9 @@ class SnippetsTokenization : public ov::pass::ModelPass { * @ingroup snippets */ struct Config { - Config(size_t concurrency = 1, bool split_m_dimension = true, bool enable_transpose_on_output = true) + Config(size_t concurrency = 1, bool split_m_dimension = true, bool enable_transpose_on_output = true, std::set mha_transpose_ranks = {3, 4}) : concurrency(concurrency), split_m_dimension(split_m_dimension), - mha_token_enable_transpose_on_output(enable_transpose_on_output) {} + mha_token_enable_transpose_on_output(enable_transpose_on_output), mha_supported_transpose_ranks(std::move(mha_transpose_ranks)) {} size_t concurrency = 1; // True if "SplitDimensionM" optimization is enabled. Otherwise, it's disabled. @@ -72,6 +77,10 @@ class SnippetsTokenization : public ov::pass::ModelPass { // Otherwise, it may be fused into Subgraph if possible // TODO [111813]: Remove please when the ticket 111813 is implemented bool mha_token_enable_transpose_on_output = true; + // Set of supported Transpose shape ranks for tokenization in MHATokenization pass. + // Note that in general Snippets support Transpose of any ranks. + // But at the moment Transpose is used only in MHA pattern where 3D and 4D tensors are supported. + std::set mha_supported_transpose_ranks = { 3, 4 }; }; OPENVINO_RTTI("SnippetsTokenization", "0"); diff --git a/src/common/snippets/include/snippets/pass/transpose_decomposition.hpp b/src/common/snippets/include/snippets/pass/transpose_decomposition.hpp index 013a538172ac7e..e9bd1506b93c60 100644 --- a/src/common/snippets/include/snippets/pass/transpose_decomposition.hpp +++ b/src/common/snippets/include/snippets/pass/transpose_decomposition.hpp @@ -20,7 +20,9 @@ class TransposeDecomposition: public ov::pass::MatcherPass { public: OPENVINO_RTTI("TransposeDecomposition", "0"); TransposeDecomposition(); - static const std::set> supported_cases; + + static bool is_supported_transpose(const Output& transpose_out); + static bool is_supported_transpose_order(const std::vector& order); }; } // namespace pass diff --git a/src/common/snippets/include/snippets/utils.hpp b/src/common/snippets/include/snippets/utils.hpp index d10930125e0ed0..c77eecd8bb15b0 100644 --- a/src/common/snippets/include/snippets/utils.hpp +++ b/src/common/snippets/include/snippets/utils.hpp @@ -25,12 +25,6 @@ inline auto is_scalar_constant(const std::shared_ptr& source_output_no return ov::is_type(source_output_node) && ov::shape_size(source_output_node->get_shape()) == 1; } -ov::PartialShape get_planar_pshape(const Input& out); -ov::PartialShape get_planar_pshape(const Output& out); -ov::PartialShape get_planar_pshape(const ov::PartialShape& shape, const std::vector& layout); -VectorDims pshape_to_vdims(const PartialShape&); -ov::PartialShape vdims_to_pshape(const VectorDims&); - inline auto normalize_rank(int32_t allocation_rank, const size_t shape_rank) -> int32_t { return allocation_rank < 0 ? allocation_rank + static_cast(shape_rank) + 1 : allocation_rank; } @@ -55,11 +49,87 @@ constexpr inline bool implication(bool cause, bool cond) { return !cause || !!cond; } -VectorDims get_planar_vdims(const VectorDims& shape, const std::vector& layout); -VectorDims get_planar_vdims(const snippets::lowered::PortDescriptorPtr& port_desc); +template +inline T div_up(const T a, const U b) { + return static_cast((a + b - 1) / b); +} + +/* ----- Shape `getters` ----- */ +/** + * @brief Returns a dense shape after applying the order. + * It means that the shape dimensions will be reordered in accordance with order indices to produce planar shape + * @param shape preordered (original) partial shape + * @param order order + * @return reordered partial shape: `planar_shape[i]` = `shape[order[i]]` + * Example, shape = [16, 2, 32, 64], order = [2, 0, 1, 3] + * planar_shape = [32, 16, 2, 64] + */ +ov::PartialShape get_planar_pshape(const ov::PartialShape& shape, const std::vector& order); +/** + * @brief Returns original shape before applying the order. + * It means that the shape dimensions have been already reordered in accordance with order indices to produce planar shape + * @param shape planar (ordered) partial shape + * @param order order + * @return preordered partial shape: `shape[i]` = `planar_shape[order[i]]` where `shape` is shape before applying the order. + * Example, shape = [16, 2, 32, 64], order = [2, 0, 1, 3] + * planar_shape = [2, 32, 16, 64] + */ +ov::PartialShape get_preordered_pshape(const ov::PartialShape& shape, const std::vector& order); +/** + * @brief Returns a dense shape of node input. + * It means that the node input shape dimensions will be reordered in accordance with order indices to produce planar shape + * @param in input of node + * @return new reordered partial shape: `planar_shape[i]` = `shape[order[i]]` + */ +ov::PartialShape get_planar_pshape(const Input& in); +/** + * @brief Returns original shape of node output before applying the order. + * It means that the preordered output shape dimensions have been already reordered in accordance with order indices to produce planar shape + * @param out output of node + * @return preordered partial shape: `shape[i]` = `planar_shape[order[i]]` where `shape` is shape before applying the order. + */ +ov::PartialShape get_preordered_pshape(const Output& out); +/** + * @brief Returns a dense shape after applying the order. + * It means that the shape dimensions will be reordered in accordance with order indices to produce planar shape + * @param shape preordered (original) shape + * @param order order + * @return reordered partial shape: `planar_shape[i]` = `shape[order[i]]` + * Example, shape = [16, 2, 32, 64], order = [2, 0, 1, 3] + * planar_shape = [32, 16, 2, 64] + */ +VectorDims get_planar_vdims(const VectorDims& shape, const std::vector& order); +/** + * @brief Returns original shape before applying the order. + * It means that the preordered shape dimensions have been already reordered in accordance with order indices to produce planar shape + * @param shape planar (ordered) shape + * @param order order + * @return preordered shape: `shape[i]` = `planar_shape[order[i]]` where `shape` is shape before applying the order. + * Example, shape = [16, 2, 32, 64], order = [2, 0, 1, 3] + * planar_shape = [2, 32, 16, 64] + */ +VectorDims get_preordered_vdims(const VectorDims& shape, const std::vector& order); +/** + * @brief Returns a dense shape of expression input port. + * It means that the input shape dimensions will be reordered in accordance with order indices to produce planar shape + * @param expr_port input expression port + * @return new reordered partial shape: `planar_shape[i]` = `shape[order[i]]` + */ VectorDims get_planar_vdims(const snippets::lowered::ExpressionPort& expr_port); +/** + * @brief Returns original shape before applying the order of expression output port. + * It means that the preordered output shape dimensions has been already reordered in accordance with order indices to produce planar shape + * @param out input of node + * @return preordered shape: `shape[i]` = `planar_shape[order[i]]` where `shape` is shape before applying the order. + */ +VectorDims get_preordered_vdims(const snippets::lowered::ExpressionPort& expr_port); + bool is_dynamic_vdims(const VectorDims& shape); +VectorDims pshape_to_vdims(const PartialShape&); +ov::PartialShape vdims_to_pshape(const VectorDims&); +/* --------------------------- */ + } // namespace utils } // namespace snippets } // namespace ov diff --git a/src/common/snippets/src/lowered/linear_ir.cpp b/src/common/snippets/src/lowered/linear_ir.cpp index adf3894f71b8b7..4e1f730db6c428 100644 --- a/src/common/snippets/src/lowered/linear_ir.cpp +++ b/src/common/snippets/src/lowered/linear_ir.cpp @@ -365,10 +365,10 @@ VectorDims LinearIR::get_master_shape() const { } // Note: Snippets would benefit from a more generic master_shape calculation approach. // It will be implemented in the scope of ROI propagation activity (ticket 120505) - const auto& result_parent = out_exprs[0]->get_input_port_connector(0)->get_source().get_expr(); + const auto& source = out_exprs[0]->get_input_port_connector(0)->get_source(); if (!m_config.m_enable_domain_optimization && out_exprs.size() == 1 && - ov::is_type(result_parent->get_node())) { - master_shape = utils::get_planar_vdims(out_exprs[0]->get_input_port_descriptor(0)); + ov::is_type(source.get_expr()->get_node())) { + master_shape = utils::get_preordered_vdims(source); } else { for (const auto& oe : out_exprs) { const auto& port_desc = oe->get_input_port_descriptor(0); diff --git a/src/common/snippets/src/lowered/loop_manager.cpp b/src/common/snippets/src/lowered/loop_manager.cpp index 2bef20bb54e9d5..da8da2c2376f1f 100644 --- a/src/common/snippets/src/lowered/loop_manager.cpp +++ b/src/common/snippets/src/lowered/loop_manager.cpp @@ -181,9 +181,8 @@ void LinearIR::LoopManager::mark_loop(LinearIR::constExprIt loop_begin_pos, std::vector loop_subtensor; std::vector loop_tensor(loop_depth, 1); for (const auto& exit_point : loop_exit_points) { - const auto& desc = exit_point.get_descriptor_ptr(); - const auto shape = utils::get_planar_vdims(desc); - auto subtensor = desc->get_subtensor(); + const auto shape = utils::get_preordered_vdims(exit_point); + auto subtensor = exit_point.get_descriptor_ptr()->get_subtensor(); if (subtensor.empty()) { subtensor.resize(loop_depth, 1); subtensor[subtensor.size() - 1] = vector_size; diff --git a/src/common/snippets/src/lowered/pass/init_loops.cpp b/src/common/snippets/src/lowered/pass/init_loops.cpp index 47a77df23401e2..8128ea0253d2a7 100644 --- a/src/common/snippets/src/lowered/pass/init_loops.cpp +++ b/src/common/snippets/src/lowered/pass/init_loops.cpp @@ -16,7 +16,7 @@ namespace pass { using LoopPort = LinearIR::LoopManager::LoopPort; namespace { -int64_t get_dim_stride(size_t dim, const std::vector& layout, const std::vector& shape) { +int64_t get_input_stride(size_t dim, const std::vector& layout, const VectorDims& shape) { int64_t stride = 1; for (int i = static_cast(layout.size()) - 1; i >= 0; i--) { if (layout[i] == dim) { @@ -26,6 +26,13 @@ int64_t get_dim_stride(size_t dim, const std::vector& layout, const std: } return stride; } +int64_t get_output_stride(size_t dim, const VectorDims& shape) { + int64_t stride = 1; + for (size_t i = dim + 1; i < shape.size(); ++i) { + stride *= static_cast(shape[i]); + } + return stride; +} } // namespace InitLoops::InitLoops() : Pass() {} @@ -42,7 +49,8 @@ void InitLoops::init_ptr_increments(std::vector& loop_inputs, std::vec const auto& dim = *(layout.rbegin() + dim_idx); // If relevant dim is not broadcasted, then ptr_increment is the dim stride in the new layout if (!(shape[dim] == 1 && work_amount != 1)) { - loop_input.ptr_increment = get_dim_stride(dim, source.get_descriptor_ptr()->get_layout(), shape); + // Input layout shows how we should read data by which order and strides + loop_input.ptr_increment = get_input_stride(dim, source.get_descriptor_ptr()->get_layout(), shape); } } } @@ -54,15 +62,12 @@ void InitLoops::init_ptr_increments(std::vector& loop_inputs, std::vec const auto loop_ids = port->get_expr()->get_loop_ids(); const auto& layout = port->get_descriptor_ptr()->get_layout(); const auto& shape = port->get_descriptor_ptr()->get_shape(); - const auto& dim = *(layout.rbegin() + dim_idx); - // Ticket: 113106 - // WA: the current logic doesn't support the case with transposed output shape for brgemm layer - // but for all existing cases planar layout can be used - std::vector planar(layout.size()); - std::iota(planar.begin(), planar.end(), 0); + const auto original_dim = layout.size() - 1 - dim_idx; + const auto& dim = std::distance(layout.cbegin(), std::find(layout.cbegin(), layout.cend(), original_dim)); // If relevant dim is not broadcasted, then ptr_increment is the dim stride in the new layout if (!(shape[dim] == 1 && work_amount != 1)) { - loop_output.ptr_increment = get_dim_stride(dim, planar, shape); + // Output layout shows how we already written data by which order and strides + loop_output.ptr_increment = get_output_stride(dim, shape); } } } diff --git a/src/common/snippets/src/lowered/pass/insert_buffers.cpp b/src/common/snippets/src/lowered/pass/insert_buffers.cpp index da5ffc11c3169d..aefaca42f4094e 100644 --- a/src/common/snippets/src/lowered/pass/insert_buffers.cpp +++ b/src/common/snippets/src/lowered/pass/insert_buffers.cpp @@ -37,7 +37,7 @@ ov::Shape compute_allocation_shape(const LinearIR::LoopManagerPtr& loop_manager, const std::vector& parent_loop_ids, const ExpressionPort& expr_port, const int allocation_rank) { - const auto& planar_shape = utils::get_planar_vdims(expr_port); + const auto planar_shape = utils::get_preordered_vdims(expr_port); const size_t rank = allocation_rank >= 0 ? std::min(static_cast(allocation_rank), planar_shape.size()) : planar_shape.size(); ov::Shape allocation_shape(rank); diff --git a/src/common/snippets/src/lowered/pass/optimize_domain.cpp b/src/common/snippets/src/lowered/pass/optimize_domain.cpp index f2d2fd43baf96c..09dadc77efe6e5 100644 --- a/src/common/snippets/src/lowered/pass/optimize_domain.cpp +++ b/src/common/snippets/src/lowered/pass/optimize_domain.cpp @@ -98,7 +98,7 @@ bool OptimizeDomain::run(snippets::lowered::LinearIR& linear_ir) { const ExpressionPtr& shape_producing_expr = blocked_input_shapes ? first_consumer : io_expr; - const auto& shape = utils::get_planar_vdims(shape_producing_expr->get_output_port_descriptor(0)); + const auto& shape = utils::get_preordered_vdims(shape_producing_expr->get_output_port(0)); OPENVINO_ASSERT(std::none_of(shape.begin(), shape.end(), [](size_t d) {return d == snippets::IShapeInferSnippets::DYNAMIC_DIMENSION; }), "OptimizeDomain pass does not support dynamic shapes"); diff --git a/src/common/snippets/src/op/brgemm.cpp b/src/common/snippets/src/op/brgemm.cpp index 5cce5d85c13a82..6ea77e447c449b 100644 --- a/src/common/snippets/src/op/brgemm.cpp +++ b/src/common/snippets/src/op/brgemm.cpp @@ -114,7 +114,7 @@ ov::element::Type Brgemm::get_output_type() const { std::vector Brgemm::get_planar_input_shapes(const std::vector>& inputs) const { OPENVINO_ASSERT(inputs.size() == 2, "Brgemm::get_planar_input_shapes() expects 2 inputs"); - return {utils::get_planar_pshape(inputs[0]), utils::get_planar_pshape(inputs[1]) }; + return { utils::get_planar_pshape(inputs[0]), utils::get_planar_pshape(inputs[1]) }; } ov::PartialShape Brgemm::get_planar_output_shape(const ov::PartialShape& output_shape) const { diff --git a/src/common/snippets/src/op/load.cpp b/src/common/snippets/src/op/load.cpp index 868ed4294e6dab..065372f7a76747 100644 --- a/src/common/snippets/src/op/load.cpp +++ b/src/common/snippets/src/op/load.cpp @@ -79,7 +79,6 @@ IShapeInferSnippets::Result LoadReshape::ShapeInfer::infer(const std::vector @@ -329,7 +329,7 @@ VectorDims Subgraph::infer_master_shape() { const auto& res_input = res->input(0); OPENVINO_ASSERT(res_input.get_partial_shape().is_static(), "Result have dynamic shape in static pipeline"); // We need to account to the shape's layout stored in Output rt_info - const auto& planar_shape = utils::get_planar_pshape(res_input.get_source_output()); + const auto& planar_shape = utils::get_preordered_pshape(res_input.get_source_output()); output_dims.emplace_back(planar_shape.get_shape()); } } diff --git a/src/common/snippets/src/pass/collapse_subgraph.cpp b/src/common/snippets/src/pass/collapse_subgraph.cpp index 6ed1054adac40c..7ce3d658e56a58 100644 --- a/src/common/snippets/src/pass/collapse_subgraph.cpp +++ b/src/common/snippets/src/pass/collapse_subgraph.cpp @@ -79,8 +79,8 @@ auto is_supported_op(const std::shared_ptr &n) -> bool { const auto& order = as_type_ptr(n->get_input_node_shared_ptr(1)); if (order) { const auto order_value = order->cast_vector(); - return (TransposeDecomposition::supported_cases.count(order_value) != 0) || - (is_brgemm_case && FuseTransposeBrgemm::supported_cases.count(order_value) != 0); + return (TransposeDecomposition::is_supported_transpose_order(order_value)) || + (is_brgemm_case && FuseTransposeBrgemm::is_supported_transpose_order(order_value)); } } return false; diff --git a/src/common/snippets/src/pass/common_optimizations.cpp b/src/common/snippets/src/pass/common_optimizations.cpp index 609496cd0265e5..1e10d2dc6dfe6e 100644 --- a/src/common/snippets/src/pass/common_optimizations.cpp +++ b/src/common/snippets/src/pass/common_optimizations.cpp @@ -11,6 +11,10 @@ #include "snippets/pass/fuse_transpose_brgemm.hpp" #include "snippets/pass/transform_convert.hpp" #include "snippets/pass/validate.hpp" +#include "snippets/pass/split_dimension_m.hpp" +#include "snippets/pass/extract_constants.hpp" +#include "snippets/pass/extract_unsupported_transposes.hpp" +#include "snippets/pass/subgraph_manager.hpp" #include "snippets/op/subgraph.hpp" #include "snippets/itt.hpp" @@ -21,343 +25,9 @@ namespace ov { namespace snippets { namespace pass { -namespace { -size_t get_lcm(size_t a, size_t b) { - std::function get_gcd; - get_gcd = [&get_gcd](size_t a, size_t b) { - if (b == 0) - return a; - return get_gcd(b, a % b); - }; - return a / get_gcd(a, b) * b; -} - -bool is_supported_matmul_for_split_dim_m_optimization(const std::shared_ptr& node) { - const auto matmul = ov::as_type_ptr(node); - return matmul && !matmul->get_transpose_a() && !matmul->is_dynamic() && node->get_shape().size() == 3; // It's needed only for 3D MHA patterns -} -} // namespace - -bool CommonOptimizations::CanOptimizeParallelWA(const std::shared_ptr& node, size_t concurrency) { - if (!is_supported_matmul_for_split_dim_m_optimization(node)) - return false; - const auto mm_shape = node->get_shape(); - const auto current_parallel_work_amount = - std::accumulate(mm_shape.rbegin() + 2, mm_shape.rend(), size_t(1), std::multiplies()); - const auto dim_M = *(mm_shape.rbegin() + 1); - return (current_parallel_work_amount < concurrency) && - (current_parallel_work_amount * dim_M >= concurrency); -} - -void CommonOptimizations::SplitDimensionM(const std::shared_ptr& subgraph, size_t concurrency) { - // To increase parallelism work in 3D cases for MHA pattern, - // we split 1st dimension (starting from 0th) into 2 new dimensions to get 4D Shapes where - // - 0th and 1st dimensions are used in parallel scheduling, - // - 2nd and 3rd dimensions are used in kernel - // Note: 3D Patterns don't contain Transpose inside so the reshaping is valid - - // It's needed only for MHA patterns. Need to add support for common patterns - if (!subgraph->has_domain_sensitive_ops()) - return; - - const auto& body = subgraph->body_ptr(); - const auto& parameters = body->get_parameters(); - // [107806]: If count of Parameters isn't equal to Subgraph inputs (it's possible case in general), - // we cannot garantee correct extraction since we don't have correct connections between body I/O and Subgraph I/O. - OPENVINO_ASSERT(parameters.size() == subgraph->input_values().size(), - "Failed to extract unsupported transposes: the count of Parameters isn't equal to Subgraph inputs"); - - // Need to find MatMul0 and check output shape - const auto& ops = body->get_ordered_ops(); - const auto mm_it = std::find_if(ops.begin(), ops.end(), - [](const std::shared_ptr& node){ return ov::is_type(node); }); - if (mm_it == ops.end()) - return; - - const auto matmul0 = *mm_it; - if (!is_supported_matmul_for_split_dim_m_optimization(matmul0)) - return; - - auto get_dim_M = [](const ov::Shape& shape) { - return *(shape.rbegin() + 1); - }; - - const auto mm_shape = matmul0->get_shape(); - const auto m_dim = get_dim_M(mm_shape); // M - const auto batch_dim = - std::accumulate(mm_shape.rbegin() + 2, mm_shape.rend(), size_t(1), std::multiplies()); // B (batch) - - // We skip optimization if the current batch is optimal for concurrency - const auto optimal_parallelism_work_amount = concurrency; - if (batch_dim % optimal_parallelism_work_amount == 0) - return; - - size_t batch_m_dim = 1; - size_t new_m_dim = m_dim; - - auto is_optimized = [&](size_t batch_m_dim) { - return batch_m_dim > 1; - }; - - // [ First Step ] - // Need to find optimized dimension splitting: [b1..bk, m, n] -> [b1..bk, batch_m_dim, new_m_dim, n] - // The work amount for parallelism should be divided by max thread count in ideal case - // that all threads have the same full work amount (avoid of thread downtime) - // If it's impossible, we select such values so that as many threads as possible have work (see [ Second Step ]) - // For example, there are 16 threads and shape [6, 512, 32] - // LCM(6, 16) = 48 <- ideal work amount for parallelism - // new_shape [6, 48 / 6, 512 / (48 / 6), 32 ] => [6, 8, 64, 32] - // Each thread has parallelism_work_amount = 6 * 8 / nthrs = 3 - const auto lcm = get_lcm(batch_dim, optimal_parallelism_work_amount); // LCM(b, nthrs) - const auto batch_dim_multiplier = lcm / batch_dim; // LCM(b, nthrs) / b - const auto needed_new_dim = m_dim / batch_dim_multiplier; // m / (LCM(b, nthrs) / b) - needed factors of dimension m - if (batch_dim_multiplier * needed_new_dim == m_dim && is_optimized(batch_dim_multiplier)) { - batch_m_dim = batch_dim_multiplier; - new_m_dim = needed_new_dim; - } else { - // [ Second Step ] - // If we couldn't optimally split on the previous step, try the second step. - // The algorithm finds the more optimal parallelism work amount [batch_dim * batch_m_dim], - // where batch_m_dim is divisor of dimension M. - // The optimal parallelism work amount means the case when as many threads as possible have work - // For example, there are 8 threads and shape [5, 384, 32] - // 768 = [2 x 192] = [3 x 128] = [4 x 96] = [6 x 64] - // - [5, 2, 192, 32] - WA = 10 = 8 + 2 (6 threads calculates once and 2 threads twice) - // - [5, 3, 128, 32] - WA = 15 = 8 + 7 (all threads have 2 kernel except one thread) <- the most optimal case - // - [5, 4, 96, 32] - WA = 20 = 8 x 2 + 4 - // - [5, 6, 64, 32] - WA = 30 = 8 x 3 + 6 - // The most optimal and possible case is [5, 3, 128, 32] - almost all threads executes kernel twice - // Heuristic value for a quick exit from the algorithm. - // The value shows the number of threads in percentages that perform the most equal work - const auto optimal_thread_num_percent = 0.8; - size_t optimal_remainder = 1; - auto get_remainder = [batch_dim, optimal_parallelism_work_amount](const size_t potential_batch_dim) { - return (batch_dim * potential_batch_dim) % optimal_parallelism_work_amount; - }; - - auto update_optimal_params = [&](size_t divisor_0, size_t divisor_1) { - const auto remainder = batch_dim * divisor_0 % optimal_parallelism_work_amount; - if (remainder > optimal_remainder || remainder == 0) { - optimal_remainder = remainder; - batch_m_dim = divisor_0; - new_m_dim = divisor_1; - } - }; - - // Firstly we have shape [batch, 1, m_dim, smth]. - // So at the beginning we have parallel_work_amount = batch x 1 - optimal_remainder = get_remainder(1); - const auto root = std::sqrt(m_dim) + 1; - for (size_t divisor_0 = 2; divisor_0 < root; ++divisor_0) { - const size_t divisor_1 = m_dim / divisor_0; - if (divisor_0 * divisor_1 != m_dim) - continue; - - update_optimal_params(divisor_0, divisor_1); - update_optimal_params(divisor_1, divisor_0); - if ((static_cast(optimal_remainder) / static_cast(optimal_parallelism_work_amount) > optimal_thread_num_percent) || - (optimal_remainder == 0)) { - break; - } - } - } - - OPENVINO_ASSERT(batch_m_dim * new_m_dim == m_dim, "Incorrect dimension M splitting!"); - // nothing to split - if (!is_optimized(batch_m_dim)) - return; - - /***** Reshape insertion *****/ - - // There are two Parameter variants: - // - Parameter on branches for Second input of MatMul - the shape should be only unsqueezed (add just 1) - // - Other Parameters (on First input of MatMuls and between) - the shape should be splitted on M dimension - - bool updated = false; - std::set> reshaped_params; - - auto insert_reshape = [&](const std::shared_ptr& param, const ov::Shape& new_shape) { - const auto index = std::distance(parameters.begin(), std::find(parameters.begin(), parameters.end(), param)); - const auto shape_const = std::make_shared(ov::element::i32, ov::Shape{new_shape.size()}, new_shape); - const auto reshape = std::make_shared(subgraph->input_value(index), shape_const, false); - subgraph->input(index).replace_source_output(reshape); - param->set_partial_shape(new_shape); - reshaped_params.insert(param); - updated = true; - }; - - auto get_updated_shape = [&](const ov::Shape& shape, bool split_m_dim) { - const auto current_m_dim = get_dim_M(shape); - OPENVINO_ASSERT(!split_m_dim || current_m_dim == 1 || current_m_dim == m_dim, "Incorrect shape for splitting!"); - ov::Shape new_shape = shape; - if ((split_m_dim && current_m_dim == 1) || !split_m_dim) { - new_shape.insert((new_shape.rbegin() + 2).base(), 1); - } else { - new_shape.insert((new_shape.rbegin() + 2).base(), batch_m_dim); - *(new_shape.rbegin() + 1) = new_m_dim; - } - OPENVINO_ASSERT(ov::shape_size(new_shape) == ov::shape_size(shape), "Incorrect shape splitting!"); - return new_shape; - }; - - auto reshape_parameter = [&](const std::shared_ptr& node, bool split_m_dim = true) { - const auto param = ov::as_type_ptr(node); - if (!param || reshaped_params.count(param) > 0) - return; - insert_reshape(param, get_updated_shape(param->get_partial_shape().get_shape(), split_m_dim)); - }; - - auto update_matmul_second_branch = [&](const std::shared_ptr& node) { - auto parent = node->get_input_node_shared_ptr(1); - while (!ov::is_type(parent)) { - if (parent->get_input_size() > 1) { - for (const auto& input_source : parent->input_values()) { - reshape_parameter(input_source.get_node_shared_ptr(), false); - } - } - - // [107731]: It's covered my MHA tokenization - parent = parent->get_input_node_shared_ptr(0); - } - reshape_parameter(parent, false); - }; - - // Firstly, Unsqueeze parameters on second branches of MatMuls - for (const auto& op : ops) { - if (ov::is_type(op)) { - update_matmul_second_branch(op); - } - } - - // Secondly, Update All M dimensions for remaining parameters - for (const auto& param : parameters) { - if (reshaped_params.count(param) == 0) - reshape_parameter(param, true); - } - - // Return the previous shape on outputs - for (size_t i = 0; i < subgraph->get_output_size() && updated; ++i) { - const auto output_shape = subgraph->get_output_shape(i); - if (is_scalar(output_shape)) - continue; - - const auto& target_inputs = subgraph->get_output_target_inputs(i); - const auto shape_const = std::make_shared(ov::element::i32, ov::Shape{output_shape.size()}, output_shape); - const auto reshape = std::make_shared(subgraph->output(i), shape_const, false); - // Save output name - const auto original_output = body->get_results()[i]->get_input_node_shared_ptr(0); - const auto original_name = original_output->get_friendly_name(); - reshape->set_friendly_name(original_name); - original_output->set_friendly_name(original_name + "_original"); - - for (const auto& input : target_inputs) { - input.replace_source_output(reshape); - // Result input tensor name was changed, the name has to be restored - if (ov::is_type(input.get_node())) { - input.get_tensor_ptr()->add_names(subgraph->output(i).get_tensor_ptr()->get_names()); - } - } - subgraph->output(i).get_tensor_ptr()->set_names({}); - updated = true; - } - subgraph->set_friendly_name(subgraph->get_friendly_name() + "_original"); - - // Need to update inner Shapes and Softmax Axis - if (updated) { - for (const auto &op : ops) { - if (const auto softmax_v8 = ov::as_type_ptr(op)) { - softmax_v8->set_axis(-1); - } else if (const auto softmax_v1 = ov::as_type_ptr(op)) { - softmax_v1->set_axis(softmax_v1->get_output_partial_shape(0).size()); // since new_shape.size() = old_shape.size() + 1 - } else if (const auto broadcast = ov::as_type_ptr(op)) { - // Broadcast is tokenized only between MatMuls -> Split M dimension - const auto shape_const = ov::as_type_ptr(broadcast->input_value(1).get_node_shared_ptr()); - OPENVINO_ASSERT(shape_const, "SplitDimensionM expects Broadcast with Constant output shape"); - const auto new_shape = get_updated_shape(shape_const->cast_vector(), true); - broadcast->set_argument(1, std::make_shared(shape_const->get_element_type(), ov::Shape{new_shape.size()}, new_shape)); - } - } - subgraph->validate_and_infer_types(); - } -} - -void CommonOptimizations::ExtractConstants(const std::shared_ptr& subgraph) { - OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::ExtractConstants"); - auto body = subgraph->body_ptr(); - - ParameterVector new_parameters; - OutputVector new_external_inputs = subgraph->input_values(); - - for (auto& op : body->get_ops()) { - auto constant = ov::as_type_ptr(op); - if (!constant || ov::shape_size(constant->get_shape()) == 1ul) - continue; - - const auto child = constant->get_output_target_inputs(0).begin()->get_node()->shared_from_this(); - if (op::Subgraph::constant_input_should_be_inside_body(child)) - continue; - - auto parameter = std::make_shared(constant->get_element_type(), constant->output(0).get_partial_shape()); - parameter->set_friendly_name(constant->get_friendly_name()); - ov::copy_runtime_info(constant, parameter); - constant->output(0).replace(parameter->output(0)); - - new_external_inputs.push_back(constant); - new_parameters.push_back(parameter); - } - - if (new_parameters.size() != 0) { - body->add_parameters(new_parameters); - body->validate_nodes_and_infer_types(); - subgraph->set_arguments(new_external_inputs); - } -} - -void CommonOptimizations::ExtractUnsupportedTransposes(const std::shared_ptr& subgraph) { - OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::ExtractUnsupportedTransposes"); - const auto& body = subgraph->body_ptr(); - const auto parameters = body->get_parameters(); - // [107806]: If count of Parameters isn't equal to Subgraph inputs, - // we cannot guarantee correct extraction since we don't have correct connections between body I/O and Subgraph I/O. - OPENVINO_ASSERT(parameters.size() == subgraph->input_values().size(), - "Failed to extract unsupported transposes: the count of Parameters isn't equal to Subgraph inputs"); - - bool updated = false; - for (size_t i = 0; i < parameters.size(); ++i) { - const auto& parameter = parameters[i]; - const auto& consumers = parameter->get_output_target_inputs(0); - if (consumers.size() != 1) - continue; - - const auto transpose = ov::as_type_ptr(consumers.begin()->get_node()->shared_from_this()); - if (!transpose) - continue; - - const auto& order = ov::as_type_ptr(transpose->get_input_node_shared_ptr(1)); - if (!order) - continue; - - const auto order_value = order->cast_vector(); - const auto transpose_child = *(transpose->get_output_target_inputs(0).begin()); - const auto is_brgemm_case = ov::is_type(transpose_child.get_node()->shared_from_this()); - // If Transpose is supported (can be decomposed or fused into Brgemm), skip - if ((is_brgemm_case && FuseTransposeBrgemm::supported_cases.count(order_value) != 0) || - (TransposeDecomposition::supported_cases.count(order_value) != 0)) - continue; - - // If the transpose isn't supported - we have to extract it from Subgraph - transpose->set_argument(0, subgraph->input_value(i)); - subgraph->set_argument(i, transpose); - transpose_child.replace_source_output(parameter); - // Update shape - parameter->set_partial_shape(transpose->get_output_partial_shape(0)); - updated = true; - } - - if (updated) { - subgraph->validate_and_infer_types(); - } -} +#define REGISTER_SNIPPETS_PASS(manager, pass, enabled, ...) \ + if (enabled) \ + manager.register_pass(__VA_ARGS__); CommonOptimizations::CommonOptimizations(const SnippetsTokenization::Config& config) { MATCHER_SCOPE(CommonOptimizations); @@ -371,29 +41,24 @@ CommonOptimizations::CommonOptimizations(const SnippetsTokenization::Config& con const auto& body = subgraph->body_ptr(); const auto is_quantized = subgraph->is_quantized(); + const auto is_domain_sensitive = subgraph->has_domain_sensitive_ops(); // Firstly, we should transform all original Converts inside body to ConvertTruncation to save original behavior. // Then if Subgraph contains FakeQuantize we enable specific transformation for quantized subgraphs. ov::pass::Manager manager(get_pass_config()); - manager.register_pass(); - manager.register_pass(); - if (is_quantized) { - manager.register_pass(); - } - manager.register_pass(); + REGISTER_SNIPPETS_PASS(manager, ov::snippets::pass::TransformConvertToConvertTruncation, true); + REGISTER_SNIPPETS_PASS(manager, ov::snippets::pass::ExplicitTransposeMatMulInputs, is_domain_sensitive); + REGISTER_SNIPPETS_PASS(manager, ov::snippets::pass::CommonFakeQuantizeDecomposition, is_quantized); + REGISTER_SNIPPETS_PASS(manager, ov::snippets::pass::SoftmaxReshapeElimination, is_domain_sensitive); manager.run_passes(body); + ov::snippets::pass::CommonOptimizations::SubgraphManager subgraph_manager; // At the moment only non-scalar Constants of FakeQuantize can be inside Subgraph // so we can enable ExtractConstants pass for quantized models - if (is_quantized) { - ExtractConstants(subgraph); - } - // Extract unsupported Transposes from body - if (subgraph->has_domain_sensitive_ops()) { - ExtractUnsupportedTransposes(subgraph); - if (config.split_m_dimension) - SplitDimensionM(subgraph, config.concurrency); - } + REGISTER_SNIPPETS_PASS(subgraph_manager, ov::snippets::pass::ExtractConstants, is_quantized); + REGISTER_SNIPPETS_PASS(subgraph_manager, ov::snippets::pass::ExtractUnsupportedTransposes, is_domain_sensitive); + REGISTER_SNIPPETS_PASS(subgraph_manager, ov::snippets::pass::SplitDimensionM, is_domain_sensitive && config.split_m_dimension, config.concurrency); + subgraph_manager.run_passes(subgraph); // Validate the body after all common optimizations ov::snippets::pass::Validate(get_pass_config()).run_on_model(body); diff --git a/src/common/snippets/src/pass/extract_constants.cpp b/src/common/snippets/src/pass/extract_constants.cpp new file mode 100644 index 00000000000000..54a2a56cd27cf5 --- /dev/null +++ b/src/common/snippets/src/pass/extract_constants.cpp @@ -0,0 +1,42 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "snippets/pass/extract_constants.hpp" + +#include "openvino/opsets/opset1.hpp" +#include "snippets/itt.hpp" + + +bool ov::snippets::pass::ExtractConstants::run_on_subgraph(const std::shared_ptr& subgraph) { + OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::ExtractConstants"); + auto body = subgraph->body_ptr(); + + ParameterVector new_parameters; + OutputVector new_external_inputs = subgraph->input_values(); + + for (auto& op : body->get_ops()) { + auto constant = ov::as_type_ptr(op); + if (!constant || ov::shape_size(constant->get_shape()) == 1ul) + continue; + + const auto child = constant->get_output_target_inputs(0).begin()->get_node()->shared_from_this(); + if (ov::snippets::op::Subgraph::constant_input_should_be_inside_body(child)) + continue; + + auto parameter = std::make_shared(constant->get_element_type(), constant->get_shape()); + ov::replace_output_update_name(constant->output(0), parameter->output(0)); + + new_external_inputs.push_back(constant); + new_parameters.push_back(parameter); + } + + if (new_parameters.size() != 0) { + body->add_parameters(new_parameters); + body->validate_nodes_and_infer_types(); + subgraph->set_arguments(new_external_inputs); + return true; + } + + return false; +} diff --git a/src/common/snippets/src/pass/extract_unsupported_transposes.cpp b/src/common/snippets/src/pass/extract_unsupported_transposes.cpp new file mode 100644 index 00000000000000..4cc87b3810c1ae --- /dev/null +++ b/src/common/snippets/src/pass/extract_unsupported_transposes.cpp @@ -0,0 +1,57 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "snippets/pass/extract_unsupported_transposes.hpp" + +#include "openvino/opsets/opset1.hpp" +#include "snippets/pass/mha_tokenization.hpp" +#include "snippets/itt.hpp" + + +bool ov::snippets::pass::ExtractUnsupportedTransposes::run_on_subgraph(const std::shared_ptr& subgraph) { + OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::ExtractUnsupportedTransposes"); + const auto& body = subgraph->body_ptr(); + const auto parameters = body->get_parameters(); + // [107806]: If count of Parameters isn't equal to Subgraph inputs, + // we cannot guarantee correct extraction since we don't have correct connections between body I/O and Subgraph I/O. + OPENVINO_ASSERT(parameters.size() == subgraph->input_values().size(), + "Failed to extract unsupported transposes: the count of Parameters isn't equal to Subgraph inputs"); + + bool updated = false; + for (size_t i = 0; i < parameters.size(); ++i) { + const auto& parameter = parameters[i]; + const auto& consumers = parameter->get_output_target_inputs(0); + if (consumers.size() != 1) + continue; + + const auto transpose = ov::as_type_ptr(consumers.begin()->get_node()->shared_from_this()); + if (!transpose) + continue; + + const auto& order = ov::as_type_ptr(transpose->get_input_node_shared_ptr(1)); + OPENVINO_ASSERT(order, "ExtractUnsupportedTransposes expects Transposes with constant order"); + + const auto order_value = order->cast_vector(); + const auto transpose_child = *(transpose->get_output_target_inputs(0).begin()); + const auto is_brgemm_case = ov::is_type(transpose_child.get_node()->shared_from_this()); + // If Transpose is supported (can be decomposed or fused into Brgemm), skip + // [116568]: It should be covered by TransposeDecomposition::is_supported or FuseTransposeBrgemm::is_supported + if ((is_brgemm_case && TokenizeMHASnippets::get_fusion_transpose_order(order_value.size()) == order_value) || + (TokenizeMHASnippets::get_decomposed_transpose_order(order_value.size()) == order_value)) + continue; + + // If the transpose isn't supported - we have to extract it from Subgraph + transpose->set_argument(0, subgraph->input_value(i)); + subgraph->set_argument(i, transpose); + transpose_child.replace_source_output(parameter); + parameter->set_partial_shape(transpose->get_output_partial_shape(0)); + updated = true; + } + + if (updated) { + subgraph->validate_and_infer_types(); + } + + return updated; +} diff --git a/src/common/snippets/src/pass/fuse_transpose_brgemm.cpp b/src/common/snippets/src/pass/fuse_transpose_brgemm.cpp index 24a4141916e189..4492c1f7466505 100644 --- a/src/common/snippets/src/pass/fuse_transpose_brgemm.cpp +++ b/src/common/snippets/src/pass/fuse_transpose_brgemm.cpp @@ -17,24 +17,19 @@ namespace ov { namespace snippets { namespace pass { -const std::set> FuseTransposeBrgemm::supported_cases = {{0, 2, 1, 3}}; - -bool FuseTransposeBrgemm::is_supported_transpose(const Output& transpose_port) { - const auto transpose_node = transpose_port.get_node_shared_ptr(); - // it's safe to do so because of the patterns we used. alternatively we can do it through pattern_values_map - const auto& constant = as_type_ptr(transpose_node->get_input_node_shared_ptr(1)); - // if Transpose in and out layout is not empty => something was already fused on this port - auto default_layout = std::vector(transpose_port.get_shape().size()); - std::iota(default_layout.begin(), default_layout.end(), 0);// NCHW layout by default - if (lowered::PortDescriptorUtils::get_port_descriptor_ptr(transpose_port)->get_layout() != default_layout || - lowered::PortDescriptorUtils::get_port_descriptor_ptr(transpose_node->input_value(0))->get_layout() != default_layout) +bool FuseTransposeBrgemm::is_supported_transpose(const Output& transpose_out) { + const auto transpose = ov::as_type_ptr(transpose_out.get_node_shared_ptr()); + if (!transpose) return false; - const auto& transpose_order = constant->cast_vector(); - // todo: this limitation is due to the fact that offsets are calculated in Kernel, and the only way - // to calc them non-default way is to set Parameter rt_info field. This limitation can be removed if - // the rt_info is properly propagated to the corresponding parameter - return is_type(transpose_node->get_input_node_shared_ptr(0)) && - supported_cases.count(transpose_order) != 0; + const auto order = ov::as_type_ptr(transpose->get_input_node_shared_ptr(1)); + if (!order) + return false; + return is_supported_transpose_order(order->cast_vector()); +} + +bool FuseTransposeBrgemm::is_supported_transpose_order(const std::vector& order) { + const auto size = order.size(); + return order.size() > 0 && order.back() == (static_cast(size) - 1); } FuseTransposeBrgemm::FuseTransposeBrgemm() { @@ -51,7 +46,7 @@ FuseTransposeBrgemm::FuseTransposeBrgemm() { // Pattern 2: Transpose on output of MatMul auto brgemm_out = ov::pass::pattern::wrap_type({ov::pass::pattern::any_input(), ov::pass::pattern::any_input()}); - auto transpose2 = ov::pass::pattern::wrap_type({brgemm_out, constant}); + auto transpose2 = ov::pass::pattern::wrap_type({brgemm_out, constant}, is_supported_transpose); auto brgemm_or_transpose = std::make_shared(OutputVector{brgemm_in0, brgemm_in1, transpose2}); diff --git a/src/common/snippets/src/pass_manager.cpp b/src/common/snippets/src/pass/manager.cpp similarity index 97% rename from src/common/snippets/src/pass_manager.cpp rename to src/common/snippets/src/pass/manager.cpp index bc9237c1ec8ab1..af59a99e348e5e 100644 --- a/src/common/snippets/src/pass_manager.cpp +++ b/src/common/snippets/src/pass/manager.cpp @@ -2,7 +2,8 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "snippets/pass_manager.hpp" +#include "snippets/pass/manager.hpp" + namespace ov { namespace snippets { @@ -77,5 +78,5 @@ std::shared_ptr Manager::insert_pass_instance(const PassPosit } } // namespace pass -}// namespace snippets -}// namespace ov +} // namespace snippets +} // namespace ov diff --git a/src/common/snippets/src/pass/mha_tokenization.cpp b/src/common/snippets/src/pass/mha_tokenization.cpp index e9f939e8d72d75..67957c286a9e66 100644 --- a/src/common/snippets/src/pass/mha_tokenization.cpp +++ b/src/common/snippets/src/pass/mha_tokenization.cpp @@ -18,11 +18,11 @@ namespace { -auto is_supported_tensor(const ov::descriptor::Tensor& t) -> bool { +bool is_supported_tensor(const ov::descriptor::Tensor& t) { return t.get_partial_shape().is_static() && ov::snippets::utils::one_of(t.get_shape().size(), 3lu, 4lu); } -auto is_supported_intermediate_op(const std::shared_ptr& node) -> bool { +bool is_supported_intermediate_op(const std::shared_ptr& node) { const auto is_intermediate_op = [](const std::shared_ptr& node) { return ov::is_type(node) || ov::is_type(node) || @@ -32,22 +32,23 @@ auto is_supported_intermediate_op(const std::shared_ptr& node) -> bool return is_intermediate_op(node) && ov::snippets::pass::TokenizeSnippets::AppropriateForSubgraph(node); } -auto is_valid_transpose(const std::shared_ptr& node, std::vector expected_order) -> bool { - auto valid_transpose_order = [expected_order](const std::shared_ptr& node) -> bool { +bool is_valid_transpose(const std::shared_ptr& node, const std::set& supported_ranks, std::vector expected_order) { + auto is_valid_transpose_order = [expected_order, supported_ranks](const std::shared_ptr& node) -> bool { const auto transpose_pattern = ov::as_type_ptr(node); if (!transpose_pattern) return false; - return transpose_pattern->cast_vector() == expected_order; + const auto existing_order = transpose_pattern->cast_vector(); + return existing_order == expected_order && supported_ranks.count(existing_order.size()) != 0; }; auto is_supported_transpose_tensor = [](const ov::descriptor::Tensor& t) { return is_supported_tensor(t) && ov::snippets::pass::TokenizeSnippets::get_supported_element_types().count(t.get_element_type()) != 0; }; - return node && node->get_output_target_inputs(0).size() == 1 && node->get_shape().size() == 4 && - valid_transpose_order(node->get_input_node_shared_ptr(1)) && is_supported_transpose_tensor(node->get_input_tensor(0)); + return node && node->get_output_target_inputs(0).size() == 1 && is_valid_transpose_order(node->get_input_node_shared_ptr(1)) && + is_supported_transpose_tensor(node->get_input_tensor(0)); } -auto tokenize_broadcast(const std::shared_ptr& interm_op, ov::NodeVector& ordered_ops) -> void { +void tokenize_broadcast(const std::shared_ptr& interm_op, ov::NodeVector& ordered_ops) { // We can tokenize Broadcast op only when output shape of child doesn't depend on Broadcast shape without last dimension. // Snippets remove Broadcast op and insert BroadcastMove if last dimensions before and after Broadcast are different. // Otherwise, we can lose original shape. @@ -95,9 +96,7 @@ auto tokenize_broadcast(const std::shared_ptr& interm_op, ov::NodeVect } } -auto tokenize_reshape_around_softmax(std::shared_ptr& interm_op, - std::shared_ptr& reshape, - ov::NodeVector& ordered_ops) -> bool { +bool tokenize_reshape_around_softmax(std::shared_ptr& interm_op, std::shared_ptr& reshape, ov::NodeVector& ordered_ops) { reshape = ov::as_type_ptr(interm_op); if (reshape) { const auto in_shape = reshape->get_input_shape(0); @@ -110,7 +109,7 @@ auto tokenize_reshape_around_softmax(std::shared_ptr& interm_op, return true; } -auto get_potential_body_params(const std::shared_ptr& op) -> size_t { +size_t get_potential_body_params(const std::shared_ptr& op) { size_t count = 0; for (size_t i = 1; i < op->get_input_size(); ++i) { const auto input = op->input_value(i); @@ -125,8 +124,8 @@ auto get_potential_body_params(const std::shared_ptr& op) -> size_t { return count; } -auto update_intermediate_supported_ops(std::shared_ptr& interm_op, ov::NodeVector& ordered_ops, - size_t& hidden_virtual_ports_count, size_t& potential_body_params_count) -> bool { +bool update_intermediate_supported_ops(std::shared_ptr& interm_op, ov::NodeVector& ordered_ops, + size_t& hidden_virtual_ports_count, size_t& potential_body_params_count) { while (is_supported_intermediate_op(interm_op)) { // All supported intermediate ops have only one output port if (interm_op->get_output_target_inputs(0).size() != 1) @@ -176,8 +175,26 @@ auto update_intermediate_supported_ops(std::shared_ptr& interm_op, ov: } return true; } + +std::vector get_rank_equivalent_order(std::vector default_order, size_t rank) { + OPENVINO_ASSERT(rank > 2, "Incorrect order rank for Transpose tokenization"); + auto order = std::vector(rank); + std::iota(order.begin(), order.end(), 0); + const auto diff = static_cast(rank - default_order.size()); + for (size_t i = 0; i < default_order.size(); ++i) { + order[diff + i] = default_order[i] + diff; + } + return order; +} } // namespace +std::vector ov::snippets::pass::TokenizeMHASnippets::get_fusion_transpose_order(size_t rank) { + return get_rank_equivalent_order({1, 0, 2}, rank); +} +std::vector ov::snippets::pass::TokenizeMHASnippets::get_decomposed_transpose_order(size_t rank) { + return get_rank_equivalent_order({1, 2, 0}, rank); +} + bool ov::snippets::pass::TokenizeMHASnippets::is_matmul0_supported(const std::shared_ptr& matmul) { if (!matmul || matmul->get_output_target_inputs(0).size() != 1 || matmul->get_transpose_a() || !is_supported_tensor(matmul->get_input_tensor(0)) || !is_supported_tensor(matmul->get_input_tensor(1))) @@ -257,6 +274,8 @@ ov::snippets::pass::TokenizeMHASnippets::TokenizeMHASnippets(const SnippetsToken ordered_ops.push_back(matmul0); + const auto pattern_rank = matmul0->get_output_partial_shape(0).size(); + auto interm_op = matmul0->get_output_target_inputs(0).begin()->get_node()->shared_from_this(); // Add supported operations which are between MatMul0 and Softmax to ordered_ops if (!update_intermediate_supported_ops(interm_op, ordered_ops, hidden_virtual_ports_count, potential_body_params_count)) @@ -368,12 +387,12 @@ ov::snippets::pass::TokenizeMHASnippets::TokenizeMHASnippets(const SnippetsToken } auto tokenize_transpose = [&](const std::shared_ptr& transpose, - bool is_input_transposed, std::vector order, + bool is_input_transposed, std::vector order, const ov::NodeVector::const_iterator& pos) { // If Transpose has valid order for the Transpose fusing (ExplicitTransposeMatMulInputs pass call), tokenize him. // Otherwise, skip the Transpose. if (!is_input_transposed) { - if (is_valid_transpose(transpose, order)) { + if (is_valid_transpose(transpose, config.mha_supported_transpose_ranks, order)) { ordered_ops.insert(pos, transpose); } return; @@ -383,7 +402,7 @@ ov::snippets::pass::TokenizeMHASnippets::TokenizeMHASnippets(const SnippetsToken if (rank < 2) return; std::swap(transposed_order[rank - 1], transposed_order[rank - 2]); - if (is_valid_transpose(transpose, transposed_order)) { + if (is_valid_transpose(transpose, config.mha_supported_transpose_ranks, transposed_order)) { ordered_ops.insert(pos, transpose); } }; @@ -391,9 +410,9 @@ ov::snippets::pass::TokenizeMHASnippets::TokenizeMHASnippets(const SnippetsToken const auto transpose1 = ov::as_type_ptr(parent); const auto transpose0 = ov::as_type_ptr(matmul0->get_input_node_shared_ptr(0)); const auto transpose2 = ov::as_type_ptr(matmul1->get_input_node_shared_ptr(1)); - tokenize_transpose(transpose1, is_transposed_b_0, {0, 2, 3, 1}, ordered_ops.begin()); - tokenize_transpose(transpose0, matmul0->get_transpose_a(), {0, 2, 1, 3}, ordered_ops.begin()); - tokenize_transpose(transpose2, matmul1->get_transpose_b(), {0, 2, 1, 3}, ordered_ops.end()); + tokenize_transpose(transpose1, is_transposed_b_0, get_decomposed_transpose_order(pattern_rank), ordered_ops.begin()); + tokenize_transpose(transpose0, matmul0->get_transpose_a(), get_fusion_transpose_order(pattern_rank), ordered_ops.begin()); + tokenize_transpose(transpose2, matmul1->get_transpose_b(), get_fusion_transpose_order(pattern_rank), ordered_ops.end()); ordered_ops.push_back(matmul1); bool are_ops_after_matmul1 = false; @@ -427,7 +446,7 @@ ov::snippets::pass::TokenizeMHASnippets::TokenizeMHASnippets(const SnippetsToken // Transpose3 if (!are_ops_after_matmul1) { auto transpose3 = config.mha_token_enable_transpose_on_output ? ov::as_type_ptr(child) : nullptr; - if (is_valid_transpose(transpose3, {0, 2, 1, 3}) && + if (is_valid_transpose(transpose3, config.mha_supported_transpose_ranks, get_fusion_transpose_order(pattern_rank)) && transpose3->get_input_element_type(0) == matmul1_out_type) { // To avoid Convert between MatMul1 and Transpose3 ordered_ops.push_back(transpose3); } diff --git a/src/common/snippets/src/pass/split_dimension_m.cpp b/src/common/snippets/src/pass/split_dimension_m.cpp new file mode 100644 index 00000000000000..671a12bffa34d2 --- /dev/null +++ b/src/common/snippets/src/pass/split_dimension_m.cpp @@ -0,0 +1,275 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "snippets/pass/split_dimension_m.hpp" + +#include "snippets/utils.hpp" +#include "snippets/itt.hpp" + +namespace { +size_t get_dim_M(const ov::Shape& shape) { + return *(shape.rbegin() + 1); +} +bool is_prime_number(size_t value) { + if (ov::snippets::utils::one_of(value, 2lu, 3lu)) return true; + if (value == 1 || value % 2 == 0 || value % 3 == 0) return false; + const auto root = std::sqrt(value) + 1; + for (size_t divisor = 5; divisor < root; divisor += 6) { + if ((value % divisor == 0) || (value % (divisor + 2) == 0)) + return false; + } + return true; +} +} // namespace + +bool ov::snippets::pass::SplitDimensionM::is_supported_matmul(const std::shared_ptr& node) { + const auto matmul = ov::as_type_ptr(node); + return matmul && !matmul->get_transpose_a() && !matmul->is_dynamic(); +} + +std::pair ov::snippets::pass::SplitDimensionM::get_splited_dimensions(size_t batch_dim, size_t m_dim, + size_t optimal_parallelism_work_amount) { + std::pair splited = { 1, m_dim }; + + const size_t lower_bound = optimal_parallelism_work_amount / batch_dim; + if (lower_bound * batch_dim == optimal_parallelism_work_amount && m_dim % lower_bound == 0) { + splited.first = lower_bound; + splited.second = m_dim / lower_bound; + OPENVINO_ASSERT(splited.first * splited.second == m_dim, "Incorrect dimension M splitting!"); + return splited; + } + + const size_t upper_bound = utils::div_up(2 * optimal_parallelism_work_amount, batch_dim); + for (size_t divisor_0 = upper_bound - 1; divisor_0 > 1; divisor_0--) { + size_t divisor_1 = m_dim / divisor_0; + if (divisor_1 * divisor_0 == m_dim) { + splited.first = divisor_0; + splited.second = divisor_1; + break; + } + } + OPENVINO_ASSERT(splited.first * splited.second == m_dim, "Incorrect dimension M splitting!"); + return splited; +} + +bool ov::snippets::pass::SplitDimensionM::can_be_optimized(const std::shared_ptr& node, size_t concurrency) { + if (!is_supported_matmul(node)) + return false; + size_t batch_m_dim, new_m_dim; + return split(node->get_shape(), concurrency, batch_m_dim, new_m_dim); +} + +std::shared_ptr ov::snippets::pass::SplitDimensionM::get_matmul(const std::shared_ptr& subgraph) { + const auto& body = subgraph->body_ptr(); + const auto& parameters = body->get_parameters(); + // [107806]: If count of Parameters isn't equal to Subgraph inputs (it's possible case in general), + // we cannot garantee correct extraction since we don't have correct connections between body I/O and Subgraph I/O. + OPENVINO_ASSERT(parameters.size() == subgraph->input_values().size(), + "Failed to extract unsupported transposes: the count of Parameters isn't equal to Subgraph inputs"); + + // Need to find MatMul0 and check output shape + const auto& ops = body->get_ordered_ops(); + const auto mm_it = std::find_if(ops.cbegin(), ops.cend(), + [](const std::shared_ptr& node){ return ov::is_type(node); }); + if (mm_it == ops.end()) + return nullptr; + + const auto matmul0 = *mm_it; + return is_supported_matmul(matmul0) ? ov::as_type_ptr(matmul0) : nullptr; +} + +bool ov::snippets::pass::SplitDimensionM::split(const ov::Shape& shape, size_t optimal_parallelism_work_amount, size_t& batch_m_dim, size_t& new_m_dim) { + const auto batch_dim = + std::accumulate(shape.rbegin() + 2, shape.rend(), size_t(1), std::multiplies()); // B (batch) + const auto m_dim = get_dim_M(shape); // M + if (is_prime_number(m_dim)) + return false; + + auto is_optimized = [&](size_t batch_dim) { + return batch_dim >= optimal_parallelism_work_amount; + }; + + // We skip optimization if the current batch is optimal for concurrency + if (is_optimized(batch_dim)) + return false; + + std::tie(batch_m_dim, new_m_dim) = get_splited_dimensions(batch_dim, m_dim, optimal_parallelism_work_amount); + return is_optimized(batch_dim * batch_m_dim); +} + +void ov::snippets::pass::SplitDimensionM::reshape_subgraph(const std::shared_ptr& subgraph, + const ov::Shape& shape, size_t batch_m_dim, size_t new_m_dim) { + const auto& body = subgraph->body_ptr(); + const auto& parameters = body->get_parameters(); + const auto& results = body->get_results(); + const auto ops = body->get_ordered_ops(); + const auto m_dim = get_dim_M(shape); + + // There are two Parameter variants: + // - Parameter on branches for Second input of MatMul - the shape should be only unsqueezed (add just 1) + // - Other Parameters (on First input of MatMuls and between) - the shape should be splitted on M dimension + + std::set> reshaped_params; + + auto insert_reshape = [&](const std::shared_ptr& param, const ov::Shape& new_shape) { + const auto index = std::distance(parameters.begin(), std::find(parameters.begin(), parameters.end(), param)); + const auto shape_const = std::make_shared(ov::element::i32, ov::Shape{new_shape.size()}, new_shape); + const auto reshape = std::make_shared(subgraph->input_value(index), shape_const, false); + subgraph->input(index).replace_source_output(reshape); + param->set_partial_shape(new_shape); + reshaped_params.insert(param); + }; + + auto get_updated_shape = [&](const ov::Shape& shape, size_t m_index, bool split_m_dim) { + const auto current_m_dim = shape[m_index]; + OPENVINO_ASSERT(!split_m_dim || current_m_dim == 1 || current_m_dim == m_dim, "Incorrect shape for splitting!"); + ov::Shape new_shape = shape; + if ((split_m_dim && current_m_dim == 1) || !split_m_dim) { + new_shape.insert(new_shape.begin() + m_index, 1); + } else { + new_shape[m_index] = new_m_dim; + new_shape.insert(new_shape.begin() + m_index, batch_m_dim); + } + OPENVINO_ASSERT(ov::shape_size(new_shape) == ov::shape_size(shape), "Incorrect shape splitting!"); + return new_shape; + }; + + auto get_updated_order = [](const std::vector& order, int m_index) { + std::vector new_order(order.size() + 1, 0); + size_t shift_idx = 0; + for (size_t i = 0; i < order.size(); ++i) { + if (order[i] < m_index) { + new_order[i + shift_idx] = order[i]; + } else if (order[i] == m_index) { + new_order[i + shift_idx++] = order[i]; + new_order[i + shift_idx] = order[i] + 1; + } else { + new_order[i + shift_idx] = order[i] + 1; + } + } + return new_order; + }; + + auto reshape_transpose = [&](const std::shared_ptr& transpose, bool is_input) -> size_t { + const auto order_constant = ov::as_type_ptr(transpose->get_input_node_shared_ptr(1)); + OPENVINO_ASSERT(order_constant != nullptr, "Transpose must have Constant order"); + const auto order = order_constant->cast_vector(); + const auto m_index = is_input ? order[order.size() - 2] : order.size() - 2; // Index of M dimension in the previous order + const auto new_order = get_updated_order(order, static_cast(m_index)); + transpose->set_argument(1, std::make_shared(order_constant->get_element_type(), ov::Shape{new_order.size()}, new_order)); + return m_index; + }; + + auto reshape_parameter = [&](const std::shared_ptr& node, bool split_m_dim = true) { + const auto param = ov::as_type_ptr(node); + if (!param || reshaped_params.count(param) > 0) + return; + + const auto shape = param->get_partial_shape().get_shape(); + const auto consumers = param->get_output_target_inputs(0); + const auto shared_consumer = consumers.begin()->get_node()->shared_from_this(); + auto m_index = shape.size() - 2; + if (ov::is_type(shared_consumer)) { + m_index = reshape_transpose(shared_consumer, true); + } + insert_reshape(param, get_updated_shape(shape, m_index, split_m_dim)); + }; + + auto update_matmul_second_branch = [&](const std::shared_ptr& node) { + auto parent = node->get_input_node_shared_ptr(1); + while (!ov::is_type(parent)) { + if (parent->get_input_size() > 1) { + for (const auto& input_source : parent->input_values()) { + reshape_parameter(input_source.get_node_shared_ptr(), false); + } + } + + // [107731]: It's covered my MHA tokenization + parent = parent->get_input_node_shared_ptr(0); + } + reshape_parameter(parent, false); + }; + + // Firstly, Unsqueeze parameters on second branches of MatMuls + for (const auto& op : ops) { + if (const auto matmul = ov::as_type_ptr(op)) { + update_matmul_second_branch(matmul); + } else if (const auto softmax_v8 = ov::as_type_ptr(op)) { + softmax_v8->set_axis(-1); + } else if (const auto softmax_v1 = ov::as_type_ptr(op)) { + softmax_v1->set_axis(softmax_v1->get_output_partial_shape(0).size()); // since new_shape.size() = old_shape.size() + 1 + } else if (const auto broadcast = ov::as_type_ptr(op)) { + // Broadcast is tokenized only between MatMuls -> Split M dimension + const auto shape_const = ov::as_type_ptr(broadcast->input_value(1).get_node_shared_ptr()); + OPENVINO_ASSERT(shape_const, "SplitDimensionM expects Broadcast with Constant output shape"); + const auto new_shape = get_updated_shape(shape_const->cast_vector(), broadcast->get_output_shape(0).size() - 2, true); + broadcast->set_argument(1, std::make_shared(shape_const->get_element_type(), ov::Shape{new_shape.size()}, new_shape)); + } + } + + // Secondly, Update All M dimensions for remaining parameters + for (const auto& param : parameters) { + if (reshaped_params.count(param) == 0) + reshape_parameter(param, true); + } + + // Update Transpose order on Result + for (const auto& res : results) { + const auto parent = res->get_input_node_shared_ptr(0); + if (ov::is_type(parent)) { + reshape_transpose(parent, false); + } + } + + // Return the previous shape on outputs + for (size_t i = 0; i < subgraph->get_output_size(); ++i) { + const auto output_shape = subgraph->get_output_shape(i); + if (is_scalar(output_shape)) + continue; + + const auto& target_inputs = subgraph->get_output_target_inputs(i); + const auto shape_const = std::make_shared(ov::element::i32, ov::Shape{output_shape.size()}, output_shape); + const auto reshape = std::make_shared(subgraph->output(i), shape_const, false); + // Save output name + const auto original_output = body->get_results()[i]->get_input_node_shared_ptr(0); + const auto original_name = original_output->get_friendly_name(); + reshape->set_friendly_name(original_name); + original_output->set_friendly_name(original_name + "_original"); + + for (const auto& input : target_inputs) { + input.replace_source_output(reshape); + // Result input tensor name was changed, the name has to be restored + if (ov::is_type(input.get_node())) { + input.get_tensor_ptr()->add_names(subgraph->output(i).get_tensor_ptr()->get_names()); + } + } + subgraph->output(i).get_tensor_ptr()->set_names({}); + } + subgraph->set_friendly_name(subgraph->get_friendly_name() + "_original"); + // Need to update inner Shapes and Softmax Axis + subgraph->validate_and_infer_types(); +} + +bool ov::snippets::pass::SplitDimensionM::run_on_subgraph(const std::shared_ptr& subgraph) { + OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::SplitDimensionM"); + // To increase parallelism work in MHA pattern, + // we split 1st dimension (starting from 0th) into 2 new dimensions to get 4D Shapes where + // - 0th and 1st dimensions are used in parallel scheduling, + // - 2nd and 3rd dimensions are used in kernel + + // It's needed only for MHA patterns. Need to add support for common patterns + if (!subgraph->has_domain_sensitive_ops()) + return false; + + if (const auto matmul0 = get_matmul(subgraph)) { + const auto mm_shape = matmul0->get_shape(); + size_t batch_m_dim, new_m_dim; + if (!split(mm_shape, m_concurrency, batch_m_dim, new_m_dim)) + return false; + + reshape_subgraph(subgraph, mm_shape, batch_m_dim, new_m_dim); + return true; + } + return false; +} diff --git a/src/common/snippets/src/pass/subgraph_manager.cpp b/src/common/snippets/src/pass/subgraph_manager.cpp new file mode 100644 index 00000000000000..860a2b15c359fd --- /dev/null +++ b/src/common/snippets/src/pass/subgraph_manager.cpp @@ -0,0 +1,21 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "snippets/pass/subgraph_manager.hpp" + +namespace ov { +namespace snippets { +namespace pass { + +bool CommonOptimizations::SubgraphManager::run_passes(std::shared_ptr subgraph) { + bool updated = false; + for (const auto& pass : m_pass_list) { + updated = pass->run_on_subgraph(subgraph) || updated; + } + return updated; +} + +} // namespace pass +} // namespace snippets +} // namespace ov diff --git a/src/common/snippets/src/pass/transpose_decomposition.cpp b/src/common/snippets/src/pass/transpose_decomposition.cpp index bb581105a7523a..fe7cf7a702b09f 100644 --- a/src/common/snippets/src/pass/transpose_decomposition.cpp +++ b/src/common/snippets/src/pass/transpose_decomposition.cpp @@ -14,7 +14,22 @@ namespace snippets { namespace pass { using namespace lowered; -const std::set> TransposeDecomposition::supported_cases = {{0, 2, 3, 1}}; +bool TransposeDecomposition::is_supported_transpose(const Output& transpose_out) { + const auto transpose = ov::as_type_ptr(transpose_out.get_node_shared_ptr()); + if (!transpose) + return false; + const auto order = ov::as_type_ptr(transpose->get_input_node_shared_ptr(1)); + if (!order) + return false; + return is_supported_transpose_order(order->cast_vector()); +} + +bool TransposeDecomposition::is_supported_transpose_order(const std::vector& order) { + const auto size = order.size(); + if (size > 0) + return order.back() != static_cast(size - 1); + return true; +} TransposeDecomposition::TransposeDecomposition() { MATCHER_SCOPE(TransposeDecomposition); @@ -37,7 +52,7 @@ TransposeDecomposition::TransposeDecomposition() { return false; auto order_value = order->cast_vector(); - if (supported_cases.count(order_value) == 0) + if (!is_supported_transpose_order(order_value)) return false; // number of elements that can be processed on every iteration. For 0,1,2,3 -> 0,2,3,1 we can guarantee only scalar access diff --git a/src/common/snippets/src/utils.cpp b/src/common/snippets/src/utils.cpp index 242391b908dc03..2bd5423babb805 100644 --- a/src/common/snippets/src/utils.cpp +++ b/src/common/snippets/src/utils.cpp @@ -12,6 +12,37 @@ namespace ov { namespace snippets { namespace utils { +namespace { +template +void ordered_shape(const Shape& shape, const std::vector& layout, bool is_forward, Shape& reordered_shape) { + for (size_t i = 0; i < layout.size(); i++) { + OPENVINO_ASSERT(layout[i] < shape.size(), "layout index is greater than the shape size"); + const auto src_idx = is_forward ? layout[i] : i; + const auto dst_idx = is_forward ? i : layout[i]; + reordered_shape[dst_idx] = shape[src_idx]; + } +} + +// Note: +// - If `is_forward` is True, `result shape` is ordered `shape` by `layout` +// - If `is_forward` is False, `result shape` is original shape to which the `layout` was applied +ov::PartialShape get_pshape(const ov::PartialShape& shape, const std::vector& layout, bool is_forward) { + if (layout.empty()) + return shape; + ov::PartialShape reordered_shape(std::vector(layout.size())); + if (shape.rank().is_dynamic()) + OPENVINO_THROW("get_reordered_planar_shape can't be called for outputs with dynamic rank"); + const size_t rank = shape.rank().get_length(); + if (layout.size() > rank) + OPENVINO_THROW("Layout rank can't be larger than tensor rank"); + // Note that it can be smaller though, for example tensor shape can be prepended with 1 for scheduling purposes + if (std::any_of(layout.begin(), layout.end(), [=](size_t x) {return x >= rank;})) + OPENVINO_THROW("Invalid layout detected: all layout indexes must be smaller than the tensor rank"); + ordered_shape(shape, layout, is_forward, reordered_shape); + return reordered_shape; +} +} // namespace + auto get_non_scalar_constant_count_for_fq(const std::shared_ptr& fq) -> size_t { std::vector cl, ch, isc, ish, osc, osh; const bool status = ov::snippets::pass::FakeQuantizeDecomposition::getScalesAndShifts(fq, cl, ch, isc, ish, osc, osh); @@ -70,23 +101,46 @@ auto get_non_scalar_constant_count_for_fq(const std::shared_ptr& layout) { - if (layout.empty()) - return shape; - std::vector reordered_shape(layout.size()); - if (shape.rank().is_dynamic()) - OPENVINO_THROW("get_reordered_planar_shape can't be called for outputs with dynamic rank"); - const size_t rank = shape.rank().get_length(); - if (layout.size() > rank) - OPENVINO_THROW("Layout rank can't be larger than tensor rank"); - // Note that it can be smaller though, for example tensor shape can be prepended with 1 for scheduling purposes - if (std::any_of(layout.begin(), layout.end(), [=](size_t x) {return x >= rank;})) - OPENVINO_THROW("Invalid layout detected: all layout indexes must be smaller than the tensor rank"); - for (size_t i = 0; i < layout.size(); i++) - reordered_shape[i] = shape[layout[i]]; +ov::PartialShape get_planar_pshape(const ov::PartialShape& shape, const std::vector& order) { + return get_pshape(shape, order, true); +} +ov::PartialShape get_preordered_pshape(const ov::PartialShape& shape, const std::vector& order) { + return get_pshape(shape, order, false); +} + +ov::PartialShape get_planar_pshape(const Input& in) { + const auto& port = snippets::lowered::PortDescriptorUtils::get_port_descriptor_ptr(in); + return get_planar_pshape(ov::Shape{port->get_shape()}, port->get_layout()); +} +ov::PartialShape get_preordered_pshape(const Output& out) { + const auto& port = snippets::lowered::PortDescriptorUtils::get_port_descriptor_ptr(out); + return get_preordered_pshape(ov::Shape{port->get_shape()}, port->get_layout()); +} + +VectorDims get_planar_vdims(const VectorDims& shape, const std::vector& order) { + VectorDims reordered_shape(order.size()); + ordered_shape(shape, order, true, reordered_shape); + return reordered_shape; +} +VectorDims get_preordered_vdims(const VectorDims& shape, const std::vector& order) { + VectorDims reordered_shape(order.size()); + ordered_shape(shape, order, false, reordered_shape); return reordered_shape; } +VectorDims get_planar_vdims(const snippets::lowered::ExpressionPort& expr_port) { + OPENVINO_ASSERT(expr_port.get_type() == snippets::lowered::ExpressionPort::Type::Input, "get_planar_vdims expects Expression Input port"); + return get_planar_vdims(expr_port.get_descriptor_ptr()->get_shape(), expr_port.get_descriptor_ptr()->get_layout()); +} +VectorDims get_preordered_vdims(const snippets::lowered::ExpressionPort& expr_port) { + OPENVINO_ASSERT(expr_port.get_type() == snippets::lowered::ExpressionPort::Type::Output, "get_preordered_vdims expects Expression Output port"); + return get_preordered_vdims(expr_port.get_descriptor_ptr()->get_shape(), expr_port.get_descriptor_ptr()->get_layout()); +} + +bool is_dynamic_vdims(const VectorDims& shape) { + return std::any_of(shape.cbegin(), shape.cend(), [](size_t v){ return v == IShapeInferSnippets::DYNAMIC_DIMENSION; }); +} + VectorDims pshape_to_vdims(const PartialShape& pshape) { VectorDims result; result.reserve(pshape.size()); @@ -106,37 +160,6 @@ ov::PartialShape vdims_to_pshape(const VectorDims& vdims) { return result; } -ov::PartialShape get_planar_pshape(const Input& in) { - const auto& port = snippets::lowered::PortDescriptorUtils::get_port_descriptor_ptr(in); - return utils::get_planar_pshape(ov::Shape{port->get_shape()}, port->get_layout()); -} - -ov::PartialShape get_planar_pshape(const Output& out) { - const auto& port = snippets::lowered::PortDescriptorUtils::get_port_descriptor_ptr(out); - return utils::get_planar_pshape(ov::Shape{port->get_shape()}, port->get_layout()); -} - -VectorDims get_planar_vdims(const VectorDims& shape, const std::vector& layout) { - VectorDims reordered_shape(shape.size()); - for (size_t i = 0; i < layout.size(); i++) { - OPENVINO_ASSERT(layout[i] < shape.size(), "get_planar_vdims: layout index is greater than the shape size"); - reordered_shape[i] = shape[layout[i]]; - } - return reordered_shape; -} - -VectorDims get_planar_vdims(const snippets::lowered::PortDescriptorPtr& port_desc) { - return get_planar_vdims(port_desc->get_shape(), port_desc->get_layout()); -} - -VectorDims get_planar_vdims(const snippets::lowered::ExpressionPort& expr_port) { - return get_planar_vdims(expr_port.get_descriptor_ptr()); -} - -bool is_dynamic_vdims(const VectorDims& shape) { - return std::any_of(shape.cbegin(), shape.cend(), [](size_t v){ return v == IShapeInferSnippets::DYNAMIC_DIMENSION; }); -} - } // namespace utils } // namespace snippets } // namespace ov diff --git a/src/common/snippets/tests/include/lowering_utils.hpp b/src/common/snippets/tests/include/lowering_utils.hpp index f2c872f725b7d6..379a8f16cec4f0 100644 --- a/src/common/snippets/tests/include/lowering_utils.hpp +++ b/src/common/snippets/tests/include/lowering_utils.hpp @@ -6,7 +6,7 @@ #include #include "snippets/op/subgraph.hpp" #include "snippets_helpers.hpp" -#include "snippets/pass_manager.hpp" +#include "snippets/pass/manager.hpp" #include "snippets/shape_inference/shape_inference.hpp" namespace ov { diff --git a/src/common/snippets/tests/src/pass/mha_tokenization.cpp b/src/common/snippets/tests/src/pass/mha_tokenization.cpp index 4fa525ba5d1f0c..49087d4ffcf675 100644 --- a/src/common/snippets/tests/src/pass/mha_tokenization.cpp +++ b/src/common/snippets/tests/src/pass/mha_tokenization.cpp @@ -31,8 +31,7 @@ void TokenizeMHASnippetsTests::run() { disable_rt_info_check(); } -TEST_F(SKIP_TokenizeMHASnippetsTests /* CVS-114607 */, smoke_Snippets_MHA) { - GTEST_SKIP(); +TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA_4D) { const auto &f = MHAFunction(std::vector{{1, 128, 12, 64}, {1, 128, 12, 64}, {1, 12, 128, 128}, {1, 128, 12, 64}}, std::vector({ov::element::f32, ov::element::f32, ov::element::f32, ov::element::f32})); model = f.getOriginal(); @@ -40,6 +39,14 @@ TEST_F(SKIP_TokenizeMHASnippetsTests /* CVS-114607 */, smoke_Snippets_MHA) { run(); } +TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA_3D) { + const auto &f = MHAFunction(std::vector{{128, 12, 64}, {128, 12, 64}, {12, 128, 128}, {128, 12, 64}}, + std::vector({ov::element::f32, ov::element::f32, ov::element::f32, ov::element::f32})); + model = f.getOriginal(); + model_ref = f.getReference(); + run(); +} + TEST_F(SKIP_TokenizeMHASnippetsTests /* CVS-114607 */, smoke_Snippets_MHA_with_MatMul0_Transpose) { GTEST_SKIP(); const auto &f = MHAMatMul0TransposeFunction(std::vector{{1, 128, 12, 64}, {1, 128, 12, 64}, {1, 12, 128, 128}, {1, 128, 12, 64}}, @@ -80,10 +87,54 @@ TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA_Transpose_fusion) { run(); } -TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA_SplitM) { +TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA3D_SplitM) { + const auto& f = MHASplitMFunction(std::vector{{128, 12, 64}, {128, 12, 64}, {12, 128, 128}, {128, 12, 64}}, + std::vector({ov::element::f32, ov::element::f32, ov::element::f32, ov::element::f32}), + std::vector{{2, 64, 12, 64}, {128, 12, 1, 64}, {12, 2, 64, 128}, {1, 128, 12, 64}, {128, 12, 64}}, + false); + model = f.getOriginal(); + model_ref = f.getReference(); + config.concurrency = 24; + run(); +} + +TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA3D_SplitM_withMul) { + const auto& f = MHASplitMFunction(std::vector{{128, 12, 64}, {128, 12, 64}, {12, 128, 128}, {128, 12, 64}}, + std::vector({ov::element::f32, ov::element::f32, ov::element::f32, ov::element::f32}), + std::vector{{2, 64, 12, 64}, {128, 12, 1, 64}, {12, 2, 64, 128}, {1, 128, 12, 64}, {128, 12, 64}}, + true); + model = f.getOriginal(); + model_ref = f.getReference(); + config.concurrency = 16; + run(); +} + +TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA4D_SplitM) { + const auto& f = MHASplitMFunction(std::vector{{1, 384, 16, 64}, {1, 384, 16, 64}, {1, 1, 1, 384}, {1, 384, 16, 64}}, + std::vector({ov::element::f32, ov::element::f32, ov::element::f32, ov::element::f32}), + std::vector{{1, 6, 64, 16, 64}, {1, 384, 16, 1, 64}, {1, 1, 1, 1, 384}, {1, 1, 384, 16, 64}, {1, 384, 16, 64}}, + false); + model = f.getOriginal(); + model_ref = f.getReference(); + config.concurrency = 60; + run(); +} + +TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA4D_SplitM_withMul) { + const auto& f = MHASplitMFunction(std::vector{{1, 384, 16, 64}, {1, 384, 16, 64}, {1, 1, 1, 384}, {1, 384, 16, 64}}, + std::vector({ov::element::f32, ov::element::f32, ov::element::f32, ov::element::f32}), + std::vector{{1, 6, 64, 16, 64}, {1, 384, 16, 1, 64}, {1, 1, 1, 1, 384}, {1, 1, 384, 16, 64}, {1, 384, 16, 64}}, + true); + model = f.getOriginal(); + model_ref = f.getReference(); + config.concurrency = 60; + run(); +} + +TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHAWOTranspose_SplitM) { const auto& f = MHAWOTransposeSplitMFunction(std::vector{{10, 9216, 128}, {10, 128, 9216}, {10, 9216, 128}}, std::vector({ov::element::f32, ov::element::f32, ov::element::f32}), - std::vector{{10, 9, 1024, 128}, {10, 1, 128, 9216}, {10, 1, 9216, 128}, {10, 9216, 128}}); + std::vector{{10, 3, 3072, 128}, {10, 1, 128, 9216}, {10, 1, 9216, 128}, {10, 9216, 128}}); model = f.getOriginal(); model_ref = f.getReference(); config.concurrency = 18; @@ -93,7 +144,7 @@ TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA_SplitM) { TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA_SplitM_AlmostAllThreads) { const auto& f = MHAWOTransposeSplitMFunction(std::vector{{5, 30, 32}, {5, 32, 30}, {5, 30, 32}}, std::vector({ov::element::f32, ov::element::f32, ov::element::f32}), - std::vector{{5, 6, 5, 32}, {5, 1, 32, 30}, {5, 1, 30, 32}, {5, 30, 32}}); + std::vector{{5, 10, 3, 32}, {5, 1, 32, 30}, {5, 1, 30, 32}, {5, 30, 32}}); model = f.getOriginal(); model_ref = f.getReference(); config.concurrency = 32; diff --git a/src/core/include/openvino/op/equal.hpp b/src/core/include/openvino/op/equal.hpp index 8148f62d2ba44b..a66e00d4be96d9 100644 --- a/src/core/include/openvino/op/equal.hpp +++ b/src/core/include/openvino/op/equal.hpp @@ -41,12 +41,9 @@ class OPENVINO_API Equal : public util::BinaryElementwiseComparison { const Output& arg1, const AutoBroadcastSpec& auto_broadcast = AutoBroadcastSpec(AutoBroadcastType::NUMPY)); - bool visit_attributes(AttributeVisitor& visitor) override; std::shared_ptr clone_with_new_inputs(const OutputVector& new_args) const override; - OPENVINO_SUPPRESS_DEPRECATED_START - bool evaluate(const HostTensorVector& outputs, const HostTensorVector& inputs) const override; - OPENVINO_SUPPRESS_DEPRECATED_END + bool evaluate(TensorVector& outputs, const TensorVector& inputs) const override; bool evaluate_upper(TensorVector& outputs) const override; bool evaluate_lower(TensorVector& outputs) const override; bool has_evaluate() const override; diff --git a/src/core/reference/include/openvino/reference/equal.hpp b/src/core/reference/include/openvino/reference/equal.hpp index c81d47c23d18ff..5e75b110eb996c 100644 --- a/src/core/reference/include/openvino/reference/equal.hpp +++ b/src/core/reference/include/openvino/reference/equal.hpp @@ -4,44 +4,68 @@ #pragma once -#if defined(__GNUC__) -# pragma GCC diagnostic push -# pragma GCC diagnostic ignored "-Wfloat-equal" -#endif +#include +#include -#include - -#include "openvino/core/shape.hpp" -#include "openvino/op/util/attr_types.hpp" #include "openvino/reference/autobroadcast_binop.hpp" +#include "openvino/reference/utils/type_util.hpp" namespace ov { namespace reference { +namespace func { +template +bool equal(const T lhs, const T rhs) { + return lhs == rhs; +} +} // namespace func + template +void equal(const T* arg0, const T* arg1, char* out, size_t count) { + std::transform(arg0, std::next(arg0, count), arg1, out, std::equal_to()); +} + +/** + * @brief Reference implementation of binary elementwise Equal operator. + * + * Used for integral types with custom `equal` function (reduce binary size). + * + * @param arg0 Pointer to input 0 data. + * @param arg1 Pointer to input 1 data. + * @param out Pointer to output data. + * @param arg_shape0 Input 0 shape. + * @param arg_shape1 Input 1 shape. + * @param broadcast_spec Broadcast specification mode. + */ +template ::value>::type* = nullptr> void equal(const T* arg0, const T* arg1, - char* out, - size_t count) // TODO: using char for bool, is this right? -{ - for (size_t i = 0; i < count; i++) { - out[i] = arg0[i] == arg1[i]; - } + U* out, + const Shape& arg0_shape, + const Shape& arg1_shape, + const op::AutoBroadcastSpec& broadcast_spec) { + autobroadcast_binop(arg0, arg1, out, arg0_shape, arg1_shape, broadcast_spec, func::equal); } -template +/** + * @brief Reference implementation of binary elementwise Equal operator. + * + * Used for floating-point types to (avoid warning compare floating point with `==`). + * + * @param arg0 Pointer to input 0 data. + * @param arg1 Pointer to input 1 data. + * @param out Pointer to output data. + * @param arg_shape0 Input 0 shape. + * @param arg_shape1 Input 1 shape. + * @param broadcast_spec Broadcast specification mode. + */ +template ()>::type* = nullptr> void equal(const T* arg0, const T* arg1, U* out, const Shape& arg0_shape, const Shape& arg1_shape, const op::AutoBroadcastSpec& broadcast_spec) { - autobroadcast_binop(arg0, arg1, out, arg0_shape, arg1_shape, broadcast_spec, [](T x, T y) -> U { - return static_cast(x == y); - }); + autobroadcast_binop(arg0, arg1, out, arg0_shape, arg1_shape, broadcast_spec, std::equal_to()); } } // namespace reference } // namespace ov - -#if defined(__GNUC__) -# pragma GCC diagnostic pop -#endif diff --git a/src/core/src/op/equal.cpp b/src/core/src/op/equal.cpp index e4adf5d0e4ce53..7f23b8970e204a 100644 --- a/src/core/src/op/equal.cpp +++ b/src/core/src/op/equal.cpp @@ -2,183 +2,160 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "ngraph/op/equal.hpp" +#include "openvino/op/equal.hpp" #include "bound_evaluate.hpp" +#include "element_visitor.hpp" #include "itt.hpp" -#include "ngraph/op/constant.hpp" -#include "ngraph/op/less_eq.hpp" -#include "ngraph/runtime/host_tensor.hpp" -#include "openvino/op/ops.hpp" +#include "openvino/op/less_eq.hpp" +#include "openvino/op/logical_and.hpp" +#include "openvino/op/logical_or.hpp" #include "openvino/reference/equal.hpp" +#include "utils.hpp" -using namespace std; -using namespace ngraph; - -OPENVINO_SUPPRESS_DEPRECATED_START +namespace ov { +namespace op { namespace equal { namespace { -template -bool evaluate(const HostTensorPtr& arg0, - const HostTensorPtr& arg1, - const HostTensorPtr& out, - const op::AutoBroadcastSpec& broadcast_spec) { - ov::reference::equal(arg0->get_data_ptr(), - arg1->get_data_ptr(), - out->get_data_ptr(), - arg0->get_shape(), - arg1->get_shape(), - broadcast_spec); - return true; -} - -bool evaluate_equal(const HostTensorPtr& arg0, - const HostTensorPtr& arg1, - const HostTensorPtr& out, - const op::AutoBroadcastSpec& broadcast_spec) { - bool rc = true; - out->set_broadcast(broadcast_spec, arg0, arg1, element::boolean); - switch (arg0->get_element_type()) { - OPENVINO_TYPE_CASE(evaluate_equal, boolean, arg0, arg1, out, broadcast_spec); - OPENVINO_TYPE_CASE(evaluate_equal, i4, arg0, arg1, out, broadcast_spec); - OPENVINO_TYPE_CASE(evaluate_equal, i8, arg0, arg1, out, broadcast_spec); - OPENVINO_TYPE_CASE(evaluate_equal, i16, arg0, arg1, out, broadcast_spec); - OPENVINO_TYPE_CASE(evaluate_equal, i32, arg0, arg1, out, broadcast_spec); - OPENVINO_TYPE_CASE(evaluate_equal, i64, arg0, arg1, out, broadcast_spec); - OPENVINO_TYPE_CASE(evaluate_equal, u4, arg0, arg1, out, broadcast_spec); - OPENVINO_TYPE_CASE(evaluate_equal, u8, arg0, arg1, out, broadcast_spec); - OPENVINO_TYPE_CASE(evaluate_equal, u16, arg0, arg1, out, broadcast_spec); - OPENVINO_TYPE_CASE(evaluate_equal, u32, arg0, arg1, out, broadcast_spec); - OPENVINO_TYPE_CASE(evaluate_equal, u64, arg0, arg1, out, broadcast_spec); - OPENVINO_TYPE_CASE(evaluate_equal, bf16, arg0, arg1, out, broadcast_spec); - OPENVINO_TYPE_CASE(evaluate_equal, f16, arg0, arg1, out, broadcast_spec); - OPENVINO_TYPE_CASE(evaluate_equal, f32, arg0, arg1, out, broadcast_spec); - OPENVINO_TYPE_CASE(evaluate_equal, f64, arg0, arg1, out, broadcast_spec); - default: - rc = false; - break; - } - return rc; -} -ov::Tensor equal_tensor(const ov::Tensor& lhs, const ov::Tensor& rhs) { - auto equal = op::v1::Equal(std::make_shared(lhs.get_element_type(), lhs.get_shape()), - std::make_shared(rhs.get_element_type(), rhs.get_shape()), - op::AutoBroadcastType::NUMPY); - auto outs = ov::TensorVector{{equal.get_output_element_type(0), equal.get_output_shape(0)}}; - equal.evaluate(outs, ov::TensorVector{lhs, rhs}); +Tensor less_equal_tensor(const Tensor& lhs, const Tensor& rhs) { + const auto less_eq = v1::LessEqual(); + auto outs = TensorVector{{element::boolean, Shape{}}}; + less_eq.evaluate(outs, {lhs, rhs}); return outs.front(); } -ov::Tensor less_equal_tensor(const ov::Tensor& lhs, const ov::Tensor& rhs) { - auto equal = op::v1::LessEqual(std::make_shared(lhs.get_element_type(), lhs.get_shape()), - std::make_shared(rhs.get_element_type(), rhs.get_shape()), - op::AutoBroadcastType::NUMPY); - auto outs = ov::TensorVector{{equal.get_output_element_type(0), equal.get_output_shape(0)}}; - equal.evaluate(outs, ov::TensorVector{lhs, rhs}); +Tensor and_tensor(const Tensor& lhs, const Tensor& rhs) { + const auto logical_and = v1::LogicalAnd(); + auto outs = TensorVector{{element::boolean, Shape{}}}; + logical_and.evaluate(outs, {lhs, rhs}); return outs.front(); } -ov::Tensor and_tensor(const ov::Tensor& lhs, const ov::Tensor& rhs) { - auto logical_and = - ov::op::v1::LogicalAnd(std::make_shared(lhs.get_element_type(), lhs.get_shape()), - std::make_shared(rhs.get_element_type(), rhs.get_shape()), - op::AutoBroadcastType::NUMPY); - auto outs = ov::TensorVector{{logical_and.get_output_element_type(0), logical_and.get_output_shape(0)}}; - logical_and.evaluate(outs, ov::TensorVector{lhs, rhs}); +Tensor or_tensor(const Tensor& lhs, const Tensor& rhs) { + const auto logical_or = v1::LogicalOr(); + auto outs = TensorVector{{element::boolean, Shape{}}}; + logical_or.evaluate(outs, {lhs, rhs}); return outs.front(); } -ov::Tensor or_tensor(const ov::Tensor& lhs, const ov::Tensor& rhs) { - auto logical_or = - ov::op::v1::LogicalOr(std::make_shared(lhs.get_element_type(), lhs.get_shape()), - std::make_shared(rhs.get_element_type(), rhs.get_shape()), - op::AutoBroadcastType::NUMPY); - auto outs = ov::TensorVector{{logical_or.get_output_element_type(0), logical_or.get_output_shape(0)}}; - logical_or.evaluate(outs, ov::TensorVector{lhs, rhs}); - return outs.front(); -} +void all_equal(const TensorVector& tensors, TensorVector& outputs) { + auto& output = outputs[0]; + auto eq_result = TensorVector{{output.get_element_type(), output.get_shape()}}; -void all_equal(const ov::TensorVector tensors, ov::Tensor& output_value) { - OPENVINO_ASSERT(tensors.size() >= 2, "Unexpected number of tensors in all_equal helper"); - auto& tensor = tensors[0]; - output_value = equal_tensor(tensor, tensors[1]); - for (size_t i = 2; i < tensors.size(); ++i) { - output_value = and_tensor(output_value, equal_tensor(tensor, tensors[i])); + auto t_iter = tensors.begin() + 2; + auto eq_inputs = TensorVector(tensors.begin(), t_iter); + + const auto eq = v1::Equal(); + eq.evaluate(outputs, eq_inputs); + for (; t_iter != tensors.end(); ++t_iter) { + eq_inputs[1] = *t_iter; + eq.evaluate(eq_result, eq_inputs); + output = and_tensor(output, eq_result[0]); } } -ov::Tensor within_interval(const ov::Tensor& lower, const ov::Tensor& upper, const ov::Tensor& subject_to_check) { - auto lower_check = less_equal_tensor(lower, subject_to_check); - auto upper_check = less_equal_tensor(subject_to_check, upper); +Tensor within_interval(const Tensor& lower, const Tensor& upper, const Tensor& subject_to_check) { + const auto lower_check = less_equal_tensor(lower, subject_to_check); + const auto upper_check = less_equal_tensor(subject_to_check, upper); return and_tensor(lower_check, upper_check); } - } // namespace + +struct Evaluate : public element::NoAction { + using element::NoAction::visit; + + template > + static result_type visit(const Tensor& arg0, + const Tensor& arg1, + Tensor& out, + const Shape& shape0, + const Shape& shape1, + const op::AutoBroadcastSpec& broadcast_spec) { + reference::equal(arg0.data(), + arg1.data(), + out.data>(), + shape0, + shape1, + broadcast_spec); + return true; + } +}; } // namespace equal //------------------------------- v1 ------------------------------------------- -op::v1::Equal::Equal(const Output& arg0, const Output& arg1, const AutoBroadcastSpec& auto_broadcast) +namespace v1 { +Equal::Equal(const Output& arg0, const Output& arg1, const AutoBroadcastSpec& auto_broadcast) : BinaryElementwiseComparison(arg0, arg1, auto_broadcast) { constructor_validate_and_infer_types(); } -shared_ptr op::v1::Equal::clone_with_new_inputs(const OutputVector& new_args) const { +std::shared_ptr Equal::clone_with_new_inputs(const OutputVector& new_args) const { OV_OP_SCOPE(v1_Equal_clone_with_new_inputs); check_new_args_count(this, new_args); - return make_shared(new_args.at(0), new_args.at(1), this->get_autob()); + return std::make_shared(new_args.at(0), new_args.at(1), get_autob()); } -bool op::v1::Equal::evaluate(const HostTensorVector& outputs, const HostTensorVector& inputs) const { +bool Equal::evaluate(TensorVector& outputs, const TensorVector& inputs) const { OV_OP_SCOPE(v1_Equal_evaluate); - return equal::evaluate_equal(inputs[0], inputs[1], outputs[0], get_autob()); + + outputs[0].set_shape(ov::op::infer_broadcast_shape(this, inputs)); + using namespace ov::element; + return IfTypeOf::apply( + inputs[0].get_element_type(), + inputs[0], + inputs[1], + outputs[0], + inputs[0].get_shape(), + inputs[1].get_shape(), + get_autob()); } -bool op::v1::Equal::evaluate_lower(ov::TensorVector& output_values) const { +bool Equal::evaluate_lower(TensorVector& output_values) const { if (get_input_tensor(0).has_and_set_bound() && get_input_tensor(1).has_and_set_bound()) return default_upper_bound_evaluator(this, output_values); // ll == lu == rl == ru -> {true} // else -> {false} const auto &lhs = get_input_tensor(0), &rhs = get_input_tensor(1); - auto lhs_lower = lhs.get_lower_value(), lhs_upper = lhs.get_upper_value(); - auto rhs_lower = rhs.get_lower_value(), rhs_upper = rhs.get_upper_value(); - equal::all_equal({lhs_lower, lhs_upper, rhs_lower, rhs_upper}, output_values[0]); + const auto &lhs_lower = lhs.get_lower_value(), &lhs_upper = lhs.get_upper_value(); + const auto &rhs_lower = rhs.get_lower_value(), &rhs_upper = rhs.get_upper_value(); + equal::all_equal({lhs_lower, lhs_upper, rhs_lower, rhs_upper}, output_values); return true; } -bool op::v1::Equal::evaluate_upper(ov::TensorVector& output_values) const { +bool Equal::evaluate_upper(TensorVector& output_values) const { const auto &lhs = get_input_tensor(0), &rhs = get_input_tensor(1); - auto lhs_lower = lhs.get_lower_value(), lhs_upper = lhs.get_upper_value(); - auto rhs_lower = rhs.get_lower_value(), rhs_upper = rhs.get_upper_value(); + const auto &lhs_lower = lhs.get_lower_value(), &lhs_upper = lhs.get_upper_value(); + const auto &rhs_lower = rhs.get_lower_value(), &rhs_upper = rhs.get_upper_value(); // check for intersection: // ll <= rl <= lu or ll <= ru <= lu - auto rl_check = equal::within_interval(lhs_lower, lhs_upper, rhs_lower); - auto ru_check = equal::within_interval(lhs_lower, lhs_upper, rhs_upper); + const auto rl_check = equal::within_interval(lhs_lower, lhs_upper, rhs_lower); + const auto ru_check = equal::within_interval(lhs_lower, lhs_upper, rhs_upper); output_values[0] = equal::or_tensor(rl_check, ru_check); return true; } -bool op::v1::Equal::has_evaluate() const { +bool Equal::has_evaluate() const { OV_OP_SCOPE(v1_Equal_has_evaluate); switch (get_input_element_type(0)) { - case ngraph::element::boolean: - case ngraph::element::i8: - case ngraph::element::u8: - case ngraph::element::i32: - case ngraph::element::i64: - case ngraph::element::u32: - case ngraph::element::u64: - case ngraph::element::f16: - case ngraph::element::f32: + case element::boolean: + case element::bf16: + case element::f16: + case element::f32: + case element::f64: + case element::i8: + case element::i16: + case element::i32: + case element::i64: + case element::u8: + case element::u16: + case element::u32: + case element::u64: return true; default: - break; + return false; } - return false; -} - -bool op::v1::Equal::visit_attributes(AttributeVisitor& visitor) { - OV_OP_SCOPE(v1_Equal_visit_attributes); - BinaryElementwiseComparison::visit_attributes(visitor); - return true; } +} // namespace v1 +} // namespace op +} // namespace ov diff --git a/src/core/src/runtime/itensor.cpp b/src/core/src/runtime/itensor.cpp index 6d966566c65610..2b3a6d49b84947 100644 --- a/src/core/src/runtime/itensor.cpp +++ b/src/core/src/runtime/itensor.cpp @@ -25,9 +25,10 @@ size_t ITensor::get_byte_size() const { } bool ITensor::is_continuous() const { - if (get_element_type().bitwidth() < 8) + if ((get_element_type().bitwidth() < 8) || get_size() == 0) { // OpenVINO doesn't support strides for lp types return true; + } const auto& shape = get_shape(); const auto& type = get_element_type(); std::vector strides(shape.size()); diff --git a/src/core/tests/tensor.cpp b/src/core/tests/tensor.cpp index 3fb0c259fc0c0d..361e45e8a570ce 100644 --- a/src/core/tests/tensor.cpp +++ b/src/core/tests/tensor.cpp @@ -52,3 +52,13 @@ TEST(tensor, wrap_tensor_with_unspecified_type_from_host_tensor) { // !tensor means that the tensor is not initialized EXPECT_EQ(!tensor, true); } + +TEST(tensor, create_tensor_with_zero_dims_check_stride) { + ov::Shape shape = {0, 0, 0, 0}; + auto tensor = ov::Tensor(element::f32, shape); + EXPECT_EQ(!!tensor, true); + auto stride = tensor.get_strides(); + EXPECT_EQ(stride.size(), shape.size()); + EXPECT_EQ(stride.back(), 0); + EXPECT_EQ(tensor.is_continuous(), true); +} diff --git a/src/core/tests/type_prop/broadcast.cpp b/src/core/tests/type_prop/broadcast.cpp index 023f8fa9505f0a..04f93d08b798e6 100644 --- a/src/core/tests/type_prop/broadcast.cpp +++ b/src/core/tests/type_prop/broadcast.cpp @@ -9,7 +9,6 @@ #include "common_test_utils/test_assertions.hpp" #include "common_test_utils/type_prop.hpp" #include "openvino/core/dimension_tracker.hpp" -#include "openvino/core/validation_util.hpp" #include "openvino/op/concat.hpp" #include "openvino/op/constant.hpp" #include "openvino/op/equal.hpp" @@ -18,6 +17,7 @@ #include "openvino/op/shape_of.hpp" #include "openvino/op/unsqueeze.hpp" #include "openvino/op/util/attr_types.hpp" +#include "validation_util.hpp" using namespace std; using namespace testing; @@ -1303,24 +1303,22 @@ TEST(type_prop, broadcast_v3_bidirectional_tricky_partial_value_case_and_equal_p auto broadcast_a = make_shared(a, select, "BIDIRECTIONAL"); const auto out_shape = broadcast_a->get_output_partial_shape(0); - OPENVINO_SUPPRESS_DEPRECATED_START EXPECT_EQ(out_shape, expected_shape); { - auto constant = ov::get_constant_from_source(equal->output(0)); - EXPECT_TRUE(constant != nullptr); + auto constant = ov::util::get_constant_from_source(equal->output(0)); + ASSERT_TRUE(constant != nullptr); std::vector expected{false, false, false}, calculated = constant->get_vector(); EXPECT_EQ(calculated, expected); } { equal = make_shared(concat, ov::op::v0::Constant::create(ov::element::i64, {3}, {5, 1, 4})); - EXPECT_TRUE(ov::get_constant_from_source(equal->output(0)) == nullptr); + EXPECT_TRUE(ov::util::get_constant_from_source(equal->output(0)) == nullptr); } { equal = make_shared(concat, ov::op::v0::Constant::create(ov::element::i64, {3}, {11, 1, 4})); - auto constant = ov::get_constant_from_source(equal->output(0)); - EXPECT_TRUE(constant != nullptr); + auto constant = ov::util::get_constant_from_source(equal->output(0)); + ASSERT_TRUE(constant != nullptr); std::vector expected{false, true, true}, calculated = constant->get_vector(); EXPECT_EQ(calculated, expected); } - OPENVINO_SUPPRESS_DEPRECATED_END } diff --git a/src/frontends/onnx/frontend/src/op/group_normalization.cpp b/src/frontends/onnx/frontend/src/op/group_normalization.cpp new file mode 100644 index 00000000000000..d0f32a89ae4515 --- /dev/null +++ b/src/frontends/onnx/frontend/src/op/group_normalization.cpp @@ -0,0 +1,46 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "op/group_normalization.hpp" + +#include "default_opset.hpp" + +OPENVINO_SUPPRESS_DEPRECATED_START +namespace ngraph { +namespace onnx_import { +namespace op { +namespace set_1 { +OutputVector group_normalization(const Node& node) { + const auto data = node.get_ng_inputs().at(0); // Shape [N, C, ...] + auto scale = node.get_ng_inputs().at(1); // Shape [num_groups] + auto bias = node.get_ng_inputs().at(2); // Shape [num_groups] + + auto eps = node.get_attribute_value("epsilon", 1e-05f); + auto num_groups = node.get_attribute_value("num_groups"); + + auto zero = default_opset::Constant::create(element::i64, Shape{1}, {0}); + auto one = default_opset::Constant::create(element::i64, Shape{1}, {1}); + auto c_dim = std::make_shared(std::make_shared(data), one, zero); + auto g_dim = default_opset::Constant::create(element::i64, Shape{1}, {num_groups}); + + auto c_g_div = std::make_shared(c_dim, g_dim); + + // Adjust scale and bias shape, [G] -> [G, C/G] -> [C] + scale = std::make_shared(scale, one); + auto broadcast_scale = + std::make_shared(scale, c_g_div, ov::op::BroadcastType::BIDIRECTIONAL); + auto c_scale = std::make_shared(broadcast_scale, c_dim, false); + + bias = std::make_shared(bias, one); + auto broadcast_bias = + std::make_shared(bias, c_g_div, ov::op::BroadcastType::BIDIRECTIONAL); + auto c_bias = std::make_shared(broadcast_bias, c_dim, false); + + return {std::make_shared(data, c_scale, c_bias, num_groups, eps)}; +} +} // namespace set_1 +} // namespace op +} // namespace onnx_import +} // namespace ngraph +OPENVINO_SUPPRESS_DEPRECATED_END diff --git a/src/frontends/onnx/frontend/src/op/group_normalization.hpp b/src/frontends/onnx/frontend/src/op/group_normalization.hpp new file mode 100644 index 00000000000000..fbd38d3667d4dd --- /dev/null +++ b/src/frontends/onnx/frontend/src/op/group_normalization.hpp @@ -0,0 +1,23 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "openvino/core/deprecated.hpp" +OPENVINO_SUPPRESS_DEPRECATED_START + +#include "ngraph/node.hpp" +#include "onnx_import/core/node.hpp" + +namespace ngraph { +namespace onnx_import { +namespace op { +namespace set_1 { +OutputVector group_normalization(const Node& node); + +} // namespace set_1 +} // namespace op +} // namespace onnx_import +} // namespace ngraph +OPENVINO_SUPPRESS_DEPRECATED_END diff --git a/src/frontends/onnx/frontend/src/ops_bridge.cpp b/src/frontends/onnx/frontend/src/ops_bridge.cpp index c4d9a50c4ca637..31ca0b20836de5 100644 --- a/src/frontends/onnx/frontend/src/ops_bridge.cpp +++ b/src/frontends/onnx/frontend/src/ops_bridge.cpp @@ -75,6 +75,7 @@ #include "op/global_max_pool.hpp" #include "op/greater.hpp" #include "op/grid_sample.hpp" +#include "op/group_normalization.hpp" #include "op/gru.hpp" #include "op/hammingwindow.hpp" #include "op/hannwindow.hpp" @@ -395,6 +396,7 @@ OperatorsBridge::OperatorsBridge() { REGISTER_OPERATOR("GlobalMaxPool", 1, global_max_pool); REGISTER_OPERATOR("Greater", 1, greater); REGISTER_OPERATOR("GridSample", 1, grid_sample); + REGISTER_OPERATOR("GroupNormalization", 1, group_normalization); REGISTER_OPERATOR("GRU", 1, gru); REGISTER_OPERATOR("HannWindow", 1, hannwindow); REGISTER_OPERATOR("HammingWindow", 1, hammingwindow); diff --git a/src/frontends/onnx/tests/models/group_normalization_2grp.prototxt b/src/frontends/onnx/tests/models/group_normalization_2grp.prototxt new file mode 100644 index 00000000000000..978ab918a2c521 --- /dev/null +++ b/src/frontends/onnx/tests/models/group_normalization_2grp.prototxt @@ -0,0 +1,91 @@ +ir_version: 8 +producer_name: "onnx-frontend-test" +graph { + node { + input: "x" + input: "scale" + input: "bias" + output: "Y" + op_type: "GroupNormalization" + attribute { + name: "num_groups" + i: 2 + type: INT + } + } + name: "test-model-group-normalization" + input { + name: "x" + type { + tensor_type { + elem_type: 1 + shape { + dim { + dim_value: 1 + } + dim { + dim_value: 4 + } + dim { + dim_value: 2 + } + dim { + dim_value: 3 + } + } + } + } + } + input { + name: "scale" + type { + tensor_type { + elem_type: 1 + shape { + dim { + dim_value: 2 + } + } + } + } + } + input { + name: "bias" + type { + tensor_type { + elem_type: 1 + shape { + dim { + dim_value: 2 + } + } + } + } + } + output { + name: "Y" + type { + tensor_type { + elem_type: 1 + shape { + dim { + dim_value: 1 + } + dim { + dim_value: 4 + } + dim { + dim_value: 2 + } + dim { + dim_value: 3 + } + } + } + } + } +} +opset_import { + domain: "" + version: 18 +} diff --git a/src/frontends/onnx/tests/models/group_normalization_3grp.prototxt b/src/frontends/onnx/tests/models/group_normalization_3grp.prototxt new file mode 100644 index 00000000000000..1711e41bd5d48f --- /dev/null +++ b/src/frontends/onnx/tests/models/group_normalization_3grp.prototxt @@ -0,0 +1,91 @@ +ir_version: 9 +opset_import { + domain: "" + version: 18 +} +producer_name: "onnx-frontend-test" +graph { + node { + input: "x" + input: "scale" + input: "bias" + output: "Y" + op_type: "GroupNormalization" + attribute { + name: "num_groups" + type: INT + i: 3 + } + } + name: "test-model-group-normalization" + input { + name: "x" + type { + tensor_type { + elem_type: 1 + shape { + dim { + dim_value: 2 + } + dim { + dim_value: 6 + } + dim { + dim_value: 2 + } + dim { + dim_value: 2 + } + } + } + } + } + input { + name: "scale" + type { + tensor_type { + elem_type: 1 + shape { + dim { + dim_value: 3 + } + } + } + } + } + input { + name: "bias" + type { + tensor_type { + elem_type: 1 + shape { + dim { + dim_value: 3 + } + } + } + } + } + output { + name: "Y" + type { + tensor_type { + elem_type: 1 + shape { + dim { + dim_value: 2 + } + dim { + dim_value: 6 + } + dim { + dim_value: 2 + } + dim { + dim_value: 2 + } + } + } + } + } +} diff --git a/src/frontends/onnx/tests/models/group_normalization_custom_eps.prototxt b/src/frontends/onnx/tests/models/group_normalization_custom_eps.prototxt new file mode 100644 index 00000000000000..083b5d8ecf5d0e --- /dev/null +++ b/src/frontends/onnx/tests/models/group_normalization_custom_eps.prototxt @@ -0,0 +1,96 @@ +ir_version: 9 +opset_import { + domain: "" + version: 18 +} +producer_name: "onnx-frontend-test" +graph { + node { + input: "x" + input: "scale" + input: "bias" + output: "Y" + op_type: "GroupNormalization" + attribute { + name: "epsilon" + type: FLOAT + f: 0.0001 + } + attribute { + name: "num_groups" + type: INT + i: 3 + } + } + name: "test-model-group-normalization" + input { + name: "x" + type { + tensor_type { + elem_type: 1 + shape { + dim { + dim_value: 2 + } + dim { + dim_value: 6 + } + dim { + dim_value: 2 + } + dim { + dim_value: 2 + } + } + } + } + } + input { + name: "scale" + type { + tensor_type { + elem_type: 1 + shape { + dim { + dim_value: 3 + } + } + } + } + } + input { + name: "bias" + type { + tensor_type { + elem_type: 1 + shape { + dim { + dim_value: 3 + } + } + } + } + } + output { + name: "Y" + type { + tensor_type { + elem_type: 1 + shape { + dim { + dim_value: 2 + } + dim { + dim_value: 6 + } + dim { + dim_value: 2 + } + dim { + dim_value: 2 + } + } + } + } + } +} diff --git a/src/frontends/onnx/tests/onnx_import.in.cpp b/src/frontends/onnx/tests/onnx_import.in.cpp index 361805e45cf0d4..2ac1dc6d464567 100644 --- a/src/frontends/onnx/tests/onnx_import.in.cpp +++ b/src/frontends/onnx/tests/onnx_import.in.cpp @@ -6884,3 +6884,81 @@ OPENVINO_TEST(${BACKEND_NAME}, onnx_model_hannwindow_symmetric) { test_case.run_with_tolerance_as_fp(0.01f); } } + +OPENVINO_TEST(${BACKEND_NAME}, onnx_group_normalization_3grp_default_eps) { + auto function = onnx_import::import_onnx_model(file_util::path_join(ov::test::utils::getExecutableDirectory(), + SERIALIZED_ZOO, + "onnx/group_normalization_3grp.onnx")); + + auto test_case = ov::test::TestCase(function, s_device); + test_case.add_input( + {-0.2261407f, -1.8793484f, -0.37692875f, 0.8860143f, 0.05993791f, -0.7634332f, 0.61080337f, 0.09776749f, + 0.5835062f, -0.32338685f, -0.23485906f, -0.04752525f, 2.4905143f, -0.11199934f, -0.20539412f, -2.4455426f, + -0.5437323f, 0.51794696f, -0.44127423f, 0.09666952f, -0.09539367f, -1.962784f, 0.25065672f, 1.5909688f, + 0.927671f, -0.46812922f, 0.2925484f, -1.1766007f, 0.7675745f, -0.94145614f, 1.1552521f, 1.6375796f, + 0.0198675f, -0.45938072f, 0.43037328f, 0.37999842f, -0.45021877f, -0.84925014f, 1.6790043f, -1.0172538f, + 0.0493111f, -0.53391f, -0.08101435f, 0.14738432f, -0.58910686f, 0.51673824f, -1.7001126f, -1.888597f}); + test_case.add_input({2.4556813f, 0.12371606f, 1.5681714f}); + test_case.add_input({0.79260737f, -0.74518913f, 1.370796f}); + + test_case.add_expected_output( + Shape{2, 6, 2, 2}, + {0.70938545f, -4.3442307f, 0.24844825f, 4.109082f, 1.5838864f, -0.93303996f, 3.267802f, 1.6995258f, + -0.6843487f, -0.7732928f, -0.76461035f, -0.7462375f, -0.49731785f, -0.75256085f, -0.7617206f, -0.9814244f, + 0.5922366f, 2.3495553f, 0.76182777f, 1.652246f, 1.3343381f, -1.7566144f, 1.9071295f, 4.1256485f, + 2.4563973f, -1.0979934f, 0.8390641f, -2.9021082f, 2.0487132f, -2.3033152f, 3.03593f, 4.2641716f, + -0.73710674f, -0.80988204f, -0.6747702f, -0.6824198f, -0.8084908f, -0.86908495f, -0.48516175f, -0.8945968f, + 2.4475086f, 1.3245938f, 2.1965842f, 2.6363354f, 1.2183195f, 3.3474774f, -0.92077446f, -1.2836761f}); + + test_case.run_with_tolerance_as_fp(0.000001f); +} + +OPENVINO_TEST(${BACKEND_NAME}, onnx_group_normalization_3grp_custom_eps) { + auto function = onnx_import::import_onnx_model(file_util::path_join(ov::test::utils::getExecutableDirectory(), + SERIALIZED_ZOO, + "onnx/group_normalization_custom_eps.onnx")); + + auto test_case = ov::test::TestCase(function, s_device); + test_case.add_input( + {1.8079232f, -0.2892469f, 2.0915377f, -1.8837914f, 0.25869793f, 0.80542284f, 2.9528935f, 0.16081251f, + 0.10507602f, -1.7271832f, -1.0217364f, -1.1528395f, -0.69146425f, -2.4292548f, -0.4232518f, 0.33357796f, + -1.4946569f, -0.08947915f, -0.7962127f, 1.3765403f, -0.1947846f, 0.30173305f, 0.08576944f, 0.8134404f, + 0.62960416f, -1.0745901f, -0.27037576f, -0.3607608f, 0.14347585f, 1.4590056f, -1.1309915f, 0.88850766f, + 0.5367185f, -0.7771955f, 0.81048864f, 0.45839247f, 1.0398412f, -0.21019235f, -1.037122f, -0.36852306f, + 2.7608335f, 0.3126114f, 0.336343f, 0.76919895f, 0.58595645f, 0.71894723f, -1.2922621f, -0.542859f}); + test_case.add_input({-0.05215209f, -0.5643389f, -0.6959881f}); + test_case.add_input({1.4327786f, 0.01641126f, -1.471873f}); + + test_case.add_expected_output( + Shape{2, 6, 2, 2}, + {1.3937842f, 1.4702199f, 1.3834473f, 1.5283363f, 1.4502488f, 1.4303224f, 1.3520534f, 1.4538165f, + -0.628196f, 0.5758153f, 0.11225323f, 0.19840352f, -0.10477467f, 1.0371594f, -0.281022f, -0.77834874f, + -0.22489226f, -1.3969909f, -0.8074844f, -2.6198394f, -1.3091526f, -1.7233121f, -1.5431708f, -2.1501417f, + 1.3968898f, 1.4998344f, 1.4512546f, 1.4567144f, 1.4262552f, 1.3467885f, 1.5032414f, 1.3812504f, + -0.36344206f, 0.6759755f, -0.58001745f, -0.30147952f, -0.7614548f, 0.22742787f, 0.8815994f, 0.35268092f, + -2.9372354f, -1.3806448f, -1.3957335f, -1.6709452f, -1.5544388f, -1.6389949f, -0.36025894f, -0.83673286f}); + + test_case.run_with_tolerance_as_fp(0.000001f); +} + +OPENVINO_TEST(${BACKEND_NAME}, onnx_group_normalization_2grp_custom_eps) { + auto function = onnx_import::import_onnx_model(file_util::path_join(ov::test::utils::getExecutableDirectory(), + SERIALIZED_ZOO, + "onnx/group_normalization_2grp.onnx")); + + auto test_case = ov::test::TestCase(function, s_device); + test_case.add_input({-0.424049f, 1.7215315f, 1.429421f, 0.52730036f, 2.0628972f, -0.15856522f, + 2.274094f, -1.9989003f, -1.7827071f, -0.87104136f, -1.2995626f, 0.16800839f, + 0.5934625f, 1.553442f, -0.5482905f, 0.6079124f, 0.3598974f, -0.15221423f, + 1.1135519f, -1.2533926f, -1.019778f, -1.9142767f, -1.2984604f, 0.5587884f}); + test_case.add_input({-1.4678609f, -1.8223071f}); + test_case.add_input({1.1155374f, -0.6101201f}); + + test_case.add_expected_output( + Shape{1, 4, 2, 3}, + {1.694167f, -0.51719165f, -0.21612573f, 0.71365166f, -0.86902285f, 1.4205441f, -1.0866947f, 3.3172996f, + 3.0944781f, 2.154863f, 2.5965219f, 1.0839586f, -1.8562672f, -3.540983f, 0.14745194f, -1.8816261f, + -1.4463723f, -0.547642f, -2.768998f, 1.3848708f, 0.97488886f, 2.5446892f, 1.4639623f, -1.7954159f}); + + test_case.run_with_tolerance_as_fp(0.000001f); +} diff --git a/src/frontends/onnx/tests/tests_python/test_backend.py b/src/frontends/onnx/tests/tests_python/test_backend.py index 779444658d1e28..a027f703ba29ce 100644 --- a/src/frontends/onnx/tests/tests_python/test_backend.py +++ b/src/frontends/onnx/tests/tests_python/test_backend.py @@ -437,9 +437,7 @@ def expect_fail(test_case_path, xfail): # type: (str) -> None ), ( xfail_issue_99955, - "OnnxBackendNodeModelTest.test_group_normalization_epsilon_cpu", "OnnxBackendNodeModelTest.test_group_normalization_epsilon_expanded_cpu", - "OnnxBackendNodeModelTest.test_group_normalization_example_cpu", "OnnxBackendNodeModelTest.test_group_normalization_example_expanded_cpu", ), ( diff --git a/src/inference/src/dev/make_tensor.cpp b/src/inference/src/dev/make_tensor.cpp index 1d23c62f86d957..2c0f33b352bcf6 100644 --- a/src/inference/src/dev/make_tensor.cpp +++ b/src/inference/src/dev/make_tensor.cpp @@ -77,7 +77,7 @@ class ViewTensor : public ITensor { auto& shape = get_shape(); if (m_strides.empty() && !shape.empty()) { m_strides.resize(shape.size()); - m_strides.back() = m_element_type.size(); + m_strides.back() = shape.back() == 0 ? 0 : m_element_type.size(); std::transform(shape.crbegin(), shape.crend() - 1, m_strides.rbegin(), diff --git a/src/plugins/intel_cpu/src/emitters/x64/jit_snippets_emitters.cpp b/src/plugins/intel_cpu/src/emitters/x64/jit_snippets_emitters.cpp index 072c3f7edcf60b..6e75dc8794ec6b 100644 --- a/src/plugins/intel_cpu/src/emitters/x64/jit_snippets_emitters.cpp +++ b/src/plugins/intel_cpu/src/emitters/x64/jit_snippets_emitters.cpp @@ -7,6 +7,7 @@ #include #include "snippets/snippets_isa.hpp" +#include "snippets/utils.hpp" #include "snippets/lowered/expression.hpp" #include "snippets/lowered/port_connector.hpp" #include "transformations/snippets/x64/op/brgemm_copy_b.hpp" @@ -229,7 +230,7 @@ void KernelEmitter::init_data_pointers(const Xbyak::Reg64& reg_indexes, const Xb // Note that we don't need offset for the last dim, since it's handled directly by Tile emitter const size_t offset_rank = master_shape.size() - 1; std::vector> data_offsets(num_params, std::vector{}); - auto offset_calculation = [=](const std::vector& shape, const std::vector& layout, const size_t data_size) { + auto offset_calculation = [=](const std::vector& shape, const std::vector& layout, const size_t data_size, bool is_input) { // Strides represent distance between consecutive elements of corresponding dimension. // If a dim size == 1, then the next dim starts immediately and the stride is 0 // case 1: @@ -248,8 +249,11 @@ void KernelEmitter::init_data_pointers(const Xbyak::Reg64& reg_indexes, const Xb // Note: this is an extra copy, but let's keep it for clarity if (!layout.empty()) { std::vector reordered_strides(strides.size()); - for (size_t i = 0; i < layout.size(); i++) - reordered_strides[i] = strides[layout[i]]; + for (size_t i = 0; i < layout.size(); i++) { + const auto& src_idx = is_input ? layout[i] : i; + const auto& dst_idx = is_input ? i : layout[i]; + reordered_strides[dst_idx] = strides[src_idx]; + } strides = std::move(reordered_strides); } // the last stride is ignored, since the entire last dim is processed by kernel @@ -261,7 +265,7 @@ void KernelEmitter::init_data_pointers(const Xbyak::Reg64& reg_indexes, const Xb return strides; }; for (size_t i = 0; i < num_params; i++) { - data_offsets[i] = offset_calculation(io_shapes[i], io_data_layouts[i], io_data_sizes[i]); + data_offsets[i] = offset_calculation(io_shapes[i], io_data_layouts[i], io_data_sizes[i], i < num_inputs); } // master_shape size must be valid in both static and dynamic cases std::function&, Reg64)> init_ptr_with_offset; @@ -718,6 +722,33 @@ size_t BrgemmEmitter::getBrgIdx(size_t kIdx, size_t nIdx) { return kIdx * BRGEMM_N_KERNEL_NUM + nIdx; } +size_t BrgemmEmitter::get_in_leading_dim(const VectorDims& shape, const std::vector& layout) { + // Input shape is original, so we need to correctly read this data by order + // Example: + // Original shape (shape) = [1, 49, 2, 23] + // Layout (transpose order) = [2, 0, 1, 3] + // Transposed shape = [2, 1, 49, 23] + // The leading dimension is equal to stride of shape[layout[3]] = 2 x 23 + OPENVINO_ASSERT(layout.back() == layout.size() - 1 && layout.size() == shape.size(), + "BrgemmEmitter detected invalid layout values: check that this shape + layout combination is schedulable"); + const auto idx = layout[layout.size() - 2]; // `1` in example + return std::accumulate(shape.cbegin() + idx + 1, shape.end(), 1, std::multiplies()); +} +size_t BrgemmEmitter::get_out_leading_dim(const VectorDims& shape, const std::vector& layout) { + // Output shape is already transposed, we need to correctly write the data with original shape by the order + // Example: + // Original transposed shape (shape) = [49, 2, 7, 39] + // Layout (transpose order) = [2, 0, 1, 3] + // Before leading dimension with index 3 there is dimension with index 2 in planar layout. + // Since we have non-planar layout, we have to find this before LD dim in transposed order. + // In layout 2nd idx is first element, it means, that the leading dimension is equal to stride of shape[0] + OPENVINO_ASSERT(layout.back() == layout.size() - 1 && layout.size() == shape.size(), + "BrgemmEmitter detected invalid layout values: check that this shape + layout combination is schedulable"); + const auto idx = layout.size() - 2; // 2 in the example + const auto dim = std::distance(layout.cbegin(), std::find(layout.cbegin(), layout.cend(), idx)); // 0 in the example: shape[0] = 49 + return std::accumulate(shape.cbegin() + dim + 1, shape.cend(), 1, std::multiplies()); // shape[1] x shape[2] x shape[3] = 2 x 7 x 39 +} + BrgemmEmitter::BrgemmEmitter(jit_generator* h, cpu_isa_t isa, const ExpressionPtr& expr) : jit_emitter(h, isa) { m_brgCtxs.fill(brgemmCtx()); std::generate(m_brgKernels.begin(), m_brgKernels.end(), [](){ return nullptr; }); @@ -730,38 +761,33 @@ BrgemmEmitter::BrgemmEmitter(jit_generator* h, cpu_isa_t isa, const ExpressionPt std::vector leading_dimensions; std::vector> io_layouts; - auto init_scheduling_params = [&](const std::vector& layout, const ov::Shape& io_shape) { - if (layout.empty()) { - // empty value indicates a planar layout - leading_dimensions.push_back(io_shape.back()); - std::vector default_layout(io_shape.size()); - std::iota(default_layout.begin(), default_layout.end(), 0); - io_layouts.push_back(default_layout); - } else { - // The idea here is to find "2" (for 4D shapes) in the layout and multiply dimensions that are to the right - // This implies that "3" is the last layout value, otherwise this layout is not supported. - // counting from the end since shape could be prepended with ones - const int64_t num_last_dims = layout.end() - std::find(layout.begin(), layout.end(), layout.size() - 2) - 1; - if (layout.back() != layout.size() - 1 || num_last_dims < 1) - IE_THROW() << "BrgemmEmitter detected invalid layout values: check that this shape + layout combination is schedulable"; - leading_dimensions.emplace_back( - std::accumulate(io_shape.end() - num_last_dims, io_shape.end(), 1, std::multiplies())); - io_layouts.push_back(layout); - } + auto get_layout = [](const std::vector& layout, const snippets::VectorDims& io_shape) { + if (!layout.empty()) return layout; + std::vector default_layout(io_shape.size()); + std::iota(default_layout.begin(), default_layout.end(), 0); + return default_layout; }; - std::vector> brgemm_inputs = {brgemm_node->input(0), - brgemm_copy ? brgemm_copy->input(0) : brgemm_node->input(1)}; - for (const auto& input : brgemm_inputs) { - init_scheduling_params(snippets::lowered::PortDescriptorUtils::get_port_descriptor_ptr(input)->get_layout(), - input.get_shape()); + auto init_in_scheduling_params = [&](const snippets::lowered::PortDescriptorPtr& input) { + io_layouts.push_back(get_layout(input->get_layout(), input->get_shape())); + leading_dimensions.push_back(get_in_leading_dim(input->get_shape(), io_layouts.back())); + }; + auto init_out_scheduling_params = [&](const snippets::lowered::PortDescriptorPtr& output) { + io_layouts.push_back(get_layout(output->get_layout(), output->get_shape())); + leading_dimensions.push_back(get_out_leading_dim(output->get_shape(), io_layouts.back())); + }; + init_in_scheduling_params(expr->get_input_port_descriptor(0)); + if (brgemm_node->is_with_data_repacking()) { + io_layouts.push_back(std::vector{}); + leading_dimensions.push_back(0); + } else { + init_in_scheduling_params(expr->get_input_port_descriptor(1)); } - init_scheduling_params(snippets::lowered::PortDescriptorUtils::get_port_descriptor_ptr(brgemm_node->output(0))->get_layout(), - brgemm_node->output(0).get_shape()); + init_out_scheduling_params(expr->get_output_port_descriptor(0)); - const auto& A_shape = brgemm_node->get_input_shape(0); + const auto& A_shape = expr->get_input_port_descriptor(0)->get_shape(); const auto& A_layout = io_layouts[0]; - const auto& C_shape = brgemm_node->get_output_shape(0); + const auto& C_shape = expr->get_output_port_descriptor(0)->get_shape(); const auto& C_layout = io_layouts[2]; // We need find original M,N,K having layouts and ordered shapes @@ -777,6 +803,9 @@ BrgemmEmitter::BrgemmEmitter(jit_generator* h, cpu_isa_t isa, const ExpressionPt m_M = brgemm_node->get_input_count(0); m_N = C_shape[get_ordered_idx(C_layout, C_layout.size() - 1)]; + if (brgemm_node->is_with_data_repacking()) + leading_dimensions[1] = rnd_up(m_N, brgemm_copy->get_n_block_size()); + auto brg0Prc = InferenceEngine::details::convertPrecision(brgemm_node->get_input_element_type(0)); auto brg1Prc = InferenceEngine::details::convertPrecision(brgemm_node->get_input_element_type(1)); m_brg0VnniFactor = 4 / brg0Prc.size(); @@ -827,7 +856,7 @@ BrgemmEmitter::BrgemmEmitter(jit_generator* h, cpu_isa_t isa, const ExpressionPt brgemmCtx.N = N(n); brgemmCtx.K = K(k); brgemmCtx.LDA = leading_dimensions[0]; - brgemmCtx.LDB = brgemm_node->is_with_data_repacking() ? rnd_up(m_N, brgemm_copy->get_n_block_size()) : leading_dimensions[1]; + brgemmCtx.LDB = leading_dimensions[1]; brgemmCtx.LDC = leading_dimensions[2]; brgemmCtx.dt_in0 = static_cast(DnnlExtensionUtils::IEPrecisionToDataType(brg0Prc)); brgemmCtx.dt_in1 = static_cast(DnnlExtensionUtils::IEPrecisionToDataType(brg1Prc)); @@ -1219,23 +1248,14 @@ BrgemmCopyBEmitter::BrgemmCopyBEmitter(jit_generator* h, cpu_isa_t isa, const Ex if (m_with_comp) m_comp_offset = brgemm_repack->get_offset_compensations(); - const auto& layout = snippets::lowered::PortDescriptorUtils::get_port_descriptor_ptr(brgemm_repack->input(0))->get_layout(); - const auto& original_shape = brgemm_repack->get_input_shape(0); + const auto& in_desc = expr->get_input_port_descriptor(0); + const auto& layout = in_desc->get_layout(); + const auto& original_shape = in_desc->get_shape(); auto transposed_shape = original_shape; size_t leading_dimension = *(original_shape.rbegin()); if (!layout.empty()) { - transposed_shape.resize(layout.size(), 1); - for (size_t i = 0; i < layout.size(); ++i) { - transposed_shape[i] = original_shape[layout[i]]; - } - // The idea here is to find "2" (for 4D shapes) in the layout and multiply dimensions that are to the right - // This implies that "3" is the last layout value, otherwise this layout is not supported. - // counting from the end since shape could be prepended with ones - const int64_t num_last_dims = layout.end() - std::find(layout.begin(), layout.end(), layout.size() - 2) - 1; - if (layout.back() != layout.size() - 1 || num_last_dims < 1) - IE_THROW() << "BrgemmRepackEmitter detected invalid layout values: " << - "check that this shape + layout combination is schedulable"; - leading_dimension = std::accumulate(original_shape.end() - num_last_dims, original_shape.end(), 1, std::multiplies()); + transposed_shape = snippets::utils::get_planar_vdims(original_shape, layout); + leading_dimension = BrgemmEmitter::get_in_leading_dim(original_shape, layout); } m_N = *(transposed_shape.rbegin()); diff --git a/src/plugins/intel_cpu/src/emitters/x64/jit_snippets_emitters.hpp b/src/plugins/intel_cpu/src/emitters/x64/jit_snippets_emitters.hpp index 7019fb14c6ec29..40437eb9898099 100644 --- a/src/plugins/intel_cpu/src/emitters/x64/jit_snippets_emitters.hpp +++ b/src/plugins/intel_cpu/src/emitters/x64/jit_snippets_emitters.hpp @@ -367,6 +367,9 @@ class BrgemmEmitter : public jit_emitter { static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); size_t aux_gprs_count() const override; + static size_t get_in_leading_dim(const VectorDims& shape, const std::vector& layout); + static size_t get_out_leading_dim(const VectorDims& shape, const std::vector& layout); + private: void validate_arguments(const std::vector &in, const std::vector &out) const override; void emit_impl(const std::vector& in, diff --git a/src/plugins/intel_cpu/src/plugin.cpp b/src/plugins/intel_cpu/src/plugin.cpp index 96be8734ec0dce..d2dd2b0eda08ce 100644 --- a/src/plugins/intel_cpu/src/plugin.cpp +++ b/src/plugins/intel_cpu/src/plugin.cpp @@ -495,6 +495,8 @@ static Config::SnippetsMode getSnippetsMode(const std::mapinput(1)); const auto& brgemm_out_desc = PortDescriptorUtils::get_port_descriptor_ptr(brgemm->output(0)); - const auto dimsMatMulIn0 = snippets::utils::get_planar_pshape(brgemm->input_value(0)).get_shape(); - const auto dimsMatMulIn1 = snippets::utils::get_planar_pshape(brgemm->input_value(1)).get_shape(); + const auto dimsMatMulIn0 = snippets::utils::get_planar_pshape(brgemm->input(0)).get_shape(); + const auto dimsMatMulIn1 = snippets::utils::get_planar_pshape(brgemm->input(1)).get_shape(); const auto K = *dimsMatMulIn0.rbegin(); const auto N = *dimsMatMulIn1.rbegin(); diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/set_brgemm_cpu_blocking_params.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/set_brgemm_cpu_blocking_params.cpp index df88ffa7edcd82..939998c08bd79e 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/set_brgemm_cpu_blocking_params.cpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/set_brgemm_cpu_blocking_params.cpp @@ -35,8 +35,8 @@ pass::SetBrgemmCPUBlockingParams::SetBrgemmCPUBlockingParams() { return false; } - const auto dimsMatMulIn0 = snippets::utils::get_planar_pshape(brgemm->input_value(0)).get_shape(); - const auto dimsMatMulIn1 = snippets::utils::get_planar_pshape(brgemm->input_value(1)).get_shape(); + const auto dimsMatMulIn0 = snippets::utils::get_planar_pshape(brgemm->input(0)).get_shape(); + const auto dimsMatMulIn1 = snippets::utils::get_planar_pshape(brgemm->input(1)).get_shape(); const auto K = *dimsMatMulIn0.rbegin(); const auto N = *dimsMatMulIn1.rbegin(); diff --git a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp index d67c5047b992e0..e979270fee3318 100644 --- a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp +++ b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp @@ -112,6 +112,7 @@ #include "snippets/pass/mha_tokenization.hpp" #include "snippets/pass/collapse_subgraph.hpp" #include "snippets/pass/common_optimizations.hpp" +#include "snippets/pass/split_dimension_m.hpp" #include "snippets/pass/extract_reshapes_from_mha.hpp" // Misc @@ -612,10 +613,14 @@ void Transformations::MainSnippets(void) { // To avoid sitations when Transpose is not alone node between MatMul and Result, // Plugin disables Transpose tokenization on output tokenization_config.mha_token_enable_transpose_on_output = (inferencePrecision == ov::element::f32); - tokenization_config.concurrency = parallel_get_num_threads(); + tokenization_config.concurrency = config.streamExecutorConfig._threadsPerStream; + if (tokenization_config.concurrency == 0) + tokenization_config.concurrency = parallel_get_max_threads(); // The optimization "SplitDimensionM" depends on target machine (thread count). // To avoid uncontrolled behavior in tests, we disabled the optimization when there is Config::SnippetsMode::IgnoreCallback tokenization_config.split_m_dimension = snippetsMode != Config::SnippetsMode::IgnoreCallback; + // [122706] Some 3D MHA Patterns have perf regressions when Transpose op is tokenized + tokenization_config.mha_supported_transpose_ranks = { 4 }; ngraph::pass::Manager snippetsManager; snippetsManager.set_per_pass_validation(false); @@ -671,15 +676,10 @@ void Transformations::MainSnippets(void) { return true; }; auto is_unsupported_parallel_work_amount = [&](const std::shared_ptr& n, const ov::Shape& shape) { - const auto parallel_work_amount = std::accumulate(shape.rbegin() + 2, shape.rend(), 1, std::multiplies()); - // Heuristic values: - // parallelism work amount - not enough work amount for parallelism - // TODO: The heuristic will be removed after parallelism support on JIT level - const auto needed_num_of_threads = 12lu; + const size_t parallel_work_amount = std::accumulate(shape.rbegin() + 2, shape.rend(), 1, std::multiplies()); const auto is_unsupported_parallel_work_amount = - parallel_get_num_threads() / 2 > parallel_work_amount && - static_cast(parallel_work_amount) < needed_num_of_threads && - !ov::snippets::pass::CommonOptimizations::CanOptimizeParallelWA(n, tokenization_config.concurrency); + parallel_work_amount < tokenization_config.concurrency && + !ov::snippets::pass::SplitDimensionM::can_be_optimized(n, tokenization_config.concurrency); return is_unsupported_parallel_work_amount; }; #endif // OPENVINO_ARCH_X86_64 diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha.cpp index 8193709b479741..b05bf845538859 100644 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha.cpp +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha.cpp @@ -5,6 +5,7 @@ #include "snippets/mha.hpp" #include "common_test_utils/test_constants.hpp" #include "test_utils/cpu_test_utils.hpp" +#include "cpp_interfaces/interface/ie_internal_plugin_config.hpp" #include "ie_plugin_config.hpp" #include "ie_system_conf.h" @@ -15,7 +16,7 @@ namespace snippets { namespace { -const std::vector> inputShapes = { +const std::vector> inputShapes_4D = { {{1, 128, 12, 64}, {1, 128, 12, 64}, {1, 12, 128, 128}, {1, 128, 12, 64}}, {{1, 128, 16, 64}, {1, 128, 16, 64}, {1, 16, 1, 1}, {1, 128, 16, 64}}, {{1, 128, 16, 64}, {1, 128, 16, 64}, {1, 1, 1, 128}, {1, 128, 16, 64}}, @@ -23,6 +24,11 @@ const std::vector> inputShapes = { {{1, 58, 16, 34}, {1, 58, 16, 34}, {1, 1, 1, 58}, {1, 58, 16, 34}}, }; +const std::vector> inputShapes_3D = { + {{128, 12, 64}, {128, 12, 64}, {12, 128, 128}, {128, 12, 64}}, + {{68, 6, 92}, {68, 6, 92}, {1, 68, 68}, { 68, 6, 92}}, +}; + static inline bool is_bf16_supported() { return InferenceEngine::with_cpu_x86_bfloat16() || InferenceEngine::with_cpu_x86_avx512_core_amx_bf16(); } @@ -40,24 +46,74 @@ static inline std::vector> precision_bf16(size_t coun return prc; } -INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHA, MHA, +static std::map enable_callback() { + return std::map{ + { + InferenceEngine::PluginConfigInternalParams::KEY_SNIPPETS_MODE, + InferenceEngine::PluginConfigInternalParams::ENABLE + }, + }; +} + +INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHA_4D, MHA, ::testing::Combine( - ::testing::ValuesIn(inputShapes), + ::testing::ValuesIn(inputShapes_4D), ::testing::ValuesIn(precision_f32(4)), ::testing::Values(ov::element::f32), ::testing::ValuesIn({false, true}), + ::testing::Values(MHA::default_thread_count), ::testing::Values(1), ::testing::Values(1), ::testing::Values(ov::test::utils::DEVICE_CPU), ::testing::Values(CPUTestUtils::cpuEmptyPluginConfig)), MHA::getTestCaseName); -INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHABF16, MHA, +INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHA_3D, MHA, + ::testing::Combine( + ::testing::ValuesIn(inputShapes_3D), + ::testing::ValuesIn(precision_f32(4)), + ::testing::Values(ov::element::f32), + ::testing::ValuesIn({false, true}), + ::testing::Values(MHA::default_thread_count), + ::testing::Values(5), // [122706]: Subgraph + 4 Transpose + ::testing::Values(2), // decomposed Transpose + MHA + ::testing::Values(ov::test::utils::DEVICE_CPU), + ::testing::Values(CPUTestUtils::cpuEmptyPluginConfig)), + MHA::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHA_4D_SplitDimensionM, MHA, + ::testing::Combine( + ::testing::Values(std::vector{{1, 128, 2, 64}, {1, 128, 2, 64}, {1, 1, 1, 1}, {1, 128, 2, 64}}), + ::testing::ValuesIn(precision_f32(4)), + ::testing::Values(ov::element::f32), + ::testing::Values(true), + ::testing::Values(4), // 4 Threads + ::testing::Values(6), // Subgraph + 4 Reshapes on inputs and 1 Reshape on output + ::testing::Values(1), + ::testing::Values(ov::test::utils::DEVICE_CPU), + ::testing::Values(enable_callback())), + MHA::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHA_3D_SplitDimensionM, MHA, + ::testing::Combine( + ::testing::Values(std::vector{{384, 2, 64}, {384, 2, 64}, {1, 384, 384}, {384, 2, 64}}), + ::testing::ValuesIn(precision_f32(4)), + ::testing::Values(ov::element::f32), + ::testing::Values(true), + ::testing::Values(4), // 4 Threads + ::testing::Values(10), // Subgraph + 4 Reshapes on inputs and 1 Reshape on output + 4 Transposes + ::testing::Values(1), // MHA + ::testing::Values(ov::test::utils::DEVICE_CPU), + ::testing::Values(enable_callback())), + MHA::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHABF16_4D, MHA, ::testing::Combine( - ::testing::ValuesIn(inputShapes), + ::testing::ValuesIn(inputShapes_4D), ::testing::ValuesIn(precision_bf16(4)), ::testing::Values(ov::element::f32), ::testing::ValuesIn({false, true}), + ::testing::Values(MHA::default_thread_count), ::testing::Values(7), // MHA + 5 Converts + 1 Transpose on output ::testing::Values(6), // MHA + 5 Converts on inputs and output ::testing::Values(ov::test::utils::DEVICE_CPU), @@ -66,10 +122,11 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHABF16, MHA, INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHAEnforceBF16, MHA, ::testing::Combine( - ::testing::ValuesIn(inputShapes), + ::testing::ValuesIn(inputShapes_4D), ::testing::ValuesIn(precision_f32(4)), ::testing::Values(ov::element::bf16), ::testing::ValuesIn({false}), + ::testing::Values(MHA::default_thread_count), ::testing::Values(7), ::testing::Values(7), ::testing::Values(ov::test::utils::DEVICE_CPU), @@ -83,6 +140,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHAMulAdd, MHAMulAdd, ::testing::ValuesIn(precision_f32(3)), ::testing::Values(ov::element::f32), ::testing::ValuesIn({false}), // Need to support True for graph builder in tests + ::testing::Values(MHA::default_thread_count), ::testing::Values(1), ::testing::Values(1), ::testing::Values(ov::test::utils::DEVICE_CPU), @@ -104,6 +162,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHA, MHASelect, ::testing::ValuesIn(precision_f32(6)), ::testing::Values(ov::element::f32), ::testing::Values(false), // Need to support True for graph builder in tests + ::testing::Values(MHA::default_thread_count), ::testing::Values(2), // Less + MHA ::testing::Values(2), ::testing::Values(ov::test::utils::DEVICE_CPU), @@ -125,6 +184,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHAWOTransposeOnInputs_4D, MHAWOTranspos ::testing::Values(std::vector{}), ::testing::Values(ov::element::f32), ::testing::Values(true), // Need to support False for graph builder in tests + ::testing::Values(MHA::default_thread_count), ::testing::Values(1), ::testing::Values(1), ::testing::Values(ov::test::utils::DEVICE_CPU), @@ -137,6 +197,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHAWOTranspose_4D, MHAWOTranspose, ::testing::ValuesIn(precision_f32(3)), ::testing::Values(ov::element::f32), ::testing::ValuesIn({true}), // Need to support False for graph builder in tests + ::testing::Values(MHA::default_thread_count), ::testing::Values(1), ::testing::Values(1), ::testing::Values(ov::test::utils::DEVICE_CPU), @@ -149,6 +210,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHAWOTranspose_3D, MHAWOTranspose, ::testing::ValuesIn(precision_f32(3)), ::testing::Values(ov::element::f32), ::testing::ValuesIn({true}), // Need to support False for graph builder in tests + ::testing::Values(MHA::default_thread_count), ::testing::Values(1), ::testing::Values(1), ::testing::Values(ov::test::utils::DEVICE_CPU), @@ -161,6 +223,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHAWOTransposeBF16_4D, MHAWOTranspose, ::testing::ValuesIn(precision_bf16(3)), ::testing::Values(ov::element::f32), ::testing::ValuesIn({true}), // Need to support False for graph builder in tests + ::testing::Values(MHA::default_thread_count), ::testing::Values(5), // MHA + 4 extra Converts on inputs and output ::testing::Values(5), // MHA + 4 extra Converts on inputs and output ::testing::Values(ov::test::utils::DEVICE_CPU), @@ -173,6 +236,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHAWOTransposeBF16_3D, MHAWOTranspose, ::testing::ValuesIn(precision_bf16(3)), ::testing::Values(ov::element::f32), ::testing::ValuesIn({true}), // Need to support False for graph builder in tests + ::testing::Values(MHA::default_thread_count), ::testing::Values(5), // MHA + 4 extra Converts on inputs and output ::testing::Values(5), // MHA + 4 extra Converts on inputs and output ::testing::Values(ov::test::utils::DEVICE_CPU), @@ -185,6 +249,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHAWOTransposeEnforceBF16_4D, MHAWOTrans ::testing::ValuesIn(precision_f32(3)), ::testing::Values(ov::element::bf16), ::testing::ValuesIn({true}), // Need to support False for graph builder in tests + ::testing::Values(MHA::default_thread_count), ::testing::Values(5), // MHA + 4 extra Converts on inputs and output ::testing::Values(5), // MHA + 4 extra Converts on inputs and output ::testing::Values(ov::test::utils::DEVICE_CPU), @@ -197,6 +262,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHAWOTransposeEnforceBF16_3D, MHAWOTrans ::testing::ValuesIn(precision_f32(3)), ::testing::Values(ov::element::bf16), ::testing::ValuesIn({true}), // Need to support False for graph builder in tests + ::testing::Values(MHA::default_thread_count), ::testing::Values(5), // MHA + 4 extra Converts on inputs and output ::testing::Values(5), // MHA + 4 extra Converts on inputs and output ::testing::Values(ov::test::utils::DEVICE_CPU), @@ -205,10 +271,11 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHAWOTransposeEnforceBF16_3D, MHAWOTrans INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHAINT8MatMul, MHAINT8MatMul, ::testing::Combine( - ::testing::ValuesIn(std::vector>(inputShapes.begin(), inputShapes.begin() + 2)), + ::testing::ValuesIn(std::vector>(inputShapes_4D.begin(), inputShapes_4D.begin() + 2)), ::testing::Values(std::vector{}), ::testing::Values(ov::element::f32), ::testing::Values(false), // The graph doesn't contain Multiply + ::testing::Values(MHA::default_thread_count), ::testing::Values(6), // FQx3 on inputs + MHA + Transpose on output + Deq Mul ::testing::Values(5), // FQx3 on inputs + MHA + Deq Mul ::testing::Values(ov::test::utils::DEVICE_CPU), @@ -221,18 +288,20 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHAQuantMatMul0, MHAQuantMatMul0, ::testing::Values(std::vector{}), ::testing::Values(ov::element::f32), ::testing::Values(false), // The graph doesn't contain Multiply + ::testing::Values(MHA::default_thread_count), ::testing::Values(8), // FQ on input + MHA + Transpose on output + 4 Reshapes + Deq Mul ::testing::Values(3), // FQ on input + MHA + Deq Mul ::testing::Values(ov::test::utils::DEVICE_CPU), ::testing::Values(CPUTestUtils::cpuEmptyPluginConfig)), MHA::getTestCaseName); -INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHAFQAfterMatMul, MHAFQAfterMatMul, +INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHAFQAfterMatMul_4D, MHAFQAfterMatMul, ::testing::Combine( - ::testing::ValuesIn(inputShapes), + ::testing::ValuesIn(inputShapes_4D), ::testing::Values(std::vector{}), ::testing::Values(ov::element::f32), ::testing::Values(false), // The graph doesn't contain Multiply + ::testing::Values(MHA::default_thread_count), ::testing::Values(3), // MHA + Transpose on output + Deq Mul ::testing::Values(2), // MHA + Deq Mul ::testing::Values(ov::test::utils::DEVICE_CPU), @@ -245,6 +314,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHAFQ, MHAFQ, ::testing::Values(std::vector{}), ::testing::Values(ov::element::f32), ::testing::Values(false), // The graph doesn't contain Multiply + ::testing::Values(MHA::default_thread_count), ::testing::Values(7), // Transposex2 + Subgraphsx5 ::testing::Values(5), // MHA + Deq Mul on output + Deqs on inputs + 2 xFQ on inputs ::testing::Values(ov::test::utils::DEVICE_CPU), @@ -261,6 +331,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHATransposedB, MHATransposedB, ::testing::Values(std::vector{}), ::testing::Values(ov::element::f32), ::testing::ValuesIn({true}), // Need to support False for graph builder in tests + ::testing::Values(MHA::default_thread_count), ::testing::Values(2), ::testing::Values(1), ::testing::Values(ov::test::utils::DEVICE_CPU), @@ -282,6 +353,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHAWithExtractedReshape, MHAWithExtracte ::testing::Values(std::vector{}), ::testing::Values(ov::element::f32), ::testing::ValuesIn({true}), // False is not supported for graph builder in tests + ::testing::Values(MHA::default_thread_count), ::testing::Values(3), // Extracted Add + Extracted Reshape + MHA ::testing::Values(2), // Extracted Add + MHA ::testing::Values(ov::test::utils::DEVICE_CPU), diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/transpose.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/transpose.cpp index 4862bdabf03419..4212102e5698e0 100644 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/transpose.cpp +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/transpose.cpp @@ -11,11 +11,24 @@ namespace snippets { namespace { -std::vector input_shapes{{2, 3, 5, 13}, {2, 3, 2, 4}, {1, 7, 1, 4}}; -INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Transpose, Transpose, +std::vector input_shapes_4D{{2, 3, 5, 13}, {2, 3, 2, 4}, {1, 7, 1, 4}}; +std::vector input_shapes_3D{{3, 5, 13}, {3, 2, 4}, {7, 1, 4}}; + +std::vector> orders_4D{{0, 2, 3, 1}}; +std::vector> orders_3D{{1, 2, 0}}; + +INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Transpose_3D, Transpose, + ::testing::Combine( + ::testing::ValuesIn(input_shapes_3D), + ::testing::ValuesIn(orders_3D), + ::testing::Values(1), // Transpose + ::testing::Values(1), // Tokenized Transpose + ::testing::Values(ov::test::utils::DEVICE_CPU)), + Transpose::getTestCaseName); +INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Transpose_4D, Transpose, ::testing::Combine( - ::testing::ValuesIn(input_shapes), - ::testing::Values(std::vector {0, 2, 3, 1}), + ::testing::ValuesIn(input_shapes_4D), + ::testing::ValuesIn(orders_4D), ::testing::Values(1), // Transpose ::testing::Values(1), // Tokenized Transpose ::testing::Values(ov::test::utils::DEVICE_CPU)), @@ -25,7 +38,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_TransposeMul, TransposeMul, ::testing::Combine( ::testing::Values(ov::PartialShape {2, 31, 3, 5}), ::testing::ValuesIn(std::vector{{2, 3, 5, 31}}), - ::testing::Values(std::vector {0, 2, 3, 1}), + ::testing::Values(std::vector {0, 2, 3, 1}), ::testing::Values(1), // Transpose ::testing::Values(1), // Tokenized Transpose ::testing::Values(ov::test::utils::DEVICE_CPU)), diff --git a/src/plugins/intel_cpu/tests/unit/snippets_transformations/mul_add_to_fma.cpp b/src/plugins/intel_cpu/tests/unit/snippets_transformations/mul_add_to_fma.cpp index 3a760050d0159a..ced190761843de 100644 --- a/src/plugins/intel_cpu/tests/unit/snippets_transformations/mul_add_to_fma.cpp +++ b/src/plugins/intel_cpu/tests/unit/snippets_transformations/mul_add_to_fma.cpp @@ -10,7 +10,7 @@ #include "snippets/op/scalar.hpp" #include "lowering_utils.hpp" #include "common_test_utils/common_utils.hpp" -#include "snippets/pass_manager.hpp" +#include "snippets/pass/manager.hpp" namespace ov { namespace test { diff --git a/src/plugins/intel_gpu/include/intel_gpu/graph/program.hpp b/src/plugins/intel_gpu/include/intel_gpu/graph/program.hpp index 1a196ea49e8e95..a67bbf1570ff13 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/graph/program.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/graph/program.hpp @@ -131,6 +131,7 @@ struct program { topology const& topology, const ExecutionConfig& config, std::shared_ptr task_executor, + std::shared_ptr compilation_context, bool is_internal = false, bool no_optimizations = false, bool is_body_program = false); @@ -251,6 +252,14 @@ struct program { bool is_internal = false, bool no_optimizations = false, bool is_body_program = false); + static ptr build_program(engine& engine, + const topology& topology, + const ExecutionConfig& config, + std::shared_ptr task_executor, + std::shared_ptr compilation_context, + bool is_internal = false, + bool no_optimizations = false, + bool is_body_program = false); static ptr build_program(engine& engine, const std::set>& nodes, const ExecutionConfig& config, @@ -266,9 +275,11 @@ struct program { ImplementationsCache& get_implementations_cache() const { return *_impls_cache; } ICompilationContext& get_compilation_context() const { return *_compilation_context; } + std::shared_ptr get_compilation_context_ptr() const { return _compilation_context; } void cancel_compilation_context(); static std::shared_ptr make_task_executor(const ExecutionConfig& config); + static std::shared_ptr make_compilation_context(const ExecutionConfig& config); private: uint32_t prog_id = 0; @@ -286,8 +297,7 @@ struct program { bool is_body_program; std::unique_ptr _impls_cache; const size_t _impls_cache_capacity = 10000; - const int _num_async_build_threads = 1; - std::unique_ptr _compilation_context; + std::shared_ptr _compilation_context; std::map> nodes_map; std::list optimized_out; diff --git a/src/plugins/intel_gpu/include/intel_gpu/plugin/program_builder.hpp b/src/plugins/intel_gpu/include/intel_gpu/plugin/program_builder.hpp index 22864106fb39f5..422451d096729b 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/plugin/program_builder.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/plugin/program_builder.hpp @@ -10,6 +10,7 @@ #include "intel_gpu/plugin/custom_layer.hpp" #include "intel_gpu/runtime/engine.hpp" #include "intel_gpu/runtime/execution_config.hpp" +#include "intel_gpu/runtime/compilation_context.hpp" #include "intel_gpu/graph/topology.hpp" #include "intel_gpu/graph/program.hpp" @@ -75,7 +76,9 @@ class ProgramBuilder final { public: ProgramBuilder(std::shared_ptr model, cldnn::engine& engine, const ExecutionConfig& config, bool createTopologyOnly = false, bool partialBuild = false, - std::shared_ptr task_executor = nullptr, bool innerProgram = false); + std::shared_ptr task_executor = nullptr, + std::shared_ptr compilation_context = nullptr, + bool innerProgram = false); ProgramBuilder(cldnn::engine& engine, const ExecutionConfig& config); static const cldnn::primitive_id m_preProcessTag; @@ -136,6 +139,7 @@ class ProgramBuilder final { bool requires_new_shape_infer(const ov::Node& op) const; std::shared_ptr get_task_executor() const { return m_task_executor; } + std::shared_ptr get_compilation_context() const { return m_compilation_context; } private: static factories_map_t factories_map; @@ -153,6 +157,7 @@ class ProgramBuilder final { bool queryMode; std::shared_ptr m_task_executor; + std::shared_ptr m_compilation_context; void EnableQueryMode() { queryMode = true; } void DisableQueryMode() { queryMode = false; } diff --git a/src/plugins/intel_gpu/src/graph/include/compilation_context.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/compilation_context.hpp similarity index 83% rename from src/plugins/intel_gpu/src/graph/include/compilation_context.hpp rename to src/plugins/intel_gpu/include/intel_gpu/runtime/compilation_context.hpp index be8d65c6aa5ecc..f664e728680b62 100644 --- a/src/plugins/intel_gpu/src/graph/include/compilation_context.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/compilation_context.hpp @@ -4,10 +4,10 @@ #pragma once -#include "openvino/runtime/threading/cpu_streams_executor.hpp" #include #include #include "intel_gpu/graph/kernel_impl_params.hpp" +#include "openvino/runtime/threading/istreams_executor.hpp" namespace cldnn { @@ -21,7 +21,7 @@ class ICompilationContext { virtual void cancel() = 0; virtual void wait_all() = 0; - static std::unique_ptr create(ov::threading::IStreamsExecutor::Config task_executor_config); + static std::shared_ptr create(ov::threading::IStreamsExecutor::Config task_executor_config); }; } // namespace cldnn diff --git a/src/plugins/intel_gpu/src/graph/compilation_context.cpp b/src/plugins/intel_gpu/src/graph/compilation_context.cpp index c1f483200c9a38..df2fad3412286b 100644 --- a/src/plugins/intel_gpu/src/graph/compilation_context.cpp +++ b/src/plugins/intel_gpu/src/graph/compilation_context.cpp @@ -2,12 +2,14 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "compilation_context.hpp" #include #include #include #include #include "intel_gpu/runtime/utils.hpp" +#include "intel_gpu/runtime/compilation_context.hpp" + +#include "openvino/runtime/threading/cpu_streams_executor.hpp" namespace cldnn { class CompilationContext : public ICompilationContext { @@ -83,7 +85,7 @@ class CompilationContext : public ICompilationContext { std::vector> futures; }; -std::unique_ptr ICompilationContext::create(ov::threading::IStreamsExecutor::Config task_executor_config) { +std::shared_ptr ICompilationContext::create(ov::threading::IStreamsExecutor::Config task_executor_config) { return cldnn::make_unique(task_executor_config); } diff --git a/src/plugins/intel_gpu/src/graph/network.cpp b/src/plugins/intel_gpu/src/graph/network.cpp index 240db96d5b4988..c8a081dadbc45f 100644 --- a/src/plugins/intel_gpu/src/graph/network.cpp +++ b/src/plugins/intel_gpu/src/graph/network.cpp @@ -13,6 +13,7 @@ #include "intel_gpu/runtime/engine.hpp" #include "intel_gpu/runtime/event.hpp" #include "intel_gpu/runtime/stream.hpp" +#include "intel_gpu/runtime/compilation_context.hpp" #include "intel_gpu/runtime/debug_configuration.hpp" #include "intel_gpu/runtime/itt.hpp" @@ -34,7 +35,6 @@ #include "program_helpers.h" #include "to_string_utils.h" #include "kernels_cache.hpp" -#include "compilation_context.hpp" // TODO: Remove once we have an abstraction for kernels_cache #include "kernel_base.h" diff --git a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp index a81d0bd10ad58d..58ecac8e776b39 100644 --- a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp +++ b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp @@ -25,7 +25,6 @@ #include "read_value_inst.h" #include "condition_inst.h" #include "experimental_detectron_roi_feature_extractor_inst.hpp" -#include "compilation_context.hpp" #include "implementation_map.hpp" #include "graph_optimizer/prepare_buffer_fusing.h" @@ -36,6 +35,7 @@ #include "intel_gpu/runtime/memory.hpp" #include "intel_gpu/runtime/error_handler.hpp" #include "intel_gpu/runtime/debug_configuration.hpp" +#include "intel_gpu/runtime/compilation_context.hpp" #include "json_object.h" #include @@ -1502,7 +1502,13 @@ cldnn::network::ptr primitive_inst::get_unfused_subgraph() { ov::intel_gpu::allow_static_input_reorder(true), ov::intel_gpu::allow_new_shape_infer(true) }; - auto prog = program::build_program(get_network().get_engine(), t, subgraph_config, get_network().get_program()->get_task_executor(), true, false); + auto prog = program::build_program(get_network().get_engine(), + t, + subgraph_config, + get_network().get_program()->get_task_executor(), + get_network().get_program()->get_compilation_context_ptr(), + true, + false); _unfused_subgraph = network::allocate_network(get_network().get_stream_ptr(), prog, true, get_network().is_primary_stream()); } diff --git a/src/plugins/intel_gpu/src/graph/program.cpp b/src/plugins/intel_gpu/src/graph/program.cpp index 59af7125f9e4dc..dde29dc1e32504 100644 --- a/src/plugins/intel_gpu/src/graph/program.cpp +++ b/src/plugins/intel_gpu/src/graph/program.cpp @@ -8,6 +8,7 @@ #include "intel_gpu/runtime/engine.hpp" #include "intel_gpu/runtime/debug_configuration.hpp" #include "intel_gpu/runtime/itt.hpp" +#include "intel_gpu/runtime/compilation_context.hpp" #include "intel_gpu/graph/program.hpp" #include "auto_tuner.h" @@ -17,7 +18,6 @@ #include "program_dump_graph.h" #include "sliding_window_utils.hpp" #include "program_helpers.h" -#include "compilation_context.hpp" #include "matrix_nms_inst.h" #include "roi_pooling_inst.h" @@ -145,10 +145,17 @@ std::shared_ptr program::make_task_executor(con return std::make_shared(task_executor_config); } +std::shared_ptr program::make_compilation_context(const ExecutionConfig& config) { + const int _num_async_build_threads = 1; + return ICompilationContext::create(make_task_executor_config(config, + "Task executor config for CompilationContext in GPU plugin", _num_async_build_threads)); +} + program::program(engine& engine_ref, topology const& topology, const ExecutionConfig& config, std::shared_ptr task_executor, + std::shared_ptr compilation_context, bool is_internal, bool no_optimizations, bool is_body_program) @@ -158,7 +165,8 @@ program::program(engine& engine_ref, _task_executor(std::move(task_executor)), processing_order(), is_internal(is_internal), - is_body_program(is_body_program) { + is_body_program(is_body_program), + _compilation_context(compilation_context) { _config.apply_user_properties(_engine.get_device_info()); init_primitives(); GPU_DEBUG_INFO << "Program config\n" << config.to_string(); @@ -214,8 +222,8 @@ void program::init_program() { _kernels_cache = std::unique_ptr(new kernels_cache(_engine, _config, prog_id, _task_executor, kernel_selector::KernelBase::get_db().get_batch_header_str())); - _compilation_context = ICompilationContext::create(make_task_executor_config(_config, - "Task executor config for CompilationContext in GPU plugin", _num_async_build_threads)); + if (!_compilation_context) + _compilation_context = program::make_compilation_context(_config); _impls_cache = cldnn::make_unique(_impls_cache_capacity); // Remove items of compilation context's internal queue when some impl is popped in kernels_cache @@ -253,7 +261,18 @@ program::ptr program::build_program(engine& engine, bool is_internal, bool no_optimizations, bool is_body_program) { - return std::make_shared(engine, topology, config, task_executor, is_internal, no_optimizations, is_body_program); + return std::make_shared(engine, topology, config, task_executor, nullptr, is_internal, no_optimizations, is_body_program); +} + +program::ptr program::build_program(engine& engine, + const topology& topology, + const ExecutionConfig& config, + std::shared_ptr task_executor, + std::shared_ptr compilation_context, + bool is_internal, + bool no_optimizations, + bool is_body_program) { + return std::make_shared(engine, topology, config, task_executor, compilation_context, is_internal, no_optimizations, is_body_program); } program::ptr program::build_program(engine& engine, @@ -262,7 +281,7 @@ program::ptr program::build_program(engine& engine, bool is_internal, bool no_optimizations, bool is_body_program) { - return std::make_shared(engine, topology, config, nullptr, is_internal, no_optimizations, is_body_program); + return std::make_shared(engine, topology, config, nullptr, nullptr, is_internal, no_optimizations, is_body_program); } program::ptr program::build_program(engine& engine, diff --git a/src/plugins/intel_gpu/src/plugin/ops/condition.cpp b/src/plugins/intel_gpu/src/plugin/ops/condition.cpp index c25726f673a2f8..7d47d1127fe57d 100644 --- a/src/plugins/intel_gpu/src/plugin/ops/condition.cpp +++ b/src/plugins/intel_gpu/src/plugin/ops/condition.cpp @@ -31,7 +31,7 @@ static cldnn::condition::branch gen_branch(ProgramBuilder& p, const std::shared_ config.set_property(ov::intel_gpu::max_dynamic_batch(1)); config.set_property(ov::intel_gpu::allow_new_shape_infer(op->is_dynamic())); - ProgramBuilder prog(internal_body, p.get_engine(), config, false, false, p.get_task_executor(), true); + ProgramBuilder prog(internal_body, p.get_engine(), config, false, false, p.get_task_executor(), p.get_compilation_context(), true); branch.inner_program = prog.get_compiled_program(); auto& input_map = branch.input_map; diff --git a/src/plugins/intel_gpu/src/plugin/ops/loop.cpp b/src/plugins/intel_gpu/src/plugin/ops/loop.cpp index 628b0d7c37d9aa..af93885a5d949c 100644 --- a/src/plugins/intel_gpu/src/plugin/ops/loop.cpp +++ b/src/plugins/intel_gpu/src/plugin/ops/loop.cpp @@ -280,7 +280,7 @@ static void CreateCommonLoopOp(ProgramBuilder& p, const std::shared_ptr& op) { ProgramBuilder::ProgramBuilder(std::shared_ptr model, cldnn::engine& engine, const ExecutionConfig& config, bool create_topology_only, bool partial_build, - std::shared_ptr task_executor, bool is_inner_program) + std::shared_ptr task_executor, + std::shared_ptr compilation_context, + bool is_inner_program) : m_config(config) , m_engine(engine) , queryMode(false) - , m_task_executor(task_executor) { + , m_task_executor(task_executor) + , m_compilation_context(compilation_context) { if (m_task_executor == nullptr) m_task_executor = cldnn::program::make_task_executor(m_config); + if (m_compilation_context == nullptr) { + m_compilation_context = cldnn::program::make_compilation_context(m_config); + } // locate global custom kernel config // and auto-load kernels from it #ifdef _WIN32 @@ -158,7 +164,14 @@ std::shared_ptr ProgramBuilder::build(const std::vector()); } diff --git a/src/plugins/intel_gpu/tests/unit/module_tests/kernel_impl_params_relevance_test.cpp b/src/plugins/intel_gpu/tests/unit/module_tests/kernel_impl_params_relevance_test.cpp index 34cc90d791a756..017ad15981cf91 100644 --- a/src/plugins/intel_gpu/tests/unit/module_tests/kernel_impl_params_relevance_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/module_tests/kernel_impl_params_relevance_test.cpp @@ -7,9 +7,9 @@ #include #include #include +#include "intel_gpu/runtime/compilation_context.hpp" #include "fully_connected_inst.h" -#include "compilation_context.hpp" #include "program_wrapper.h" diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp index dc23440c48af67..2f684a40f7f5ec 100644 --- a/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp @@ -14,7 +14,7 @@ #include #include -#include "compilation_context.hpp" +#include "intel_gpu/runtime/compilation_context.hpp" #include "fully_connected_inst.h" #include diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/gemm_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/gemm_gpu_test.cpp index a90edc00a2db98..247453944e3a4a 100644 --- a/src/plugins/intel_gpu/tests/unit/test_cases/gemm_gpu_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/test_cases/gemm_gpu_test.cpp @@ -10,7 +10,7 @@ #include #include "openvino/reference/matmul.hpp" -#include "compilation_context.hpp" +#include "intel_gpu/runtime/compilation_context.hpp" #include "gemm_inst.h" #include diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/group_normalization_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/group_normalization_gpu_test.cpp index a13c1d1550882f..ed52f276fa5960 100644 --- a/src/plugins/intel_gpu/tests/unit/test_cases/group_normalization_gpu_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/test_cases/group_normalization_gpu_test.cpp @@ -7,7 +7,7 @@ #include #include #include "openvino/reference/group_normalization.hpp" -#include "compilation_context.hpp" +#include "intel_gpu/runtime/compilation_context.hpp" using namespace cldnn; diff --git a/src/plugins/template/src/remote_context.cpp b/src/plugins/template/src/remote_context.cpp index 2003cdf48ed93f..51d4f7a0211d77 100644 --- a/src/plugins/template/src/remote_context.cpp +++ b/src/plugins/template/src/remote_context.cpp @@ -26,7 +26,7 @@ class VectorTensorImpl : public ov::IRemoteTensor { m_strides.clear(); if (!shape.empty()) { m_strides.resize(shape.size()); - m_strides.back() = m_element_type.size(); + m_strides.back() = shape.back() == 0 ? 0 : m_element_type.size(); std::copy(shape.rbegin(), shape.rend() - 1, m_strides.rbegin() + 1); std::partial_sum(m_strides.rbegin(), m_strides.rend(), m_strides.rbegin(), std::multiplies()); } diff --git a/src/tests/functional/plugin/shared/include/snippets/mha.hpp b/src/tests/functional/plugin/shared/include/snippets/mha.hpp index 1a922d215fa058..547fa323cf4b18 100644 --- a/src/tests/functional/plugin/shared/include/snippets/mha.hpp +++ b/src/tests/functional/plugin/shared/include/snippets/mha.hpp @@ -16,6 +16,7 @@ typedef std::tuple< std::vector, // Input Element types ov::element::Type, // Inference precision bool, // With Multiply + size_t, // Thread count size_t, // Expected num nodes size_t, // Expected num subgraphs std::string, // Target Device @@ -27,13 +28,17 @@ class MHA : public testing::WithParamInterface, public: static std::string getTestCaseName(testing::TestParamInfo obj); + constexpr static size_t default_thread_count = 0; + protected: void SetUp() override; + void compile_model() override; void generate_inputs(const std::vector& targetInputStaticShapes) override; virtual std::shared_ptr get_subgraph(); bool m_with_mul = false; + size_t m_thread_count; std::vector m_input_types; }; diff --git a/src/tests/functional/plugin/shared/src/snippets/mha.cpp b/src/tests/functional/plugin/shared/src/snippets/mha.cpp index 3017fe55a83a44..c21a754b0ad901 100644 --- a/src/tests/functional/plugin/shared/src/snippets/mha.cpp +++ b/src/tests/functional/plugin/shared/src/snippets/mha.cpp @@ -18,10 +18,11 @@ std::string MHA::getTestCaseName(testing::TestParamInfo elem_types; ov::element::Type prc; bool withMul; + size_t thread_count; std::string targetDevice; size_t num_nodes, num_subgraphs; std::map additionalConfig; - std::tie(inputShapes, elem_types, prc, withMul, num_nodes, num_subgraphs, targetDevice, additionalConfig) = obj.param; + std::tie(inputShapes, elem_types, prc, withMul, thread_count, num_nodes, num_subgraphs, targetDevice, additionalConfig) = obj.param; std::ostringstream result; for (size_t i = 0; i < inputShapes.size(); ++i) @@ -29,6 +30,7 @@ std::string MHA::getTestCaseName(testing::TestParamInfo inputShapes; ov::element::Type prc; std::map additionalConfig; - std::tie(inputShapes, m_input_types, prc, m_with_mul, ref_num_nodes, ref_num_subgraphs, targetDevice, additionalConfig) = this->GetParam(); + std::tie(inputShapes, m_input_types, prc, m_with_mul, m_thread_count, + ref_num_nodes, ref_num_subgraphs, targetDevice, additionalConfig) = this->GetParam(); init_input_shapes(static_partial_shapes_to_test_representation(inputShapes)); const auto subgraph_model = get_subgraph(); @@ -66,6 +69,12 @@ void MHA::SetUp() { rel_threshold = 0.05f; } +void MHA::compile_model() { + if (m_thread_count != default_thread_count) + core->set_property(targetDevice, ov::inference_num_threads(m_thread_count)); + SubgraphBaseTest::compile_model(); +} + void MHA::generate_inputs(const std::vector& targetInputStaticShapes) { inputs.clear(); const auto& model_inputs = function->inputs(); diff --git a/src/tests/ov_helpers/ov_snippets_models/include/subgraph_mha.hpp b/src/tests/ov_helpers/ov_snippets_models/include/subgraph_mha.hpp index 0c6521dba84e95..57f7bf30e3c860 100644 --- a/src/tests/ov_helpers/ov_snippets_models/include/subgraph_mha.hpp +++ b/src/tests/ov_helpers/ov_snippets_models/include/subgraph_mha.hpp @@ -56,6 +56,19 @@ class MHAFunction : public SnippetsFunctionBase { std::vector precisions; }; +class MHASplitMFunction : public MHAFunction { +public: + explicit MHASplitMFunction(const std::vector& inputShapes, const std::vector& precisions, + const std::vector& reshapes, bool with_mul = true) + : MHAFunction(inputShapes, precisions, with_mul), reshapes(reshapes) { + OPENVINO_ASSERT(reshapes.size() == 5, "Got invalid number of Reshape shapes"); + } +protected: + std::shared_ptr initReference() const override; + + std::vector reshapes; +}; + /* Graph: * Transpose1[0,2,1,3] Constant * \ / diff --git a/src/tests/ov_helpers/ov_snippets_models/src/subgraph_mha.cpp b/src/tests/ov_helpers/ov_snippets_models/src/subgraph_mha.cpp index fdefcf03d9dd19..661af347dd4574 100644 --- a/src/tests/ov_helpers/ov_snippets_models/src/subgraph_mha.cpp +++ b/src/tests/ov_helpers/ov_snippets_models/src/subgraph_mha.cpp @@ -13,6 +13,40 @@ namespace ov { namespace test { namespace snippets { +namespace { +std::vector get_rank_equivalent_order(std::vector default_order, size_t rank) { + OPENVINO_ASSERT(rank > 2, "Incorrect rank for testing"); + auto order = std::vector(rank); + std::iota(order.begin(), order.end(), 0); + const auto diff = rank - default_order.size(); + for (size_t i = 0; i < default_order.size(); ++i) { + order[diff + i] = default_order[i] + diff; + } + return order; +} +std::vector get_fusion_order(size_t rank) { + return get_rank_equivalent_order({1, 0, 2}, rank); +} +std::vector get_decomposed_order(size_t rank) { + return get_rank_equivalent_order({1, 2, 0}, rank); +} +std::vector get_fusion_order_after_split_m(size_t rank, bool is_input) { + if (rank == 4) { + return is_input ? std::vector{2, 0, 1, 3} : std::vector{1, 2, 0, 3}; + } else if (rank == 5) { + return is_input ? std::vector{0, 3, 1, 2, 4} : std::vector{0, 2, 3, 1, 4}; + } + OPENVINO_THROW("Incorrect rank for testing"); +} +std::vector get_decomposed_order_after_split_m(size_t rank) { + if (rank == 4) { + return std::vector{1, 2, 3, 0}; + } else if (rank == 5) { + return std::vector{0, 2, 3, 4, 1}; + } + OPENVINO_THROW("Incorrect rank for testing"); +} +} // namespace std::shared_ptr MHAFunction::initOriginal() const { auto transpose0Param = std::make_shared(precisions[0], input_shapes[0]); @@ -21,48 +55,40 @@ std::shared_ptr MHAFunction::initOriginal() const { auto transpose2Param = std::make_shared(precisions[3], input_shapes[3]); ngraph::ParameterVector ngraphParam = {transpose0Param, transpose1Param, addParam, transpose2Param}; - std::vector constantShapes; - constantShapes.push_back(ov::Shape({input_shapes[0].get_shape().size()})); - constantShapes.push_back(ov::Shape({input_shapes[0].get_shape().size()})); - constantShapes.push_back(ov::Shape({1, input_shapes[1].get_shape()[2], 1, 1})); - constantShapes.push_back(ov::Shape({2})); - constantShapes.push_back(ov::Shape({4})); - constantShapes.push_back(ov::Shape({input_shapes[0].get_shape().size()})); - constantShapes.push_back(ov::Shape({input_shapes[0].get_shape().size()})); - - auto transpose0Const = ngraph::builder::makeConstant(ngraph::element::i64, constantShapes[0], std::vector{0, 2, 1, 3}); - auto transpose1Const = ngraph::builder::makeConstant(ngraph::element::i64, constantShapes[1], std::vector{0, 2, 3, 1}); - auto transpose2Const = ngraph::builder::makeConstant(ngraph::element::i64, constantShapes[5], std::vector{0, 2, 1, 3}); - auto transpose3Const = ngraph::builder::makeConstant(ngraph::element::i64, constantShapes[6], std::vector{0, 2, 1, 3}); - - std::vector reshape0ConstData = {static_cast(input_shapes[0].get_shape()[0] * - input_shapes[0].get_shape()[1] * input_shapes[0].get_shape()[2]), - -1}; - auto reshape0Const = ngraph::builder::makeConstant(ngraph::element::i64, constantShapes[3], reshape0ConstData); + const auto rank = input_shapes[0].size(); + const auto fusion_order = get_fusion_order(rank); + const auto decomposed_order = get_decomposed_order(rank); - std::vector reshape1ConstData = {static_cast(input_shapes[0].get_shape()[0]), - static_cast(input_shapes[0].get_shape()[2]), - static_cast(input_shapes[0].get_shape()[1]), - static_cast(input_shapes[0].get_shape()[1])}; - auto reshape1Const = ngraph::builder::makeConstant(ngraph::element::i64, constantShapes[4], reshape1ConstData); + const auto transpose0Const = ngraph::builder::makeConstant(ngraph::element::i64, ov::Shape{rank}, fusion_order); + const auto transpose1Const = ngraph::builder::makeConstant(ngraph::element::i64, ov::Shape{rank}, decomposed_order); + const auto transpose2Const = ngraph::builder::makeConstant(ngraph::element::i64, ov::Shape{rank}, fusion_order); + const auto transpose3Const = ngraph::builder::makeConstant(ngraph::element::i64, ov::Shape{rank}, fusion_order); - float transA = false; - float transB = false; const auto transpose0 = std::make_shared(transpose0Param, transpose0Const); const auto transpose1 = std::make_shared(transpose1Param, transpose1Const); std::shared_ptr matmul_parent1 = transpose1; if (with_mul) { - std::vector mulConstData(ngraph::shape_size(constantShapes[2])); - auto mulConst = ngraph::builder::makeConstant(precisions[1], constantShapes[2], mulConstData, true); + ov::Shape shape(rank, 1); + shape[rank - 3] = transpose1->get_output_shape(0)[rank - 3]; + std::vector mulConstData(ngraph::shape_size(shape)); + const auto mulConst = ngraph::builder::makeConstant(precisions[1], shape, mulConstData, true); matmul_parent1 = std::make_shared(transpose1, mulConst); } - const auto matMul0 = std::make_shared(transpose0, matmul_parent1, transA, transB); + const auto matMul0 = std::make_shared(transpose0, matmul_parent1); const auto add = std::make_shared(matMul0, addParam); + + const auto interm_shape = add->get_output_shape(0); + const auto batch = std::accumulate(interm_shape.cbegin(), interm_shape.cbegin() + rank - 1, 1, std::multiplies()); + const auto reshape0ConstData = std::vector{ batch, -1 }; + const auto reshape1ConstData = interm_shape; + const auto reshape0Const = ngraph::builder::makeConstant(ngraph::element::i64, ov::Shape{reshape0ConstData.size()}, reshape0ConstData); + const auto reshape1Const = ngraph::builder::makeConstant(ngraph::element::i64, ov::Shape{reshape1ConstData.size()}, reshape1ConstData); + const auto reshape0 = std::make_shared(add, reshape0Const, true); const auto softMax = std::make_shared(reshape0, 1); const auto reshape1 = std::make_shared(softMax, reshape1Const, true); const auto transpose2 = std::make_shared(transpose2Param, transpose2Const); - const auto matMul1 = std::make_shared(reshape1, transpose2, transA, transB); + const auto matMul1 = std::make_shared(reshape1, transpose2); const auto transpose3 = std::make_shared(matMul1, transpose3Const); ngraph::ResultVector results{std::make_shared(transpose3)}; @@ -81,53 +107,36 @@ std::shared_ptr MHAFunction::initReference() const { auto addParam = std::make_shared(precisions[2], input_shapes[2]); auto transpose2Param = std::make_shared(precisions[3], input_shapes[3]); - std::vector constantShapes; - constantShapes.push_back(ov::Shape({input_shapes[0].get_shape().size()})); - constantShapes.push_back(ov::Shape({input_shapes[0].get_shape().size()})); - constantShapes.push_back(ov::Shape({1, input_shapes[1].get_shape()[2], 1, 1})); - constantShapes.push_back(ov::Shape({2})); - constantShapes.push_back(ov::Shape({4})); - constantShapes.push_back(ov::Shape({input_shapes[0].get_shape().size()})); - constantShapes.push_back(ov::Shape({input_shapes[0].get_shape().size()})); - - auto transpose0Const = ngraph::builder::makeConstant(ngraph::element::i64, constantShapes[0], std::vector{0, 2, 1, 3}); - auto transpose1Const = ngraph::builder::makeConstant(ngraph::element::i64, constantShapes[1], std::vector{0, 2, 3, 1}); - auto transpose2Const = ngraph::builder::makeConstant(ngraph::element::i64, constantShapes[5], std::vector{0, 2, 1, 3}); - auto transpose3Const = ngraph::builder::makeConstant(ngraph::element::i64, constantShapes[6], std::vector{0, 2, 1, 3}); - ngraph::ParameterVector subgraph_params = {transpose0Param, transpose1Param, addParam, transpose2Param}; - std::vector reshape0ConstData = {static_cast(input_shapes[0].get_shape()[0] * - input_shapes[0].get_shape()[1] * input_shapes[0].get_shape()[2]), - -1}; - auto reshape0Const = ngraph::builder::makeConstant(ngraph::element::i64, constantShapes[3], reshape0ConstData); + const auto rank = input_shapes[0].size(); + const auto fusion_order = get_fusion_order(rank); + const auto decomposed_order = get_decomposed_order(rank); - std::vector reshape1ConstData = {static_cast(input_shapes[0].get_shape()[0]), - static_cast(input_shapes[0].get_shape()[2]), - static_cast(input_shapes[0].get_shape()[1]), - static_cast(input_shapes[0].get_shape()[1])}; - auto reshape1Const = ngraph::builder::makeConstant(ngraph::element::i64, constantShapes[4], reshape1ConstData); + const auto transpose0Const = ngraph::builder::makeConstant(ngraph::element::i64, ov::Shape{rank}, fusion_order); + const auto transpose1Const = ngraph::builder::makeConstant(ngraph::element::i64, ov::Shape{rank}, decomposed_order); + const auto transpose2Const = ngraph::builder::makeConstant(ngraph::element::i64, ov::Shape{rank}, fusion_order); + const auto transpose3Const = ngraph::builder::makeConstant(ngraph::element::i64, ov::Shape{rank}, fusion_order); - float transA = false; - float transB = false; const auto transpose0 = std::make_shared(transpose0Param, transpose0Const); const auto transpose1 = std::make_shared(transpose1Param, transpose1Const); std::shared_ptr matmul_parent1 = transpose1; if (with_mul) { - std::vector mulConstData(ngraph::shape_size(constantShapes[2])); - auto mulConst = ngraph::builder::makeConstant(precisions[1], constantShapes[2], mulConstData, true); - auto mulParam = std::make_shared(precisions[1], mulConst->get_shape()); + ov::Shape shape(rank, 1); + shape[rank - 3] = transpose1->get_output_shape(0)[rank - 3]; + std::vector mulConstData(ngraph::shape_size(shape)); + const auto mulConst = ngraph::builder::makeConstant(precisions[1], shape, mulConstData, true); + const auto mulParam = std::make_shared(precisions[1], mulConst->get_shape()); matmul_parent1 = std::make_shared(transpose1, mulParam); subgraph_params = {transpose0Param, transpose1Param, mulParam, addParam, transpose2Param}; subgraph_inputs = {data0, data1, mulConst, data2, data3}; } - const auto matMul0 = std::make_shared(transpose0, matmul_parent1, transA, transB); + + const auto matMul0 = std::make_shared(transpose0, matmul_parent1); const auto add = std::make_shared(matMul0, addParam); - const auto reshape0 = std::make_shared(add, reshape0Const, true); - const auto softMax = std::make_shared(reshape0, 1); - const auto reshape1 = std::make_shared(softMax, reshape1Const, true); + const auto softMax = std::make_shared(add, rank - 1); const auto transpose2 = std::make_shared(transpose2Param, transpose2Const); - const auto matMul1 = std::make_shared(reshape1, transpose2, transA, transB); + const auto matMul1 = std::make_shared(softMax, transpose2); const auto transpose3 = std::make_shared(matMul1, transpose3Const); auto subgraph = std::make_shared(subgraph_inputs, @@ -135,6 +144,70 @@ std::shared_ptr MHAFunction::initReference() const { return std::make_shared(NodeVector{subgraph}, ngraphParams); } +std::shared_ptr MHASplitMFunction::initReference() const { + auto data0 = std::make_shared(precisions[0], input_shapes[0]); + auto data1 = std::make_shared(precisions[1], input_shapes[1]); + auto data2 = std::make_shared(precisions[2], input_shapes[2]); + auto data3 = std::make_shared(precisions[3], input_shapes[3]); + ngraph::ParameterVector ngraphParams = {data0, data1, data2, data3}; + + auto make_reshape = [](const std::shared_ptr& node, const ov::Shape& new_shape) { + auto shape_const = ngraph::builder::makeConstant(ngraph::element::i32, {new_shape.size()}, new_shape); + return std::make_shared(node, shape_const, true); + }; + + auto reshape0 = make_reshape(data0, reshapes[0]); + auto reshape1 = make_reshape(data1, reshapes[1]); + auto reshape2 = make_reshape(data2, reshapes[2]); + auto reshape3 = make_reshape(data3, reshapes[3]); + NodeVector subgraph_inputs = {reshape0, reshape1, reshape2, reshape3}; + + auto transpose0Param = std::make_shared(precisions[0], reshape0->get_shape()); + auto transpose1Param = std::make_shared(precisions[1], reshape1->get_shape()); + auto addParam = std::make_shared(precisions[2], reshape2->get_shape()); + auto transpose2Param = std::make_shared(precisions[3], reshape3->get_shape()); + ngraph::ParameterVector subgraph_params = {transpose0Param, transpose1Param, addParam, transpose2Param}; + + const auto rank = input_shapes[0].size() + 1; + + const auto transpose0Const = ngraph::builder::makeConstant(ngraph::element::i64, ov::Shape{rank}, get_fusion_order_after_split_m(rank, true)); + const auto transpose1Const = ngraph::builder::makeConstant(ngraph::element::i64, ov::Shape{rank}, get_decomposed_order_after_split_m(rank)); + const auto transpose2Const = ngraph::builder::makeConstant(ngraph::element::i64, ov::Shape{rank}, get_fusion_order_after_split_m(rank, true)); + const auto transpose3Const = ngraph::builder::makeConstant(ngraph::element::i64, ov::Shape{rank}, get_fusion_order_after_split_m(rank, false)); + + const auto transpose0 = std::make_shared(transpose0Param, transpose0Const); + const auto transpose1 = std::make_shared(transpose1Param, transpose1Const); + + std::shared_ptr matmul_parent1 = transpose1; + if (with_mul) { + ov::Shape shape(rank - 1, 1); + shape[rank - 4] = transpose1->get_output_shape(0)[rank - 4]; + ov::Shape reshape_shape = shape; + reshape_shape.insert(reshape_shape.cbegin() + rank - 3, 1); + std::vector mulConstData(ngraph::shape_size(shape)); + const auto mulConst = ngraph::builder::makeConstant(precisions[1], shape, mulConstData, true); + const auto reshape_mul = make_reshape(mulConst, reshape_shape); + const auto mulParam = std::make_shared(precisions[1], reshape_mul->get_shape()); + matmul_parent1 = std::make_shared(transpose1, mulParam); + subgraph_params = {transpose0Param, transpose1Param, mulParam, addParam, transpose2Param}; + subgraph_inputs = {reshape0, reshape1, reshape_mul, reshape2, reshape3}; + } + + const auto matMul0 = std::make_shared(transpose0, matmul_parent1); + const auto add = std::make_shared(matMul0, addParam); + const auto softMax = std::make_shared(add, rank - 1); + const auto transpose2 = std::make_shared(transpose2Param, transpose2Const); + const auto matMul1 = std::make_shared(softMax, transpose2); + const auto transpose3 = std::make_shared(matMul1, transpose3Const); + + const auto subgraph = std::make_shared(subgraph_inputs, + std::make_shared(ov::OutputVector{transpose3}, + subgraph_params)); + + auto reshape4 = make_reshape(subgraph, reshapes[4]); + ngraph::ResultVector results{std::make_shared(reshape4)}; + return std::make_shared(results, ngraphParams, "mha"); +} std::shared_ptr MHAMatMul0TransposeFunction::initOriginal() const { auto transpose0Param = std::make_shared(precisions[0], input_shapes[0]); diff --git a/tests/layer_tests/pytorch_tests/test_convnd.py b/tests/layer_tests/pytorch_tests/test_convnd.py index 8b46b2992d2c07..8ea83a5aae5dc1 100644 --- a/tests/layer_tests/pytorch_tests/test_convnd.py +++ b/tests/layer_tests/pytorch_tests/test_convnd.py @@ -216,6 +216,7 @@ def forward(self, x, y): @pytest.mark.nightly @pytest.mark.precommit + @pytest.mark.xfail(reason="ticket 123727") def test_conv2d(self, ie_device, precision, ir_version): self._test(*self.create_model(), ie_device, precision, ir_version, freeze_model=True, dynamic_shapes=False)