diff --git a/src/common/snippets/include/snippets/lowered/expression.hpp b/src/common/snippets/include/snippets/lowered/expression.hpp index d3367c2abc6475..3be336599bfdcd 100644 --- a/src/common/snippets/include/snippets/lowered/expression.hpp +++ b/src/common/snippets/include/snippets/lowered/expression.hpp @@ -4,14 +4,13 @@ #pragma once -#include - #include #include -#include "snippets/tensor_descriptor.hpp" #include "snippets/emitter.hpp" #include "snippets/target_machine.hpp" +#include "snippets/lowered/tensor.hpp" +#include "snippets/lowered/expression_port.hpp" namespace ngraph { @@ -19,43 +18,15 @@ namespace snippets { namespace lowered { class LinearIR; -class Expression; -using ExpressionPtr = std::shared_ptr; - -class ExpressionPort { - friend class Expression; - -public: - enum Type { - Input, - Output - }; - - ExpressionPort() = default; - - Type get_type() const { return m_type; } - - ExpressionPtr expr = nullptr; - size_t port = 0; - -private: - ExpressionPort(const ExpressionPtr& expr, size_t port, Type type); - - Type m_type = Type::Input; -}; class Expression : public std::enable_shared_from_this { friend class LinearIR; + friend class ExpressionPort; public: static size_t LOOP_NULL_ID; Expression() = default; - explicit Expression(const std::shared_ptr& n); - // The ctor fills outputs automatically from rt_info and/or tensor shapes - explicit Expression(const std::shared_ptr& n, std::vector inputs); - explicit Expression(const std::shared_ptr& n, std::vector inputs, std::vector outputs); - virtual ~Expression() = default; std::shared_ptr get_node() const; @@ -64,53 +35,66 @@ class Expression : public std::enable_shared_from_this { RegInfo get_reg_info() const { return m_reg_info; } void set_reg_info(RegInfo rinfo) { m_reg_info = std::move(rinfo); } - const std::vector& get_inputs() { return m_inputs; } - const std::vector& get_outputs() { return m_outputs; } + const TensorPtr& get_input_tensor(size_t i) const; + const TensorPtr& get_output_tensor(size_t i) const; + std::vector get_input_tensors() const { return m_input_tensors; } + std::vector get_output_tensors() const { return m_output_tensors; } + + const PortDescriptorPtr& get_input_port_descriptor(size_t i) const; + const PortDescriptorPtr& get_output_port_descriptor(size_t i) const; + std::vector get_input_port_descriptors() const { return m_input_port_descriptors; } + std::vector get_output_port_descriptors() const { return m_output_port_descriptors; } + + size_t get_input_count() const { return m_input_tensors.size(); } + size_t get_output_count() const { return m_output_tensors.size(); } std::vector get_loop_ids() const { return m_loop_ids; } void set_loop_ids(const std::vector& loops) { m_loop_ids = loops; } void set_loop_id(size_t id, size_t idx); void remove_loop_id(size_t id); - bool is_outside_loop() const { return m_is_outside_loop; } + void validate() const; void init_emitter(const std::shared_ptr& target); - ExpressionPort input_port(size_t i); - ExpressionPort output_port(size_t i); + ExpressionPort get_input_port(size_t i); + ExpressionPort get_output_port(size_t i); protected: - void replace_input(size_t port, TensorDescriptorPtr to); - void replace_output(size_t port, TensorDescriptorPtr to); + // Note: The constructor and tensor initialization are private since an expression can be created only by Linear IR. + // These methods must be used only by Linear IR builder of expressions! + explicit Expression(const std::shared_ptr& n); + + void replace_input(size_t port, TensorPtr to); std::shared_ptr m_source_node{nullptr}; std::shared_ptr m_emitter{nullptr}; - std::vector m_inputs; - std::vector m_outputs; + std::vector m_input_tensors{}; + std::vector m_output_tensors{}; + std::vector m_input_port_descriptors{}; + std::vector m_output_port_descriptors{}; RegInfo m_reg_info{{}, {}}; // The order Loops identifies: Outer ---> Inner std::vector m_loop_ids; - bool m_is_outside_loop = false; }; +using ExpressionPtr = std::shared_ptr; class IOExpression : public Expression { + friend class LinearIR; + public: enum class io_type {INPUT, OUTPUT, UNDEFINED}; - IOExpression(const std::shared_ptr& n, int64_t index); - IOExpression(const std::shared_ptr& n, int64_t index, std::vector inputs); - int64_t get_index() const { return m_index; } io_type get_type() const { return m_type; } private: + explicit IOExpression(const std::shared_ptr& n, int64_t index); + explicit IOExpression(const std::shared_ptr& n, int64_t index); + int64_t m_index = -1; io_type m_type = io_type::UNDEFINED; }; -bool operator==(const ExpressionPort& lhs, const ExpressionPort& rhs); -bool operator!=(const ExpressionPort& lhs, const ExpressionPort& rhs); -bool operator<(const ExpressionPort& lhs, const ExpressionPort& rhs); - } // namespace lowered } // namespace snippets } // namespace ngraph diff --git a/src/common/snippets/include/snippets/lowered/expression_factory.hpp b/src/common/snippets/include/snippets/lowered/expression_factory.hpp new file mode 100644 index 00000000000000..af6a1b74e6c021 --- /dev/null +++ b/src/common/snippets/include/snippets/lowered/expression_factory.hpp @@ -0,0 +1,55 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "linear_ir.hpp" + +#include "snippets/snippets_isa.hpp" + +namespace ngraph { +namespace snippets { +namespace lowered { + +class LinearIR::ExpressionFactory { +public: + template + static ExpressionPtr build(const std::shared_ptr& n, Args&&... params) { + if (const auto par = ov::as_type_ptr(n)) { + return create(par, params...); + } else if (const auto res = ov::as_type_ptr(n)) { + return create(res, params...); + } else if (const auto loop_begin = ov::as_type_ptr(n)) { + return create(loop_begin, params...); + } else if (const auto loop_end = ov::as_type_ptr(n)) { + return create(loop_end, params...); + } + return create(n, params...); + } + +private: + /* -- Default Builders - initialize input tensors from parents and create new output tensors themselves */ + static ExpressionPtr create(const std::shared_ptr& par, const LinearIR& linear_ir, + const std::shared_ptr& model); + static ExpressionPtr create(const std::shared_ptr& res, const LinearIR& linear_ir, + const std::shared_ptr& model); + static ExpressionPtr create(const std::shared_ptr& n, const LinearIR& linear_ir, + const std::shared_ptr& model); + + /* -- Input Builders - get input tensors from method parameters and create new output tensors themselves */ + static ExpressionPtr create(const std::shared_ptr& n, const std::vector& inputs); + static ExpressionPtr create(const std::shared_ptr& n, const std::vector& inputs); + static ExpressionPtr create(const std::shared_ptr& n, const std::vector& inputs); + + // Creates inputs for expression using parent output tensors + static void create_expression_inputs(const LinearIR& linear_ir, const ExpressionPtr& expr); + // Creates new output tensors + static void create_expression_outputs(const ExpressionPtr& expr); + // The method verifies of input tensors to availability of the expression as consumer and add it if missed + static void init_expression_inputs(const ExpressionPtr& expr, const std::vector& inputs); +}; + +} // namespace lowered +} // namespace snippets +} // namespace ngraph diff --git a/src/common/snippets/include/snippets/lowered/expression_port.hpp b/src/common/snippets/include/snippets/lowered/expression_port.hpp new file mode 100644 index 00000000000000..bb4ce7366a9a03 --- /dev/null +++ b/src/common/snippets/include/snippets/lowered/expression_port.hpp @@ -0,0 +1,51 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include + +#include "port_descriptor.hpp" + + +namespace ngraph { +namespace snippets { +namespace lowered { + +class Tensor; +class Expression; +class ExpressionPort { +public: + enum Type { + Input, + Output + }; + + ExpressionPort() = default; + explicit ExpressionPort(const std::shared_ptr& expr, Type type, size_t port); + + const std::shared_ptr& get_expr() const { return m_expr; } + Type get_type() const { return m_type; } + size_t get_index() const { return m_port_index; } + + const PortDescriptorPtr& get_descriptor_ptr() const; + const std::shared_ptr& get_tensor_ptr() const; + // Returns connected ports to the current: + // - Input port returns one source (parent) port + // - Output port returns all consumer ports (children) + std::set get_connected_ports() const; + + friend bool operator==(const ExpressionPort& lhs, const ExpressionPort& rhs); + friend bool operator!=(const ExpressionPort& lhs, const ExpressionPort& rhs); + friend bool operator<(const ExpressionPort& lhs, const ExpressionPort& rhs); + +private: + std::shared_ptr m_expr; + Type m_type = Type::Output; + size_t m_port_index = 0; +}; +} // namespace lowered +} // namespace snippets +} // namespace ngraph diff --git a/src/common/snippets/include/snippets/lowered/linear_ir.hpp b/src/common/snippets/include/snippets/lowered/linear_ir.hpp index 3b789e40b1ca79..e230d99d98d239 100644 --- a/src/common/snippets/include/snippets/lowered/linear_ir.hpp +++ b/src/common/snippets/include/snippets/lowered/linear_ir.hpp @@ -18,12 +18,12 @@ class Config { bool m_save_lowered_code = false; // True if we should check runtime info for nodes to call specific needed transformations bool m_need_fill_tail_register = false; - bool m_explicit_loop_insertion = false; ov::PartialShape m_master_shape{}; size_t m_loop_depth = 1; }; class LinearIR { + class ExpressionFactory; public: using container = std::list; using io_container = std::list>; @@ -33,21 +33,18 @@ class LinearIR { LinearIR() = default; explicit LinearIR(const std::shared_ptr& m, Config config = {}); - LinearIR deep_copy() const; + ExpressionPtr create_expression(const std::shared_ptr& n, const std::vector& inputs); + static LinearIR::container deep_copy_range(LinearIR::container::const_iterator begin, LinearIR::container::const_iterator end); const container& get_ops() const {return m_lowered_ops; } const io_container& get_IO_ops() const {return m_io_lowered_ops; } Config get_config() {return m_config; } - ExpressionPtr get_expr_by_node(const std::shared_ptr& n) const; - ExpressionPort get_expr_by_output(const TensorDescriptorPtr& n) const; - const std::set& get_exprs_by_input(const TensorDescriptorPtr& n) const; + const ExpressionPtr& get_expr_by_node(const std::shared_ptr& n) const; - void replace_input(const ExpressionPort& expr_port, const TensorDescriptorPtr& to); - void replace_input(const ExpressionPtr& expr, size_t port, const TensorDescriptorPtr& to); - void replace_output(const ExpressionPort& expr_port, const TensorDescriptorPtr& to); - void replace_output(const ExpressionPtr& expr, size_t port, const TensorDescriptorPtr& to); + void replace_input(const std::set& consumers, const TensorPtr& to); + void replace_input(const ExpressionPort& expr_port, const TensorPtr& to); /** * @brief Move an expression from the position "from" to the position immediately before "to". @@ -88,26 +85,21 @@ class LinearIR { void init_emitters(const std::shared_ptr& target); void serialize(const std::string& xml, const std::string& bin); - static ov::NodeVector get_ordered_ops(const std::shared_ptr& model); - class LoopManager; using LoopManagerPtr = std::shared_ptr; const LoopManagerPtr& get_loop_manager() const { return m_loop_manager; } private: - void register_expression(const ExpressionPtr& expr); - // Like register_expression, but doesn't allow Parameter or Result registration. You can do it only through ctor - void register_regular_expression(const ExpressionPtr& expr); + static ov::NodeVector get_ordered_ops(const std::shared_ptr& model); + // Default ctor - can be called only from Linear IR initialization as default way + ExpressionPtr create_expression(const std::shared_ptr& n, const std::shared_ptr& model = nullptr); + + void register_expression(const ExpressionPtr& expr, bool io_allowed = false); void unregister_expression(const ExpressionPtr& expr); container m_lowered_ops{}; std::unordered_map, std::shared_ptr> m_node2expression_map; - // Expression must be uniquely identified by an output, so there can't be expressions that have the same output - std::unordered_map m_output2expression_map; - // At the same time, several expressions can have the same input if they are connected to the same parent - // E.g. LoopEnd will always have the same input as a Load inside the loop (since it has to increment the same reg) - std::unordered_map> m_input2expression_map; io_container m_io_lowered_ops; Config m_config{}; LoopManagerPtr m_loop_manager = nullptr; diff --git a/src/common/snippets/include/snippets/lowered/loop_manager.hpp b/src/common/snippets/include/snippets/lowered/loop_manager.hpp index 4c3f171995a200..ed31e73c7c0688 100644 --- a/src/common/snippets/include/snippets/lowered/loop_manager.hpp +++ b/src/common/snippets/include/snippets/lowered/loop_manager.hpp @@ -9,7 +9,7 @@ #include #include -#include "snippets/tensor_descriptor.hpp" +#include "port_descriptor.hpp" namespace ngraph { namespace snippets { @@ -43,15 +43,10 @@ class LinearIR::LoopManager { size_t get_loop_count() const { return m_map.size(); } const std::map& get_map() const; - static void skipped_mark(LinearIR::constExprIt loop_begin_pos, - LinearIR::constExprIt loop_end_pos, - size_t loop_depth); - void mark_loop(LinearIR& linear_ir, - LinearIR::constExprIt loop_begin_pos, + void mark_loop(LinearIR::constExprIt loop_begin_pos, LinearIR::constExprIt loop_end_pos, size_t loop_depth, size_t vector_size); - void mark_loop(LinearIR& linear_ir, - LinearIR::constExprIt loop_begin_pos, + void mark_loop(LinearIR::constExprIt loop_begin_pos, LinearIR::constExprIt loop_end_pos, size_t idx, size_t work_amount, @@ -74,8 +69,7 @@ class LinearIR::LoopManager { static void exprs_marking(LinearIR::constExprIt loop_begin_pos, LinearIR::constExprIt loop_end_pos, size_t loop_id, size_t idx); - static void get_io_loop_ports(LinearIR& linear_ir, - LinearIR::constExprIt loop_begin_pos, + static void get_io_loop_ports(LinearIR::constExprIt loop_begin_pos, LinearIR::constExprIt loop_end_pos, std::vector& entries, std::vector& exits); diff --git a/src/common/snippets/include/snippets/lowered/pass/fuse_loops.hpp b/src/common/snippets/include/snippets/lowered/pass/fuse_loops.hpp index 1f355fbe9dfbb6..0f66b4ce55c3a6 100644 --- a/src/common/snippets/include/snippets/lowered/pass/fuse_loops.hpp +++ b/src/common/snippets/include/snippets/lowered/pass/fuse_loops.hpp @@ -27,15 +27,13 @@ class FuseLoops : public Transformation { private: static bool can_be_fused(const LinearIR::LoopManager::LoopInfoPtr& loop_current, const LinearIR::LoopManager::LoopInfoPtr& loop_target); - static bool fuse_upper_into_current(LinearIR& linear_ir, const LinearIR::LoopManagerPtr& loop_manager, - const ExpressionPort& current_entry_point, const ExpressionPort& target_exit_point, + static bool fuse_upper_into_current(LinearIR& linear_ir, const LinearIR::LoopManagerPtr& loop_manager, const ExpressionPort& current_entry_point, size_t current_loop_id, size_t target_loop_id, size_t dim_idx, LinearIR::constExprIt& current_loop_begin_pos, LinearIR::constExprIt& current_loop_end_pos); - static bool fuse_lower_into_current(LinearIR& linear_ir, const LinearIR::LoopManagerPtr& loop_manager, - const ExpressionPort& current_entry_point, const ExpressionPort& target_exit_point, + static bool fuse_lower_into_current(LinearIR& linear_ir, const LinearIR::LoopManagerPtr& loop_manager, const ExpressionPort& current_entry_point, size_t current_loop_id, size_t target_loop_id, size_t dim_idx, LinearIR::constExprIt& current_loop_begin_pos, LinearIR::constExprIt& current_loop_end_pos); - static void fuse_points(LinearIR& linear_ir, std::vector& exit_points, std::vector& entry_points, + static void fuse_points(std::vector& exit_points, std::vector& entry_points, LinearIR::constExprIt loop_begin_pos, LinearIR::constExprIt loop_end_pos); }; diff --git a/src/common/snippets/include/snippets/lowered/pass/insert_buffers.hpp b/src/common/snippets/include/snippets/lowered/pass/insert_buffers.hpp index 552ca10ab94863..9abded985e60c7 100644 --- a/src/common/snippets/include/snippets/lowered/pass/insert_buffers.hpp +++ b/src/common/snippets/include/snippets/lowered/pass/insert_buffers.hpp @@ -5,7 +5,6 @@ #pragma once #include "transformation.hpp" -#include "snippets/tensor_descriptor.hpp" namespace ngraph { namespace snippets { diff --git a/src/common/snippets/include/snippets/lowered/port_descriptor.hpp b/src/common/snippets/include/snippets/lowered/port_descriptor.hpp new file mode 100644 index 00000000000000..516512b8e655cb --- /dev/null +++ b/src/common/snippets/include/snippets/lowered/port_descriptor.hpp @@ -0,0 +1,93 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "openvino/core/node.hpp" +#include "openvino/core/attribute_visitor.hpp" + + +namespace ngraph { +namespace snippets { +namespace lowered { + +class PortDescriptor; +using PortDescriptorPtr = std::shared_ptr; +class PortDescriptor { +public: + // The structure with service values for scheduling parameters + struct ServiceDimensions { + // The value for the subtensor that means that scheduling should be by full dimension + static size_t FULL_DIM; + }; + + explicit PortDescriptor(const ov::Input& node, + std::vector subtensor_shape = {}, + std::vector layout = {}); + explicit PortDescriptor(const ov::Input& node, + std::vector subtensor_shape = {}, + std::vector layout = {}); + explicit PortDescriptor(const ov::Output& node, + std::vector subtensor_shape = {}, + std::vector layout = {}); + explicit PortDescriptor(const ov::Output& node, + std::vector subtensor_shape = {}, + std::vector layout = {}); + PortDescriptor(std::vector shape, std::vector subtensor_shape, std::vector layout = {}); + PortDescriptor() = default; + + std::vector get_shape() const {return m_tensor_shape;} + std::vector get_subtensor() const {return m_subtensor_shape;} + std::vector get_layout() const {return m_layout;} + + void set_shape(const std::vector& tensor) { m_tensor_shape = tensor; } + void set_layout(const std::vector& layout) { m_layout = layout; } + void set_subtensor(const std::vector& subtensor) { m_subtensor_shape = subtensor; } + + std::string serialize() const; + bool empty() const { return m_layout.empty() && m_subtensor_shape.empty();} + PortDescriptorPtr clone() const; + + friend bool operator==(const PortDescriptor& lhs, const PortDescriptor& rhs); + friend bool operator!=(const PortDescriptor& lhs, const PortDescriptor& rhs) {return !(lhs == rhs);} + +private: + void validate_arguments(); + /// \brief Original tensor shape + std::vector m_tensor_shape{}; + /// \brief Order of dimensions: NCHW == {0, 1, 2, 3}, NHWC == {0, 2, 3, 1}, NCHW16c == {0, 1, 2, 3, 1} + std::vector m_layout{}; + /// \brief Minimal tensor size that could be processed in one call + std::vector m_subtensor_shape{}; +}; + +class PortManager { +public: + static void set_port_descriptor_ptr(const ov::Input& n, const PortDescriptorPtr& desc); + static void set_port_descriptor_ptr(const ov::Output& n, const PortDescriptorPtr& desc); + + static PortDescriptorPtr get_port_descriptor_ptr(const ov::Input& in); + static PortDescriptorPtr get_port_descriptor_ptr(const ov::Input& out); + static PortDescriptorPtr get_port_descriptor_ptr(const ov::Output& in); + static PortDescriptorPtr get_port_descriptor_ptr(const ov::Output& out); + +private: + static void init_default(std::vector& in_descs, std::vector& out_descs, const std::shared_ptr& node); +}; + +class PortDescriptorVectorAttribute : public ov::RuntimeAttribute { +public: + OPENVINO_RTTI("PortDescriptorVectorAttribute", "", ov::RuntimeAttribute); + + PortDescriptorVectorAttribute() = default; + explicit PortDescriptorVectorAttribute(std::vector in_descs = {}, std::vector out_descs = {}) + : inputs(std::move(in_descs)), outputs(std::move(out_descs)) {} + + std::vector inputs{}; + std::vector outputs{}; +}; + +} // namespace lowered +} // namespace snippets +} // namespace ngraph diff --git a/src/common/snippets/include/snippets/lowered/tensor.hpp b/src/common/snippets/include/snippets/lowered/tensor.hpp new file mode 100644 index 00000000000000..97a091c6258d41 --- /dev/null +++ b/src/common/snippets/include/snippets/lowered/tensor.hpp @@ -0,0 +1,44 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include + +#include "port_descriptor.hpp" + +#include "expression_port.hpp" + + +namespace ngraph { +namespace snippets { +namespace lowered { + +class Expression; + +class Tensor { +public: + Tensor() = default; + explicit Tensor(ExpressionPort source_descriptor, const std::set& consumer_descriptors = {}); + + const ExpressionPort& get_source() const { return m_source_port; } + std::set get_consumers() const { return m_consumer_ports; } + + void add_consumer(const ExpressionPort& consumer); + void remove_consumer(const ExpressionPort& consumer); + bool found_consumer(const ExpressionPort& consumer) const; + std::set::const_iterator find_consumer(const ExpressionPort& consumer) const; + std::set::iterator find_consumer(const ExpressionPort& consumer); + +private: + ExpressionPort m_source_port; + std::set m_consumer_ports; +}; +using TensorPtr = std::shared_ptr; + + +} // namespace lowered +} // namespace snippets +} // namespace ngraph diff --git a/src/common/snippets/include/snippets/op/brgemm.hpp b/src/common/snippets/include/snippets/op/brgemm.hpp index 6d7e08a9d05ffb..7ddcdb6975332a 100644 --- a/src/common/snippets/include/snippets/op/brgemm.hpp +++ b/src/common/snippets/include/snippets/op/brgemm.hpp @@ -20,7 +20,8 @@ class Brgemm : public MemoryAccess { public: OPENVINO_OP("Brgemm", "SnippetsOpset", MemoryAccess); Brgemm(const Output& A, const Output& B, - const size_t offset_a = 0lu, const size_t offset_b = 0lu, const size_t offset_c = 0lu); + const size_t offset_a = 0lu, const size_t offset_b = 0lu, const size_t offset_c = 0lu, + std::vector layout_a = {}, std::vector layout_b = {}, std::vector layout_c = {}); Brgemm() = default; size_t get_offset_a() const { return get_input_offset(0); } @@ -34,9 +35,13 @@ class Brgemm : public MemoryAccess { protected: ov::element::Type get_output_type() const; - std::vector get_planar_input_shapes(const std::vector>& inputs) const; + std::vector get_planar_input_shapes(const std::vector>& inputs) const; ov::PartialShape get_output_partial_shape(const std::vector& input_shapes) const; ov::PartialShape get_planar_output_shape(const ov::PartialShape& output_shape) const; + +private: + void custom_constructor_validate_and_infer_types(std::vector layout_a, std::vector layout_b, std::vector layout_c); + void validate_inputs() const; }; } // namespace op diff --git a/src/common/snippets/include/snippets/op/subgraph.hpp b/src/common/snippets/include/snippets/op/subgraph.hpp index f7bae5aaeac815..05ef134c28eb6d 100644 --- a/src/common/snippets/include/snippets/op/subgraph.hpp +++ b/src/common/snippets/include/snippets/op/subgraph.hpp @@ -172,9 +172,6 @@ class Subgraph : public ov::op::util::SubGraphOp { // True if body has operations that don't support plugin-side domain optimizations // (e.g. Transpose, Softmax, MatMul in general doesn't support dimensions collapsing) bool m_has_domain_sensitive_ops = false; - // True if we should go through whole body to check for where loops should be explicitly inserted. - // Otherwise, we insert Loops on Parameters and Results - for example, it's optimized out for subgraph with only Eltwise ops - bool m_explicit_loop_insertion = false; } config; }; @@ -198,7 +195,7 @@ static inline auto build_subgraph(const std::shared_ptr& node, con return subgraph; }; -// Need to update tensor name manually, since intel_cpu::Graph::Replicate() looks at input.get_tensor().get_name(); +// Need to update tensor name manually, since intel_cpu::Graph::Replicate() looks at input.get_shape().get_name(); // If subgraph->get_output_size() == 1, then the name will be restored correctly from the node name auto inline update_out_tensor_name(const std::shared_ptr& subgraph) -> void { bool not_set = true; diff --git a/src/common/snippets/include/snippets/pass/fuse_transpose_brgemm.hpp b/src/common/snippets/include/snippets/pass/fuse_transpose_brgemm.hpp index 15929f908c774b..f87b8d03c665d5 100644 --- a/src/common/snippets/include/snippets/pass/fuse_transpose_brgemm.hpp +++ b/src/common/snippets/include/snippets/pass/fuse_transpose_brgemm.hpp @@ -7,6 +7,10 @@ #include "ngraph/pass/graph_rewrite.hpp" #include "ngraph/pattern/matcher.hpp" +#include "openvino/op/transpose.hpp" + +#include "snippets/lowered/port_descriptor.hpp" + namespace ngraph { namespace snippets { namespace pass { @@ -23,6 +27,9 @@ class FuseTransposeBrgemm: public ngraph::pass::MatcherPass { OPENVINO_RTTI("FuseTransposeBrgemm", "0"); FuseTransposeBrgemm(); static const std::set> supported_cases; + +private: + static bool is_supported_transpose(const Output& transpose_port); }; } // namespace pass diff --git a/src/common/snippets/include/snippets/pass/matmul_to_brgemm.hpp b/src/common/snippets/include/snippets/pass/matmul_to_brgemm.hpp index 4cfbd1fa394edb..dbe7d3446d398c 100644 --- a/src/common/snippets/include/snippets/pass/matmul_to_brgemm.hpp +++ b/src/common/snippets/include/snippets/pass/matmul_to_brgemm.hpp @@ -7,6 +7,8 @@ #include "ngraph/pass/graph_rewrite.hpp" #include "ngraph/pattern/matcher.hpp" +#include "snippets/op/brgemm.hpp" + namespace ngraph { namespace snippets { namespace pass { @@ -20,6 +22,9 @@ class MatMulToBrgemm: public ngraph::pass::MatcherPass { public: OPENVINO_RTTI("MatMulToBrgemm", "0"); MatMulToBrgemm(); + +private: + void init_ports(const std::shared_ptr& brgemm) const; }; diff --git a/src/common/snippets/include/snippets/pass/set_softmax_ports.hpp b/src/common/snippets/include/snippets/pass/set_softmax_ports.hpp new file mode 100644 index 00000000000000..22e7f0b8af7a7e --- /dev/null +++ b/src/common/snippets/include/snippets/pass/set_softmax_ports.hpp @@ -0,0 +1,26 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include + +namespace ngraph { +namespace snippets { +namespace pass { + +/** + * @interface SetSoftmaxPorts + * @brief The pass updates port descriptors in accordance with the Softmax reduction axis + * @ingroup snippets + */ +class SetSoftmaxPorts: public ngraph::pass::MatcherPass { +public: + SetSoftmaxPorts(); +}; + +} // namespace pass +} // namespace snippets +} // namespace ngraph diff --git a/src/common/snippets/include/snippets/tensor_descriptor.hpp b/src/common/snippets/include/snippets/tensor_descriptor.hpp deleted file mode 100644 index bd676222d33ab6..00000000000000 --- a/src/common/snippets/include/snippets/tensor_descriptor.hpp +++ /dev/null @@ -1,62 +0,0 @@ -// Copyright (C) 2023 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#pragma once - -#include "openvino/core/node.hpp" -#include "openvino/core/attribute_visitor.hpp" - - -namespace ngraph { -namespace snippets { -class TensorDescriptorAttribute; -class TensorDescriptor { - friend class TensorDescriptorAttribute; -public: -explicit TensorDescriptor(const Output& node, - std::vector subtensor_shape = {}, - std::vector layout = {}); -explicit TensorDescriptor(const Output& node, - std::vector subtensor_shape = {}, - std::vector layout = {}); - TensorDescriptor(std::vector tensor_shape, - std::vector subtensor_shape, - std::vector layout = {}); - TensorDescriptor() = default; - static TensorDescriptor deserialize(const std::string& serialized_info); - std::string serialize() const; - std::vector get_tensor() const {return m_tensor_shape;} - std::vector get_subtensor() const {return m_subtensor_shape;} - std::vector get_layout() const {return m_layout;} - bool empty() const { return m_tensor_shape.empty() && m_layout.empty() && m_subtensor_shape.empty();} - friend bool operator==(const TensorDescriptor& lhs, const TensorDescriptor& rhs); - friend bool operator!=(const TensorDescriptor& lhs, const TensorDescriptor& rhs) {return !(lhs == rhs);} - -private: - void validate_arguments(); - /// \brief Original tensor shape - std::vector m_tensor_shape{}; - /// \brief Order of dimensions: NCHW == {0, 1, 2, 3}, NHWC == {0, 2, 3, 1}, NCHW16c == {0, 1, 2, 3, 1} - std::vector m_layout{}; - /// \brief Minimal tensor size that could be processed in one call - std::vector m_subtensor_shape{}; -}; - -std::ostream& operator << (std::ostream&, const TensorDescriptor& td); -using TensorDescriptorPtr = std::shared_ptr; -class TensorDescriptorPtrVectorAttribute : public ov::RuntimeAttribute { -public: - OPENVINO_RTTI("TensorDescriptorVectorAttribute", "0"); - - TensorDescriptorPtrVectorAttribute() = default; - explicit TensorDescriptorPtrVectorAttribute(std::vector descriptor) : m_value(std::move(descriptor)) {} - std::vector m_value{}; -}; - -void set_tensor_descriptor_ptr(const Output& n, const TensorDescriptorPtr& desc); -TensorDescriptorPtr get_tensor_descriptor_ptr(const Output& out); -TensorDescriptorPtr get_tensor_descriptor_ptr(const Output& out); - -} // namespace snippets -} // namespace ngraph diff --git a/src/common/snippets/include/snippets/utils.hpp b/src/common/snippets/include/snippets/utils.hpp index ec719971923101..63547a226df2f9 100644 --- a/src/common/snippets/include/snippets/utils.hpp +++ b/src/common/snippets/include/snippets/utils.hpp @@ -24,18 +24,9 @@ inline auto is_scalar_constant(const std::shared_ptr& source_outpu return ngraph::is_type(source_output_node) && ngraph::shape_size(source_output_node->get_shape()) == 1; } +ov::PartialShape get_port_planar_shape(const Input& out); ov::PartialShape get_port_planar_shape(const Output& out); ov::PartialShape get_reordered_planar_shape(const ov::PartialShape& shape, const std::vector& layout); -std::vector get_node_output_layout(const std::shared_ptr& node); -std::vector get_node_output_layout(const Node* node); -void set_transpose_output_layout(const ov::Output& port, const std::shared_ptr& node); -void set_output_layout(const ov::Output& port, const std::vector& layout); - -bool get_outside_loop_value(const std::shared_ptr& node); -void set_outside_loop_value(const std::shared_ptr& node, bool is_outside = true); - -inline ov::Dimension get_inner_dim(const ov::PartialShape &shape) { return *(shape.rbegin()); } -inline ov::Dimension get_outer_dim(const ov::PartialShape &shape) { return *(shape.rbegin() + 1); } inline auto normalize_rank(int32_t allocation_rank, const size_t shape_rank) -> int32_t { return allocation_rank < 0 ? allocation_rank + static_cast(shape_rank) + 1 : allocation_rank; diff --git a/src/common/snippets/src/generator.cpp b/src/common/snippets/src/generator.cpp index b8968a97d28126..5f166619b1c7f7 100644 --- a/src/common/snippets/src/generator.cpp +++ b/src/common/snippets/src/generator.cpp @@ -24,7 +24,6 @@ #include "snippets/lowered/pass/indentify_buffers.hpp" #include "snippets/op/kernel.hpp" -#include "snippets/tensor_descriptor.hpp" #include diff --git a/src/common/snippets/src/lowered/expression.cpp b/src/common/snippets/src/lowered/expression.cpp index e543e211d57b7f..dffc8e03c74355 100644 --- a/src/common/snippets/src/lowered/expression.cpp +++ b/src/common/snippets/src/lowered/expression.cpp @@ -16,31 +16,35 @@ namespace lowered { size_t Expression::LOOP_NULL_ID = SIZE_MAX; -ExpressionPort::ExpressionPort(const ExpressionPtr& expr, size_t port, Type type) : expr(expr), port(port), m_type(type) { - if (type == Type::Input) { - OPENVINO_ASSERT(port < expr->get_inputs().size(), "The input port must be less than input count"); - } else if (type == Type::Output) { - OPENVINO_ASSERT(port < expr->get_outputs().size(), "The output port must be less than output count"); +Expression::Expression(const std::shared_ptr& n) + : m_source_node{n}, m_emitter{nullptr}, m_input_tensors{}, m_output_tensors{}, m_reg_info{{}, {}} { + m_input_port_descriptors.reserve(n->get_input_size()); + m_output_port_descriptors.reserve(n->get_output_size()); + for (const auto& input : n->inputs()) { + m_input_port_descriptors.push_back(PortManager::get_port_descriptor_ptr(input)); + } + for (const auto& output : n->outputs()) { + m_output_port_descriptors.push_back(PortManager::get_port_descriptor_ptr(output)); } } -Expression::Expression(const std::shared_ptr& n) - : m_source_node{n}, m_emitter{nullptr}, m_reg_info{{}, {}}, m_is_outside_loop(utils::get_outside_loop_value(n)) { - for (const auto& in : n->inputs()) - m_inputs.emplace_back(get_tensor_descriptor_ptr(in.get_source_output())); - for (const auto& out : n->outputs()) - m_outputs.emplace_back(get_tensor_descriptor_ptr(out)); +const TensorPtr& Expression::get_input_tensor(size_t i) const { + OPENVINO_ASSERT(i < m_input_tensors.size(), "Failed to get input tensor: target input port must be less than input count!"); + return m_input_tensors[i]; } - -Expression::Expression(const std::shared_ptr& n, std::vector inputs) - : m_source_node{n}, m_emitter{nullptr}, m_inputs(std::move(inputs)), m_reg_info{{}, {}}, m_is_outside_loop(utils::get_outside_loop_value(n)) { - for (const auto& out : n->outputs()) - m_outputs.emplace_back(get_tensor_descriptor_ptr(out)); +const TensorPtr& Expression::get_output_tensor(size_t i) const { + OPENVINO_ASSERT(i < m_output_tensors.size(), "Failed to get output: target output port must be less than output count!"); + return m_output_tensors[i]; } -Expression::Expression(const std::shared_ptr& n, std::vector inputs, std::vector outputs) - : m_source_node{n}, m_emitter{nullptr}, m_inputs(std::move(inputs)), m_outputs(std::move(outputs)), - m_reg_info{{}, {}}, m_is_outside_loop(utils::get_outside_loop_value(n)) {} +const PortDescriptorPtr& Expression::get_input_port_descriptor(size_t i) const { + OPENVINO_ASSERT(i < m_input_port_descriptors.size(), "Failed to get input port descriptor: target input port must be less than input count!"); + return m_input_port_descriptors[i]; +} +const PortDescriptorPtr& Expression::get_output_port_descriptor(size_t i) const { + OPENVINO_ASSERT(i < m_output_port_descriptors.size(), "Failed to get output port descriptor: target output port must be less than output count!"); + return m_output_port_descriptors[i]; +} std::shared_ptr Expression::get_node() const { if (!m_source_node) @@ -49,26 +53,29 @@ std::shared_ptr Expression::get_node() const { } std::shared_ptr Expression::get_emitter() const { - return m_emitter; + return m_emitter; } void Expression::init_emitter(const std::shared_ptr& target) { m_emitter = target->get(m_source_node->get_type_info())(m_source_node); } -void Expression::replace_input(size_t port, TensorDescriptorPtr to) { - OPENVINO_ASSERT(port < m_inputs.size(), "Failed to replace: target input port must be less than input count!"); - m_inputs[port] = std::move(to); +void Expression::validate() const { + OPENVINO_ASSERT(m_input_port_descriptors.size() == m_input_tensors.size(), "The count of input ports and input tensors must be equal"); + OPENVINO_ASSERT(m_output_port_descriptors.size() == m_output_tensors.size(), "The count of output ports and output tensors must be equal"); + OPENVINO_ASSERT(m_source_node != nullptr, "The expression has null source node"); } -void Expression::replace_output(size_t port, TensorDescriptorPtr to) { - OPENVINO_ASSERT(port < m_outputs.size(), "Failed to replace: target output port must be less than output count!"); - m_outputs[port] = std::move(to); +void Expression::replace_input(size_t port, TensorPtr to) { + OPENVINO_ASSERT(port < m_input_tensors.size(), "Failed to replace: target input port must be less than input count!"); + m_input_tensors[port] = std::move(to); } void Expression::set_loop_id(size_t id, size_t idx) { - OPENVINO_ASSERT((std::find(m_loop_ids.begin(), m_loop_ids.end(), id) == m_loop_ids.end()), - "Expression cannot have several the same Loops"); + if (id != LOOP_NULL_ID) { + OPENVINO_ASSERT((std::find(m_loop_ids.begin(), m_loop_ids.end(), id) == m_loop_ids.end()), + "Expression cannot have several the same Loops"); + } if (m_loop_ids.size() <= idx) { m_loop_ids.resize(idx + 1, LOOP_NULL_ID); } @@ -81,40 +88,19 @@ void Expression::remove_loop_id(size_t id) { *it = Expression::LOOP_NULL_ID; } -ExpressionPort Expression::input_port(size_t i) { - OPENVINO_ASSERT(i < m_inputs.size(), "Failed to get input port: target input port must be less than input count!"); - return ExpressionPort(this->shared_from_this(), i, ExpressionPort::Type::Input); +ExpressionPort Expression::get_input_port(size_t i) { + return ExpressionPort(this->shared_from_this(), ExpressionPort::Type::Input, i); } -ExpressionPort Expression::output_port(size_t i) { - OPENVINO_ASSERT(i < m_outputs.size(), "Failed to get output port: target output port must be less than output count!"); - return ExpressionPort(this->shared_from_this(), i, ExpressionPort::Type::Output); +ExpressionPort Expression::get_output_port(size_t i) { + return ExpressionPort(this->shared_from_this(), ExpressionPort::Type::Output, i); } IOExpression::IOExpression(const std::shared_ptr& par, int64_t index) - : Expression(par), m_index(index), m_type{io_type::INPUT} { -} + : Expression(par), m_index(index), m_type{io_type::INPUT} {} +IOExpression::IOExpression(const std::shared_ptr& res, int64_t index) + : Expression(res), m_index(index), m_type{io_type::OUTPUT} {} -IOExpression::IOExpression(const std::shared_ptr& res, int64_t index, std::vector inputs) - : Expression(res, inputs, {}), m_index(index), m_type{io_type::OUTPUT} { -} - -bool operator==(const ExpressionPort& lhs, const ExpressionPort& rhs) { - if (&lhs == &rhs) - return true; - OPENVINO_ASSERT(lhs.get_type() == rhs.get_type(), "Incorrect comparison: Ports are from different types!"); - return lhs.expr == rhs.expr && lhs.port == rhs.port; -} - -bool operator!=(const ExpressionPort& lhs, const ExpressionPort& rhs) { - return !(lhs == rhs); -} - -bool operator<(const ExpressionPort& lhs, const ExpressionPort& rhs) { - OPENVINO_ASSERT(lhs.get_type() == rhs.get_type(), "Incorrect comparison: Ports are from different types!"); - // Firstly ports - return (lhs.port < rhs.port) || (lhs.port == rhs.port && lhs.expr < rhs.expr); -} }// namespace lowered }// namespace snippets }// namespace ngraph diff --git a/src/common/snippets/src/lowered/expression_factory.cpp b/src/common/snippets/src/lowered/expression_factory.cpp new file mode 100644 index 00000000000000..2bf63bb3a631e9 --- /dev/null +++ b/src/common/snippets/src/lowered/expression_factory.cpp @@ -0,0 +1,126 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "snippets/lowered/expression_factory.hpp" + +#include "snippets/snippets_isa.hpp" + +namespace ngraph { +namespace snippets { +namespace lowered { + +void LinearIR::ExpressionFactory::create_expression_inputs(const LinearIR& linear_ir, const ExpressionPtr& expr) { + OPENVINO_ASSERT(expr != nullptr, "Failed expression inputs creation: expression is null"); + const auto& node = expr->get_node(); + + expr->m_input_tensors.resize(node->get_input_size(), nullptr); + for (const auto& input : node->inputs()) { + const auto input_source = input.get_source_output(); + const auto in_index = input.get_index(); + const auto& parent_expr = linear_ir.get_expr_by_node(input_source.get_node_shared_ptr()); + const auto& tensor = parent_expr->get_output_tensor(input_source.get_index()); + tensor->add_consumer(expr->get_input_port(in_index)); + expr->m_input_tensors[in_index] = tensor; + } +} + +void LinearIR::ExpressionFactory::create_expression_outputs(const ExpressionPtr& expr) { + OPENVINO_ASSERT(expr != nullptr, "Failed expression outputs creation: expression is null"); + const auto& node = expr->get_node(); + + expr->m_output_tensors.resize(node->get_output_size(), nullptr); + for (const auto& output : node->outputs()) { + const auto out_index = output.get_index(); + const auto source = expr->get_output_port(out_index); + expr->m_output_tensors[out_index] = std::make_shared(source); + } +} + +// The method verifies of input tensors to availability of the expression as consumer and add it if missed +void LinearIR::ExpressionFactory::init_expression_inputs(const ExpressionPtr& expr, const std::vector& inputs) { + for (size_t i = 0; i < inputs.size(); ++i) { + const auto& input = inputs[i]; + const auto consumers = input->get_consumers(); + const auto found = std::find_if(consumers.begin(), consumers.end(), + [&](const ExpressionPort& desc) { + return desc.get_index() == i && desc.get_expr() == expr; + }); + if (found == consumers.end()) { + input->add_consumer(expr->get_input_port(i)); + } + } + expr->m_input_tensors = inputs; +} + +ExpressionPtr LinearIR::ExpressionFactory::create(const std::shared_ptr& par, + const LinearIR& linear_ir, const std::shared_ptr& model) { + // Note: ctor of shared_ptr isn't friend class for Expression -> we cannot use directly make_shared(args) + OPENVINO_ASSERT(model != nullptr, "To create IOExpression from Parameter there must be inited model!"); + auto expr = std::make_shared(IOExpression(par, model->get_parameter_index(par))); + create_expression_outputs(expr); + expr->validate(); + return expr; +} + +ExpressionPtr LinearIR::ExpressionFactory::create(const std::shared_ptr& res, + const LinearIR& linear_ir, const std::shared_ptr& model) { + // Note: ctor of shared_ptr isn't friend class for Expression -> we cannot use directly make_shared(args) + OPENVINO_ASSERT(model != nullptr, "To create IOExpression from Result there must be inited model!"); + auto expr = std::make_shared(IOExpression(res, model->get_result_index(res))); + create_expression_inputs(linear_ir, expr); + // The Result node don't need output port (because of sense of the node). But each node in ngraph must have one output at least. + // The port descriptors are automatically created in constructor. We manually clean output ports. + expr->m_output_port_descriptors.clear(); + expr->validate(); + return expr; +} + +ExpressionPtr LinearIR::ExpressionFactory::create(const std::shared_ptr& n, const LinearIR& linear_ir, + const std::shared_ptr& model) { + OPENVINO_ASSERT(!ov::is_type(n), "Default expression builder doesn't support LoopBegin and LoopEnd"); + // Note: ctor of shared_ptr isn't friend class for Expression + auto expr = std::make_shared(Expression(n)); + create_expression_inputs(linear_ir, expr); + create_expression_outputs(expr); + expr->validate(); + return expr; +} + +ExpressionPtr LinearIR::ExpressionFactory::create(const std::shared_ptr& n, const std::vector& inputs) { + OPENVINO_ASSERT(inputs.empty(), "LoopBegin cannot have inputs"); + auto expr = std::make_shared(Expression(n)); + init_expression_inputs(expr, inputs); + create_expression_outputs(expr); + expr->validate(); + return expr; +} + +ExpressionPtr LinearIR::ExpressionFactory::create(const std::shared_ptr& n, const std::vector& inputs) { + auto expr = std::make_shared(Expression(n)); + // LoopEnd doesn't have port descriptors on inputs (except input from LoopBegin) + expr->m_input_port_descriptors.resize(inputs.size(), nullptr); + const auto& last_input = inputs.back()->get_source(); + OPENVINO_ASSERT(ov::is_type(last_input.get_expr()->get_node()), "LoopEnd expression expects LoopBegin on last input"); + expr->m_input_port_descriptors[inputs.size() - 1] = last_input.get_descriptor_ptr()->clone(); + init_expression_inputs(expr, inputs); + // The LoopEnd node don't need output port (because of sense of the node). But each node in ngraph must have one output at least. + // The port descriptors are automatically created in constructor. We manually clean output ports. + expr->m_output_port_descriptors.clear(); + expr->validate(); + return expr; +} + +ExpressionPtr LinearIR::ExpressionFactory::create(const std::shared_ptr& n, const std::vector& inputs) { + OPENVINO_ASSERT(!ov::is_type(n) && + !ov::is_type(n), + "Expression builder with inputs doesn't support Result and Parameter"); + auto expr = std::make_shared(Expression(n)); + init_expression_inputs(expr, inputs); + create_expression_outputs(expr); + expr->validate(); + return expr; +} +}// namespace lowered +}// namespace snippets +}// namespace ngraph diff --git a/src/common/snippets/src/lowered/expression_port.cpp b/src/common/snippets/src/lowered/expression_port.cpp new file mode 100644 index 00000000000000..d16a12e0da6287 --- /dev/null +++ b/src/common/snippets/src/lowered/expression_port.cpp @@ -0,0 +1,57 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "snippets/lowered/expression_port.hpp" + +#include "snippets/utils.hpp" + + +namespace ngraph { +namespace snippets { +namespace lowered { + +ExpressionPort::ExpressionPort(const std::shared_ptr& expr, Type type, size_t port) + : m_expr(expr), m_type(type), m_port_index(port) {} + +const PortDescriptorPtr& ExpressionPort::get_descriptor_ptr() const { + const auto& descs = m_type == Type::Input ? m_expr->m_input_port_descriptors + : m_expr->m_output_port_descriptors; + OPENVINO_ASSERT(m_port_index < descs.size(), "Incorrect index of port"); + return descs[m_port_index]; +} + +const std::shared_ptr& ExpressionPort::get_tensor_ptr() const { + const auto& tensors = m_type == Type::Input ? m_expr->m_input_tensors + : m_expr->m_output_tensors; + OPENVINO_ASSERT(m_port_index < tensors.size(), "Incorrect index of port"); + return tensors[m_port_index]; +} + +std::set ExpressionPort::get_connected_ports() const { + if (ExpressionPort::m_type == Type::Input) { + return { m_expr->m_input_tensors[m_port_index]->get_source() }; + } + if (ExpressionPort::m_type == Type::Output) { + return m_expr->m_output_tensors[m_port_index]->get_consumers(); + } + OPENVINO_THROW("ExpressionPort supports only Input and Output types"); +} + +bool operator==(const ExpressionPort& lhs, const ExpressionPort& rhs) { + if (&lhs == &rhs) + return true; + OPENVINO_ASSERT(lhs.get_type() == rhs.get_type(), "Incorrect ExpressionPort comparison"); + return lhs.get_index() == rhs.get_index() && lhs.get_expr() == rhs.get_expr(); +} +bool operator!=(const ExpressionPort& lhs, const ExpressionPort& rhs) { + return !(lhs == rhs); +} +bool operator<(const ExpressionPort& lhs, const ExpressionPort& rhs) { + OPENVINO_ASSERT(lhs.get_type() == rhs.get_type(), "Incorrect ExpressionPort comparison"); + return (lhs.get_index() < rhs.get_index()) || (lhs.get_index() == rhs.get_index() && lhs.get_expr() < rhs.get_expr()); +} + +}// namespace lowered +}// namespace snippets +}// namespace ngraph diff --git a/src/common/snippets/src/lowered/linear_ir.cpp b/src/common/snippets/src/lowered/linear_ir.cpp index 976efd62e7f639..828462e020c9f6 100644 --- a/src/common/snippets/src/lowered/linear_ir.cpp +++ b/src/common/snippets/src/lowered/linear_ir.cpp @@ -7,8 +7,8 @@ #include #include "snippets/lowered/loop_manager.hpp" +#include "snippets/lowered/expression_factory.hpp" #include -#include "snippets/tensor_descriptor.hpp" #include "snippets/utils.hpp" #include @@ -20,45 +20,37 @@ namespace lowered { LinearIR::LinearIR(const std::shared_ptr& model, Config config) : m_io_lowered_ops{}, m_config{std::move(config)}, m_loop_manager(std::make_shared()) { - constExprIt scalar_pos = m_lowered_ops.begin(); - ExpressionPtr last_param = nullptr; + constExprIt last_param = m_lowered_ops.end(); for (const auto& n : get_ordered_ops(model)) { constExprIt insertion_pos = m_lowered_ops.end(); - std::shared_ptr expr; - std::vector input_tds; - for (const auto& in : n->inputs()) { - const auto& out = in.get_source_output(); - const auto& parent_out_tds = m_node2expression_map[out.get_node_shared_ptr()]->get_outputs(); - input_tds.push_back(parent_out_tds[out.get_index()]); + const auto expr = create_expression(n, model); + + // Scalar should be on the Linear IR beginning after Parameters to have valid expression order after Loop passes. + // After these passes we must call pass MoveScalarToConsumer() to have a correct accuracy. + // For more details, please see the pass description + if (const auto& scalar = as_type_ptr(n)) { + insertion_pos = std::next(last_param); } - if (const auto& par = as_type_ptr(n)) { - auto io_expr = std::make_shared(par, model->get_parameter_index(par)); - m_io_lowered_ops.push_back(io_expr); - expr = io_expr; - last_param = expr; - } else if (const auto& res = as_type_ptr(n)) { - auto io_expr = std::make_shared(res, model->get_result_index(res), input_tds); + + register_expression(expr, true); + const auto& it = m_lowered_ops.insert(insertion_pos, expr); + + if (const auto io_expr = std::dynamic_pointer_cast(expr)) { m_io_lowered_ops.push_back(io_expr); - expr = io_expr; - } else { - if (const auto& scalar = as_type_ptr(n)) { - // Scalar should be on the Linear IR beginning after Parameters to have valid expression order after Loop passes. - // After these passes we must call pass MoveScalarToConsumer() to have a correct accuracy. - // For more details, please see the pass description - if (scalar_pos == m_lowered_ops.end()) { - OPENVINO_ASSERT(last_param, "Scalars must be executed after Parameters"); - scalar_pos = std::find(m_lowered_ops.begin(), m_lowered_ops.end(), last_param); - } - insertion_pos = std::next(scalar_pos); - } - // Note that output tds must be empty since they are filled automatically from rt_info and/or tensor shapes - expr = std::make_shared(n, input_tds); + if (ov::is_type(n)) + last_param = it; } - register_expression(expr); - m_lowered_ops.insert(insertion_pos, expr); } } +ExpressionPtr LinearIR::create_expression(const std::shared_ptr& n, const std::shared_ptr& model) { + return ExpressionFactory::build(n, *this, model); +} + +ExpressionPtr LinearIR::create_expression(const std::shared_ptr& n, const std::vector& inputs) { + return ExpressionFactory::build(n, inputs); +} + ov::NodeVector LinearIR::get_ordered_ops(const std::shared_ptr& m) { if (!m->get_sinks().empty()) OPENVINO_THROW("Linear IR is not supposed to work for model with sinks. Check your transformation pipeline."); @@ -106,15 +98,6 @@ LinearIR::container LinearIR::deep_copy_range(LinearIR::container::const_iterato return result; } -LinearIR LinearIR::deep_copy() const { - LinearIR result; - auto& result_ops = result.m_lowered_ops; - for (const auto& expr : deep_copy_range(m_lowered_ops.begin(), m_lowered_ops.end())) - result_ops.emplace_back(expr); - result.m_config = m_config; - return result; -} - void LinearIR::debug_print(bool tds_as_pointers) const { auto print_rinfo = [](const RegInfo& rinfo) { std::cerr << " : {"; @@ -125,7 +108,7 @@ void LinearIR::debug_print(bool tds_as_pointers) const { std::cerr << i << " "; std::cerr << "}"; }; - std::map td2int; + std::map td2int; int td_counter = 0; int counter = 0; for (const auto& expr : m_lowered_ops) { @@ -133,23 +116,23 @@ void LinearIR::debug_print(bool tds_as_pointers) const { std::cerr << counter++ << " : " << node->get_friendly_name() << " : "; if (tds_as_pointers) { - for (const auto& in : expr->get_inputs()) { + for (const auto& in : expr->m_input_tensors) { if (td2int.count(in) == 0) OPENVINO_THROW("Undefined input descriptor for op"); std::cerr << td2int.at(in) << ", "; } std::cerr << "\b\b => "; - for (const auto& out : expr->get_outputs()) { + for (const auto& out : expr->m_output_tensors) { if (td2int.count(out) == 0) td2int.insert({out, td_counter++}); std::cerr << td2int.at(out) << ", "; } } else { - for (const auto& in : expr->get_inputs()) - std::cerr << *in << ", "; + for (const auto& port_desc : expr->m_input_port_descriptors) + std::cerr << port_desc << ", "; std::cerr << "\b\b => "; - for (const auto& out : expr->get_outputs()) - std::cerr << *out << ", "; + for (const auto& port_desc : expr->m_output_port_descriptors) + std::cerr << port_desc << ", "; } std::cerr << "\b\b"; const auto& rinfo = expr->get_reg_info(); @@ -166,125 +149,63 @@ void LinearIR::init_emitters(const std::shared_ptr& target) { } } -ExpressionPtr LinearIR::get_expr_by_node(const std::shared_ptr& n) const { +const ExpressionPtr& LinearIR::get_expr_by_node(const std::shared_ptr& n) const { auto found = m_node2expression_map.find(n); - return found == m_node2expression_map.end() ? nullptr : found->second; -} - -ExpressionPort LinearIR::get_expr_by_output(const TensorDescriptorPtr& td) const { - auto found = m_output2expression_map.find(td); - if (found == m_output2expression_map.end()) - OPENVINO_THROW("Failed to find expression by output tensor descriptor"); + OPENVINO_ASSERT(found != m_node2expression_map.end(), "The node " + n->get_friendly_name() + " hasn't been found in Linear IR"); return found->second; } -const std::set& LinearIR::get_exprs_by_input(const TensorDescriptorPtr& td) const { - auto found = m_input2expression_map.find(td); - if (found == m_input2expression_map.end()) - OPENVINO_THROW("Failed to find expression by input tensor descriptor"); - return found->second; +void LinearIR::replace_input(const std::set& consumers, const TensorPtr& to) { + for (const auto& consumer_input : consumers) { + replace_input(consumer_input, to); + } } -void LinearIR::replace_input(const ExpressionPtr& expr, size_t port, const TensorDescriptorPtr& to) { - replace_input(expr->input_port(port), to); -} +void LinearIR::replace_input(const ExpressionPort& expr_port, const TensorPtr& to) { + const auto port = expr_port.get_index(); + const auto& expr = expr_port.get_expr(); -void LinearIR::replace_input(const ExpressionPort& expr_port, const TensorDescriptorPtr& to) { - const auto& expr = expr_port.expr; - const auto port = expr_port.port; OPENVINO_ASSERT(expr_port.get_type() == ExpressionPort::Type::Input, "Failed to replace: target input port must have Input type"); - OPENVINO_ASSERT(port < expr->m_inputs.size(), "Failed to replace: target input port must be less than input count!"); - const auto from = expr->m_inputs[port]; - auto found = m_input2expression_map.find(from); - if (found == m_input2expression_map.end() || found->second.count(expr_port) == 0) - OPENVINO_THROW("Invalid expression of input was provided to replace_input"); - found->second.erase(expr_port); - { - const auto& res = m_input2expression_map.insert({to, std::set{expr_port}}); - // If input is already in the map => add ExprPtr to the mapped set - if (!res.second) { - res.first->second.insert(expr_port); - } - } - expr->replace_input(port, std::move(to)); -} - -void LinearIR::replace_output(const ExpressionPtr& expr, size_t port, const TensorDescriptorPtr& to) { - replace_output(expr->output_port(port), to); -} + OPENVINO_ASSERT(expr_port.get_index() < expr->get_input_count(), "Failed to replace: target input port must be less than input count!"); -void LinearIR::replace_output(const ExpressionPort& expr_port, const TensorDescriptorPtr& to) { - const auto& expr = expr_port.expr; - const auto port = expr_port.port; - OPENVINO_ASSERT(expr_port.get_type() == ExpressionPort::Type::Output, "Failed to replace: target output port must have Output type"); - OPENVINO_ASSERT(port < expr->m_outputs.size(), "Failed to replace: target output port must be less than output count!"); - const auto from = expr->m_outputs[port]; - auto found = m_output2expression_map.find(from); - if (found == m_output2expression_map.end() || found->second != expr_port) - OPENVINO_THROW("Invalid expression of output was provided to replace_output"); - m_output2expression_map.erase(found); - m_output2expression_map[to] = expr_port; - expr->replace_output(port, to); -} + const auto& from = expr->get_input_tensor(port); + if (from == to) + return; -void LinearIR::register_regular_expression(const ExpressionPtr& expr) { - if (is_type(expr->get_node()) || is_type(expr->get_node())) - OPENVINO_THROW("LinearIR::insert can't be used to add Parameters or Results to IR"); - register_expression(expr); + if (!to->found_consumer(expr_port)) { + to->add_consumer(expr_port); + } + from->remove_consumer(expr_port); + expr->replace_input(port, to); } -void LinearIR::register_expression(const ExpressionPtr& expr) { +void LinearIR::register_expression(const ExpressionPtr& expr, bool io_allowed) { const auto& node = expr->get_node(); + if (!io_allowed && (is_type(node) || is_type(node))) + OPENVINO_THROW("LinearIR::insert can't be used to add Parameters or Results to IR"); { const auto& res = m_node2expression_map.insert({node, expr}); if (!res.second) OPENVINO_THROW("Duplicate node is detected in linear IR: " + std::string(node->get_friendly_name())); } - for (size_t i = 0; i < expr->m_outputs.size(); ++i) { - const auto& out = expr->m_outputs[i]; - m_output2expression_map[out] = expr->output_port(i); - } - - for (size_t i = 0; i < expr->m_inputs.size(); ++i) { - const auto& in = expr->m_inputs[i]; - const auto expr_port = expr->input_port(i); - const auto& res = m_input2expression_map.insert({in, std::set{expr_port}}); - // If input is already in the map => add ExprPtr to the mapped set - if (!res.second) { - res.first->second.insert(expr_port); - } - } } void LinearIR::unregister_expression(const ExpressionPtr& expr) { - for (const auto& out : expr->m_outputs) - m_output2expression_map.erase(out); - - size_t in_port = 0; - for (const auto& in : expr->m_inputs) { - const auto& found = m_input2expression_map.find(in); - if (found != m_input2expression_map.end()) { - // Note: If the input is used by only by this expr => delete the whole entry - // Otherwise delete the expr from the users set - auto& users = found->second; - if (users.size() == 1) - m_input2expression_map.erase(found); - else - users.erase(expr->input_port(in_port)); - } - ++in_port; + for (size_t i = 0; i < expr->get_input_count(); ++i) { + const auto& input = expr->get_input_tensor(i); + input->remove_consumer(expr->get_input_port(i)); } m_node2expression_map.erase(expr->get_node()); } LinearIR::exprIt LinearIR::insert(constExprIt pos, container::value_type&& value) { - register_regular_expression(value); + register_expression(value); return m_lowered_ops.insert(pos, value); } LinearIR::exprIt LinearIR::insert(constExprIt pos, const container::value_type& value) { - register_regular_expression(value); + register_expression(value); return m_lowered_ops.insert(pos, value); } @@ -296,22 +217,15 @@ LinearIR::exprIt LinearIR::insert(constExprIt pos, exprIt begin, exprIt end) { LinearIR::exprIt LinearIR::insert(constExprIt pos, constExprIt begin, constExprIt end) { for (auto b = begin; b != end; b++) - register_regular_expression(*b); + register_expression(*b); return m_lowered_ops.insert(pos, begin, end); } LinearIR::exprIt LinearIR::insert(LinearIR::constExprIt pos, const NodeVector& nodes) { auto ret = m_lowered_ops.end(); for (const auto& n : nodes) { - std::vector input_tds; - for (const auto& in : n->inputs()) { - const auto& out = in.get_source_output(); - const auto& parent_out_tds = m_node2expression_map[out.get_node_shared_ptr()]->get_outputs(); - input_tds.push_back(parent_out_tds[out.get_index()]); - } - // Note that output tds must be empty since they are filled automatically from rt_info and/or tensor shapes - const auto& expr = std::make_shared(n, input_tds); - register_regular_expression(expr); + const auto& expr = create_expression(n); + register_expression(expr); ret = m_lowered_ops.insert(pos, expr); } // Need to return iterator to the first of the inserted values @@ -319,15 +233,8 @@ LinearIR::exprIt LinearIR::insert(LinearIR::constExprIt pos, const NodeVector& n } LinearIR::exprIt LinearIR::insert(LinearIR::constExprIt pos, const std::shared_ptr& n) { - std::vector input_tds; - for (const auto& in : n->inputs()) { - const auto& out = in.get_source_output(); - const auto& parent_out_tds = m_node2expression_map[out.get_node_shared_ptr()]->get_outputs(); - input_tds.push_back(parent_out_tds[out.get_index()]); - } - // Note that output tds must be empty since they are filled automatically from rt_info and/or tensor shapes - const auto& expr = std::make_shared(n, input_tds); - register_regular_expression(expr); + const auto& expr = create_expression(n); + register_expression(expr); return m_lowered_ops.insert(pos, expr); } diff --git a/src/common/snippets/src/lowered/loop_manager.cpp b/src/common/snippets/src/lowered/loop_manager.cpp index cf2caeea807631..2e6d41fbde580f 100644 --- a/src/common/snippets/src/lowered/loop_manager.cpp +++ b/src/common/snippets/src/lowered/loop_manager.cpp @@ -5,7 +5,7 @@ #include "snippets/lowered/loop_manager.hpp" #include "snippets/lowered/expression.hpp" -#include "snippets/tensor_descriptor.hpp" +#include "snippets/utils.hpp" #include #include @@ -44,8 +44,7 @@ void LinearIR::LoopManager::get_loop_bounds(const LinearIR &linear_ir, LinearIR::constExprIt &loop_begin_pos, LinearIR::constExprIt &loop_end_pos) const { const auto loop_info = get_loop_info(loop_id); - get_loop_bounds(linear_ir, loop_info->entry_exprs, loop_info->exit_exprs, loop_begin_pos, loop_end_pos, - loop_id); + get_loop_bounds(linear_ir, loop_info->entry_exprs, loop_info->exit_exprs, loop_begin_pos, loop_end_pos, loop_id); } void LinearIR::LoopManager::get_loop_bounds(const LinearIR &linear_ir, @@ -56,7 +55,8 @@ void LinearIR::LoopManager::get_loop_bounds(const LinearIR &linear_ir, size_t loop_id) { OPENVINO_ASSERT(!entries.empty(), "Loop must have entry points"); OPENVINO_ASSERT(!exits.empty(), "Loop must have entry points"); - loop_begin_pos = std::find(linear_ir.begin(), linear_ir.end(), entries.front().expr); + const auto& entry_expr = entries.front().get_expr(); + loop_begin_pos = std::find(linear_ir.begin(), linear_ir.end(), entry_expr); OPENVINO_ASSERT(loop_begin_pos != linear_ir.end(), "Loop begin hasn't been found!"); // Some operations in Loop can be before first entry points: Scalars, VectorBuffer. @@ -68,12 +68,12 @@ void LinearIR::LoopManager::get_loop_bounds(const LinearIR &linear_ir, } // At the moment all Loops must have exit points - loop_end_pos = std::next(std::find(loop_begin_pos, linear_ir.end(), exits.back().expr)); + const auto& exit_expr = exits.back().get_expr(); + loop_end_pos = std::next(std::find(loop_begin_pos, linear_ir.end(), exit_expr)); OPENVINO_ASSERT(loop_end_pos != linear_ir.end(), "Loop end hasn't been found!"); } -void LinearIR::LoopManager::get_io_loop_ports(LinearIR &linear_ir, - LinearIR::constExprIt loop_begin_pos, +void LinearIR::LoopManager::get_io_loop_ports(LinearIR::constExprIt loop_begin_pos, LinearIR::constExprIt loop_end_pos, std::vector &entries, std::vector &exits) { @@ -81,24 +81,21 @@ void LinearIR::LoopManager::get_io_loop_ports(LinearIR &linear_ir, exits.clear(); for (auto expr_it = loop_begin_pos; expr_it != loop_end_pos; ++expr_it) { const auto& expr = *expr_it; - const auto inputs = expr->get_inputs(); - const auto outputs = expr->get_outputs(); - - for (size_t in_port = 0; in_port < inputs.size(); ++in_port) { - const auto in_td = inputs[in_port]; - const auto parent_expr = linear_ir.get_expr_by_output(in_td).expr; + for (size_t i = 0; i < expr->get_input_count(); ++i) { + const auto in_port = expr->get_input_port(i); + const auto& parent_expr = in_port.get_connected_ports().begin()->get_expr(); if (!ov::is_type(parent_expr->get_node()) && std::find(loop_begin_pos, expr_it, parent_expr) == expr_it) { - entries.push_back(expr->input_port(in_port)); + entries.push_back(in_port); } } - - for (size_t out_port = 0; out_port < outputs.size(); ++out_port) { - const auto out_td = outputs[out_port]; - const auto consumer_exprs = linear_ir.get_exprs_by_input(out_td); - for (const auto& conumer_expr : consumer_exprs) { - if (std::find(expr_it, loop_end_pos, conumer_expr.expr) == loop_end_pos) { - exits.push_back(expr->output_port(out_port)); + for (size_t i = 0; i < expr->get_output_count(); ++i) { + const auto out_port = expr->get_output_port(i); + const auto consumer_ports = out_port.get_connected_ports(); + for (const auto& consumer : consumer_ports) { + const auto& consumer_expr = consumer.get_expr(); + if (std::find(expr_it, loop_end_pos, consumer_expr) == loop_end_pos) { + exits.push_back(out_port); break; } } @@ -106,88 +103,84 @@ void LinearIR::LoopManager::get_io_loop_ports(LinearIR &linear_ir, } } -void LinearIR::LoopManager::skipped_mark(LinearIR::constExprIt loop_begin_pos, - LinearIR::constExprIt loop_end_pos, - size_t loop_depth) { - const auto loop_ids = std::vector(loop_depth, Expression::LOOP_NULL_ID); - for (auto& expr_it = loop_begin_pos; expr_it != loop_end_pos; ++expr_it) { - const auto expr = *expr_it; - expr->set_loop_ids(loop_ids); - } -} - -void LinearIR::LoopManager::mark_loop(LinearIR &linear_ir, - LinearIR::constExprIt loop_begin_pos, +void LinearIR::LoopManager::mark_loop(LinearIR::constExprIt loop_begin_pos, LinearIR::constExprIt loop_end_pos, size_t loop_depth, size_t vector_size) { std::vector loop_entry_points, loop_exit_points; - LoopManager::get_io_loop_ports(linear_ir, loop_begin_pos, loop_end_pos, loop_entry_points, - loop_exit_points); + LoopManager::get_io_loop_ports(loop_begin_pos, loop_end_pos, loop_entry_points, loop_exit_points); - auto broadcast = [](std::vector &lhs, const std::vector &rhs) -> void { + auto broadcast = [](std::vector& lhs, const std::vector& rhs, size_t index) -> void { if (rhs == lhs) return; const auto lhs_size = lhs.size(); const auto rhs_size = rhs.size(); const auto size = std::max(lhs_size, rhs_size); - std::vector result(size, 1); lhs.resize(size, 1); - for (size_t i = 0; i < size; ++i) { - const auto lhs_value = i < lhs_size ? *(lhs.crbegin() + i) : 1; - const auto rhs_value = i < rhs_size ? *(rhs.crbegin() + i) : 1; - OPENVINO_ASSERT(lhs_value == rhs_value || lhs_value == 1 || rhs_value == 1, - "Output shapes of Loop must be broadcastable!"); - *(lhs.rbegin() + i) = std::max(lhs_value, rhs_value); - } + OPENVINO_ASSERT(index < size, "Incorrect index for broadcasting"); + const auto lhs_value = index < lhs_size ? *(lhs.crbegin() + index) : 1; + const auto rhs_value = index < rhs_size ? *(rhs.crbegin() + index) : 1; + OPENVINO_ASSERT(lhs_value == rhs_value || lhs_value == 1 || rhs_value == 1, + "Output shapes of Loop must be broadcastable!"); + *(lhs.rbegin() + index) = std::max(lhs_value, rhs_value); + }; + + auto is_outside_loop = [](const std::vector& subtensor) { + return std::all_of(subtensor.begin(), subtensor.end(), [](size_t lhs) { return lhs == PortDescriptor::ServiceDimensions::FULL_DIM; }); }; std::vector loop_subtensor; - std::vector loop_layout; - std::vector loop_tensor(1, 1); // Scalar + std::vector loop_tensor(loop_depth, 1); for (const auto& exit_point : loop_exit_points) { - const auto expr = exit_point.expr; - const auto port = exit_point.port; - const auto out_td = expr->get_outputs()[port]; - const auto out_tensor = out_td->get_tensor(); - const auto out_layout = out_td->get_layout(); - broadcast(loop_tensor, out_tensor); - if (loop_layout.empty()) - loop_layout = out_layout; - OPENVINO_ASSERT(loop_layout == out_layout, "Output layouts of Loop must be the same!"); - } + const auto& desc = exit_point.get_descriptor_ptr(); + const auto tensor = utils::get_reordered_planar_shape(ov::PartialShape(desc->get_shape()), desc->get_layout()).get_shape(); + auto subtensor = desc->get_subtensor(); + if (subtensor.empty()) { + subtensor.resize(loop_depth, 1); + subtensor[subtensor.size() - 1] = vector_size; + } - for (const auto& entry_point : loop_entry_points) { - const auto expr = entry_point.expr; - const auto out_td = expr->get_outputs().front(); - const auto out_subtensor = out_td->get_subtensor(); + const size_t resizing_value = is_outside_loop(subtensor) ? PortDescriptor::ServiceDimensions::FULL_DIM : 1; + while (subtensor.size() < loop_depth) + subtensor.insert(subtensor.begin(), resizing_value); if (loop_subtensor.empty()) - loop_subtensor = out_subtensor; - OPENVINO_ASSERT(loop_subtensor == out_subtensor, "Subtensors of Loop must be the same!"); + loop_subtensor = subtensor; + + OPENVINO_ASSERT(std::equal(loop_subtensor.crbegin(), loop_subtensor.crbegin() + loop_depth, subtensor.crbegin()), + "Incorrect scheduling parameters for loop"); + + for (size_t dim_idx = 0; dim_idx < loop_depth; ++dim_idx) { + if (*(subtensor.rbegin() + dim_idx) != PortDescriptor::ServiceDimensions::FULL_DIM) { + broadcast(loop_tensor, tensor, dim_idx); + } + } } for (size_t dim_idx = 0; dim_idx < loop_depth; ++dim_idx) { + if (*(loop_subtensor.rbegin() + dim_idx) == PortDescriptor::ServiceDimensions::FULL_DIM) { + exprs_marking(loop_begin_pos, loop_end_pos, Expression::LOOP_NULL_ID, loop_depth - dim_idx - 1); + continue; + } + OPENVINO_ASSERT(dim_idx < loop_tensor.size(), "Incorrect indexes of Loop for markup"); - const auto dim = loop_layout.size() >= dim_idx ? *(loop_layout.rbegin() + dim_idx) : 0; - const auto work_amount = loop_tensor.size() > dim ? loop_tensor[dim] : 0; + const auto work_amount = + loop_tensor.size() > dim_idx ? *(loop_tensor.rbegin() + dim_idx) + : 0; const auto work_amount_increment = - loop_subtensor.size() > dim_idx ? *(loop_subtensor.rbegin() + dim_idx) : - dim_idx == 0 ? vector_size : 1; - - mark_loop(linear_ir, loop_begin_pos, loop_end_pos, loop_depth - dim_idx - 1, work_amount, + loop_subtensor.size() > dim_idx ? *(loop_subtensor.rbegin() + dim_idx) + : (dim_idx == 0 ? vector_size : 1); + mark_loop(loop_begin_pos, loop_end_pos, loop_depth - dim_idx - 1, work_amount, work_amount_increment, loop_entry_points, loop_exit_points); } } -void LinearIR::LoopManager::mark_loop(LinearIR &linear_ir, - LinearIR::constExprIt loop_begin_pos, +void LinearIR::LoopManager::mark_loop(LinearIR::constExprIt loop_begin_pos, LinearIR::constExprIt loop_end_pos, size_t idx, size_t work_amount, size_t work_amount_increment, const std::vector &entries, const std::vector &exits) { - const auto loop_info = std::make_shared( - work_amount, work_amount_increment, entries, exits); + const auto loop_info = std::make_shared(work_amount, work_amount_increment, entries, exits); const auto loop_id = this->add_loop_info(loop_info); exprs_marking(loop_begin_pos, loop_end_pos, loop_id, idx); } diff --git a/src/common/snippets/src/lowered/pass/allocate_buffers.cpp b/src/common/snippets/src/lowered/pass/allocate_buffers.cpp index 9e17b573aa274e..a22c8e19549634 100644 --- a/src/common/snippets/src/lowered/pass/allocate_buffers.cpp +++ b/src/common/snippets/src/lowered/pass/allocate_buffers.cpp @@ -21,10 +21,10 @@ void AllocateBuffers::propagate_offset(const LinearIR& linear_ir, const Expressi // Propagate to up: in Store. Buffer can have only one Store { if (buffer->is_intermediate_memory()) { - OPENVINO_ASSERT(buffer_expr->get_inputs().size() == 1, "Buffer with intermediate memory must have one parent"); - const auto& parent_output = linear_ir.get_expr_by_output(buffer_expr->get_inputs()[0]); - const auto& parent_expr = parent_output.expr; - const auto port = parent_output.port; + OPENVINO_ASSERT(buffer_expr->get_input_tensors().size() == 1, "Buffer with intermediate memory must have one parent"); + const auto& parent_output = buffer_expr->get_input_tensor(0)->get_source(); + const auto& parent_expr = parent_output.get_expr(); + const auto port = parent_output.get_index(); const auto& parent_node = parent_expr->get_node(); auto memory_access = ov::as_type_ptr(parent_node); if (memory_access && memory_access->is_memory_access_output_port(port)) { @@ -36,10 +36,10 @@ void AllocateBuffers::propagate_offset(const LinearIR& linear_ir, const Expressi } } // Propagate to down: in Load. Buffer can have several Load - const auto& buffer_out = buffer_expr->get_outputs()[0]; - for (const auto& child_expr_input : linear_ir.get_exprs_by_input(buffer_out)) { - const auto& child_expr = child_expr_input.expr; - const auto port = child_expr_input.port; + const auto& buffer_out = buffer_expr->get_output_tensor(0); + for (const auto& child_expr_input : buffer_out->get_consumers()) { + const auto& child_expr = child_expr_input.get_expr(); + const auto port = child_expr_input.get_index(); const auto& child_node = child_expr->get_node(); auto memory_access = ov::as_type_ptr(child_node); if (memory_access && memory_access->is_memory_access_input_port(port)) { @@ -61,7 +61,8 @@ bool AllocateBuffers::run(LinearIR& linear_ir) { bool modified = false; size_t offset = 0; for (auto expr_it = linear_ir.begin(); expr_it != linear_ir.end(); expr_it++) { - if (auto buffer = as_type_ptr(expr_it->get()->get_node())) { + const auto& expr = *expr_it; + if (auto buffer = as_type_ptr(expr->get_node())) { const auto buffer_size = buffer->get_byte_size(); // If it's the first buffer, offsets are zero => nothing to propagate, can continue if (m_buffer_scratchpad_size == 0) { @@ -70,7 +71,7 @@ bool AllocateBuffers::run(LinearIR& linear_ir) { } if (buffer->is_intermediate_memory()) { - const auto& parent_expr = linear_ir.get_expr_by_output(expr_it->get()->get_inputs()[0]).expr; + const auto& parent_expr = expr->get_input_tensor(0)->get_source().get_expr(); const auto& parent_node = parent_expr->get_node(); // Full MemoryAccess ops need new memory. Previous logic is to check for parent isn't Loop // TODO: It should be unified in MemoryManager with memory reuse in the near future diff --git a/src/common/snippets/src/lowered/pass/assign_registers.cpp b/src/common/snippets/src/lowered/pass/assign_registers.cpp index 1d770d1b5e6c5e..92633245e1b036 100644 --- a/src/common/snippets/src/lowered/pass/assign_registers.cpp +++ b/src/common/snippets/src/lowered/pass/assign_registers.cpp @@ -19,8 +19,8 @@ namespace pass { bool AssignRegisters::run(LinearIR& linear_ir) { OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::AssignRegisters") using Reg = size_t; - using tensor = snippets::TensorDescriptorPtr; - auto& expressions = linear_ir.get_ops(); + using tensor = TensorPtr; + const auto& expressions = linear_ir.get_ops(); std::vector> typed_ops; NodeVector ops; @@ -47,38 +47,38 @@ bool AssignRegisters::run(LinearIR& linear_ir) { auto op = expr->get_node(); if (const auto io_expr = std::dynamic_pointer_cast(expr)) { if (io_expr->get_type() == IOExpression::io_type::INPUT) - manually_assigned_gprs[expr->get_outputs()[0]] = io_expr->get_index(); + manually_assigned_gprs[expr->get_output_tensor(0)] = io_expr->get_index(); else if (io_expr->get_type() == IOExpression::io_type::OUTPUT) - manually_assigned_gprs[expr->get_inputs()[0]] = num_parameters + io_expr->get_index(); + manually_assigned_gprs[expr->get_input_tensor(0)] = num_parameters + io_expr->get_index(); else OPENVINO_THROW("Unsupported io_type detected"); } else if (const auto& buffer = ov::as_type_ptr(op)) { const auto buffer_id = buffer->get_id(); // All buffers have one common data pointer if (buffer->is_intermediate_memory()) { - manually_assigned_gprs[expr->get_inputs()[0]] = + manually_assigned_gprs[expr->get_input_tensor(0)] = static_cast(num_results + num_parameters + buffer_id); } - manually_assigned_gprs[expr->get_outputs()[0]] = + manually_assigned_gprs[expr->get_output_tensor(0)] = static_cast(num_results + num_parameters + buffer_id); } else if (ov::is_type(op) || ov::is_type(op)) { // Only in SoftmaxDecomposition ReduceMax and ReduceSum use HorizonMax/HorizonSum and VectorBuffer. // We should manually set the one vector register for VectorBuffer and Max/Sum output to simulate a accumulator // TODO [96351]: We should rewrite accumulator pattern using another way - const auto input_td = expr->get_inputs()[0]; - const auto& input_expr = linear_ir.get_expr_by_output(input_td).expr; - const auto& input_expr_input_tds = input_expr->get_inputs(); - for (const auto& td : input_expr_input_tds) { - if (ov::is_type(linear_ir.get_expr_by_output(td).expr->get_node())) { - manually_assigned_vecs[td] = static_cast(accumulator_reg); + const auto& input_tensor = expr->get_input_tensor(0); + const auto& input_expr = input_tensor->get_source().get_expr(); + const auto& input_expr_input_tensors = input_expr->get_input_tensors(); + for (const auto& tensor : input_expr_input_tensors) { + if (ov::is_type(tensor->get_source().get_expr()->get_node())) { + manually_assigned_vecs[tensor] = static_cast(accumulator_reg); } } - const auto output_td = expr->get_outputs()[0]; - manually_assigned_vecs[input_td] = static_cast(accumulator_reg); - manually_assigned_vecs[output_td] = static_cast(accumulator_reg); - for (const auto& child_expr_input : linear_ir.get_exprs_by_input(output_td)) { - if (ov::is_type(child_expr_input.expr->get_node())) { - manually_assigned_vecs[child_expr_input.expr->get_outputs()[0]] = + const auto& output_tensor = expr->get_output_tensor(0); + manually_assigned_vecs[input_tensor] = static_cast(accumulator_reg); + manually_assigned_vecs[output_tensor] = static_cast(accumulator_reg); + for (const auto& child_expr_input : output_tensor->get_consumers()) { + if (ov::is_type(child_expr_input.get_expr()->get_node())) { + manually_assigned_vecs[child_expr_input.get_expr()->get_output_tensor(0)] = static_cast(accumulator_reg); } } @@ -86,11 +86,11 @@ bool AssignRegisters::run(LinearIR& linear_ir) { // TODO: Fix via common pipeline using LoopEnd: // All operations `outside loop` after Horizon ops should have the same register to avoid using it in the next Loop const auto current_loops_ids = expr->get_loop_ids(); - auto next_expr = linear_ir.get_exprs_by_input(output_td).begin()->expr; + auto next_expr = output_tensor->get_consumers().begin()->get_expr(); while (next_expr->get_loop_ids() == current_loops_ids) { - manually_assigned_vecs[next_expr->get_outputs()[0]] = + manually_assigned_vecs[next_expr->get_output_tensor(0)] = static_cast(accumulator_reg); - next_expr = linear_ir.get_exprs_by_input(next_expr->get_outputs()[0]).begin()->expr; + next_expr = next_expr->get_output_tensor(0)->get_consumers().begin()->get_expr(); } accumulator_reg++; @@ -103,11 +103,11 @@ bool AssignRegisters::run(LinearIR& linear_ir) { decltype(regs_vec)& reg_map, const std::map& manually_assigned_regs, size_t& counter) { - for (const auto& out_td : expr->get_outputs()) { + for (const auto& out_tensor : expr->get_output_tensors()) { // Note that some ops might have identical input&output tensors (Result and Tile* for ex.) // so we have to check that the tensor has not been enumerated already - if (reg_map.count(out_td) == 0) { - reg_map[out_td] = manually_assigned_regs.count(out_td) == 0 ? counter++ : IS_MANUALLY_ALLOCATED_REG; + if (reg_map.count(out_tensor) == 0) { + reg_map[out_tensor] = manually_assigned_regs.count(out_tensor) == 0 ? counter++ : IS_MANUALLY_ALLOCATED_REG; } } }; @@ -143,9 +143,9 @@ bool AssignRegisters::run(LinearIR& linear_ir) { for (size_t i = 0; i < typed_ops.size(); i++) { const auto& t_op = typed_ops[i]; std::vector used_tensors, defined_tensors; - for (const auto& in : t_op.second->get_inputs()) + for (const auto& in : t_op.second->get_input_tensors()) used_tensors.push_back(in); - for (const auto& out : t_op.second->get_outputs()) + for (const auto& out : t_op.second->get_output_tensors()) defined_tensors.push_back(out); switch (t_op.first) { case Generator::opRegType::vec2vec: @@ -191,9 +191,9 @@ bool AssignRegisters::run(LinearIR& linear_ir) { const auto& expr = typed_ops[n].second; if (is_type(expr->get_node()) || is_type(expr->get_node())) continue; - for (const auto& out : expr->get_outputs()) { - for (const auto& child_expr_input : linear_ir.get_exprs_by_input(out)) { - const auto& child_expr = child_expr_input.expr; + for (const auto& out : expr->get_output_tensors()) { + for (const auto& child_expr_input : out->get_consumers()) { + const auto& child_expr = child_expr_input.get_expr(); auto child_it = linear_ir.begin(); std::advance(child_it, n); size_t k = n; @@ -304,8 +304,7 @@ bool AssignRegisters::run(LinearIR& linear_ir) { std::map assigned_regs(std::move(manually_assigned_gprs)); assigned_regs.insert(manually_assigned_vecs.begin(), manually_assigned_vecs.end()); - auto register_assigned_regs = [=, &assigned_regs](const std::map& unique_regs, - const std::map& unique2reused) { + auto register_assigned_regs = [=, &assigned_regs](const std::map& unique_regs, const std::map& unique2reused) { for (const auto& reg : unique_regs) { if (reg.second == IS_MANUALLY_ALLOCATED_REG) continue; @@ -320,10 +319,10 @@ bool AssignRegisters::run(LinearIR& linear_ir) { for (auto& t_op : typed_ops) { RegInfo rinfo; const auto& expr = t_op.second; - for (const auto& in : expr->get_inputs()) { + for (const auto& in : expr->get_input_tensors()) { rinfo.first.push_back(assigned_regs[in]); } - for (const auto& out : expr->get_outputs()) { + for (const auto& out : expr->get_output_tensors()) { rinfo.second.push_back(assigned_regs[out]); } t_op.second->set_reg_info(rinfo); diff --git a/src/common/snippets/src/lowered/pass/cleanup_loop_offsets.cpp b/src/common/snippets/src/lowered/pass/cleanup_loop_offsets.cpp index b35043e132b39c..0b82c1d866a693 100644 --- a/src/common/snippets/src/lowered/pass/cleanup_loop_offsets.cpp +++ b/src/common/snippets/src/lowered/pass/cleanup_loop_offsets.cpp @@ -35,13 +35,13 @@ bool CleanupLoopOffsets::run(LinearIR& linear_ir) { } if (auto outer_loop_end = as_type_ptr(next_node)) { auto fin_offsets = loop_end->get_finalization_offsets(); - std::unordered_map per_tensor_offset; - const auto& loop_inputs = expr_it->get()->get_inputs(); + std::unordered_map per_tensor_offset; + const auto& loop_inputs = expr_it->get()->get_input_tensors(); for (size_t i = 0; i < fin_offsets.size(); i++) per_tensor_offset[loop_inputs[i]] = i; auto outer_ptr_increments = outer_loop_end->get_ptr_increments(); - const auto& outer_loop_inputs = next_expr_it->get()->get_inputs(); + const auto& outer_loop_inputs = next_expr_it->get()->get_input_tensors(); for (size_t i = 0; i < outer_ptr_increments.size(); i++) { const auto& managed_tensor = outer_loop_inputs[i]; const auto& found = per_tensor_offset.find(managed_tensor); diff --git a/src/common/snippets/src/lowered/pass/fuse_loops.cpp b/src/common/snippets/src/lowered/pass/fuse_loops.cpp index 2f49ce4aca13ee..f70e33e68ab23f 100644 --- a/src/common/snippets/src/lowered/pass/fuse_loops.cpp +++ b/src/common/snippets/src/lowered/pass/fuse_loops.cpp @@ -29,30 +29,25 @@ bool FuseLoops::can_be_fused(const LoopInfoPtr& loop_current, const LoopInfoPtr& return supported_work_amount && supported_increment; } -void FuseLoops::fuse_points(LinearIR& linear_ir, std::vector& exit_points, std::vector& entry_points, - LinearIR::constExprIt loop_begin_pos, LinearIR::constExprIt loop_end_pos) { +void FuseLoops::fuse_points(std::vector& exit_points, std::vector& entry_points, + LinearIR::constExprIt loop_begin_pos, LinearIR::constExprIt loop_end_pos) { std::vector new_exit_points; for (const auto& exit_point : exit_points) { - const auto expr = exit_point.expr; - const auto port = exit_point.port; - const auto output_td = expr->get_outputs()[port]; - const auto consumers_inputs = linear_ir.get_exprs_by_input(output_td); + const auto consumers_inputs = exit_point.get_connected_ports(); - std::vector mapped_entry_points; - std::vector outside_consumers; + std::set mapped_entry_points; + std::set outside_consumers; for (const auto& consumer_input : consumers_inputs) { - const auto consumer = consumer_input.expr; - const auto consumer_port = consumer_input.port; - const auto consumer_point = consumer->input_port(consumer_port); - const auto entry_point_it = std::find(entry_points.begin(), entry_points.end(), consumer_point); + const auto entry_point_it = std::find(entry_points.begin(), entry_points.end(), consumer_input); if (entry_point_it != entry_points.end()) { - mapped_entry_points.push_back(*entry_point_it); + mapped_entry_points.insert(*entry_point_it); continue; } + const auto& consumer = consumer_input.get_expr(); const auto inside_it = std::find(loop_begin_pos, loop_end_pos, consumer); if (inside_it == loop_end_pos) { - outside_consumers.push_back(consumer); + outside_consumers.insert(consumer); } } @@ -72,10 +67,9 @@ void FuseLoops::fuse_points(LinearIR& linear_ir, std::vector& ex exit_points = new_exit_points; } -bool FuseLoops::fuse_upper_into_current(LinearIR& linear_ir, const LinearIR::LoopManagerPtr& loop_manager, - const ExpressionPort& current_entry_point, const ExpressionPort& target_exit_point, - size_t current_loop_id, size_t target_loop_id, size_t dim_idx, - LinearIR::constExprIt& current_loop_begin_pos, LinearIR::constExprIt& current_loop_end_pos) { +bool FuseLoops::fuse_upper_into_current(LinearIR& linear_ir, const LinearIR::LoopManagerPtr& loop_manager, const ExpressionPort& current_entry_point, + size_t current_loop_id, size_t target_loop_id, size_t dim_idx, + LinearIR::constExprIt& current_loop_begin_pos, LinearIR::constExprIt& current_loop_end_pos) { const auto& loop_current = loop_manager->get_loop_info(current_loop_id); const auto& loop_target = loop_manager->get_loop_info(target_loop_id); if (!can_be_fused(loop_current, loop_target)) @@ -89,13 +83,10 @@ bool FuseLoops::fuse_upper_into_current(LinearIR& linear_ir, const LinearIR::Loo bool is_fusion_allowed = true; for (size_t i = 0; i < loop_target->exit_exprs.size() && is_fusion_allowed; ++i) { const auto target_exit_point = loop_target->exit_exprs[i]; - const auto target_exit_expr = target_exit_point.expr; - const auto port = target_exit_point.port; - const auto output_td = target_exit_expr->get_outputs()[port]; - const auto consumer_inputs = linear_ir.get_exprs_by_input(output_td); + const auto consumer_inputs = target_exit_point.get_connected_ports(); for (const auto& consumer_input : consumer_inputs) { - const auto consumer = consumer_input.expr; - if (ov::is_type(consumer->get_node()) || consumer == current_entry_point.expr) + const auto& consumer = consumer_input.get_expr(); + if (ov::is_type(consumer->get_node()) || consumer == current_entry_point.get_expr()) continue; // The fusing is only valid if target Loop consumer (the Consumer is outside of target Loop) // is after current Loop (after Loop_down). @@ -113,7 +104,7 @@ bool FuseLoops::fuse_upper_into_current(LinearIR& linear_ir, const LinearIR::Loo auto current_exit_points = loop_current->exit_exprs; auto target_entry_points = loop_target->entry_exprs; auto target_exit_points = loop_target->exit_exprs; - fuse_points(linear_ir, target_exit_points, current_entry_points, target_loop_begin_pos, target_loop_end_pos); + fuse_points(target_exit_points, current_entry_points, target_loop_begin_pos, target_loop_end_pos); const auto insertion_place = current_loop_begin_pos; const auto is_move_needed = target_loop_end_pos != current_loop_begin_pos; @@ -146,10 +137,9 @@ bool FuseLoops::fuse_upper_into_current(LinearIR& linear_ir, const LinearIR::Loo return true; } -bool FuseLoops::fuse_lower_into_current(LinearIR& linear_ir, const LinearIR::LoopManagerPtr& loop_manager, - const ExpressionPort& current_exit_point, const ExpressionPort& target_entry_point, - size_t current_loop_id, size_t target_loop_id, size_t dim_idx, - LinearIR::constExprIt& current_loop_begin_pos, LinearIR::constExprIt& current_loop_end_pos) { +bool FuseLoops::fuse_lower_into_current(LinearIR& linear_ir, const LinearIR::LoopManagerPtr& loop_manager, const ExpressionPort& current_exit_point, + size_t current_loop_id, size_t target_loop_id, size_t dim_idx, + LinearIR::constExprIt& current_loop_begin_pos, LinearIR::constExprIt& current_loop_end_pos) { const auto& loop_current = loop_manager->get_loop_info(current_loop_id); const auto& loop_target = loop_manager->get_loop_info(target_loop_id); if (!can_be_fused(loop_current, loop_target)) @@ -160,12 +150,9 @@ bool FuseLoops::fuse_lower_into_current(LinearIR& linear_ir, const LinearIR::Loo bool is_fusion_allowed = true; for (size_t i = 0; i < loop_target->entry_exprs.size() && is_fusion_allowed; ++i) { const auto target_entry_point = loop_target->entry_exprs[i]; - const auto target_entry_expr = target_entry_point.expr; - const auto port = target_entry_point.port; - const auto input_td = target_entry_expr->get_inputs()[port]; - const auto parent_expr_output = linear_ir.get_expr_by_output(input_td); - const auto parent_expr = parent_expr_output.expr; - if (ov::is_type(parent_expr->get_node()) || parent_expr == current_exit_point.expr) + const auto parent_expr_output = *target_entry_point.get_connected_ports().begin(); + const auto& parent_expr = parent_expr_output.get_expr(); + if (ov::is_type(parent_expr->get_node()) || parent_expr == current_exit_point.get_expr()) continue; is_fusion_allowed = parent_expr->get_loop_ids()[dim_idx] == current_loop_id || // The parent expr is from the same current Loop std::find(linear_ir.cbegin(), current_loop_begin_pos, parent_expr) != current_loop_begin_pos; // The parent is before current Loop @@ -182,7 +169,7 @@ bool FuseLoops::fuse_lower_into_current(LinearIR& linear_ir, const LinearIR::Loo auto current_exit_points = loop_current->exit_exprs; auto target_entry_points = loop_target->entry_exprs; auto target_exit_points = loop_target->exit_exprs; - fuse_points(linear_ir, current_exit_points, target_entry_points, current_loop_begin_pos, current_loop_end_pos); + fuse_points(current_exit_points, target_entry_points, current_loop_begin_pos, current_loop_end_pos); const auto insertion_place = current_loop_end_pos; const auto is_move_needed = insertion_place != target_loop_begin_pos; @@ -268,12 +255,8 @@ bool FuseLoops::run(LinearIR& linear_ir) { bool was_fusion_up = false; for (size_t in_port = 0; in_port < entry_points.size() && !was_fusion_up; ++in_port) { const auto entry_point = entry_points[in_port]; - const auto entry_expr = entry_point.expr; - const auto port = entry_point.port; - const auto input_td = entry_expr->get_inputs()[port]; - const auto parent_expr_output = linear_ir.get_expr_by_output(input_td); - const auto parent_expr = parent_expr_output.expr; - const auto out_port = parent_expr_output.port; + const auto parent_expr_output = *entry_point.get_connected_ports().begin(); + const auto& parent_expr = parent_expr_output.get_expr(); const auto parent = parent_expr->get_node(); if (ov::is_type(parent) || ov::is_type(parent) || @@ -288,10 +271,8 @@ bool FuseLoops::run(LinearIR& linear_ir) { "Loops cannot have parents of entry points with the same identifier"); if (loop_id_target == Expression::LOOP_NULL_ID) continue; - const auto loop_info_target = loop_manager->get_loop_info(loop_id_target); - const auto target_exit_port = parent_expr->output_port(out_port); - if (fuse_upper_into_current(linear_ir, loop_manager, entry_point, target_exit_port, loop_id, loop_id_target, + if (fuse_upper_into_current(linear_ir, loop_manager, entry_point, loop_id, loop_id_target, dim_idx, loop_begin_pos, loop_end_pos)) { was_fusion_up = true; loop_manager->remove_loop_info(loop_id_target); @@ -309,13 +290,9 @@ bool FuseLoops::run(LinearIR& linear_ir) { bool was_fusion_down = false; for (size_t out_port = 0; out_port < exit_points.size() && !was_fusion_down; ++out_port) { const auto exit_point = exit_points[out_port]; - const auto exit_expr = exit_point.expr; - const auto port = exit_point.port; - const auto output_td = exit_expr->get_outputs()[port]; - const auto consumer_exprs_inputs = linear_ir.get_exprs_by_input(output_td); + const auto consumer_exprs_inputs = exit_point.get_connected_ports(); for (const auto& consumer_expr_input : consumer_exprs_inputs) { - const auto consumer_expr = consumer_expr_input.expr; - const auto in_port = consumer_expr_input.port; + const auto& consumer_expr = consumer_expr_input.get_expr(); const auto consumer = consumer_expr->get_node(); if (ov::is_type(consumer) || ov::is_type(consumer)) { @@ -331,9 +308,7 @@ bool FuseLoops::run(LinearIR& linear_ir) { if (loop_id == loop_id_target || loop_id_target == Expression::LOOP_NULL_ID) continue; - const auto loop_info_target = loop_manager->get_loop_info(loop_id_target); - const auto target_entry_port = consumer_expr->input_port(in_port); - if (fuse_lower_into_current(linear_ir, loop_manager, exit_point, target_entry_port, loop_id, loop_id_target, + if (fuse_lower_into_current(linear_ir, loop_manager, exit_point, loop_id, loop_id_target, dim_idx, loop_begin_pos, loop_end_pos)) { was_fusion_down = true; loop_manager->remove_loop_info(loop_id_target); diff --git a/src/common/snippets/src/lowered/pass/indentify_buffers.cpp b/src/common/snippets/src/lowered/pass/indentify_buffers.cpp index 769454c36aded2..621ac31be7d101 100644 --- a/src/common/snippets/src/lowered/pass/indentify_buffers.cpp +++ b/src/common/snippets/src/lowered/pass/indentify_buffers.cpp @@ -55,28 +55,25 @@ std::vector IdentifyBuffers::create_adjacency_matrix(const LinearIR& linea for (size_t buffer_idx = 0; buffer_idx < buffers.size(); ++buffer_idx) { // Here intermediate Buffer const auto buffer_expr = buffers[buffer_idx]; - const auto buffer_input_tds = buffer_expr->get_inputs(); - OPENVINO_ASSERT(buffer_input_tds.size() == 1, "Intermediate Buffer must have one input"); const auto buffer = ov::as_type_ptr(buffer_expr->get_node()); - - const auto& buffer_td = buffer_input_tds.front(); - const auto buffer_siblings = linear_ir.get_exprs_by_input(buffer_td); + const auto& buffer_tensor = buffer_expr->get_input_tensor(0); + const auto buffer_siblings = buffer_tensor->get_consumers(); for (const auto& buffer_sibling : buffer_siblings) { - const auto& sibling_expr = buffer_sibling.expr; + const auto& sibling_expr = buffer_sibling.get_expr(); // Skip myself if (sibling_expr == buffer_expr) { continue; } else if (const auto loop_end = ov::as_type_ptr(sibling_expr->get_node())) { - const auto& loop_tds = sibling_expr->get_inputs(); + const auto& loop_tds = sibling_expr->get_input_tensors(); const auto input_count = loop_end->get_input_num(); const auto output_count = loop_end->get_output_num(); const auto& ptr_increments = loop_end->get_ptr_increments(); const auto& io_data_sizes = loop_end->get_element_type_sizes(); - const auto buffer_loop_port = std::distance(loop_tds.begin(), std::find(loop_tds.begin(), loop_tds.end(), buffer_td)); + const auto buffer_loop_port = std::distance(loop_tds.begin(), std::find(loop_tds.begin(), loop_tds.end(), buffer_tensor)); // Verify Buffers on Loop inputs: for (size_t input_idx = 0; input_idx < input_count; ++input_idx) { - const auto loop_in = linear_ir.get_expr_by_output(loop_tds[input_idx]).expr; + const auto& loop_in = loop_tds[input_idx]->get_source().get_expr(); if (const auto& neighbour_buffer = is_intermediate_buffer(loop_in->get_node())) { const auto neighbour_buffer_loop_port = input_idx; update_adj_matrix(buffer, buffer_idx, neighbour_buffer, @@ -88,12 +85,12 @@ std::vector IdentifyBuffers::create_adjacency_matrix(const LinearIR& linea // Verify Buffers on Loop outputs for (size_t output_idx = 0; output_idx < output_count; ++output_idx) { // Skip the current Buffer - if (buffer_td == loop_tds[input_count + output_idx]) + if (buffer_tensor == loop_tds[input_count + output_idx]) continue; - const auto& consumer_inputs = linear_ir.get_exprs_by_input(loop_tds[input_count + output_idx]); + const auto consumer_inputs = loop_tds[input_count + output_idx]->get_consumers(); for (const auto& consumer_input : consumer_inputs) { - const auto& child_node = consumer_input.expr->get_node(); + const auto& child_node = consumer_input.get_expr()->get_node(); if (const auto& neighbour_buffer = is_intermediate_buffer(child_node)) { const auto neighbour_buffer_loop_port = input_count + output_idx; update_adj_matrix(buffer, buffer_idx, neighbour_buffer, diff --git a/src/common/snippets/src/lowered/pass/init_loops.cpp b/src/common/snippets/src/lowered/pass/init_loops.cpp index 460997d547a14e..550a4b7e7b9552 100644 --- a/src/common/snippets/src/lowered/pass/init_loops.cpp +++ b/src/common/snippets/src/lowered/pass/init_loops.cpp @@ -24,12 +24,12 @@ void filter_ports(LinearIR& linear_ir, std::set> loop_parents; for (const auto& loop_entry_point : loop_entries) { - const auto& expr = loop_entry_point.expr; - const auto port = loop_entry_point.port; + const auto& expr = loop_entry_point.get_expr(); + const auto port = loop_entry_point.get_index(); const auto node = expr->get_node(); const auto ma = ov::as_type_ptr(node); if (ma && ma->is_memory_access_input_port(port)) { - const auto& parent_expr = linear_ir.get_expr_by_output(expr->get_inputs()[port]).expr; + const auto& parent_expr = loop_entry_point.get_connected_ports().begin()->get_expr(); const auto& parent = parent_expr->get_node(); // Todo: Sometimes several Load in one Loop read data from the same Node if (loop_parents.find(parent) == loop_parents.end()) { @@ -40,8 +40,8 @@ void filter_ports(LinearIR& linear_ir, } for (const auto& loop_exit_point : loop_exits) { - const auto& expr = loop_exit_point.expr; - const auto port = loop_exit_point.port; + const auto& expr = loop_exit_point.get_expr(); + const auto port = loop_exit_point.get_index(); const auto ma = ov::as_type_ptr(expr->get_node()); if (ma && ma->is_memory_access_output_port(port)) { new_loop_exits.push_back(loop_exit_point); @@ -68,57 +68,43 @@ InitLoops::InitLoops() : Transformation() {} std::vector InitLoops::init_ptr_increments(const std::vector& loop_inputs, const std::vector& loop_outputs, size_t dim_idx) const { - std::vector ptr_increments; - // Note: All loop inputs must have the same layout by definition. - // If this doesn't hold, then we're trying to inject loops in the wrong place. - const std::vector loop_layout{ - !loop_inputs.empty() ? loop_inputs.front().expr->get_inputs()[0]->get_layout() : - !loop_outputs.empty() ? loop_outputs.front().expr->get_outputs()[0]->get_layout() : - std::vector{}}; + std::vector ptr_increments; // Note: Need to find max relevant dim expr to account for broadcasting, collect relevant_dims as well - // Note: At the moment all loop_inputs and loop_outputs - are Load/Store ops in this method. - // So for example, we can call loop_input[i]->get_outputs().front() because Load have one output - size_t max_relevant_dim_size = 0; + size_t max_relevant_dim_size = 1; for (const auto& loop_input : loop_inputs) { - const auto& expr = loop_input.expr; - const auto out_td = expr->get_outputs().front(); - const auto& layout = out_td->get_layout(); - const auto& tensor = out_td->get_tensor(); + const auto& layout = loop_input.get_descriptor_ptr()->get_layout(); + const auto& shape = loop_input.get_descriptor_ptr()->get_shape(); const auto& dim = *(layout.rbegin() + dim_idx); - max_relevant_dim_size = std::max(tensor[dim], max_relevant_dim_size); + max_relevant_dim_size = std::max(shape[dim], max_relevant_dim_size); } for (const auto& loop_output : loop_outputs) { - const auto& expr = loop_output.expr; - const auto in_td = expr->get_inputs().front(); - const auto& layout = in_td->get_layout(); - const auto& tensor = in_td->get_tensor(); + const auto& layout = loop_output.get_descriptor_ptr()->get_layout(); + const auto& shape = loop_output.get_descriptor_ptr()->get_shape(); const auto& dim = *(layout.rbegin() + dim_idx); - max_relevant_dim_size = std::max(tensor[dim], max_relevant_dim_size); + max_relevant_dim_size = std::max(shape[dim], max_relevant_dim_size); } + for (const auto& loop_input : loop_inputs) { - const auto& expr = loop_input.expr; - const auto out_td = expr->get_outputs().front(); - const auto& layout = out_td->get_layout(); - const auto& tensor = out_td->get_tensor(); + // For strides we have to use layout from source since source writes data by special rules + const auto source = *loop_input.get_connected_ports().begin(); + const auto& layout = loop_input.get_descriptor_ptr()->get_layout(); + const auto& shape = loop_input.get_descriptor_ptr()->get_shape(); const auto& dim = *(layout.rbegin() + dim_idx); int64_t ptr_increment = 0; // If relevant dim is not broadcasted, then ptr_increment is the dim stride in the new layout - if (!(tensor[dim] == 1 && max_relevant_dim_size != 1)) - ptr_increment = get_dim_stride(dim, loop_layout, tensor); + if (!(shape[dim] == 1 && max_relevant_dim_size != 1)) + ptr_increment = get_dim_stride(dim, source.get_descriptor_ptr()->get_layout(), shape); ptr_increments.push_back(ptr_increment); } - // Note: Le already accounted for loop_input vs inside loops layout mismatch. So we need non-dense output - // ptr_increments only if loop_input_layout doesn't match loop_output_layout + for (const auto& loop_output : loop_outputs) { - const auto& expr = loop_output.expr; - const auto in_td = expr->get_inputs().front(); - const auto& layout = in_td->get_layout(); - const auto& tensor = in_td->get_tensor(); + const auto& layout = loop_output.get_descriptor_ptr()->get_layout(); + const auto& shape = loop_output.get_descriptor_ptr()->get_shape(); const auto& dim = *(layout.rbegin() + dim_idx); int64_t ptr_increment = 0; // If relevant dim is not broadcasted, then ptr_increment is the dim stride in the new layout - if (!(tensor[dim] == 1 && max_relevant_dim_size != 1)) - ptr_increment = get_dim_stride(dim, layout, tensor); + if (!(shape[dim] == 1 && max_relevant_dim_size != 1)) + ptr_increment = get_dim_stride(dim, layout, shape); ptr_increments.push_back(ptr_increment); } @@ -135,14 +121,14 @@ std::vector InitLoops::init_finalization_offsets(const std::vector InitLoops::init_element_type_sizes(const std::vector& loop_inputs, - const std::vector& loop_outputs) { + const std::vector& loop_outputs) { std::vector element_types; element_types.reserve(loop_inputs.size() + loop_outputs.size()); for (const auto& in : loop_inputs) { - element_types.push_back(in.expr->get_node()->get_input_element_type(in.port).size()); + element_types.push_back(in.get_expr()->get_node()->get_input_element_type(in.get_index()).size()); } for (const auto& out : loop_outputs) { - element_types.push_back(out.expr->get_node()->get_output_element_type(out.port).size()); + element_types.push_back(out.get_expr()->get_node()->get_output_element_type(out.get_index()).size()); } return element_types; } @@ -164,7 +150,7 @@ bool InitLoops::insertion(LinearIR& linear_ir, const LinearIR::LoopManager::Loop const auto io_data_sizes = init_element_type_sizes(loop_entries, loop_exits); const auto& loop_begin = std::make_shared(); - const auto& loop_begin_expr = std::make_shared(loop_begin); + const auto& loop_begin_expr = linear_ir.create_expression(loop_begin, std::vector{}); linear_ir.insert(loop_begin_pos, loop_begin_expr); const auto& loop_end = std::make_shared( @@ -172,14 +158,14 @@ bool InitLoops::insertion(LinearIR& linear_ir, const LinearIR::LoopManager::Loop io_data_sizes, loop_entries.size(), loop_exits.size()); loop_end->has_outer_loop = has_outer_loop; - std::vector loop_end_inputs; + std::vector loop_end_inputs; for (const auto& expr_port : loop_entries) - loop_end_inputs.push_back(expr_port.expr->get_inputs()[expr_port.port]); + loop_end_inputs.push_back(expr_port.get_expr()->get_input_tensor(expr_port.get_index())); for (const auto& expr_port : loop_exits) - loop_end_inputs.push_back(expr_port.expr->get_outputs()[expr_port.port]); - loop_end_inputs.push_back(linear_ir.get_expr_by_node(loop_begin)->get_outputs().front()); + loop_end_inputs.push_back(expr_port.get_expr()->get_output_tensor(expr_port.get_index())); + loop_end_inputs.push_back(loop_begin_expr->get_output_tensor(0)); - const auto& loop_end_expr = std::make_shared(loop_end, loop_end_inputs, std::vector{}); + const auto& loop_end_expr = linear_ir.create_expression(loop_end, loop_end_inputs); linear_ir.insert(loop_end_pos, loop_end_expr); return true; } diff --git a/src/common/snippets/src/lowered/pass/insert_buffers.cpp b/src/common/snippets/src/lowered/pass/insert_buffers.cpp index 09efcf3e4b47da..4958a8552d5133 100644 --- a/src/common/snippets/src/lowered/pass/insert_buffers.cpp +++ b/src/common/snippets/src/lowered/pass/insert_buffers.cpp @@ -19,7 +19,7 @@ InsertBuffers::InsertBuffers(int32_t buffer_allocation_rank) : Transformation(), m_buffer_allocation_rank(buffer_allocation_rank) {} LinearIR::constExprIt InsertBuffers::insertion_position(const LinearIR& linear_ir, const LinearIR::LoopManagerPtr& loop_manager, - const ExpressionPtr& up_expr, const ExpressionPtr& down_expr) { + const ExpressionPtr& up_expr, const ExpressionPtr& down_expr) { const auto up_loops = up_expr->get_loop_ids(); const auto down_loops = down_expr->get_loop_ids(); OPENVINO_ASSERT(up_loops.size() == down_loops.size(), "The Loop IDs must be normalized!"); @@ -58,15 +58,15 @@ LinearIR::constExprIt InsertBuffers::insertion_position(const LinearIR& linear_i } void InsertBuffers::insertion(LinearIR& linear_ir, const LinearIR::LoopManagerPtr& loop_manager, size_t loop_id, - const std::vector& loop_entries, const std::vector& loop_exits) { + const std::vector& loop_entries, const std::vector& loop_exits) { for (const auto& entry_point : loop_entries) { - const auto expr = entry_point.expr; - const auto port = entry_point.port; + const auto& expr = entry_point.get_expr(); + const auto port = entry_point.get_index(); const auto node = expr->get_node(); - const auto input_td = expr->get_inputs()[port]; - const auto parent_expr_output = linear_ir.get_expr_by_output(input_td); - const auto& parent_expr = parent_expr_output.expr; - const auto parent_port = parent_expr_output.port; + const auto& input_tensor = expr->get_input_tensor(port); + const auto& parent_expr_output = input_tensor->get_source(); + const auto& parent_expr = parent_expr_output.get_expr(); + const auto parent_port = parent_expr_output.get_index(); const auto parent = parent_expr->get_node(); if (ov::is_type(parent) || ov::is_type(parent) || @@ -103,33 +103,30 @@ void InsertBuffers::insertion(LinearIR& linear_ir, const LinearIR::LoopManagerPt // Need to insert between 2nd and 4th Loops - after 2nd Loop const auto pos = insertion_position(linear_ir, loop_manager, parent_expr, expr); const auto buffer = std::make_shared(parent->output(parent_port), m_buffer_allocation_rank); - - const auto td = std::make_shared(input_td->get_tensor(), - input_td->get_subtensor(), - input_td->get_layout()); - const std::vector buffer_outs = { td }; - const std::vector parent_outs = { input_td }; - linear_ir.insert(pos, std::make_shared(buffer, parent_outs, buffer_outs)); - linear_ir.replace_input(expr, port, td); + PortManager::set_port_descriptor_ptr(buffer->output(0), parent_expr_output.get_descriptor_ptr()->clone()); + // Output tensor is automatically filled from PortDescriptor + const auto buffer_expr = linear_ir.create_expression(buffer, {input_tensor}); + linear_ir.insert(pos, buffer_expr); + linear_ir.replace_input(entry_point, buffer_expr->get_output_tensor(0)); } } for (const auto& exit_point : loop_exits) { - const auto expr = exit_point.expr; - const auto port = exit_point.port; + const auto& expr = exit_point.get_expr(); + const auto port = exit_point.get_index(); const auto node = expr->get_node(); - const auto output_td = expr->get_outputs()[port]; - const auto child_exprs_inputs = linear_ir.get_exprs_by_input(output_td); + const auto output_tensor = exit_point.get_tensor_ptr(); + const auto child_exprs_inputs = output_tensor->get_consumers(); const auto current_loops = expr->get_loop_ids(); const auto current_loop_count = current_loops.size(); - const std::vector node_outs = {output_td}; + const std::vector node_outs = {output_tensor}; std::set potential_consumers; std::set buffers; const auto current_loop_lvl = std::distance(current_loops.begin(), std::find(current_loops.begin(), current_loops.end(), loop_id)); for (const auto& child_expr_input : child_exprs_inputs) { - const auto& child_expr = child_expr_input.expr; - const auto child_port = child_expr_input.port; + const auto& child_expr = child_expr_input.get_expr(); + const auto child_port = child_expr_input.get_index(); const auto& child = child_expr->get_node(); if (ov::is_type(child)) continue; @@ -164,13 +161,9 @@ void InsertBuffers::insertion(LinearIR& linear_ir, const LinearIR::LoopManagerPt // we should remove them to insert one common Buffer on one common port if (!buffers.empty()) { for (const auto& buffer : buffers) { - const auto buffer_out = buffer->get_outputs().front(); - const auto buffer_consumers_inputs = linear_ir.get_exprs_by_input(buffer_out); - for (const auto& consumer_input : buffer_consumers_inputs) { - const auto consumer = consumer_input.expr; - const auto consumer_port = consumer_input.port; - linear_ir.replace_input(consumer, consumer_port, output_td); - } + const auto& buffer_out = buffer->get_output_tensor(0); + const auto buffer_consumers_inputs = buffer_out->get_consumers(); + linear_ir.replace_input(buffer_consumers_inputs, output_tensor); potential_consumers.insert(buffer_consumers_inputs.begin(), buffer_consumers_inputs.end()); linear_ir.erase(std::find(linear_ir.begin(), linear_ir.end(), buffer)); } @@ -182,12 +175,10 @@ void InsertBuffers::insertion(LinearIR& linear_ir, const LinearIR::LoopManagerPt // Need to insert after 2nd Loops // Note: All potential consumers must have the same count of first equal Loop identifies and the same count of different last identifies // TODO: Need to verify that - const auto pos = insertion_position(linear_ir, loop_manager, expr, (*potential_consumers.begin()).expr); + const auto pos = insertion_position(linear_ir, loop_manager, expr, (*potential_consumers.begin()).get_expr()); auto buffer = std::make_shared(node->output(port), m_buffer_allocation_rank); - const auto td = std::make_shared(output_td->get_tensor(), - output_td->get_subtensor(), - output_td->get_layout()); + PortManager::set_port_descriptor_ptr(buffer->output(0), exit_point.get_descriptor_ptr()->clone()); // We cannot insert Node output tensor on Buffer output because not all consumers of Node needs Buffer // Example: // Add @@ -195,13 +186,10 @@ void InsertBuffers::insertion(LinearIR& linear_ir, const LinearIR::LoopManagerPt // Result Buffer // | <- It should be new TD // Relu - const std::vector buffer_outs = {td}; - linear_ir.insert(pos, std::make_shared(buffer, node_outs, buffer_outs)); - for (const auto& consumer_input : potential_consumers) { - const auto consumer = consumer_input.expr; - const auto consumer_port = consumer_input.port; - linear_ir.replace_input(consumer, consumer_port, td); - } + // Output tensor is automatically filled from PortDescriptor + const auto buffer_expr = linear_ir.create_expression(buffer, node_outs); + linear_ir.insert(pos, buffer_expr); + linear_ir.replace_input(potential_consumers, buffer_expr->get_output_tensor(0)); } } } @@ -234,10 +222,10 @@ bool InsertBuffers::run(LinearIR& linear_ir) { std::vector loop_entries(input_ports.size()), loop_exits(output_ports.size()); // C++17: for (auto const& [loop_id, loop_info] : loop_data_map) for (const auto& p : input_ports) { - loop_entries[p.first] = expr->input_port(p.first); + loop_entries[p.first] = expr->get_input_port(p.first); } for (const auto& p : output_ports) { - loop_exits[p.first] = expr->output_port(p.first); + loop_exits[p.first] = expr->get_output_port(p.first); } insertion(linear_ir, loop_manager, Expression::LOOP_NULL_ID, loop_entries, loop_exits); diff --git a/src/common/snippets/src/lowered/pass/insert_load_store.cpp b/src/common/snippets/src/lowered/pass/insert_load_store.cpp index f67ff2094382ec..c4931dfc1ad01a 100644 --- a/src/common/snippets/src/lowered/pass/insert_load_store.cpp +++ b/src/common/snippets/src/lowered/pass/insert_load_store.cpp @@ -33,7 +33,7 @@ using LoopInfoPtr = LoopManager::LoopInfoPtr; InsertLoadStore::InsertLoadStore(size_t vector_size) : m_vector_size(vector_size) {} void InsertLoadStore::update_loops(const LinearIR::LoopManagerPtr& loop_manager, const std::vector& loop_ids, - const ExpressionPort& actual_port, const std::vector& target_ports, bool is_entry) { + const ExpressionPort& actual_port, const std::vector& target_ports, bool is_entry) { for (auto loop_id : loop_ids) { if (loop_id != Expression::LOOP_NULL_ID) update_loop(loop_manager->get_loop_info(loop_id), actual_port, target_ports, is_entry); @@ -41,7 +41,7 @@ void InsertLoadStore::update_loops(const LinearIR::LoopManagerPtr& loop_manager, } void InsertLoadStore::update_loop(const LinearIR::LoopManager::LoopInfoPtr& loop_info, - const ExpressionPort& actual_port, const std::vector& target_ports, bool is_entry) { + const ExpressionPort& actual_port, const std::vector& target_ports, bool is_entry) { auto& ports = is_entry ? loop_info->entry_exprs : loop_info->exit_exprs; auto port_it = std::find(ports.begin(), ports.end(), actual_port); if (port_it == ports.end()) @@ -54,13 +54,13 @@ bool InsertLoadStore::insert_load(LinearIR& linear_ir, const LinearIR::constExpr const auto& loop_manager = linear_ir.get_loop_manager(); const auto& data_expr = *data_expr_it; const auto& data_node = data_expr->get_node(); - const auto& output_td = data_expr->get_outputs().front(); - const auto consumer_inputs = linear_ir.get_exprs_by_input(output_td); + const auto& output_tensor = data_expr->get_output_tensor(0); + const auto consumer_inputs = output_tensor->get_consumers(); bool was_inserted = false; for (const auto& consumer_input : consumer_inputs) { - const auto& consumer_expr = consumer_input.expr; - const auto port = consumer_input.port; + const auto& consumer_expr = consumer_input.get_expr(); + const auto port = consumer_input.get_index(); const auto& consumer = consumer_expr->get_node(); const auto ma = ov::as_type_ptr(consumer); if (ma && ma->is_memory_access_input_port(port)) @@ -71,21 +71,17 @@ bool InsertLoadStore::insert_load(LinearIR& linear_ir, const LinearIR::constExpr const auto inner_loop = get_inner_loop_id(loop_ids); OPENVINO_ASSERT(inner_loop != Expression::LOOP_NULL_ID, "Loop hasn't been found!"); - const auto load_td = std::make_shared(output_td->get_tensor(), - output_td->get_subtensor(), - output_td->get_layout()); const auto load = std::make_shared(data_node->output(0), m_vector_size); - const auto load_outs = std::vector{ load_td }; - const auto param_outs = std::vector{ output_td }; - const auto load_expr = std::make_shared(load, param_outs, load_outs); + PortManager::set_port_descriptor_ptr(load->output(0), consumer_input.get_descriptor_ptr()->clone()); + const auto load_expr = linear_ir.create_expression(load, {output_tensor}); linear_ir.insert(std::find(data_expr_it, linear_ir.cend(), consumer_expr), load_expr); - linear_ir.replace_input(consumer_expr, port, load_td); + linear_ir.replace_input(consumer_input, load_expr->get_output_tensor(0)); // Copy Loop identifies load_expr->set_loop_ids(loop_ids); // Need to update all the corresponding Loops with the same Entry Point const auto prev_entry_point = consumer_input; - const auto new_entry_point = load_expr->input_port(0); + const auto new_entry_point = load_expr->get_input_port(0); update_loops(loop_manager, loop_ids, prev_entry_point, {new_entry_point}, true); was_inserted = true; } @@ -96,10 +92,10 @@ bool InsertLoadStore::insert_load(LinearIR& linear_ir, const LinearIR::constExpr bool InsertLoadStore::insert_store(LinearIR& linear_ir, const LinearIR::constExprIt& data_expr_it) { const auto& loop_manager = linear_ir.get_loop_manager(); const auto& data_expr = *data_expr_it; - const auto& input_td = data_expr->get_inputs().front(); - const auto parent_output = linear_ir.get_expr_by_output(input_td); - const auto& parent_expr = parent_output.expr; - const auto port = parent_output.port; + const auto& input_tensor = data_expr->get_input_tensor(0); + const auto& parent_output = input_tensor->get_source(); + const auto& parent_expr = parent_output.get_expr(); + const auto port = parent_output.get_index(); const auto& parent = parent_expr->get_node(); const auto ma = ov::as_type_ptr(parent); if (ma && ma->is_memory_access_output_port(port)) @@ -110,17 +106,13 @@ bool InsertLoadStore::insert_store(LinearIR& linear_ir, const LinearIR::constExp const auto inner_loop = get_inner_loop_id(loop_ids); OPENVINO_ASSERT(inner_loop != Expression::LOOP_NULL_ID, "Loop hasn't been found!"); - const auto store_td = std::make_shared(input_td->get_tensor(), - input_td->get_subtensor(), - input_td->get_layout()); const auto store = std::make_shared(parent->output(port), m_vector_size); - const auto store_outs = std::vector{ store_td }; - const auto param_outs = std::vector{ input_td }; - const auto store_expr = std::make_shared(store, param_outs, store_outs); + PortManager::set_port_descriptor_ptr(store->output(0), parent_output.get_descriptor_ptr()->clone()); + const auto store_expr = linear_ir.create_expression(store, {input_tensor}); const auto& reverse_insertion_pos = std::find(std::reverse_iterator(data_expr_it), linear_ir.crend(), parent_expr); const auto& insertion_pos = reverse_insertion_pos.base(); linear_ir.insert(insertion_pos, store_expr); - linear_ir.replace_input(data_expr, 0, store_td); + linear_ir.replace_input(data_expr->get_input_port(0), store_expr->get_output_tensor(0)); // Copy Loop identifies store_expr->set_loop_ids(loop_ids); @@ -128,13 +120,13 @@ bool InsertLoadStore::insert_store(LinearIR& linear_ir, const LinearIR::constExp const auto prev_exit_point = parent_output; // The previous exit point byt one output port can have several consumers that can be potential exit points // So we should verify on the possible future exit points - const auto consumer_inputs = linear_ir.get_exprs_by_input(input_td); + const auto consumer_inputs = input_tensor->get_consumers(); const auto should_be_saved = std::any_of(consumer_inputs.begin(), consumer_inputs.end(), [](const ExpressionPort& input_port) { - const auto& node = input_port.expr->get_node(); + const auto& node = input_port.get_expr()->get_node(); return ov::is_type(node) || ov::is_type(node); }); - const auto new_exit_point = store_expr->output_port(0); + const auto new_exit_point = store_expr->get_output_port(0); const auto new_exit_points = should_be_saved ? std::vector{prev_exit_point, new_exit_point} : std::vector{new_exit_point}; update_loops(loop_manager, loop_ids, prev_exit_point, new_exit_points, false); diff --git a/src/common/snippets/src/lowered/pass/insert_tail_loop.cpp b/src/common/snippets/src/lowered/pass/insert_tail_loop.cpp index d9bed42e347d0f..cfdc9ab8ae66eb 100644 --- a/src/common/snippets/src/lowered/pass/insert_tail_loop.cpp +++ b/src/common/snippets/src/lowered/pass/insert_tail_loop.cpp @@ -41,25 +41,27 @@ void InsertTailLoop::tail_transformations(LinearIR& linear_ir, ov::is_type(op))) { for (size_t i = 0; i < op->inputs().size(); ++i) { if (auto fill = insertFill(op->input(i))) { - std::vector inputs{expr_it->get()->get_inputs()[i]}; + const auto& input = expr_it->get()->get_input_tensor(i); + const auto consumers = input->get_consumers(); // Note: inputs == outputs, since we want to modify vector reg inplace - auto fill_expr = std::make_shared(fill, inputs, inputs); + auto fill_expr = linear_ir.create_expression(fill, {input}); + linear_ir.insert(expr_it, fill_expr); + linear_ir.replace_input(consumers, fill_expr->get_output_tensor(0)); auto reg = expr_it->get()->get_reg_info().first[i]; fill_expr->set_reg_info({{reg}, {reg}}); - linear_ir.insert(expr_it, fill_expr); } } } else if (const auto memory_access = std::dynamic_pointer_cast(op)) { // FIXME: C++17 const auto& [port, desc] : memory_access->get_memory_access_input_ports() for (const auto p : memory_access->get_memory_access_input_ports()) { const auto port = p.first; - if (memory_access->is_memory_access_input_port(port) && memory_access->get_input_count(port) > 1) { + if (memory_access->get_input_count(port) > 1) { memory_access->set_input_count(tail_size, port); } } for (const auto p : memory_access->get_memory_access_output_ports()) { const auto port = p.first; - if (memory_access->is_memory_access_output_port(port) && memory_access->get_output_count(port) > 1) { + if (memory_access->get_output_count(port) > 1) { memory_access->set_output_count(tail_size, port); } } @@ -95,25 +97,25 @@ bool InsertTailLoop::run(LinearIR& linear_ir) { } }; auto is_loop_with_buffers = [&linear_ir](const std::shared_ptr& loop_end) { - auto is_buffer_input = [&linear_ir](const TensorDescriptorPtr& input) { - const auto parent_expr = linear_ir.get_expr_by_output(input).expr; + auto is_buffer_input = [&linear_ir](const TensorPtr& input) { + const auto& parent_expr = input->get_source().get_expr(); return ov::is_type(parent_expr->get_node()); }; - auto is_buffer_output = [&linear_ir](const TensorDescriptorPtr& output) { - const auto& child_exprs_inputs = linear_ir.get_exprs_by_input(output); + auto is_buffer_output = [&linear_ir](const TensorPtr& output) { + const auto child_exprs_inputs = output->get_consumers(); return std::any_of(child_exprs_inputs.begin(), child_exprs_inputs.end(), - [](const ExpressionPort& lp) {return ov::is_type(lp.expr->get_node());}); + [](const ExpressionPort& lp) {return ov::is_type(lp.get_expr()->get_node());}); }; - const auto loop_end_expr = linear_ir.get_expr_by_node(loop_end); - const auto inputs = loop_end_expr->get_inputs(); + const auto& loop_end_expr = linear_ir.get_expr_by_node(loop_end); + const auto inputs = loop_end_expr->get_input_tensors(); const auto in_num = loop_end->get_input_num(); const auto out_num = loop_end->get_output_num(); OPENVINO_ASSERT(inputs.size() == (in_num + out_num + 1), std::string("The LoopEnd expression must have the count of inputs is") + std::string("equal to count of input and outputs of Loop plus one for work amount")); - const std::vector loop_ins(inputs.begin(), inputs.begin() + in_num); - const std::vector loop_outs(inputs.begin() + in_num, inputs.begin() + in_num + out_num); + const std::vector loop_ins(inputs.begin(), inputs.begin() + in_num); + const std::vector loop_outs(inputs.begin() + in_num, inputs.begin() + in_num + out_num); return std::any_of(loop_ins.begin(), loop_ins.end(), is_buffer_input) || std::any_of(loop_outs.begin(), loop_outs.end(), is_buffer_output); }; diff --git a/src/common/snippets/src/lowered/pass/load_movebroadcast_to_broadcastload.cpp b/src/common/snippets/src/lowered/pass/load_movebroadcast_to_broadcastload.cpp index 8a13cf2328d6c1..b9bcfce87f5394 100644 --- a/src/common/snippets/src/lowered/pass/load_movebroadcast_to_broadcastload.cpp +++ b/src/common/snippets/src/lowered/pass/load_movebroadcast_to_broadcastload.cpp @@ -19,21 +19,22 @@ bool LoadMoveBroadcastToBroadcastLoad::run(LinearIR& linear_ir) { bool modified = false; for (auto expr_it = linear_ir.begin(); expr_it != linear_ir.end(); expr_it++) { - const auto& op = (*expr_it)->get_node(); + const auto& expr = *expr_it; + const auto& op = expr->get_node(); // Match on MoveBroadcast because MoveBroadcast is rare node in bodies if (const auto move_broadcast = ov::as_type_ptr(op)) { - const auto interm_td = (*expr_it)->get_inputs().front(); - const auto parent_expr = linear_ir.get_expr_by_output(interm_td).expr; + const auto& interm_tensor = expr->get_input_tensor(0); + const auto parent_expr = interm_tensor->get_source().get_expr(); const auto load = ov::as_type_ptr(parent_expr->get_node()); if (!load) continue; // Cannot rewrite Broadcast + Load if load has more than 1 user // or more than one input, or if Broadcast has several inputs - const auto load_consumers_inputs = linear_ir.get_exprs_by_input(interm_td); + const auto load_consumers_inputs = interm_tensor->get_consumers(); size_t count = 0; for (const auto& consumer_expr_input : load_consumers_inputs) { - const auto consumer = consumer_expr_input.expr->get_node(); + const auto consumer = consumer_expr_input.get_expr()->get_node(); if (!ov::is_type(consumer)) count++; } @@ -41,15 +42,17 @@ bool LoadMoveBroadcastToBroadcastLoad::run(LinearIR& linear_ir) { if (count > 1) continue; - auto outshape = move_broadcast->get_output_partial_shape(0); - auto broadcastload = std::make_shared(load->input_value(0), outshape, load->get_offset()); - const auto in_td = std::vector{ parent_expr->get_inputs().front() }; - const auto out_td = std::vector{ (*expr_it)->get_outputs().front() }; + const auto& outshape = move_broadcast->get_output_partial_shape(0); + const auto broadcastload = std::make_shared(load->input_value(0), outshape, load->get_offset()); + const auto move_consumers = expr->get_output_tensor(0)->get_consumers(); + PortManager::set_port_descriptor_ptr(broadcastload->output(0), expr->get_output_port(0).get_descriptor_ptr()->clone()); + const auto broadcastload_expr = linear_ir.create_expression(broadcastload, { parent_expr->get_input_tensor(0) }); const auto mv_expr_it = expr_it; const auto insertion_pos = std::next(expr_it); + expr_it = linear_ir.insert(insertion_pos, broadcastload_expr); linear_ir.erase(std::find(linear_ir.begin(), mv_expr_it, parent_expr)); linear_ir.erase(mv_expr_it); - expr_it = linear_ir.insert(insertion_pos, std::make_shared(broadcastload, in_td, out_td)); + linear_ir.replace_input(move_consumers, broadcastload_expr->get_output_tensor(0)); modified |= true; } } diff --git a/src/common/snippets/src/lowered/pass/mark_loops.cpp b/src/common/snippets/src/lowered/pass/mark_loops.cpp index 4380ec9ca41072..1b13dbcdbbd4b3 100644 --- a/src/common/snippets/src/lowered/pass/mark_loops.cpp +++ b/src/common/snippets/src/lowered/pass/mark_loops.cpp @@ -29,8 +29,15 @@ bool MarkLoops::run(LinearIR& linear_ir) { auto is_not_start_point = [](const std::shared_ptr& node) { return ov::is_type(node) || ov::is_type(node) || - ov::is_type(node) || - ov::is_type(node); // Softmax is decomposed operation. The marking is in decomposition pass + ov::is_type(node); + }; + + auto are_conflicted = [](const ExpressionPort& lhs, const ExpressionPort& rhs) { + const auto& lhs_desc = lhs.get_descriptor_ptr(); + const auto& rhs_desc = rhs.get_descriptor_ptr(); + return lhs_desc->get_subtensor() != rhs_desc->get_subtensor() || + lhs_desc->get_layout() != rhs_desc->get_layout() || + lhs_desc->get_shape() != rhs_desc->get_shape(); }; for (auto expr_it = linear_ir.cbegin(); expr_it != linear_ir.cend(); expr_it++) { @@ -42,14 +49,7 @@ bool MarkLoops::run(LinearIR& linear_ir) { auto loop_begin_pos = expr_it; auto loop_end_pos = loop_begin_pos; - const auto& outputs = expr->get_outputs(); - const auto& loop_inner_layout = outputs.front()->get_layout(); - const auto& loop_inner_subtensor = outputs.front()->get_subtensor(); - const bool loop_is_outside = expr->is_outside_loop(); - const bool loop_is_inside = !loop_is_outside; - - bool current_is_outside = loop_is_outside; - bool current_is_inside = loop_is_inside; + bool collapse = true; do { const auto& prev_expr = *loop_end_pos; loop_end_pos++; @@ -60,29 +60,33 @@ bool MarkLoops::run(LinearIR& linear_ir) { // If iterator is the last, we should finish Loop const auto& current_expr = *loop_end_pos; const auto& current_node = current_expr->get_node(); - if (ov::is_type(current_node) || // Softmax is marked in decomposition - ov::is_type(current_node) || + if (ov::is_type(current_node) || ov::is_type(current_node)) break; - const auto& ins = loop_end_pos->get()->get_inputs(); - current_is_inside = std::all_of(ins.begin(), ins.end(), - [&loop_inner_layout, &loop_inner_subtensor](const TensorDescriptorPtr& td) { - return td->get_layout() == loop_inner_layout && - td->get_subtensor() == loop_inner_subtensor; }); - // If the next expr isn't real customer of prev expr we should finish Loop - auto connected = [&](const TensorDescriptorPtr& td) {return linear_ir.get_expr_by_output(td).expr == prev_expr;}; - if (current_is_inside && std::none_of(ins.begin(), ins.end(), connected)) - break; - - current_is_outside = current_expr->is_outside_loop(); - } while (current_is_inside == loop_is_inside && current_is_outside == loop_is_outside); - - if (loop_is_inside) - loop_manager->mark_loop(linear_ir, loop_begin_pos, loop_end_pos, loop_depth, m_vector_size); - else if (loop_is_outside) - loop_manager->skipped_mark(loop_begin_pos, loop_end_pos, loop_depth); - + // We finish Loop if + // - the next expr isn't real consumer + // - the is conflict between the corresponding ports + bool is_connected = false; + bool is_conflicted = false; + for (size_t i = 0; i < prev_expr->get_output_count(); ++i) { + const auto& loop_tensor = prev_expr->get_output_tensor(i); + const auto consumers = loop_tensor->get_consumers(); + const auto found = std::find_if(consumers.begin(), consumers.end(), [&loop_end_pos](const ExpressionPort& consumer) { + return consumer.get_expr() == *loop_end_pos; + }); + if (found != consumers.end()) { + if (are_conflicted(*found, loop_tensor->get_source())) { + is_conflicted = true; + break; + } + is_connected = true; + } + } + collapse = is_connected && !is_conflicted; + } while (collapse); + + loop_manager->mark_loop(loop_begin_pos, loop_end_pos, loop_depth, m_vector_size); expr_it = std::prev(loop_end_pos); } diff --git a/src/common/snippets/src/lowered/pass/move_result_out_from_loop.cpp b/src/common/snippets/src/lowered/pass/move_result_out_from_loop.cpp index 82a73e6328d7cf..c44cb6c6feb03f 100644 --- a/src/common/snippets/src/lowered/pass/move_result_out_from_loop.cpp +++ b/src/common/snippets/src/lowered/pass/move_result_out_from_loop.cpp @@ -31,8 +31,8 @@ bool MoveResultOutOfLoop::run(LinearIR& linear_ir) { continue; } - const auto input_td = expr->get_inputs().front(); - const auto parent_expr = linear_ir.get_expr_by_output(input_td).expr; + const auto& input_tensor = expr->get_input_tensor(0); + const auto& parent_expr = input_tensor->get_source().get_expr(); const auto parent_loop_ids = parent_expr->get_loop_ids(); int outer_loop_id = static_cast(parent_loop_ids.size()) - 1; for (; outer_loop_id >= 0; --outer_loop_id) { diff --git a/src/common/snippets/src/lowered/pass/move_scalar_to_consumer.cpp b/src/common/snippets/src/lowered/pass/move_scalar_to_consumer.cpp index 808530982446e3..88961847fe1ce6 100644 --- a/src/common/snippets/src/lowered/pass/move_scalar_to_consumer.cpp +++ b/src/common/snippets/src/lowered/pass/move_scalar_to_consumer.cpp @@ -25,11 +25,10 @@ bool MoveScalarToConsumer::run(LinearIR& linear_ir) { for (auto expr_it = linear_ir.rbegin(); expr_it != linear_ir.rend(); expr_it++) { const auto expr = expr_it->get(); if (ov::is_type(expr->get_node())) { - const auto& output = expr->get_outputs().front(); - const auto& consumers = linear_ir.get_exprs_by_input(output); + const auto consumers = expr->get_output_tensor(0)->get_consumers(); OPENVINO_ASSERT(consumers.size() == 1, "Scalar expression is expected to have a single consumer"); - const auto& consumer_expr = consumers.begin()->expr; + const auto& consumer_expr = consumers.begin()->get_expr(); // Move something only if consumer is not already the next one (previous since the iterator is a reverse one) auto forward_it = std::prev(expr_it.base()); if (consumer_expr != *std::next(forward_it)) { diff --git a/src/common/snippets/src/lowered/pass/propagate_layout.cpp b/src/common/snippets/src/lowered/pass/propagate_layout.cpp index 85c3facb9e7d2a..3a12b59a8e173b 100644 --- a/src/common/snippets/src/lowered/pass/propagate_layout.cpp +++ b/src/common/snippets/src/lowered/pass/propagate_layout.cpp @@ -16,43 +16,44 @@ namespace pass { bool PropagateLayout::run(LinearIR& linear_ir) { OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::PropagateLayout") - const auto& io_ops = linear_ir.get_IO_ops(); - auto io_ops_it = io_ops.begin(); + if (linear_ir.empty()) + return false; + for (auto expr_it = linear_ir.begin(); expr_it != linear_ir.end(); expr_it++) { - if (*expr_it == *io_ops_it) { - const auto& expr = io_ops_it->get(); - io_ops_it++; - const bool is_input = expr->get_type() == IOExpression::io_type::INPUT; - const auto& tds = is_input ? expr->get_outputs() : expr->get_inputs(); - if (tds.size() != 1) - OPENVINO_THROW("Parameter/Results should have exactly one output/input"); - const auto& target_td = tds[0]; - // If input - we should be looking downstream, if output - upstream - if (is_input) { - const auto& child_exprs_inputs = linear_ir.get_exprs_by_input(target_td); - // Note that here we consider only the first child (which is usually load), - // but often there is another child - LoopEnd - std::vector child_layout{}; - for (const auto& child_input : child_exprs_inputs) { - const auto child = child_input.expr; - const auto& n = child->get_node(); - if (is_type(n) || is_type(n)) { - // Note: this limitation could be relaxed to multiple ops, - // but all of them must have the same shape and layout - if (!child_layout.empty() && child->get_outputs().front()->get_layout() != child_layout) - OPENVINO_THROW("All children of an input expression must have the same layout"); - child_layout = child->get_outputs().front()->get_layout(); - } - } - if (!child_layout.empty()) { - auto new_td = TensorDescriptor(target_td.get()->get_tensor(), target_td.get()->get_subtensor(), - child_layout); - (*target_td) = new_td; + const auto& expr = *expr_it; + const auto io_expr = std::dynamic_pointer_cast(expr); + if (!io_expr) + continue; + + const bool is_input = io_expr->get_type() == IOExpression::io_type::INPUT; + const auto& tds = is_input ? expr->get_output_tensors() : expr->get_input_tensors(); + if (tds.size() != 1) + OPENVINO_THROW("Parameter/Results should have exactly one output/input"); + + // If input - we should be looking downstream, if output - upstream + const auto& target_tensor = tds.front(); + if (is_input) { + const auto consumer_inputs = target_tensor->get_consumers(); + // Note that here we consider only the first child (which is usually load), + // but often there is another child - LoopEnd + std::set> child_layouts; + for (const auto& child_input : consumer_inputs) { + const auto& child = child_input.get_expr(); + const auto port = child_input.get_index(); + const auto& n = child->get_node(); + const auto ma = ov::as_type_ptr(n); + if (ma && ma->is_memory_access_input_port(port)) { + child_layouts.insert(child_input.get_descriptor_ptr()->get_layout()); } } + OPENVINO_ASSERT(child_layouts.size() == 1, "All children of an input expression must have the same layout"); + io_expr->get_output_port_descriptor(0)->set_layout(*child_layouts.begin()); + } else { + io_expr->get_input_port_descriptor(0)->set_layout(target_tensor->get_source().get_descriptor_ptr()->get_layout()); } } -return true; + + return true; } } // namespace pass diff --git a/src/common/snippets/src/lowered/pass/reset_buffers.cpp b/src/common/snippets/src/lowered/pass/reset_buffers.cpp index 89dad68eb0ed5d..7da95d71b9079d 100644 --- a/src/common/snippets/src/lowered/pass/reset_buffers.cpp +++ b/src/common/snippets/src/lowered/pass/reset_buffers.cpp @@ -18,14 +18,14 @@ bool ResetBuffers::reuse_buffer_increments(const LinearIR& linear_ir, const Expr if (!loop_end) return false; - const auto loop_tds = loop_end_expr->get_inputs(); + const auto loop_tds = loop_end_expr->get_input_tensors(); const auto input_count = loop_end->get_input_num(); const auto output_count = loop_end->get_output_num(); std::set resetting_buffers; std::set buffers_ids; for (size_t i = 0; i < input_count; ++i) { - const auto parent_output = linear_ir.get_expr_by_output(loop_tds[i]).expr; + const auto& parent_output = loop_tds[i]->get_source().get_expr(); if (const auto buffer = ov::as_type_ptr(parent_output->get_node())) { // If Buffer is missed in set, Just save - it's first meeting if (buffers_ids.count(buffer->get_id()) == 0) { @@ -37,11 +37,11 @@ bool ResetBuffers::reuse_buffer_increments(const LinearIR& linear_ir, const Expr } } for (size_t i = 0; i < output_count; ++i) { - const auto consumer_inputs = linear_ir.get_exprs_by_input(loop_tds[input_count + i]); + const auto consumer_inputs = loop_tds[input_count + i]->get_consumers(); size_t buffer_count = 0; size_t loop_count = 0; for (const auto& consumer_input : consumer_inputs) { - const auto& child_node = consumer_input.expr->get_node(); + const auto& child_node = consumer_input.get_expr()->get_node(); if (const auto buffer = ov::as_type_ptr(child_node)) { buffer_count++; // If Buffer is missed in set, Just save - it's first meeting diff --git a/src/common/snippets/src/lowered/pass/softmax_decomposition.cpp b/src/common/snippets/src/lowered/pass/softmax_decomposition.cpp index b491dfe1172fce..576f2915dded4d 100644 --- a/src/common/snippets/src/lowered/pass/softmax_decomposition.cpp +++ b/src/common/snippets/src/lowered/pass/softmax_decomposition.cpp @@ -35,20 +35,21 @@ bool SoftmaxDecomposition::run(LinearIR& linear_ir) { const auto& pm = matcher->get_pattern_map(); const auto softmax = pm.at(match_softmax); const auto softmax_expr = *expr_it; - const auto input_tds = softmax_expr->get_inputs(); - const auto output_tds = softmax_expr->get_outputs(); - const auto tensor_out = output_tds.front()->get_tensor(); - const auto subtensor_in = input_tds.front()->get_subtensor(); + const auto softmax_loop_ids = softmax_expr->get_loop_ids(); + const auto& input_tensor = softmax_expr->get_input_tensor(0); + const auto& output_tensor = softmax_expr->get_output_tensor(0); + const auto tensor_out = softmax_expr->get_output_port_descriptor(0)->get_shape(); const auto inner_work_amount = *(tensor_out.rbegin()); - const auto outer_work_amount = *(tensor_out.rbegin() + 1); expr_it = linear_ir.erase(expr_it); // Remove Softmax std::vector outer_exprs; // We need an iterator to the inserted element - auto push_node = [&linear_ir, &expr_it](const std::shared_ptr& n) { - return std::make_pair(linear_ir.insert(expr_it, n), n); + auto push_node = [&linear_ir, &expr_it, &softmax_loop_ids](const std::shared_ptr& n) { + const auto expr = linear_ir.insert(expr_it, n); + (*expr)->set_loop_ids(softmax_loop_ids); + return std::make_pair(expr, n); }; // Note: VectorBuffer is a special case, since it should go before the initial Load. So we handle it separately @@ -61,10 +62,10 @@ bool SoftmaxDecomposition::run(LinearIR& linear_ir) { outer_exprs.push_back(*horizon_max.first); // Markup of ReduceMax Loop - loop_manager->mark_loop(linear_ir, max.first, horizon_max.first, 1, inner_work_amount, m_vector_size, - std::vector{(*max.first)->input_port(0), - (*max.first)->input_port(1)}, - std::vector{(*max.first)->output_port(0)}); + loop_manager->mark_loop(max.first, horizon_max.first, 1, inner_work_amount, m_vector_size, + std::vector{(*max.first)->get_input_port(0), + (*max.first)->get_input_port(1)}, + std::vector{(*max.first)->get_output_port(0)}); const auto broadcast_horizon_max = push_node( std::make_shared(horizon_max.second, horizon_max.second->get_input_partial_shape(0))); @@ -81,12 +82,12 @@ bool SoftmaxDecomposition::run(LinearIR& linear_ir) { outer_exprs.push_back(*horizon_sum.first); // Markup of ReduceMax Loop - loop_manager->mark_loop(linear_ir, sub.first, horizon_sum.first, 1, inner_work_amount, m_vector_size, - std::vector{(*sub.first)->input_port(0), - (*sub.first)->input_port(1), - (*sum.first)->input_port(1)}, - std::vector{(*exp.first)->output_port(0), - (*sum.first)->output_port(0)}); + loop_manager->mark_loop(sub.first, horizon_sum.first, 1, inner_work_amount, m_vector_size, + std::vector{(*sub.first)->get_input_port(0), + (*sub.first)->get_input_port(1), + (*sum.first)->get_input_port(1)}, + std::vector{(*exp.first)->get_output_port(0), + (*sum.first)->get_output_port(0)}); // Divide is expensive operation, so we decompose it into 1 / x * y, where 1 / x is executed outside loop const auto pow = push_node(std::make_shared(horizon_sum.second, -1.f)); @@ -97,27 +98,43 @@ bool SoftmaxDecomposition::run(LinearIR& linear_ir) { // Mul (pseudo-Divide loop) const auto mul = push_node(std::make_shared(exp.second, broadcast_pow.second)); - // Transfer original TensorDescriptors - linear_ir.replace_input(*max.first, 0, input_tds.front()); - linear_ir.replace_input(*sub.first, 0, input_tds.front()); - linear_ir.replace_output(*mul.first, 0, output_tds.front()); + // Transfer original ExpressionPorts + linear_ir.replace_input((*max.first)->get_input_port(0), input_tensor); + linear_ir.replace_input((*sub.first)->get_input_port(0), input_tensor); + linear_ir.replace_input(output_tensor->get_consumers(), (*mul.first)->get_output_tensor(0)); // Markup of Mul Loop - loop_manager->mark_loop(linear_ir, mul.first, expr_it, 1, inner_work_amount, m_vector_size, - std::vector{(*mul.first)->input_port(0), - (*mul.first)->input_port(1)}, - std::vector{(*mul.first)->output_port(0)}); + loop_manager->mark_loop(mul.first, expr_it, 1, inner_work_amount, m_vector_size, + std::vector{(*mul.first)->get_input_port(0), + (*mul.first)->get_input_port(1)}, + std::vector{(*mul.first)->get_output_port(0)}); // Markup inner loop for outside expression with null loop id for (const auto& expr : outer_exprs) { expr->set_loop_id(Expression::LOOP_NULL_ID, 1); } - // Outer Loop - loop_manager->mark_loop(linear_ir, vector_buffer_max.first, expr_it, 0, outer_work_amount, 1, - std::vector{(*max.first)->input_port(0), - (*sub.first)->input_port(0)}, - std::vector{(*mul.first)->output_port(0)}); + auto update_loop_bounds = [&softmax_expr](std::vector& points, + const std::vector& new_points, + const LinearIR::LoopManager::LoopInfoPtr& loop_info) { + auto entry_found = std::find_if(points.begin(), points.end(), [&softmax_expr](const ExpressionPort& desc) { + return desc.get_expr() == softmax_expr; + }); + if (entry_found != points.end()) { + entry_found = points.erase(entry_found); + points.insert(entry_found, new_points.begin(), new_points.end()); + } + }; + + // Update Loop info for outer loops + for (auto loop_id : softmax_loop_ids) { + if (loop_id == Expression::LOOP_NULL_ID) + continue; + const auto loop_info = loop_manager->get_loop_info(loop_id); + update_loop_bounds(loop_info->entry_exprs, std::vector{(*max.first)->get_input_port(0), + (*sub.first)->get_input_port(0)}, loop_info); + update_loop_bounds(loop_info->exit_exprs, std::vector{(*mul.first)->get_output_port(0)}, loop_info); + } /* =========================================== */ diff --git a/src/common/snippets/src/lowered/pass/vector_to_scalar.cpp b/src/common/snippets/src/lowered/pass/vector_to_scalar.cpp index 41335b74e7be70..320c9fdb5af9ad 100644 --- a/src/common/snippets/src/lowered/pass/vector_to_scalar.cpp +++ b/src/common/snippets/src/lowered/pass/vector_to_scalar.cpp @@ -19,14 +19,15 @@ bool SetScalarCountForLoadStore::run(LinearIR& linear_ir) { OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::SetScalarCountForLoadStore") bool modified = false; for (auto expr_it = linear_ir.begin(); expr_it != linear_ir.end(); expr_it++) { - const auto& op = expr_it->get()->get_node(); + const auto& expr = *expr_it; + const auto& op = expr->get_node(); const auto load = ov::as_type_ptr(op); const auto store = ov::as_type_ptr(op); if (load || store) { - const auto td = load ? (*expr_it)->get_inputs().front() : - (*expr_it)->get_outputs().front(); - const auto& layout = td->get_layout(); - const auto& tensor_shape = td->get_tensor(); + const auto& layout = load ? expr->get_input_port_descriptor(0)->get_layout() + : expr->get_output_port_descriptor(0)->get_layout(); + const auto& tensor_shape = load ? expr->get_input_port_descriptor(0)->get_shape() + : expr->get_output_port_descriptor(0)->get_shape(); // Find last dimension by layout const auto last_dim_idx = std::find(layout.begin(), layout.end(), layout.size() - 1); OPENVINO_ASSERT(last_dim_idx != layout.end(), "Load/Store expression have incorrect layout"); diff --git a/src/common/snippets/src/lowered/port_descriptor.cpp b/src/common/snippets/src/lowered/port_descriptor.cpp new file mode 100644 index 00000000000000..9b3591660eb720 --- /dev/null +++ b/src/common/snippets/src/lowered/port_descriptor.cpp @@ -0,0 +1,143 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "snippets/lowered/port_descriptor.hpp" + +namespace ngraph { +namespace snippets { +namespace lowered { + +size_t PortDescriptor::ServiceDimensions::FULL_DIM = SIZE_MAX; + +PortDescriptor::PortDescriptor(const ov::Input& in, std::vector subtensor_shape, std::vector layout) + : PortDescriptor(ov::Input(in.get_node(), in.get_index()), std::move(subtensor_shape), std::move(layout)) {} +PortDescriptor::PortDescriptor(const ov::Input& in, std::vector subtensor_shape, std::vector layout) + : PortDescriptor(in.get_shape(), std::move(subtensor_shape), std::move(layout)) {} + +PortDescriptor::PortDescriptor(const ov::Output& out, std::vector subtensor_shape, std::vector layout) + : PortDescriptor(ov::Output(out.get_node(), out.get_index()), std::move(subtensor_shape), std::move(layout)) {} +PortDescriptor::PortDescriptor(const ov::Output& out, std::vector subtensor_shape, std::vector layout) + : PortDescriptor(out.get_shape(), std::move(subtensor_shape), std::move(layout)) {} + +PortDescriptor::PortDescriptor(std::vector shape, std::vector subtensor_shape, std::vector layout) + : m_tensor_shape(std::move(shape)), m_layout(std::move(layout)), m_subtensor_shape(std::move(subtensor_shape)) { + validate_arguments(); +} + +void PortDescriptor::validate_arguments() { + if (!m_tensor_shape.empty() && m_layout.empty()) { + m_layout.resize(m_tensor_shape.size()); + // NCHW layout by default + std::iota(m_layout.begin(), m_layout.end(), 0); + } else if (m_layout.size() != m_tensor_shape.size()) { + OPENVINO_THROW("Snippets tensor descriptor: Layout size must be equal to the shape size"); + } +} + +PortDescriptorPtr PortDescriptor::clone() const { + return std::make_shared(m_tensor_shape, m_subtensor_shape, m_layout); +} + +std::string PortDescriptor::serialize() const { + std::stringstream ss; + ss << m_tensor_shape.size() << " "; + for (auto val : m_tensor_shape) + ss << val << " "; + ss << m_subtensor_shape.size() << " "; + for (auto val : m_subtensor_shape) + ss << val << " "; + ss << m_layout.size() << " "; + for (auto val : m_layout) + ss << val << " "; + return ss.str(); +} +bool operator==(const PortDescriptor& lhs, const PortDescriptor& rhs) { + return lhs.m_tensor_shape == rhs.m_tensor_shape && + lhs.m_layout == rhs.m_layout && + lhs.m_subtensor_shape == rhs.m_subtensor_shape; +} + +void PortManager::init_default(std::vector& in_descs, std::vector& out_descs, const std::shared_ptr& node) { + in_descs.resize(node->get_input_size()); + out_descs.resize(node->get_output_size()); + for (size_t i = 0; i < node->get_input_size(); ++i) { + in_descs[i] = std::make_shared(node->input(i)); + } + for (size_t i = 0; i < node->get_output_size(); ++i) { + out_descs[i] = std::make_shared(node->output(i)); + } +} + +void PortManager::set_port_descriptor_ptr(const ov::Input& in, const PortDescriptorPtr& desc) { + const auto& node = in.get_node()->shared_from_this(); + auto& rt_info = node->get_rt_info(); + const auto& key = PortDescriptorVectorAttribute::get_type_info_static(); + const auto& found = rt_info.find(key); + if (found == rt_info.end()) { + std::vector in_descs, out_descs; + init_default(in_descs, out_descs, node); + in_descs[in.get_index()] = desc; + rt_info[key] = PortDescriptorVectorAttribute(in_descs, out_descs); + } else { + auto& in_descs = found->second.as().inputs; + if (in_descs.size() != node->get_input_size()) + OPENVINO_THROW("Set input port descriptor is failed: incorrect count"); + in_descs[in.get_index()] = desc; + } +} + +void PortManager::set_port_descriptor_ptr(const ov::Output& out, const PortDescriptorPtr& desc) { + const auto& node = out.get_node_shared_ptr(); + auto& rt_info = node->get_rt_info(); + const auto& key = PortDescriptorVectorAttribute::get_type_info_static(); + const auto& found = rt_info.find(key); + if (found == rt_info.end()) { + std::vector in_descs, out_descs; + init_default(in_descs, out_descs, node); + out_descs[out.get_index()] = desc; + rt_info[key] = PortDescriptorVectorAttribute(in_descs, out_descs); + } else { + auto& out_descs = found->second.as().outputs; + if (out_descs.size() != node->get_output_size()) + OPENVINO_THROW("Set output port descriptor is failed: incorrect count"); + out_descs[out.get_index()] = desc; + } +} + +PortDescriptorPtr PortManager::get_port_descriptor_ptr(const ov::Input& in) { + return get_port_descriptor_ptr(ov::Input(in.get_node(), in.get_index())); +} +PortDescriptorPtr PortManager::get_port_descriptor_ptr(const ov::Input& in) { + const auto& node = in.get_node(); + auto& rt_info = node->get_rt_info(); + const auto& key = PortDescriptorVectorAttribute::get_type_info_static(); + const auto& found = rt_info.find(key); + if (found == rt_info.end()) { + return std::make_shared(in); + } + const auto& in_descs = found->second.as().inputs; + if (in_descs.size() != node->get_input_size()) + OPENVINO_THROW("Get input port descriptor is failed: incorrect count"); + return in_descs[in.get_index()]; +} + +PortDescriptorPtr PortManager::get_port_descriptor_ptr(const Output& out) { + return get_port_descriptor_ptr(ov::Output(out.get_node(), out.get_index())); +} +PortDescriptorPtr PortManager::get_port_descriptor_ptr(const Output& out) { + const auto& node = out.get_node(); + const auto& rt_info = node->get_rt_info(); + const auto& key = PortDescriptorVectorAttribute::get_type_info_static(); + const auto& found = rt_info.find(key); + if (found == rt_info.end()) { + return std::make_shared(out); + } + const auto& out_descs = found->second.as().outputs; + if (out_descs.size() != node->get_output_size()) + OPENVINO_THROW("Get output port descriptor is failed: incorrect count"); + return out_descs[out.get_index()]; +} +} // namespace lowered +} // namespace snippets +} // namespace ngraph diff --git a/src/common/snippets/src/lowered/tensor.cpp b/src/common/snippets/src/lowered/tensor.cpp new file mode 100644 index 00000000000000..866e58a49ee021 --- /dev/null +++ b/src/common/snippets/src/lowered/tensor.cpp @@ -0,0 +1,52 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "snippets/lowered/tensor.hpp" + +#include +#include "snippets/utils.hpp" + + +namespace ngraph { +namespace snippets { +namespace lowered { + +Tensor::Tensor(ExpressionPort source_descriptor, const std::set& consumer_descriptors) + : m_source_port(std::move(source_descriptor)), m_consumer_ports(consumer_descriptors) {} + +std::set::const_iterator Tensor::find_consumer(const ExpressionPort& consumer) const { + // Note: Find by shared ptr and index port is enough since these parameters must be unique + return std::find_if(m_consumer_ports.cbegin(), m_consumer_ports.cend(), + [&consumer](const ExpressionPort& td) { + return consumer.get_expr() == td.get_expr() && consumer.get_index() == td.get_index(); + }); +} + +std::set::iterator Tensor::find_consumer(const ExpressionPort& consumer) { + // Note: Find by shared ptr and index port is enough since these parameters must be unique + return std::find_if(m_consumer_ports.begin(), m_consumer_ports.end(), + [&consumer](const ExpressionPort& td) { + return consumer.get_expr() == td.get_expr() && consumer.get_index() == td.get_index(); + }); +} + +bool Tensor::found_consumer(const ExpressionPort& consumer) const { + return find_consumer(consumer) != m_consumer_ports.end(); +} + +void Tensor::add_consumer(const ExpressionPort& consumer) { + OPENVINO_ASSERT(!found_consumer(consumer), "Consumer has been already added to Tensor!"); + const auto res = m_consumer_ports.insert(consumer); + OPENVINO_ASSERT(res.second, "Consumer hasn't been added to the Tensor"); +} + +void Tensor::remove_consumer(const ExpressionPort& consumer) { + const auto& found = find_consumer(consumer); + OPENVINO_ASSERT(found != m_consumer_ports.end(), "Consumer is missed in Tensor!"); + m_consumer_ports.erase(found); +} + +}// namespace lowered +}// namespace snippets +}// namespace ngraph diff --git a/src/common/snippets/src/op/brgemm.cpp b/src/common/snippets/src/op/brgemm.cpp index 4c9c2c497fb9a0..b647835abe9e04 100644 --- a/src/common/snippets/src/op/brgemm.cpp +++ b/src/common/snippets/src/op/brgemm.cpp @@ -13,21 +13,40 @@ namespace snippets { namespace op { Brgemm::Brgemm(const Output& A, const Output& B, - const size_t offset_a, const size_t offset_b, const size_t offset_c) : MemoryAccess({A, B}, std::set{0, 1}, std::set{0}) { + const size_t offset_a, const size_t offset_b, const size_t offset_c, + std::vector layout_a, std::vector layout_b, std::vector layout_c) + : MemoryAccess({A, B}, std::set{0, 1}, std::set{0}) { set_output_size(1); set_input_offset(offset_a, 0); set_input_offset(offset_b, 1); set_output_offset(offset_c, 0); - constructor_validate_and_infer_types(); + custom_constructor_validate_and_infer_types(std::move(layout_a), std::move(layout_b), std::move(layout_c)); } -void Brgemm::validate_and_infer_types() { - INTERNAL_OP_SCOPE(Brgemm_validate_and_infer_types); +void Brgemm::custom_constructor_validate_and_infer_types(std::vector layout_a, std::vector layout_b, std::vector layout_c) { + INTERNAL_OP_SCOPE(BrgemmCPU_constructor_validate_and_infer_types); + validate_inputs(); + + // During ctor call, Brgemm doesn't know his port descriptors. + // So we use explicit layouts from parameters + const auto planar_input_shapes = + std::vector{ ngraph::snippets::utils::get_reordered_planar_shape(get_input_partial_shape(0), layout_a), + ngraph::snippets::utils::get_reordered_planar_shape(get_input_partial_shape(1), layout_b) }; + auto output_shape = get_output_partial_shape(planar_input_shapes); + set_output_type(0, get_output_type(), ngraph::snippets::utils::get_reordered_planar_shape(output_shape, layout_c)); +} + +void Brgemm::validate_inputs() const { // If no leading dimensions are provided, assume dense row-major inputs-outputs NODE_VALIDATION_CHECK(this, get_input_partial_shape(0).is_static() && get_input_partial_shape(1).is_static(), "Brgemm currently supports only static shapes."); +} + +void Brgemm::validate_and_infer_types() { + INTERNAL_OP_SCOPE(Brgemm_validate_and_infer_types); + validate_inputs(); - const auto planar_input_shapes = get_planar_input_shapes(input_values()); + const auto planar_input_shapes = get_planar_input_shapes(inputs()); auto output_shape = get_output_partial_shape(planar_input_shapes); set_output_type(0, get_output_type(), get_planar_output_shape(output_shape)); } @@ -35,7 +54,11 @@ void Brgemm::validate_and_infer_types() { std::shared_ptr Brgemm::clone_with_new_inputs(const OutputVector& new_args) const { INTERNAL_OP_SCOPE(Brgemm_clone_with_new_inputs); check_new_args_count(this, new_args); - return std::make_shared(new_args.at(0), new_args.at(1), get_offset_a(), get_offset_b(), get_offset_c()); + return std::make_shared(new_args.at(0), new_args.at(1), + get_offset_a(), get_offset_b(), get_offset_c(), + lowered::PortManager::get_port_descriptor_ptr(input(0))->get_layout(), + lowered::PortManager::get_port_descriptor_ptr(input(1))->get_layout(), + lowered::PortManager::get_port_descriptor_ptr(output(0))->get_layout()); } ov::element::Type Brgemm::get_output_type() const { @@ -56,18 +79,22 @@ ov::element::Type Brgemm::get_output_type() const { } } -std::vector Brgemm::get_planar_input_shapes(const std::vector>& inputs) const { +std::vector Brgemm::get_planar_input_shapes(const std::vector>& inputs) const { OPENVINO_ASSERT(inputs.size() == 2, "Brgemm::get_planar_input_shapes() expects 2 inputs"); return { utils::get_port_planar_shape(inputs[0]), utils::get_port_planar_shape(inputs[1]) }; } ov::PartialShape Brgemm::get_planar_output_shape(const ov::PartialShape& output_shape) const { // This method can be safely called from validate_and_infer_types() before output creation - const auto& rt_info = get_rt_info(); - auto it = rt_info.find(TensorDescriptorPtrVectorAttribute::get_type_info_static()); - if (it != rt_info.end()) { - const auto& td = it->second.as().m_value[0]; - return utils::get_reordered_planar_shape(output_shape, td->get_layout()); + const auto& key = lowered::PortDescriptorVectorAttribute::get_type_info_static(); + auto& rt_info = get_rt_info(); + const auto& found = rt_info.find(key); + if (found != rt_info.end()) { + const auto& out_descs = found->second.as().outputs; + if (out_descs.size() != get_output_size()) + OPENVINO_THROW("Get output port descriptor is failed: incorrect count"); + const auto& port_desc = out_descs[0]; + return utils::get_reordered_planar_shape(output_shape, port_desc->get_layout()); } return output_shape; } diff --git a/src/common/snippets/src/op/subgraph.cpp b/src/common/snippets/src/op/subgraph.cpp index da026c03a57e1b..8e95105dfa41b0 100644 --- a/src/common/snippets/src/op/subgraph.cpp +++ b/src/common/snippets/src/op/subgraph.cpp @@ -16,8 +16,9 @@ #include "snippets/pass/transform_convert.hpp" #include "snippets/pass/matmul_to_brgemm.hpp" #include "snippets/pass/fuse_transpose_brgemm.hpp" +#include "snippets/pass/set_softmax_ports.hpp" #include "snippets/utils.hpp" -#include "snippets/tensor_descriptor.hpp" +#include "snippets/lowered/port_descriptor.hpp" #include "transformations/common_optimizations/nop_elimination.hpp" #include "transformations/utils/utils.hpp" @@ -64,8 +65,6 @@ void snippets::op::Subgraph::init_config() { config.m_has_domain_sensitive_ops = config.m_has_domain_sensitive_ops || is_domain_sensitive_op(op); } - // Domain sensitive ops are decomposed with explicit Loops. So, we should explicitly insert Loops in Subgraph if it contains these ops - config.m_explicit_loop_insertion = config.m_has_domain_sensitive_ops; } auto snippets::op::Subgraph::get_estimated_buffer_count(const ov::NodeVector& ops) -> size_t { @@ -464,6 +463,7 @@ void snippets::op::Subgraph::convert_to_snippet_dialect() { manager.register_pass(); manager.register_pass(); manager.register_pass(); + manager.register_pass(); } manager.register_pass(); manager.register_pass(); @@ -529,7 +529,6 @@ snippets::Schedule snippets::op::Subgraph::generate( lowering_config.m_need_fill_tail_register = config.m_has_domain_sensitive_ops; lowering_config.m_loop_depth = tileRank; lowering_config.m_master_shape = master_shape; - lowering_config.m_explicit_loop_insertion = config.m_explicit_loop_insertion; const auto& lowering_result = m_generator->generate(body_ptr(), lowering_config, compile_params); ngraph::snippets::code ptr = lowering_result.binary_code; m_buffer_scratchpad = lowering_result.buffer_scratchpad_size; diff --git a/src/common/snippets/src/pass/fuse_transpose_brgemm.cpp b/src/common/snippets/src/pass/fuse_transpose_brgemm.cpp index 3f6d2a99d5b2a6..25954e66ccb8ed 100644 --- a/src/common/snippets/src/pass/fuse_transpose_brgemm.cpp +++ b/src/common/snippets/src/pass/fuse_transpose_brgemm.cpp @@ -17,46 +17,47 @@ namespace ngraph { namespace snippets { namespace pass { + const std::set> FuseTransposeBrgemm::supported_cases = {{0, 2, 1, 3}}; + +bool FuseTransposeBrgemm::is_supported_transpose(const Output& transpose_port) { + const auto transpose_node = transpose_port.get_node_shared_ptr(); + // it's safe to do so because of the patterns we used. alternatively we can do it through pattern_values_map + const auto& constant = as_type_ptr(transpose_node->get_input_node_shared_ptr(1)); + // if Transpose in and out layout is not empty => something was already fused on this port + auto default_layout = std::vector(transpose_port.get_shape().size()); + std::iota(default_layout.begin(), default_layout.end(), 0);// NCHW layout by default + if (lowered::PortManager::get_port_descriptor_ptr(transpose_port)->get_layout() != default_layout || + lowered::PortManager::get_port_descriptor_ptr(transpose_node->input_value(0))->get_layout() != default_layout) + return false; + const auto& transpose_order = constant->cast_vector(); + // todo: this limitation is due to the fact that offsets are calculated in Kernel, and the only way + // to calc them non-default way is to set Parameter rt_info field. This limitation can be removed if + // the rt_info is properly propagated to the corresponding parameter + return is_type(transpose_node->get_input_node_shared_ptr(0)) && + supported_cases.count(transpose_order) != 0; +} + FuseTransposeBrgemm::FuseTransposeBrgemm() { MATCHER_SCOPE(FuseTransposeBrgemm); - auto transpose_is_supported = [](const Output& transpose_port) { - const auto transpose_node = transpose_port.get_node_shared_ptr(); - // it's safe to do so because of the patterns we used. alternatively we can do it through pattern_values_map - const auto& constant = as_type_ptr(transpose_node->get_input_node_shared_ptr(1)); - // if Transpose in and out layout is not empty => something was already fused on this port - if (!utils::get_node_output_layout(transpose_node).empty() || - !utils::get_node_output_layout(transpose_node->get_input_node_shared_ptr(0)).empty()) - return false; - const auto& transpose_order = constant->cast_vector(); - // todo: this limitation is due to the fact that offsets are calculated in Kernel, and the only way - // to calc them non-default way is to set Parameter rt_info field. This limitation can be removed if - // the rt_info is properly propagated to the corresponding parameter - if (!is_type(transpose_node->get_input_node_shared_ptr(0)) || - supported_cases.count(transpose_order) == 0) - return false; - return true; - }; auto constant = pattern::wrap_type(); - auto transpose = pattern::wrap_type({pattern::any_input(), constant}, transpose_is_supported); + auto transpose = pattern::wrap_type({pattern::any_input(), constant}, is_supported_transpose); auto transpose_matcher = std::make_shared(transpose); - auto brgemm_any = pattern::wrap_type({pattern::any_input(), pattern::any_input()}); + // Pattern 0: Transpose on 0-th input of MatMul auto brgemm_in0 = pattern::wrap_type({transpose, pattern::any_input()}); + + // Pattern 1: Transpose on 1-st input of MatMul auto brgemm_in1 = pattern::wrap_type({pattern::any_input(), transpose}); - auto brgemm_out0 = pattern::wrap_type({brgemm_any, constant}); - auto brgemm_or_transpose = std::make_shared(OutputVector{brgemm_in0, brgemm_in1, brgemm_out0}); + + // Pattern 2: Transpose on output of MatMul + auto brgemm_out = pattern::wrap_type({pattern::any_input(), pattern::any_input()}); + auto transpose2 = pattern::wrap_type({brgemm_out, constant}); + + auto brgemm_or_transpose = std::make_shared(OutputVector{brgemm_in0, brgemm_in1, transpose2}); auto callback = [=](pattern::Matcher& m) { OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "FuseTransposeBrgemm") - auto set_layout_from_order = [](const std::shared_ptr& node, const ov::Output& port) { - const auto& const_order = as_type_ptr(node->get_input_node_shared_ptr(1)); - const auto& td = ngraph::snippets::get_tensor_descriptor_ptr(port); - const auto& tensor = td->get_tensor(); - const auto& subtensor = td->get_subtensor(); - std::vector layout = const_order->cast_vector(); - ngraph::snippets::set_tensor_descriptor_ptr(port, std::make_shared(tensor, subtensor, layout)); - }; auto brgemm = as_type_ptr(m.get_match_root()); // Transpose on the Brgemm's output @@ -64,26 +65,36 @@ FuseTransposeBrgemm::FuseTransposeBrgemm() { brgemm = as_type_ptr(m.get_match_root()->get_input_node_shared_ptr(0)); const auto& brgemm_out = brgemm->output(0); const auto& transpose_out = m.get_match_value(); + const auto& const_order = ov::as_type_ptr(transpose_out.get_node_shared_ptr()->get_input_node_shared_ptr(1)); + const auto& original_port = ngraph::snippets::lowered::PortManager::get_port_descriptor_ptr(brgemm_out); + original_port->set_shape(transpose_out.get_shape()); + original_port->set_layout(const_order->cast_vector()); for (const auto& in : transpose_out.get_target_inputs()) in.replace_source_output(brgemm->output(0)); - set_layout_from_order(as_type_ptr(transpose_out.get_node_shared_ptr()), brgemm_out); } + for (size_t i = 0; i < brgemm->get_input_size(); i++) { - const auto& in_value = brgemm->input_value(i); + const auto& in = brgemm->input(i); + const auto& in_value = in.get_source_output(); if (transpose_matcher->match(in_value)) { const auto& transpose = as_type_ptr(in_value.get_node_shared_ptr()); - set_layout_from_order(transpose, transpose->input_value(0)); + const auto& const_order = ov::as_type_ptr(transpose->get_input_node_shared_ptr(1)); brgemm->set_argument(i, transpose->input_value(0)); + const auto& original_port = ngraph::snippets::lowered::PortManager::get_port_descriptor_ptr(in); + original_port->set_shape(transpose->get_input_shape(0)); + original_port->set_layout(const_order->cast_vector()); } } + // need to run validate_and_infer_types manually: either input shapes were updated or // output Layout was updated (out shape will be updated in validate_and_infer_types()) brgemm->validate_and_infer_types(); return true; }; + register_matcher(std::make_shared(brgemm_or_transpose, matcher_name), callback); } } // namespace pass } // namespace snippets -} // namespace ngraph \ No newline at end of file +} // namespace ngraph diff --git a/src/common/snippets/src/pass/matmul_to_brgemm.cpp b/src/common/snippets/src/pass/matmul_to_brgemm.cpp index 42b3775e2536bd..4ceca5802233ed 100644 --- a/src/common/snippets/src/pass/matmul_to_brgemm.cpp +++ b/src/common/snippets/src/pass/matmul_to_brgemm.cpp @@ -9,19 +9,31 @@ #include "snippets/snippets_isa.hpp" #include "snippets/utils.hpp" -#include "ngraph/opsets/opset1.hpp" #include "ngraph/rt_info.hpp" -#include +#include "snippets/lowered/port_descriptor.hpp" #include "ngraph/pattern/op/wrap_type.hpp" namespace ngraph { namespace snippets { namespace pass { +void MatMulToBrgemm::init_ports(const std::shared_ptr& brgemm) const { + auto get_subtensor = [](const ov::Shape& shape) { + return std::vector{ lowered::PortDescriptor::ServiceDimensions::FULL_DIM, lowered::PortDescriptor::ServiceDimensions::FULL_DIM }; + }; + for (const auto& input : brgemm->inputs()) { + const auto tensor = input.get_shape(); + const auto subtensor = get_subtensor(tensor); + lowered::PortManager::set_port_descriptor_ptr(input, std::make_shared(tensor, subtensor)); + } + const auto tensor = brgemm->get_output_shape(0); + const auto subtensor = get_subtensor(tensor); + lowered::PortManager::set_port_descriptor_ptr(brgemm->output(0), std::make_shared(tensor, subtensor)); +} + MatMulToBrgemm::MatMulToBrgemm() { MATCHER_SCOPE(MatMulToBrgemm); - auto matmul_pattern = ngraph::pattern::wrap_type({ngraph::pattern::any_input(), - ngraph::pattern::any_input()}); + auto matmul_pattern = ngraph::pattern::wrap_type({ngraph::pattern::any_input(), ngraph::pattern::any_input()}); auto callback = [=](ngraph::pattern::Matcher& m) { OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "ov::intel_cpu::pass::MatMulToBrgemm") @@ -39,11 +51,7 @@ MatMulToBrgemm::MatMulToBrgemm() { brgemm->set_friendly_name(matmul->get_friendly_name()); ngraph::copy_runtime_info(matmul, nodes); ngraph::replace_node(matmul, nodes.back()); - const std::vector tensor = brgemm->get_output_shape(0); - const std::vector subtensor = {tensor[tensor.size() - 2], tensor[tensor.size() - 1]}; - ngraph::snippets::set_tensor_descriptor_ptr(brgemm->output(0), std::make_shared(tensor, subtensor)); - // TODO: At the moment Brgemm is executed outside Loop. When Blocking is supported, remove it - utils::set_outside_loop_value(brgemm, true); + init_ports(brgemm); return true; }; diff --git a/src/common/snippets/src/pass/set_softmax_ports.cpp b/src/common/snippets/src/pass/set_softmax_ports.cpp new file mode 100644 index 00000000000000..09737e69cb4646 --- /dev/null +++ b/src/common/snippets/src/pass/set_softmax_ports.cpp @@ -0,0 +1,58 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "snippets/pass/set_softmax_ports.hpp" + +#include +#include "snippets/lowered/port_descriptor.hpp" + +#include "ngraph/op/softmax.hpp" +#include "ngraph/pattern/op/wrap_type.hpp" +#include "ngraph/pattern/op/or.hpp" +#include "ngraph/validation_util.hpp" + +using namespace ngraph; + +ngraph::snippets::pass::SetSoftmaxPorts::SetSoftmaxPorts() { + MATCHER_SCOPE(SetSoftmaxPorts); + + auto m_softmax_v1 = ngraph::pattern::wrap_type(); + auto m_softmax_v8 = ngraph::pattern::wrap_type(); + auto m_softmax = std::make_shared(OutputVector{m_softmax_v1, m_softmax_v8}); + + auto callback = [](ngraph::pattern::Matcher &m) { + OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::SetSoftmaxPorts") + auto root = m.get_match_root(); + + const auto& pshape = root->get_input_partial_shape(0); + if (pshape.is_dynamic()) + return false; + + const auto shape = pshape.get_shape(); + const auto rank = shape.size(); + + int64_t axis; + if (const auto softmax_v8 = ngraph::as_type_ptr(root)) { + OPENVINO_SUPPRESS_DEPRECATED_START + axis = ngraph::normalize_axis(root->get_friendly_name(), softmax_v8->get_axis(), rank); + OPENVINO_SUPPRESS_DEPRECATED_END + } else if (const auto softmax_v1 = ngraph::as_type_ptr(root)) { + axis = softmax_v1->get_axis(); + } else { + return false; + } + + OPENVINO_ASSERT(axis < static_cast(rank), "Softmax has incorrect axis"); + std::vector subtensor(rank, 1); + for (size_t i = axis; i < rank; ++i) + subtensor[i] = lowered::PortDescriptor::ServiceDimensions::FULL_DIM; + + lowered::PortManager::set_port_descriptor_ptr(root->input(0), std::make_shared(root->input(0), subtensor)); + lowered::PortManager::set_port_descriptor_ptr(root->output(0), std::make_shared(root->output(0), subtensor)); + + return true; + }; + + register_matcher(std::make_shared(m_softmax, matcher_name), callback); +} diff --git a/src/common/snippets/src/pass/transpose_decomposition.cpp b/src/common/snippets/src/pass/transpose_decomposition.cpp index 08a083558c9760..b71ba728ab5d90 100644 --- a/src/common/snippets/src/pass/transpose_decomposition.cpp +++ b/src/common/snippets/src/pass/transpose_decomposition.cpp @@ -5,17 +5,23 @@ #include #include #include -#include +#include "snippets/lowered/port_descriptor.hpp" #include #include #include -const std::set> ngraph::snippets::pass::TransposeDecomposition::supported_cases = {{0, 2, 3, 1}}; -ngraph::snippets::pass::TransposeDecomposition::TransposeDecomposition() { + +namespace ngraph { +namespace snippets { +namespace pass { + +const std::set> TransposeDecomposition::supported_cases = {{0, 2, 3, 1}}; + +TransposeDecomposition::TransposeDecomposition() { MATCHER_SCOPE(TransposeDecomposition); - // todo: we need a special transformation that detects and propagates data access pattern to Parameters and Results - // this is needed to communicate access pattern to the plugin node and op::Kernel - // This is the reason we match only to Parameter, this limitation could be relaxed if we propagate access pattern - // to the appropriate parameter + // Todo: we need a special transformation that detects and propagates data access pattern to Parameters and Results + // this is needed to communicate access pattern to the plugin node and op::Kernel + // This is the reason we match only to Parameter, this limitation could be relaxed if we propagate access pattern + // to the appropriate parameter auto match_data = ngraph::pattern::wrap_type(); auto match_order = ngraph::pattern::wrap_type(); auto match_transpose = ngraph::pattern::wrap_type({match_data, match_order}); @@ -23,8 +29,8 @@ ngraph::snippets::pass::TransposeDecomposition::TransposeDecomposition() { ngraph::matcher_pass_callback callback = [=](ngraph::pattern::Matcher& m) { OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::TransposeDecomposition") auto& pattern_to_output = m.get_pattern_value_map(); - const auto transpose = ov::as_type_ptr( - pattern_to_output.at(match_transpose).get_node_shared_ptr()); + const auto& data_input = pattern_to_output.at(match_data); + const auto transpose = ov::as_type_ptr(pattern_to_output.at(match_transpose).get_node_shared_ptr()); const auto order = ov::as_type_ptr(pattern_to_output.at(match_order).get_node_shared_ptr()); if (transformation_callback(transpose) || transpose->is_dynamic()) @@ -34,20 +40,19 @@ ngraph::snippets::pass::TransposeDecomposition::TransposeDecomposition() { if (supported_cases.count(order_value) == 0) return false; - auto data_input = pattern_to_output.at(match_data); - const std::vector& tensor_shape {data_input.get_shape()}; // number of elements that can be processed on every iteration. For 0,1,2,3 -> 0,2,3,1 we can guarantee only scalar access - const std::vector subtensor_shape {1}; + const auto subtensor = std::vector{1}; const auto& layout = order->cast_vector(); + // todo: LoadReshape used here is essentially Load + an easy way to maintain correct shape propagation // fix this in future and develop a more consistent shape propagation approach. - auto load = std::make_shared(data_input, subtensor_shape[0], 0, layout); - auto store = std::make_shared(load, subtensor_shape[0]); - ngraph::snippets::set_tensor_descriptor_ptr(load->output(0), std::make_shared(tensor_shape, subtensor_shape, layout)); - ngraph::snippets::set_tensor_descriptor_ptr(store->output(0), - std::make_shared(store->get_output_shape(0), - std::vector{}, - std::vector{})); + auto load = std::make_shared(data_input, subtensor[0], 0, layout); + auto store = std::make_shared(load, subtensor[0]); + + lowered::PortManager::set_port_descriptor_ptr(load->input(0), std::make_shared(load->get_input_shape(0), subtensor, layout)); + lowered::PortManager::set_port_descriptor_ptr(load->output(0), std::make_shared(load->get_output_shape(0), subtensor)); + lowered::PortManager::set_port_descriptor_ptr(store->input(0), std::make_shared(store->get_input_shape(0), subtensor)); + lowered::PortManager::set_port_descriptor_ptr(store->output(0), std::make_shared(store->get_output_shape(0), subtensor)); for (auto& input : transpose->output(0).get_target_inputs()) { input.replace_source_output(store->output(0)); @@ -59,3 +64,7 @@ ngraph::snippets::pass::TransposeDecomposition::TransposeDecomposition() { auto m = std::make_shared(match_transpose, matcher_name); register_matcher(m, callback); } + +} // namespace pass +} // namespace snippets +} // namespace ngraph diff --git a/src/common/snippets/src/tensor_descriptor.cpp b/src/common/snippets/src/tensor_descriptor.cpp deleted file mode 100644 index a3182686c80c2a..00000000000000 --- a/src/common/snippets/src/tensor_descriptor.cpp +++ /dev/null @@ -1,136 +0,0 @@ -// Copyright (C) 2023 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#include "snippets/tensor_descriptor.hpp" -#include "ngraph/except.hpp" -#include - -namespace ngraph { -namespace snippets { -TensorDescriptor::TensorDescriptor(const Output& out, - std::vector subtensor_shape, - std::vector layout) - : TensorDescriptor(ov::Output(out.get_node(), out.get_index()), - std::move(subtensor_shape), - std::move(layout)) { -} - -TensorDescriptor::TensorDescriptor(const Output& out, - std::vector subtensor_shape, - std::vector layout) - : m_layout(std::move(layout)), m_subtensor_shape(std::move(subtensor_shape)) { - const auto& pshape = out.get_partial_shape(); - // Note: this limitation could be relaxed if necessary - if (pshape.is_dynamic()) - OPENVINO_THROW("Snippets tensor descriptor can be created only for static shapes"); - m_tensor_shape = pshape.get_shape(); - validate_arguments(); -} - -TensorDescriptor::TensorDescriptor(std::vector tensor_shape, - std::vector subtensor_shape, - std::vector layout) : m_tensor_shape(std::move(tensor_shape)), - m_layout(std::move(layout)), m_subtensor_shape(std::move(subtensor_shape)) { - validate_arguments(); -} - -void TensorDescriptor::validate_arguments() { - if (!m_tensor_shape.empty() && m_layout.empty()) { - m_layout.resize(m_tensor_shape.size()); - // NCHW layout by default - std::iota(m_layout.begin(), m_layout.end(), 0); - } else if (m_layout.size() != m_tensor_shape.size()) { - OPENVINO_THROW("Snippets tensor descriptor: Layout size must be equal to the shape size"); - } -} - - -TensorDescriptor TensorDescriptor::deserialize(const std::string& serialized_info) { - std::stringstream sinfo(serialized_info); - auto read_values = [](std::stringstream& ss){ - size_t num = 0; - ss >> num; - std::vector res; - for (size_t i = 0; i < num; i++) { - size_t val; - ss >> val; - res.push_back(val); - } - return res; - }; - const auto& tensor_shape = read_values(sinfo); - const auto& subtensor_shape = read_values(sinfo); - const auto& layout = read_values(sinfo); - return {tensor_shape, subtensor_shape, layout}; -} - -std::string TensorDescriptor::serialize() const { - std::stringstream ss; - ss << m_tensor_shape.size() << " "; - for (auto val : m_tensor_shape) - ss << val << " "; - ss << m_subtensor_shape.size() << " "; - for (auto val : m_subtensor_shape) - ss << val << " "; - ss << m_layout.size() << " "; - for (auto val : m_layout) - ss << val << " "; - return ss.str(); -} -bool operator==(const TensorDescriptor& lhs, const TensorDescriptor& rhs) { - return lhs.m_tensor_shape == rhs.m_tensor_shape && - lhs.m_layout == rhs.m_layout && - lhs.m_subtensor_shape == rhs.m_subtensor_shape; -} - -std::ostream& operator << (std::ostream& ss, const TensorDescriptor& td) { - auto print_vector = [&ss](const std::vector& data){ - ss << "["; - for (auto i : data) - ss << i << ","; - ss << (data.empty() ? "]" : "\b]"); - }; - ss << "{Tensor: "; - print_vector(td.get_tensor()); - ss << " Subtensor: "; - print_vector(td.get_subtensor()); - ss << " Layout: "; - print_vector(td.get_layout()); - ss << "}"; - return ss; -} - -void set_tensor_descriptor_ptr(const Output& out, const TensorDescriptorPtr& desc) { - const auto& node = out.get_node_shared_ptr(); - auto& rt_info = node->get_rt_info(); - const auto& key = TensorDescriptorPtrVectorAttribute::get_type_info_static(); - const auto& found = rt_info.find(key); - if (found == rt_info.end()) { - std::vector value(node->get_output_size()); - value[out.get_index()] = desc; - rt_info[key] = TensorDescriptorPtrVectorAttribute(value); - } else { - auto& value = found->second.as().m_value; - if (value.size() != node->get_output_size()) - OPENVINO_THROW("Either all or none of Tensor descriptors should be stored in rt_info (set)"); - value[out.get_index()] = desc; - } -} -TensorDescriptorPtr get_tensor_descriptor_ptr(const Output& out) { - return get_tensor_descriptor_ptr(ov::Output(out.get_node(), out.get_index())); -} -TensorDescriptorPtr get_tensor_descriptor_ptr(const Output& out) { - const auto& node = out.get_node_shared_ptr(); - const auto& rt_info = node->get_rt_info(); - auto it = rt_info.find(TensorDescriptorPtrVectorAttribute::get_type_info_static()); - if (it == rt_info.end()) { - return std::make_shared(out); - } - const auto& td_vector = it->second.as().m_value; - if (td_vector.size() != node->get_output_size()) - OPENVINO_THROW("Either all or none of Tensor descriptors should be stored in rt_info (get)"); - return td_vector[out.get_index()]; -} -} // namespace snippets -} // namespace ngraph diff --git a/src/common/snippets/src/utils.cpp b/src/common/snippets/src/utils.cpp index 789a5e6daeb080..e64aa000028b9b 100644 --- a/src/common/snippets/src/utils.cpp +++ b/src/common/snippets/src/utils.cpp @@ -67,27 +67,6 @@ auto get_non_scalar_constant_count_for_fq(const std::shared_ptr get_node_output_layout(const std::shared_ptr& node) { - return get_node_output_layout(node.get()); -} -std::vector get_node_output_layout(const Node* node) { - if (!node) - return {}; - if (node->is_dynamic()) - OPENVINO_THROW("It's illegal to call get_node_output_layout for dynamic nodes"); - auto& rt = node->get_rt_info(); - const auto rinfo = rt.find("Layout"); - if (rinfo != rt.end()) { - std::vector layout(rinfo->second.as>()); - // This might be a little costy, but still useful sanity check. Remove if proved to be unacceptably heavy. - std::set unique_elements(layout.begin(), layout.end()); - if (unique_elements.size() < layout.size()) - OPENVINO_THROW("Layout must contain only unique dimension indexes"); - return layout; - } else { - return {}; - } -} ov::PartialShape get_reordered_planar_shape(const ov::PartialShape& shape, const std::vector& layout) { if (layout.empty()) @@ -106,33 +85,14 @@ ov::PartialShape get_reordered_planar_shape(const ov::PartialShape& shape, const return reordered_shape; } -ov::PartialShape get_port_planar_shape(const Output& out) { - const auto& td = ngraph::snippets::get_tensor_descriptor_ptr(out); - return utils::get_reordered_planar_shape(ov::Shape{td->get_tensor()}, td->get_layout()); -} - -void set_transpose_output_layout(const ov::Output& port, const std::shared_ptr& node) { - const auto& const_order = as_type_ptr(node->get_input_node_shared_ptr(1)); - OPENVINO_ASSERT(const_order != nullptr, "Transpose order must be Constant to set layout!"); - set_output_layout(port, const_order->cast_vector()); -} - -void set_output_layout(const ov::Output& port, const std::vector& layout) { - auto& rt_info = port.get_node_shared_ptr()->get_rt_info(); - rt_info["Layout"] = layout; +ov::PartialShape get_port_planar_shape(const Input& in) { + const auto& port = lowered::PortManager::get_port_descriptor_ptr(in); + return utils::get_reordered_planar_shape(ov::Shape{port->get_shape()}, port->get_layout()); } -bool get_outside_loop_value(const std::shared_ptr& node) { - auto& rt_info = node->get_rt_info(); - const auto& found = rt_info.find("snippets::is_outside_loop"); - if (found == rt_info.end()) { - return false; // Default value: Expression should be executed inside - } - return found->second.as(); -} -void set_outside_loop_value(const std::shared_ptr& node, bool is_outside) { - auto& rt_info = node->get_rt_info(); - rt_info["snippets::is_outside_loop"] = is_outside; +ov::PartialShape get_port_planar_shape(const Output& out) { + const auto& port = lowered::PortManager::get_port_descriptor_ptr(out); + return utils::get_reordered_planar_shape(ov::Shape{port->get_shape()}, port->get_layout()); } } // namespace utils diff --git a/src/plugins/intel_cpu/src/emitters/x64/jit_snippets_emitters.cpp b/src/plugins/intel_cpu/src/emitters/x64/jit_snippets_emitters.cpp index 731e5c898b7cd7..b69a160b807a6f 100644 --- a/src/plugins/intel_cpu/src/emitters/x64/jit_snippets_emitters.cpp +++ b/src/plugins/intel_cpu/src/emitters/x64/jit_snippets_emitters.cpp @@ -15,7 +15,7 @@ #include "transformations/snippets/x64/op//brgemm_cpu.hpp" #include "snippets/snippets_isa.hpp" #include "snippets/op/subgraph.hpp" -#include "snippets/tensor_descriptor.hpp" +#include "snippets/lowered/tensor.hpp" using namespace InferenceEngine; using ngraph::snippets::op::Subgraph; @@ -26,7 +26,7 @@ using namespace dnnl::impl::cpu::x64; using ngraph::snippets::lowered::Expression; using ngraph::snippets::lowered::IOExpression; using ngraph::snippets::lowered::ExpressionPtr; -using ngraph::snippets::TensorDescriptorPtr; +using ngraph::snippets::lowered::TensorPtr; namespace ov { namespace intel_cpu { @@ -121,26 +121,26 @@ KernelEmitter::KernelEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl: num_inputs = 0; num_outputs = 0; for (const auto& expr : io_exprs) { - TensorDescriptorPtr td {}; + ngraph::snippets::lowered::PortDescriptorPtr desc = nullptr; element::Type etype; switch (expr->get_type()) { case ngraph::snippets::lowered::IOExpression::io_type::INPUT: { - td = expr->get_outputs()[0]; + desc = expr->get_output_port_descriptor(0); etype = expr->get_node()->get_output_element_type(0); num_inputs++; break; } case ngraph::snippets::lowered::IOExpression::io_type::OUTPUT: { num_outputs++; - td = expr->get_inputs()[0]; + desc = expr->get_input_port_descriptor(0); etype = expr->get_node()->get_input_element_type(0); break; } default : { IE_THROW() << "Kernel detected unsupported io_type"; } } - io_shapes.push_back(td->get_tensor()); - io_data_layouts.push_back(td->get_layout()); + io_shapes.push_back(desc->get_shape()); + io_data_layouts.push_back(desc->get_layout()); io_data_sizes.push_back(etype.size()); } @@ -222,8 +222,7 @@ void KernelEmitter::init_data_pointers(size_t num_inputs, size_t num_params, siz const size_t offset_rank = jcp.master_shape.size() - 1; //const size_t tile_rank = jcp.tile_rank; std::vector> data_offsets(num_params, std::vector{}); - auto offset_calculation = [=](const std::vector& shape, - const std::vector& layout, const size_t data_size) { + auto offset_calculation = [=](const std::vector& shape, const std::vector& layout, const size_t data_size) { // Strides represent distance between consecutive elements of corresponding dimension. // If a dim size == 1, then the next dim starts immediately and the stride is 0 // case 1: @@ -724,14 +723,11 @@ BrgemmEmitter::BrgemmEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl: if (brgemm_node->is_dynamic()) IE_THROW() << "Snippets don't support code generation for dynamic Brgemm"; const auto brgemm_copy = brgemm_node->is_with_data_repacking() ? brgemm_node->get_brgemm_copy() : nullptr; - const OutputVector io_values {brgemm_node->input_value(0), - brgemm_copy ? brgemm_copy->input_value(0) : brgemm_node->input_value(1), - brgemm_node->output(0)}; + std::vector leading_dimensions; std::vector> io_layouts; - for (const auto& val : io_values) { - const auto& layout = ngraph::snippets::get_tensor_descriptor_ptr(val.get_node_shared_ptr())->get_layout(); - const auto& io_shape = val.get_shape(); + + auto init_scheduling_params = [&](const std::vector& layout, const ov::Shape& io_shape) { if (layout.empty()) { // empty value indicates a planar layout leading_dimensions.push_back(io_shape.back()); @@ -744,17 +740,25 @@ BrgemmEmitter::BrgemmEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl: // counting from the end since shape could be prepended with ones const int64_t num_last_dims = layout.end() - std::find(layout.begin(), layout.end(), layout.size() - 2) - 1; if (layout.back() != layout.size() - 1 || num_last_dims < 1) - IE_THROW() << "BrgemmEmitter detected invalid layout values: " << - "check that this shape + layout combination is schedulable"; + IE_THROW() << "BrgemmEmitter detected invalid layout values: check that this shape + layout combination is schedulable"; leading_dimensions.emplace_back( std::accumulate(io_shape.end() - num_last_dims, io_shape.end(), 1, std::multiplies())); io_layouts.push_back(layout); } + }; + + std::vector> brgemm_inputs = {brgemm_node->input(0), + brgemm_copy ? brgemm_copy->input(0) : brgemm_node->input(1)}; + for (const auto& input : brgemm_inputs) { + init_scheduling_params(ngraph::snippets::lowered::PortManager::get_port_descriptor_ptr(input)->get_layout(), + input.get_shape()); } + init_scheduling_params(ngraph::snippets::lowered::PortManager::get_port_descriptor_ptr(brgemm_node->output(0))->get_layout(), + brgemm_node->output(0).get_shape()); - const auto& A_shape = io_values[0].get_shape(); + const auto& A_shape = brgemm_node->get_input_shape(0); const auto& A_layout = io_layouts[0]; - const auto& C_shape = io_values[2].get_shape(); + const auto& C_shape = brgemm_node->get_output_shape(0); const auto& C_layout = io_layouts[2]; // We need find original M,N,K having layouts and ordered shapes @@ -1105,7 +1109,7 @@ BrgemmCopyBEmitter::BrgemmCopyBEmitter(dnnl::impl::cpu::x64::jit_generator* h, d if (m_with_comp) m_comp_offset = brgemm_repack->get_offset_compensations(); - const auto& layout = ngraph::snippets::get_tensor_descriptor_ptr(brgemm_repack->get_input_node_shared_ptr(0))->get_layout(); + const auto& layout = ngraph::snippets::lowered::PortManager::get_port_descriptor_ptr(brgemm_repack->input(0))->get_layout(); const auto& original_shape = brgemm_repack->get_input_shape(0); auto transposed_shape = original_shape; size_t leading_dimension = *(original_shape.rbegin()); diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_copy_b.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_copy_b.cpp index 3502586495a512..201ea3d23214b2 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_copy_b.cpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_copy_b.cpp @@ -13,7 +13,7 @@ using namespace std; using namespace ov; intel_cpu::BrgemmCopyB::BrgemmCopyB(const Output& x, const element::Type src_type, const Type type, - const size_t offset_in, const size_t offset_out0, const size_t offset_out1) + const size_t offset_in, const size_t offset_out0, const size_t offset_out1, std::vector layout_input) : ngraph::snippets::op::MemoryAccess({x}, 1, type == Type::WithCompensations ? 2 : 1), m_type(type), m_src_type(src_type) { set_output_size(type == Type::WithCompensations ? 2 : 1); set_input_port_descriptor({0, offset_in}, 0); @@ -21,7 +21,7 @@ intel_cpu::BrgemmCopyB::BrgemmCopyB(const Output& x, const element::Type s if (is_with_compensations()) { set_output_port_descriptor({0, offset_out1}, 1); } - constructor_validate_and_infer_types(); + custom_constructor_validate_and_infer_types(std::move(layout_input)); } bool intel_cpu::BrgemmCopyB::visit_attributes(AttributeVisitor& visitor) { @@ -31,14 +31,27 @@ bool intel_cpu::BrgemmCopyB::visit_attributes(AttributeVisitor& visitor) { return true; } +void intel_cpu::BrgemmCopyB::custom_constructor_validate_and_infer_types(std::vector layout_input) { + INTERNAL_OP_SCOPE(BrgemmRepack_ctor_validate_and_infer_types); + // During ctor call, BrgemmCopyB doesn't know his port descriptors. + // So we use port descs from source inputs + const auto element_type = get_input_element_type(0); + const auto pshape = ngraph::snippets::utils::get_reordered_planar_shape(get_input_partial_shape(0), layout_input); + validate(pshape, element_type); +} + void intel_cpu::BrgemmCopyB::validate_and_infer_types() { INTERNAL_OP_SCOPE(BrgemmRepack_validate_and_infer_types); const auto element_type = get_input_element_type(0); + const auto pshape = ngraph::snippets::utils::get_port_planar_shape(input(0)); + validate(pshape, element_type); +} + +void intel_cpu::BrgemmCopyB::validate(const ov::PartialShape& pshape, const ov::element::Type& element_type) { NGRAPH_CHECK(one_of(element_type, element::bf16, element::i8), - "BrgemmCopyB doesn't support element type" + element_type.get_type_name()); + "BrgemmCopyB doesn't support element type" + element_type.get_type_name()); - const auto pshape = ngraph::snippets::utils::get_port_planar_shape(input_value(0)); if (pshape.is_dynamic()) { set_output_type(0, element_type, ov::PartialShape{ov::Dimension::dynamic()}); if (is_with_compensations()) { @@ -66,7 +79,8 @@ std::shared_ptr intel_cpu::BrgemmCopyB::clone_with_new_inputs(const Output return std::make_shared(new_args.at(0), m_src_type, m_type, get_offset_in(), get_offset_out(), - is_with_compensations() ? get_offset_compensations() : 0); + is_with_compensations() ? get_offset_compensations() : 0, + ngraph::snippets::lowered::PortManager::get_port_descriptor_ptr(input(0))->get_layout()); } size_t intel_cpu::BrgemmCopyB::get_offset_compensations() const { diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_copy_b.hpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_copy_b.hpp index d8db828b4a3e56..dd34e23bdb89e3 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_copy_b.hpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_copy_b.hpp @@ -26,7 +26,8 @@ class BrgemmCopyB : public ngraph::snippets::op::MemoryAccess { }; BrgemmCopyB(const Output& x, const element::Type src_type, const Type type = Type::OnlyRepacking, - const size_t offset_in = 0lu, const size_t offset_out0 = 0lu, const size_t offset_out1 = 0lu); + const size_t offset_in = 0lu, const size_t offset_out0 = 0lu, const size_t offset_out1 = 0lu, + std::vector layout_input = {}); BrgemmCopyB() = default; size_t get_offset_in() const { return get_input_offset(0); } @@ -43,6 +44,9 @@ class BrgemmCopyB : public ngraph::snippets::op::MemoryAccess { std::shared_ptr clone_with_new_inputs(const OutputVector& new_args) const override; private: + void custom_constructor_validate_and_infer_types(std::vector layout_input = {}); + void validate(const ov::PartialShape& pshape, const ov::element::Type& element_type); + Type m_type = Type::OnlyRepacking; element::Type m_src_type = ov::element::undefined; // src element type of the corresponding BRGEMM }; diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_cpu.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_cpu.cpp index 011501a53947c2..12fc4b0d2bc821 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_cpu.cpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_cpu.cpp @@ -5,7 +5,7 @@ #include "brgemm_cpu.hpp" #include "snippets/itt.hpp" #include "snippets/utils.hpp" -#include "snippets/tensor_descriptor.hpp" +#include "snippets/lowered/port_descriptor.hpp" #include "utils/general_utils.h" @@ -13,7 +13,8 @@ namespace ov { namespace intel_cpu { BrgemmCPU::BrgemmCPU(const Output& A, const Output& B, const Type type, - const size_t offset_a, const size_t offset_b, const size_t offset_c) + const size_t offset_a, const size_t offset_b, const size_t offset_c, + std::vector layout_a, std::vector layout_b, std::vector layout_c) : Brgemm(), m_type(type) { // We call default ctor of Brgemm class to avoid incorrect shape infer in constructor_validate_and_type_infer() call set_arguments({A, B}); @@ -22,11 +23,12 @@ BrgemmCPU::BrgemmCPU(const Output& A, const Output& B, const Type ty set_input_port_descriptor({0, offset_a}, 0); set_input_port_descriptor({0, offset_b}, 1); set_output_port_descriptor({0, offset_c}, 0); - constructor_validate_and_infer_types(); + custom_constructor_validate_and_infer_types(std::move(layout_a), std::move(layout_b), std::move(layout_c)); } BrgemmCPU::BrgemmCPU(const Output& A, const Output& B, const Output& scratch, const Type type, - const size_t offset_a, const size_t offset_b, const size_t offset_scratch, const size_t offset_c) + const size_t offset_a, const size_t offset_b, const size_t offset_scratch, const size_t offset_c, + std::vector layout_a, std::vector layout_b, std::vector layout_c) : Brgemm(), m_type(type) { set_arguments({A, B, scratch}); set_output_size(1); @@ -35,25 +37,41 @@ BrgemmCPU::BrgemmCPU(const Output& A, const Output& B, const Output< set_input_port_descriptor({0, offset_b}, 1); set_output_port_descriptor({0, offset_c}, 0); set_input_port_descriptor({0, offset_scratch}, 2); - constructor_validate_and_infer_types(); + custom_constructor_validate_and_infer_types(std::move(layout_a), std::move(layout_b), std::move(layout_c)); +} + +void BrgemmCPU::custom_constructor_validate_and_infer_types(std::vector layout_a, std::vector layout_b, std::vector layout_c) { + INTERNAL_OP_SCOPE(BrgemmCPU_constructor_validate_and_infer_types); + validate_inputs(); + + // During ctor call, BrgemmCPU doesn't know his port descriptors. + // So we use port descs from source inputs + const auto brgemm_copy = is_with_data_repacking() ? get_brgemm_copy() : nullptr; + const auto planar_input_shapes = + std::vector{ ngraph::snippets::utils::get_reordered_planar_shape(get_input_partial_shape(0), layout_a), + brgemm_copy ? ngraph::snippets::utils::get_port_planar_shape(brgemm_copy->input(0)) + : ngraph::snippets::utils::get_reordered_planar_shape(get_input_partial_shape(1), layout_b) }; + auto output_shape = get_output_partial_shape(planar_input_shapes); + set_output_type(0, get_output_type(), ngraph::snippets::utils::get_reordered_planar_shape(output_shape, layout_c)); + + //Additional check for 3rd input + validate_with_scratchpad(planar_input_shapes[1].get_shape()); } void BrgemmCPU::validate_and_infer_types() { INTERNAL_OP_SCOPE(BrgemmCPU_validate_and_infer_types); - // If no leading dimensions are provided, assume dense row-major inputs-outputs - NODE_VALIDATION_CHECK(this, get_input_partial_shape(0).is_static() && get_input_partial_shape(1).is_static(), - "BrgemmCPU currently supports only static shapes."); - - OPENVINO_ASSERT(implication(one_of(m_type, Type::Floating, Type::WithDataRepacking), get_input_size() == 2), - "BrgemmCPU expects 2 inputs in cases, when input precisions are f32|f32, u8|i8 or bf16|bf16 (non-AMX system)"); - OPENVINO_ASSERT(implication(one_of(m_type, Type::WithCompensations, Type::AMX), get_input_size() == 3), - "BrgemmCPU expects 3 inputs with input precisions i8|i8 and bf16|bf16 on AMX system"); + validate_inputs(); const auto brgemm_copy = is_with_data_repacking() ? get_brgemm_copy() : nullptr; - const auto planar_input_shapes = get_planar_input_shapes({input_value(0), brgemm_copy ? brgemm_copy->input_value(0) : input_value(1)}); + const auto planar_input_shapes = get_planar_input_shapes({input(0), brgemm_copy ? brgemm_copy->input(0) : input(1)}); auto output_shape = get_output_partial_shape(planar_input_shapes); set_output_type(0, get_output_type(), get_planar_output_shape(output_shape)); + //Additional check for 3rd input + validate_with_scratchpad(planar_input_shapes[1].get_shape()); +} + +void BrgemmCPU::validate_with_scratchpad(const ov::Shape& shape_b) const { //Additional check for 3rd input if (one_of(m_type, Type::WithCompensations, Type::AMX)) { const auto shape = get_input_partial_shape(2); @@ -61,7 +79,6 @@ void BrgemmCPU::validate_and_infer_types() { const auto type = get_input_element_type(2); if (is_with_compensations()) { const auto element_type_b = get_input_element_type(0); - const auto shape_b = planar_input_shapes[1].get_shape(); const auto N = *shape_b.rbegin(); const auto N_blk = element_type_b == element::f32 ? N : element_type_b == element::bf16 ? 32 : 64; @@ -76,16 +93,32 @@ void BrgemmCPU::validate_and_infer_types() { } } +void BrgemmCPU::validate_inputs() const { + // If no leading dimensions are provided, assume dense row-major inputs-outputs + NODE_VALIDATION_CHECK(this, get_input_partial_shape(0).is_static() && get_input_partial_shape(1).is_static(), + "BrgemmCPU currently supports only static shapes."); + OPENVINO_ASSERT(implication(one_of(m_type, Type::Floating, Type::WithDataRepacking), get_input_size() == 2), + "BrgemmCPU expects 2 inputs in cases, when input precisions are f32|f32, u8|i8 or bf16|bf16 (non-AMX system)"); + OPENVINO_ASSERT(implication(one_of(m_type, Type::WithCompensations, Type::AMX), get_input_size() == 3), + "BrgemmCPU expects 3 inputs with input precisions i8|i8 and bf16|bf16 on AMX system"); +} + std::shared_ptr BrgemmCPU::clone_with_new_inputs(const OutputVector& new_args) const { INTERNAL_OP_SCOPE(BrgemmCPU_clone_with_new_inputs); check_new_args_count(this, new_args); std::shared_ptr new_node = nullptr; if (!is_with_scratchpad()) { new_node = std::make_shared(new_args.at(0), new_args.at(1), m_type, - get_offset_a(), get_offset_b(), get_offset_c()); + get_offset_a(), get_offset_b(), get_offset_c(), + ngraph::snippets::lowered::PortManager::get_port_descriptor_ptr(input(0))->get_layout(), + ngraph::snippets::lowered::PortManager::get_port_descriptor_ptr(input(1))->get_layout(), + ngraph::snippets::lowered::PortManager::get_port_descriptor_ptr(output(0))->get_layout()); } else { new_node = std::make_shared(new_args.at(0), new_args.at(1), new_args.at(2), m_type, - get_offset_a(), get_offset_b(), get_offset_scratch(), get_offset_c()); + get_offset_a(), get_offset_b(), get_offset_scratch(), get_offset_c(), + ngraph::snippets::lowered::PortManager::get_port_descriptor_ptr(input(0))->get_layout(), + ngraph::snippets::lowered::PortManager::get_port_descriptor_ptr(input(1))->get_layout(), + ngraph::snippets::lowered::PortManager::get_port_descriptor_ptr(output(0))->get_layout()); } return new_node; } diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_cpu.hpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_cpu.hpp index 2081ca25c7528f..2f744fe50e55c7 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_cpu.hpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_cpu.hpp @@ -7,6 +7,8 @@ #include "snippets/op/brgemm.hpp" #include "brgemm_copy_b.hpp" +#include "snippets/lowered/port_descriptor.hpp" + namespace ov { namespace intel_cpu { @@ -28,9 +30,11 @@ class BrgemmCPU : public ngraph::snippets::op::Brgemm { }; BrgemmCPU(const Output& A, const Output& B, const Type type, - const size_t offset_a = 0, const size_t offset_b = 0, const size_t offset_c = 0); + const size_t offset_a = 0, const size_t offset_b = 0, const size_t offset_c = 0, + std::vector layout_a = {}, std::vector layout_b = {}, std::vector layout_c = {}); BrgemmCPU(const Output& A, const Output& B, const Output& scratch, const Type type, - const size_t offset_a = 0, const size_t offset_b = 0, const size_t offset_scratch = 0, const size_t offset_c = 0); + const size_t offset_a = 0, const size_t offset_b = 0, const size_t offset_scratch = 0, const size_t offset_c = 0, + std::vector layout_a = {}, std::vector layout_b = {}, std::vector layout_c = {}); BrgemmCPU() = default; void validate_and_infer_types() override; @@ -48,7 +52,11 @@ class BrgemmCPU : public ngraph::snippets::op::Brgemm { constexpr static size_t SCRATCH_BYTE_SIZE = 32 * 1024; private: - Type m_type = Type::Floating; + void custom_constructor_validate_and_infer_types(std::vector layout_a, std::vector layout_b, std::vector layout_c); + void validate_with_scratchpad(const ov::Shape& shape_b) const; + void validate_inputs() const; + + Type m_type = Type::Floating; }; } // namespace intel_cpu diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/brgemm_to_brgemm_cpu.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/brgemm_to_brgemm_cpu.cpp index 70f46d3f08f2f5..15b327288d0e6e 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/brgemm_to_brgemm_cpu.cpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/brgemm_to_brgemm_cpu.cpp @@ -5,8 +5,9 @@ #include "snippets/itt.hpp" #include "brgemm_to_brgemm_cpu.hpp" -#include "snippets/snippets_isa.hpp" + #include "snippets/utils.hpp" +#include "snippets/op/brgemm.hpp" #include "transformations/snippets/x64/op/brgemm_copy_b.hpp" #include "transformations/snippets/x64/op/brgemm_cpu.hpp" @@ -22,6 +23,23 @@ namespace ov { namespace intel_cpu { +using namespace ngraph::snippets::lowered; + +namespace { +inline std::vector make_subtensor(const ov::Shape& tensor) { + return std::vector(std::min(tensor.size(), 2lu), PortDescriptor::ServiceDimensions::FULL_DIM); +} +template +void set_full_port_desc(const T& port) { + const auto& shape = port.get_shape(); + PortManager::set_port_descriptor_ptr(port, std::make_shared(shape, make_subtensor(shape))); +} +template +void set_port_desc(const T& port, Args... params) { + PortManager::set_port_descriptor_ptr(port, std::make_shared(params...)); +} +} // namespace + pass::BrgemmToBrgemmCPU::BrgemmToBrgemmCPU() { MATCHER_SCOPE(BrgemmToBrgemmCPU); @@ -39,6 +57,10 @@ pass::BrgemmToBrgemmCPU::BrgemmToBrgemmCPU() { return false; } + const auto& brgemm_in0_desc = PortManager::get_port_descriptor_ptr(brgemm->input(0)); + const auto& brgemm_in1_desc = PortManager::get_port_descriptor_ptr(brgemm->input(1)); + const auto& brgemm_out_desc = PortManager::get_port_descriptor_ptr(brgemm->output(0)); + const auto dimsMatMulIn0 = ngraph::snippets::utils::get_port_planar_shape(brgemm->input_value(0)).get_shape(); const auto dimsMatMulIn1 = ngraph::snippets::utils::get_port_planar_shape(brgemm->input_value(1)).get_shape(); @@ -56,39 +78,63 @@ pass::BrgemmToBrgemmCPU::BrgemmToBrgemmCPU() { const auto offset_c = brgemm->get_offset_c(); std::shared_ptr brgemm_cpu = nullptr; + std::shared_ptr brgemm_repacking = nullptr; if (element_type_a == ov::element::f32) { brgemm_cpu = std::make_shared(brgemm->input_value(0), brgemm->input_value(1), BrgemmCPU::Type::Floating, - offset_a, offset_b, offset_c); + offset_a, offset_b, offset_c, + brgemm_in0_desc->get_layout(), brgemm_in1_desc->get_layout(), brgemm_out_desc->get_layout()); } else { const auto copy_b_type = with_comp ? BrgemmCopyB::WithCompensations : BrgemmCopyB::OnlyRepacking; - const auto brgemmRepackIn1 = std::make_shared(brgemm->input_value(1), element_type_a, copy_b_type, offset_b); - const auto buffer = std::make_shared(brgemmRepackIn1->output(0)); - ngraph::snippets::utils::set_outside_loop_value(brgemmRepackIn1, true); - ngraph::snippets::utils::set_outside_loop_value(buffer, true); + brgemm_repacking = std::make_shared(brgemm->input_value(1), element_type_a, copy_b_type, offset_b, 0, 0, + brgemm_in1_desc->get_layout()); + const auto buffer = std::make_shared(brgemm_repacking->output(0)); + set_port_desc(brgemm_repacking->input(0), brgemm_in1_desc->get_shape(), brgemm_in1_desc->get_subtensor(), brgemm_in1_desc->get_layout()); + set_full_port_desc(brgemm_repacking->output(0)); + set_full_port_desc(buffer->input(0)); + set_full_port_desc(buffer->output(0)); if (with_amx) { const auto scratch = std::make_shared(ov::Shape{BrgemmCPU::SCRATCH_BYTE_SIZE}); brgemm_cpu = std::make_shared(brgemm->input_value(0), buffer, scratch, BrgemmCPU::Type::AMX, - offset_a, offset_b, offset_c); - ngraph::snippets::utils::set_outside_loop_value(scratch, true); + offset_a, offset_b, 0, offset_c, + brgemm_in0_desc->get_layout(), std::vector{}, brgemm_out_desc->get_layout()); + set_full_port_desc(scratch->output(0)); + set_full_port_desc(brgemm_cpu->input(2)); } else if (with_comp) { - const auto scratch = std::make_shared(brgemmRepackIn1->output(1)); + const auto scratch = std::make_shared(brgemm_repacking->output(1)); brgemm_cpu = std::make_shared(brgemm->input_value(0), buffer, scratch, BrgemmCPU::Type::WithCompensations, - offset_a, offset_b, offset_c); - ngraph::snippets::utils::set_outside_loop_value(scratch, true); + offset_a, offset_b, 0, offset_c, + brgemm_in0_desc->get_layout(), std::vector{}, brgemm_out_desc->get_layout()); + set_full_port_desc(brgemm_repacking->output(1)); + set_full_port_desc(scratch->input(0)); + set_full_port_desc(scratch->output(0)); + set_full_port_desc(brgemm_cpu->input(2)); } else if (one_of(element_type_a, ov::element::u8, ov::element::bf16)) { brgemm_cpu = std::make_shared(brgemm->input_value(0), buffer, BrgemmCPU::Type::WithDataRepacking, - offset_a, offset_b, offset_c); + offset_a, offset_b, offset_c, + brgemm_in0_desc->get_layout(), std::vector{}, brgemm_out_desc->get_layout()); } else { IE_THROW() << "Invalid configuration for BRGEMM CPU"; } } brgemm_cpu->set_friendly_name(brgemm->get_friendly_name()); - ngraph::copy_runtime_info(brgemm, brgemm_cpu); // Copy output layout inside as well ngraph::replace_node(brgemm, brgemm_cpu); - // TODO: At the moment Brgemm is executed outside Loop. When Blocking is supported, remove it - ngraph::snippets::utils::set_outside_loop_value(brgemm_cpu, true); + + // Transfer ports + set_port_desc(brgemm_cpu->input(0), brgemm_in0_desc->get_shape(), brgemm_in0_desc->get_subtensor(), brgemm_in0_desc->get_layout()); + if (brgemm_repacking) { + set_full_port_desc(brgemm_cpu->input(1)); + } else { + set_port_desc(brgemm_cpu->input(1), brgemm_in1_desc->get_shape(), brgemm_in1_desc->get_subtensor(), brgemm_in1_desc->get_layout()); + } + set_port_desc(brgemm_cpu->output(0), brgemm_out_desc->get_shape(), brgemm_out_desc->get_subtensor(), brgemm_out_desc->get_layout()); + + // need to run validate_and_infer_types manually: either input shapes were updated or + // output Layout was updated (out shape will be updated in validate_and_infer_types()) + if (brgemm_repacking) + brgemm_repacking->validate_and_infer_types(); + brgemm_cpu->validate_and_infer_types(); return true; }; diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/fuse_load_store_and_convert.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/fuse_load_store_and_convert.cpp index 066d3758e74f22..0a95316a5c59df 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/fuse_load_store_and_convert.cpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/fuse_load_store_and_convert.cpp @@ -15,20 +15,19 @@ bool ov::intel_cpu::pass::FuseLoadStoreConvert::fuse_load_convert(ngraph::snippe ngraph::snippets::lowered::LinearIR::constExprIt& convert_it) { const auto& convert_expr = *convert_it; const auto& convert = ov::as_type_ptr(convert_expr->get_node()); - const auto input_td = convert_expr->get_inputs().front(); - const auto output_td = convert_expr->get_outputs().front(); + const auto& input_td = convert_expr->get_input_tensor(0); if (convert->get_destination_type() != ov::element::f32 && convert->get_destination_type() != ov::element::i32) return false; - const auto& load_output = linear_ir.get_expr_by_output(input_td); - const auto& load_expr = load_output.expr; + const auto& load_output = input_td->get_source(); + const auto& load_expr = load_output.get_expr(); const auto load = ov::as_type_ptr(load_expr->get_node()); if (!load || ov::is_type(load_expr->get_node()) || ov::is_type(load_expr->get_node())) return false; - const auto consumers = linear_ir.get_exprs_by_input(input_td); + const auto consumers = input_td->get_consumers(); if (consumers.size() != 1) return false; @@ -45,13 +44,16 @@ bool ov::intel_cpu::pass::FuseLoadStoreConvert::fuse_load_convert(ngraph::snippe OPENVINO_THROW("Type of Convert op is undefined. Supports only fusing Load and ConvertTruncation or ConvertSaturation ops"); } - const auto in_td = std::vector{ load_expr->get_inputs().front() }; - const auto out_td = std::vector{ output_td }; - const auto mv_expr_it = convert_it; - const auto& insertion_pos = std::next(convert_it); - linear_ir.erase(std::find(linear_ir.cbegin(), mv_expr_it, load_expr)); - linear_ir.erase(mv_expr_it); - convert_it = linear_ir.insert(insertion_pos, std::make_shared(load_convert, in_td, out_td)); + const auto out_port = convert_expr->get_output_port(0); + const auto convert_consumers = out_port.get_connected_ports(); + ngraph::snippets::lowered::PortManager::set_port_descriptor_ptr(load_convert->output(0), out_port.get_descriptor_ptr()->clone()); + const auto load_convert_expr = linear_ir.create_expression(load_convert, { load_expr->get_input_tensor(0) }); + const auto convert_expr_it = convert_it; + const auto insertion_pos = std::next(convert_it); + convert_it = linear_ir.insert(insertion_pos, load_convert_expr); + linear_ir.erase(std::find(linear_ir.cbegin(), convert_expr_it, load_expr)); + linear_ir.erase(convert_expr_it); + linear_ir.replace_input(convert_consumers, load_convert_expr->get_output_tensor(0)); return true; } @@ -59,17 +61,17 @@ bool ov::intel_cpu::pass::FuseLoadStoreConvert::fuse_store_convert(ngraph::snipp ngraph::snippets::lowered::LinearIR::constExprIt& convert_it) { const auto& convert_expr = *convert_it; const auto& convert = convert_expr->get_node(); - const auto input_td = convert_expr->get_inputs().front(); - const auto output_td = convert_expr->get_outputs().front(); + const auto& input_td = convert_expr->get_input_tensor(0); + const auto& output_td = convert_expr->get_output_tensor(0); if (convert->get_input_element_type(0) != ov::element::f32 && convert->get_input_element_type(0) != ov::element::i32) return false; - const auto consumers = linear_ir.get_exprs_by_input(output_td); + const auto consumers = output_td->get_consumers(); if (consumers.size() != 1) return false; const auto store_input = *(consumers.begin()); - const auto store_expr = store_input.expr; + const auto& store_expr = store_input.get_expr(); const auto store = ov::as_type_ptr(store_expr->get_node()); if (!store) return false; @@ -87,13 +89,16 @@ bool ov::intel_cpu::pass::FuseLoadStoreConvert::fuse_store_convert(ngraph::snipp OPENVINO_THROW("Type of Convert op is undefined. Supports only fusing Store and ConvertTruncation or ConvertSaturation ops"); } - const auto in_td = std::vector{ input_td }; - const auto out_td = std::vector{ store_expr->get_outputs().front() }; - const auto store_it = std::find(convert_it, linear_ir.cend(), store_expr); - const auto& insertion_pos = std::next(store_it); - linear_ir.erase(store_it); - convert_it = linear_ir.erase(convert_it); - linear_ir.insert(insertion_pos, std::make_shared(store_convert, in_td, out_td)); + const auto out_port = store_expr->get_output_port(0); + const auto store_consumers = out_port.get_connected_ports(); + ngraph::snippets::lowered::PortManager::set_port_descriptor_ptr(store_convert->output(0), out_port.get_descriptor_ptr()->clone()); + const auto store_convert_expr = linear_ir.create_expression(store_convert, { input_td }); + const auto convert_expr_it = convert_it; + const auto insertion_pos = std::next(convert_it); + convert_it = linear_ir.insert(insertion_pos, store_convert_expr); + linear_ir.erase(std::find(convert_expr_it, linear_ir.cend(), store_expr)); + linear_ir.erase(convert_expr_it); + linear_ir.replace_input(store_consumers, store_convert_expr->get_output_tensor(0)); return true; } diff --git a/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_lowered.cpp b/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_lowered.cpp index 44be5e51dc0c8a..09a5cbce0a3424 100644 --- a/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_lowered.cpp +++ b/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_lowered.cpp @@ -78,24 +78,37 @@ std::shared_ptr Transpose0213MatMulLoweredFunction::initLowered() con std::make_shared(precisions[1], input_shapes[1])}; std::vector layout{0, 2, 1, 3}; // Note: validity of transpose_position values is checked in Transpose0213MatMulSinhFunction constructor - if (transpose_position <= 1) { - const auto& anchor = data[transpose_position]; - const auto& td = ngraph::snippets::get_tensor_descriptor_ptr(anchor); - const auto& tensor = td->get_tensor(); + if (transpose_position < 2) { + const auto& anchor = data[transpose_position]->output(0); + const auto& td = ngraph::snippets::lowered::PortManager::get_port_descriptor_ptr(anchor); + const auto& tensor = td->get_shape(); const auto& subtensor = td->get_subtensor(); - ngraph::snippets::set_tensor_descriptor_ptr(anchor, - std::make_shared(tensor, subtensor, layout)); } - auto matmul = std::make_shared(data[0], data[1]); + auto matmul = std::make_shared(data[0], data[1], 0, 0, 0, transpose_position == 0 ? layout : std::vector{}, + transpose_position == 1 ? layout : std::vector{}, + transpose_position == 2 ? layout : std::vector{}); + auto result = std::make_shared(matmul); if (transpose_position == 2) { const auto& anchor = matmul->output(0); - const auto& td = ngraph::snippets::get_tensor_descriptor_ptr(anchor); - const auto& tensor = td->get_tensor(); + const auto& td = ngraph::snippets::lowered::PortManager::get_port_descriptor_ptr(anchor); + const auto& tensor = td->get_shape(); const auto& subtensor = td->get_subtensor(); - ngraph::snippets::set_tensor_descriptor_ptr(anchor, - std::make_shared(tensor, subtensor, layout)); - matmul->validate_and_infer_types(); + ngraph::snippets::lowered::PortManager::set_port_descriptor_ptr(anchor, + std::make_shared(tensor, + subtensor, + layout)); } + if (transpose_position < 2) { + const auto& anchor = data[transpose_position]->output(0); + const auto& td = ngraph::snippets::lowered::PortManager::get_port_descriptor_ptr(anchor); + const auto& tensor = td->get_shape(); + const auto& subtensor = td->get_subtensor(); + ngraph::snippets::lowered::PortManager::set_port_descriptor_ptr(matmul->input(transpose_position), + std::make_shared(tensor, + subtensor, + layout)); + } + matmul->validate_and_infer_types(); return std::make_shared(NodeVector{matmul}, data); }