openvinotoolkit · dmitry-gorokhov · Jun 8, 2023 · Jan 26, 2023 · May 24, 2023 · May 24, 2023
@@ -28,6 +28,8 @@ class Brgemm : public MemoryAccess {
     size_t get_offset_b() const { return get_input_offset(1); }
     size_t get_offset_c() const { return get_output_offset(0); }
 
+    static ov::element::Type get_output_type(const ov::element::Type& in_type0, const ov::element::Type& in_type1);
+
     void validate_and_infer_types() override;
     std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
 

@@ -29,7 +29,7 @@ class Buffer : public ov::op::Op {
 public:
     OPENVINO_OP("Buffer", "SnippetsOpset");
     Buffer() = default;
-    Buffer(const ov::Shape& shape, size_t id = 0);
+    Buffer(const ov::Shape& shape, ov::element::Type element_type = ov::element::u8, size_t id = 0);
     Buffer(const ov::Output<ov::Node>& arg, const ov::Shape& shape, size_t id = 0);
     Buffer(const ov::Output<ov::Node>& arg, int32_t allocation_rank = -1, size_t id = 0);
 
@@ -48,9 +48,10 @@ class Buffer : public ov::op::Op {
     int64_t get_offset() const { return m_offset; }
     void set_id(size_t id) { m_id = id; }
     void set_offset(int64_t offset) { m_offset = offset; }
-
     size_t get_byte_size() const;
 
+    void set_element_type(ov::element::Type element_type);
+
     bool is_intermediate_memory() const { return m_type == Type::IntermediateMemory; }
     bool is_new_memory() const { return m_type == Type::NewMemory; }
 
@@ -59,6 +60,7 @@ class Buffer : public ov::op::Op {
     ov::Shape m_shape = {};
     int64_t m_offset = 0;
     size_t m_id = 0;  // Default ID - 0. All Buffers are from the same set
+    ov::element::Type m_element_type = ov::element::u8;  // u8 - default 1 byte
 };
 
 } // namespace op

@@ -136,7 +136,6 @@ class Subgraph : public ov::op::util::SubGraphOp {
     // should have explicit Constants even if they're non-scalar (Reshape, Transpose, Broadcast)
     // This check returns True if Constant op which is input of this op should be inside Subgraph body
     static auto constant_input_should_be_inside_body(const std::shared_ptr<ov::Node>& node) -> bool;
-
     static bool check_broadcast(const std::shared_ptr<const ov::Node>& node) noexcept;
     // Return estimated unique buffer count (upper bound). It's needed for tokenization
     static auto get_estimated_buffer_count(const ov::NodeVector& ops) -> size_t;

@@ -6,6 +6,7 @@
 
 #include "openvino/pass/graph_rewrite.hpp"
 #include "openvino/pass/pattern/matcher.hpp"
+#include "snippets/pass/tokenization.hpp"
 
 namespace ov {
 namespace snippets {
@@ -14,13 +15,34 @@ namespace pass {
 /**
  * @interface TokenizeMHASnippets
  * @brief The pass tokenizes MHA-pattern into Subgraph
- *        TODO: Write pattern
+ *        Pattern:           Transpose1
+ *                               |
+ *             Transpose0 [Eltwise, Select]
+ *                     \     /
+ *                     MatMul0
+ *                        |
+ *           [Eltwise, Select, Reshape]
+ *                        |
+ *                     Softmax
+ *                        |
+ *            [Eltwise, Select, Reshape]  Transpose2
+ *                               \      /
+ *                                MatMul1
+ *                                  |
+ *                  [Eltwise, Select, Transpose3]
+ *        Notes:
+ *          - Transposes can be missed
+ *          - Transpose0, Transpose2 and Transpose3 may have only [0,2,1,3] order
+ *          - Transpose1 may have only [0,2,3,1] order
+ *          - [...] means any count of different nodes from list. But:
+ *              * Reshapes can be only explicitly around Softmax (Reshape -> Softmax -> Reshape)
+ *              * After MatMul1 may be only Transpose3 or any count of Eltwise, Select ops.
  * @ingroup snippets
  */
 class TokenizeMHASnippets: public ov::pass::MatcherPass {
 public:
     OPENVINO_RTTI("TokenizeMHASnippets", "0");
-    TokenizeMHASnippets();
+    TokenizeMHASnippets(const SnippetsTokenization::Config& config = {});
 };
 
 }  // namespace pass

@@ -7,8 +7,7 @@
 #include "openvino/pass/graph_rewrite.hpp"
 #include "openvino/pass/pattern/matcher.hpp"
 
-#include "snippets/pass/mha_tokenization.hpp"
-#include "snippets/pass/collapse_subgraph.hpp"
+#include "snippets/op/subgraph.hpp"
 
 namespace ov {
 namespace snippets {
@@ -19,8 +18,16 @@ namespace pass {
  SkippedByPlugin - indicate that snippets can't include this node in subgraph. Can be set by Plugin via SetSnippetsNodeType(...).
  */
 enum class SnippetsNodeType : int64_t {NotSet, SkippedByPlugin};
+/*
+ NotSet - default value returned if the subgraph wasn't marked and snippets can include nodes in this subgraph
+ Completed - indicate that snippets can't include any nodes in this subgraph.
+             It's used in separate tokenization pass, for example, tokenization by matcher (MHA Tokenization).
+ */
+enum class SnippetsSubgraphType : int64_t {NotSet, Completed};
 void SetSnippetsNodeType(const std::shared_ptr<Node>&, SnippetsNodeType);
+void SetSnippetsSubgraphType(const std::shared_ptr<op::Subgraph>&, SnippetsSubgraphType);
 SnippetsNodeType GetSnippetsNodeType(const std::shared_ptr<const Node>&);
+SnippetsSubgraphType GetSnippetsSubgraphType(const std::shared_ptr<const op::Subgraph>&);
 void SetTopologicalOrder(const std::shared_ptr<Node>&, int64_t);
 int64_t GetTopologicalOrder(const std::shared_ptr<const Node>&);
 
@@ -48,8 +55,26 @@ class EnumerateNodes : public ov::pass::ModelPass {
  */
 class SnippetsTokenization : public ov::pass::ModelPass {
 public:
+    /**
+     * @interface Config
+     * @brief Allow to adjust tokenization passes
+     * @ingroup snippets
+     */
+    struct Config {
+        Config(bool enable_transpose = true) : mha_token_enable_transpose(enable_transpose) {}
+
+        // False if all Transposes aren't tokenized in MHA Tokenization.
+        // Otherwise, they may be fused into Subgraph if possible
+        // TODO [106921]: Remove please when the ticket 106921 is implemented
+        bool mha_token_enable_transpose = true;
+    };
+
     OPENVINO_RTTI("SnippetsTokenization", "0");
+    SnippetsTokenization(const Config& config) : m_config(config) {}
     bool run_on_model(const std::shared_ptr<ov::Model>& m) override;
+
+private:
+    Config m_config{};
 };
 
 

@@ -124,8 +124,8 @@ bool AllocateBuffers::run(LinearIR& linear_ir) {
 
                 const auto current_allocated_memory_size = m_buffer_scratchpad_size - offset;
                 if (buffer_size > current_allocated_memory_size) {
-                    m_buffer_scratchpad_size += (buffer_size - current_allocated_memory_size);
-                    // Note: we don't update offset because we just add memory to needed size
+                    allocate(buffer, expr, buffer_size);
+                    continue;
                 }
                 propagate_offset(linear_ir, *expr_it, offset);
                 allocated_buffers.insert(expr);

@@ -100,9 +100,9 @@ bool AssignRegisters::run(LinearIR& linear_ir) {
     // Otherwise WIN build fails with "IS_MANUALLY_ALLOCATED_REG cannot be implicitly captured because no default capture mode has been specified"
     // the same problem with all the other lambdas in this file
     auto enumerate_out_tensors = [=] (const ExpressionPtr& expr,
-                                                              decltype(regs_vec)& reg_map,
-                                                              const std::map<tensor, Reg>& manually_assigned_regs,
-                                                              size_t& counter) {
+                                      decltype(regs_vec)& reg_map,
+                                      const std::map<tensor, Reg>& manually_assigned_regs,
+                                      size_t& counter) {
         for (const auto& out_tensor : expr->get_output_port_connectors()) {
             // Note that some ops might have identical input&output tensors (Result and Tile* for ex.)
             // so we have to check that the tensor has not been enumerated already

@@ -62,22 +62,29 @@ std::shared_ptr<Node> Brgemm::clone_with_new_inputs(const OutputVector& new_args
                                     lowered::PortDescriptorUtils::get_port_descriptor_ptr(output(0))->get_layout());
 }
 
-ov::element::Type Brgemm::get_output_type() const {
-    const auto element_type_a = get_input_element_type(0);
-    const auto element_type_b = get_input_element_type(1);
-    const bool is_f32 = utils::everyone_is(element::f32, element_type_a, element_type_b);
-    const bool is_int8 = utils::one_of(element_type_a, element::i8, element::u8) && element_type_b == element::i8;
-    const bool is_bf16 = utils::everyone_is(element::bf16, element_type_a, element_type_b);
+ov::element::Type Brgemm::get_output_type(const ov::element::Type& in_type0, const ov::element::Type& in_type1) {
+    const bool is_f32 = utils::everyone_is(element::f32, in_type0, in_type1);
+    const bool is_int8 = utils::one_of(in_type0, element::i8, element::u8) && in_type1 == element::i8;
+    const bool is_bf16 = utils::everyone_is(element::bf16, in_type0, in_type1);
     if (is_f32 || is_bf16) {
-       return element::f32;
+        return element::f32;
     } else if (is_int8) {
         return element::i32;
     } else {
+        return element::undefined;
+    }
+}
+
+ov::element::Type Brgemm::get_output_type() const {
+    auto output_type = get_output_type(get_input_element_type(0), get_input_element_type(1));
+    if (output_type == element::undefined) {
         OPENVINO_THROW("BrgemmCPU node has incompatible input element types: " +
-                            element_type_a.get_type_name() +
-                            " and " +
-                            element_type_b.get_type_name());
+                       get_input_element_type(0).get_type_name() +
+                       " and " +
+                       get_input_element_type(1).get_type_name());
     }
+
+    return output_type;
 }
 
 std::vector<ov::PartialShape> Brgemm::get_planar_input_shapes(const std::vector<ov::Input<ov::Node>>& inputs) const {

@@ -14,8 +14,8 @@ namespace snippets {
 namespace op {
 
 
-Buffer::Buffer(const ov::Shape& shape, size_t id)
-    : Op(), m_type(Type::NewMemory), m_shape(shape), m_offset(0), m_id(id) {
+Buffer::Buffer(const ov::Shape& shape, ov::element::Type element_type, size_t id)
+    : Op(), m_type(Type::NewMemory), m_shape(shape), m_offset(0), m_id(id), m_element_type(std::move(element_type)) {
     constructor_validate_and_infer_types();
 }
 
@@ -40,34 +40,33 @@ bool Buffer::visit_attributes(AttributeVisitor& visitor) {
     visitor.on_attribute("allocation_shape", m_shape);
     visitor.on_attribute("offset", m_offset);
     visitor.on_attribute("id", m_id);
+    visitor.on_attribute("element_type", m_element_type);
     return true;
 }
 
 void Buffer::validate_and_infer_types() {
     INTERNAL_OP_SCOPE(Buffer_validate_and_infer_types);
-    ov::element::Type output_type;
     ov::Shape output_shape;
     if (m_type == Type::NewMemory) {
         OPENVINO_ASSERT(get_input_size() == 0, "Buffer with new allocated memory must to not have arguments!");
         output_shape = m_shape;
-        output_type = ov::element::u8;  // 1Byte
     } else if (m_type == Type::IntermediateMemory) {
         const auto& input_shape = get_input_partial_shape(0);
         OPENVINO_ASSERT(input_shape.is_static(), "Buffer supports only static input shape");
-        output_type = get_input_element_type(0);
+        m_element_type = get_input_element_type(0);
         output_shape = input_shape.get_shape();
     } else {
         OPENVINO_THROW("Buffer supports only the following types: NewMemory and IntermediateMemory");
     }
-    set_output_type(0, output_type, output_shape);
+    set_output_type(0, m_element_type, output_shape);
 }
 
 std::shared_ptr<Node> Buffer::clone_with_new_inputs(const OutputVector& new_args) const {
     INTERNAL_OP_SCOPE(Buffer_clone_with_new_inputs);
     check_new_args_count(this, new_args);
     std::shared_ptr<op::Buffer> new_buffer = nullptr;
     if (m_type == Type::NewMemory) {
-        new_buffer = std::make_shared<Buffer>(m_shape, m_id);
+        new_buffer = std::make_shared<Buffer>(m_shape, m_element_type, m_id);
     } else if (m_type == Type::IntermediateMemory) {
         new_buffer = std::make_shared<Buffer>(new_args.at(0), m_shape, m_id);
     } else {
@@ -82,6 +81,13 @@ size_t Buffer::get_byte_size() const {
     return ov::shape_size(shape) * get_element_type().size();
 }
 
+void Buffer::set_element_type(ov::element::Type element_type) {
+    OPENVINO_ASSERT(is_new_memory(), "Only Buffer with NewMemory can change his output precision!");
+    m_element_type = std::move(element_type);
+    // Apply the change
+    validate_and_infer_types();
+}
+
 } // namespace op
 } // namespace snippets
 } // namespace ov
@@ -14,7 +14,6 @@
 #include "snippets/pass/convert_constants.hpp"
 #include "snippets/pass/convert_power_to_powerstatic.hpp"
 #include "snippets/pass/transpose_decomposition.hpp"
-#include "snippets/pass/transform_convert.hpp"
 #include "snippets/pass/matmul_to_brgemm.hpp"
 #include "snippets/pass/fuse_transpose_brgemm.hpp"
 #include "snippets/pass/set_softmax_ports.hpp"
@@ -75,12 +74,11 @@ auto snippets::op::Subgraph::is_domain_sensitive_op(const std::shared_ptr<ov::No
 }
 
 void snippets::op::Subgraph::init_config() {
+    auto update = [](bool& flag, bool status) { flag = flag || status; };
     const auto ops = body_ptr()->get_ops();
     for (const auto& op : ops) {
-        config.m_is_quantized = config.m_is_quantized ||
-            ov::is_type<ov::op::v0::FakeQuantize>(op);
-        config.m_has_domain_sensitive_ops = config.m_has_domain_sensitive_ops ||
-            is_domain_sensitive_op(op);
+        update(config.m_is_quantized, ov::is_type<ov::op::v0::FakeQuantize>(op));
+        update(config.m_has_domain_sensitive_ops, is_domain_sensitive_op(op));
     }
 }
 
@@ -93,6 +91,13 @@ auto snippets::op::Subgraph::get_estimated_buffer_count(const ov::NodeVector& op
     // and where will be Loops - we can just predict.
     // Note: The ops that create Buffers: MatMul, Transpose and Softmax (always FP32)
     std::vector<size_t> used_precision_size;
+
+    auto push_prc_size = [&used_precision_size](size_t precision_size) {
+        if (used_precision_size.empty() || used_precision_size.back() != precision_size) {
+            used_precision_size.push_back(precision_size);
+        }
+    };
+
     for (const auto& op : ops) {
         if (const auto transpose = ov::as_type_ptr<ov::op::v1::Transpose>(op)) {
             // At the moment Transposes are supported only on Results and Parameters but
@@ -106,34 +111,23 @@ auto snippets::op::Subgraph::get_estimated_buffer_count(const ov::NodeVector& op
                                                            }) ||
                                               !ov::is_type<ov::op::v0::Parameter>(transpose->get_input_node_shared_ptr(0));
             if (are_prev_or_next_ops) {
-                const auto prc_size = transpose->get_element_type().size();
-                if (used_precision_size.empty() || used_precision_size.back() != prc_size) {
-                    used_precision_size.push_back(prc_size);
-                }
+                push_prc_size(transpose->get_element_type().size());
             }
         } else if (ov::is_type<ov::op::v1::Softmax>(op) || ov::is_type<ov::op::v8::Softmax>(op)) {
-            // Softmax always uses 2 FP32 Buffers
-            const auto prc_size = ov::element::f32.size();
-            if (used_precision_size.empty() || used_precision_size.back() != prc_size) {
-                used_precision_size.push_back(prc_size);
-            }
+            // Softmax always uses 2 FP32 Buffers after decomposition.
+            // They are inplace and the same so we can push precision size only once
+            push_prc_size(ov::element::f32.size());
         } else if (const auto matmul = ov::as_type_ptr<ov::op::v0::MatMul>(op)) {
             // First input check is enough because MatMul requires the same prc size on inputs
             if (!ov::is_type<ov::op::v0::Parameter>(matmul->get_input_node_shared_ptr(0)) ||
                 !ov::is_type<ov::op::v0::Parameter>(matmul->get_input_node_shared_ptr(1))) {
-                const auto prc_size = matmul->get_input_element_type(0).size();
-                if (used_precision_size.empty() || used_precision_size.back() != prc_size) {
-                    used_precision_size.push_back(prc_size);
-                }
+                push_prc_size(matmul->get_input_element_type(0).size());
             }
 
             const auto consumers = matmul->get_output_target_inputs(0);
             if (std::none_of(consumers.begin(), consumers.end(),
                              [](const ov::Input<ov::Node>& in) { return ov::is_type<ov::op::v0::Result>(in.get_node()); })) {
-                const auto prc_size = matmul->get_element_type().size();
-                if (used_precision_size.empty() || used_precision_size.back() != prc_size) {
-                    used_precision_size.push_back(prc_size);
-                }
+                push_prc_size(matmul->get_element_type().size());
             }
         }
     }