[Snippets] Added Softmax support

[Snippets] Added support for Reshape around Softmax applied comment part Added config parameter to disable MHA ops tokenization Buffer 2D Loops
a-sidorova · Dec 6, 2022 · 70a7e56 · 70a7e56
1 parent 6cb1bae
commit 70a7e56
Show file tree

Hide file tree

Showing 71 changed files with 3,207 additions and 267 deletions.
diff --git a/src/common/snippets/include/snippets/generator.hpp b/src/common/snippets/include/snippets/generator.hpp
@@ -120,10 +120,16 @@ class Generator {
     public:
         // True if the lowered Emitters need to be accessed during runtime. Normally they're destroyed after code emission.
         bool m_save_lowered_code = false;
+        // True if one evaluation optimizations are enabled
+        bool m_one_evaluation_optimizations = true;
+        // True if we should check runtime info for nodes to call specific needed transformations
+        bool m_need_fill_tail_register = false;
     };
     /**
      * @brief virtual method any specific implementation should implement
      * @param m model in canonical for for table-based code generation
+     * @param config config with transformation and optimization parameters
+     * @param compile_params parameters for generated code
      * @return pointer to generated code
      */
     code generate(std::shared_ptr<ov::Model>& m, const GeneratorConfig& config, const void* compile_params = nullptr);

diff --git a/src/common/snippets/include/snippets/op/brgemm.hpp b/src/common/snippets/include/snippets/op/brgemm.hpp
@@ -19,13 +19,27 @@ namespace op {
 class Brgemm : public ngraph::op::v0::MatMul {
 public:
     OPENVINO_OP("Brgemm", "SnippetsOpset", ngraph::op::v0::MatMul);
-    Brgemm(const Output<Node>& A, const Output<Node>& B);
+    Brgemm(const Output<Node>& A, const Output<Node>& B, const size_t offset_a = 0lu, const size_t offset_b = 0lu, const size_t offset_c = 0lu);
     Brgemm() = default;
 
+    bool visit_attributes(AttributeVisitor& visitor) override;
     void validate_and_infer_types() override;
     std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
 
     bool has_evaluate() const override { return false; }
+
+    size_t get_offset_a() const { return m_offset_a; }
+    size_t get_offset_b() const { return m_offset_b; }
+    size_t get_offset_c() const { return m_offset_c; }
+
+    void set_offset_a(const size_t offset) { m_offset_a = offset; }
+    void set_offset_b(const size_t offset) { m_offset_b = offset; }
+    void set_offset_c(const size_t offset) { m_offset_c = offset; }
+
+private:
+    size_t m_offset_a = 0lu;  // offset for first input
+    size_t m_offset_b = 0lu;  // offset for second input
+    size_t m_offset_c = 0lu;  // offset for output
 };
 
 } // namespace op

diff --git a/src/common/snippets/include/snippets/op/buffer.hpp b/src/common/snippets/include/snippets/op/buffer.hpp
@@ -0,0 +1,54 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <ngraph/op/op.hpp>
+
+namespace ngraph {
+namespace snippets {
+namespace op {
+
+/**
+ * @interface Buffer
+ * @brief The operation is for intermediate data storage
+ *        - m_offset - offset from common Buffer allocated memory.
+ *                 Default value is 0.
+ *        - m_allocation_rank - rank of shape for memory allocation: shape[shape_rank - m_allocation_rank : shape_rank].
+ *                 It's needed to allocate needed memory size that depends on Tile rank, for example.
+ *                 Default value is -1 (full shape)
+ *        Notes:
+ *               - All buffers in a graph have the same memory pointer. So if we have a few buffers,
+ *                 each buffer should have its own offset for common memory
+ *               - Buffer should be a single consumer for operation output port
+ * @ingroup snippets
+ */
+class Buffer : public ngraph::op::Op {
+public:
+    OPENVINO_OP("Buffer", "SnippetsOpset");
+    BWDCMP_RTTI_DECLARATION;
+
+    Buffer(const Output<Node>& x, const int32_t allocation_rank = -1);
+    Buffer() = default;
+
+    size_t get_offset() const { return m_offset; }
+    void set_offset(const size_t offset);
+
+    int32_t get_allocation_rank() const { return m_allocation_rank; }
+    void set_allocation_rank(int32_t rank) { m_allocation_rank = rank; }
+
+    size_t get_byte_size() const;
+
+    bool visit_attributes(AttributeVisitor& visitor) override;
+    std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
+    void validate_and_infer_types() override;
+
+private:
+    size_t m_offset = 0lu;
+    int32_t m_allocation_rank = -1;
+};
+
+} // namespace op
+} // namespace snippets
+} // namespace ngraph
diff --git a/src/common/snippets/include/snippets/op/fill.hpp b/src/common/snippets/include/snippets/op/fill.hpp
@@ -0,0 +1,47 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <ngraph/op/op.hpp>
+
+namespace ngraph {
+namespace snippets {
+namespace op {
+
+/**
+ * @interface Fill
+ * @brief Generated in Tail Loop vector representation in code generation step for cases when we should
+ *        refill regsiters by special numbers.
+ *        For example, for cases with ReduceMax or ReduceSum in Softmax
+ *        Where:
+ *          -  offset - is value shift for filling
+ *          - fill_value - hexadecimal filling value
+ * @ingroup snippets
+ */
+class Fill : public ngraph::op::Op {
+public:
+    OPENVINO_OP("Fill", "SnippetsOpset");
+
+    Fill(const Output<Node>& x, const size_t offset, const uint32_t fill_value = 0x0);
+    Fill() = default;
+
+    size_t get_offset() const { return m_offset; }
+    uint32_t get_fill_value() const { return m_fill_value; }
+
+    void set_offset(const size_t offset) { m_offset = offset; }
+    void set_fill_value(const uint32_t fill_value) { m_fill_value = fill_value; }
+
+    bool visit_attributes(AttributeVisitor& visitor) override;
+    std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
+    void validate_and_infer_types() override;
+
+protected:
+    size_t m_offset = 0lu;
+    uint32_t m_fill_value = 0x0;
+};
+
+} // namespace op
+} // namespace snippets
+} // namespace ngraph
diff --git a/src/common/snippets/include/snippets/op/horizon_max.hpp b/src/common/snippets/include/snippets/op/horizon_max.hpp
@@ -0,0 +1,32 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "ngraph/op/op.hpp"
+
+namespace ngraph {
+namespace snippets {
+namespace op {
+
+/**
+ * @interface HorizonMax
+ * @brief The operation calculates a horizon maximum of a vector register
+ * @ingroup snippets
+ */
+class HorizonMax : public ngraph::op::Op {
+public:
+    OPENVINO_OP("HorizonMax", "SnippetsOpset");
+
+    HorizonMax(const Output<Node>& x);
+    HorizonMax() = default;
+
+    bool visit_attributes(AttributeVisitor& visitor) override { return true;}
+    std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
+    void validate_and_infer_types() override;
+};
+
+} // namespace op
+} // namespace snippets
+} // namespace ngraph
diff --git a/src/common/snippets/include/snippets/op/horizon_sum.hpp b/src/common/snippets/include/snippets/op/horizon_sum.hpp
@@ -0,0 +1,32 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "ngraph/op/op.hpp"
+
+namespace ngraph {
+namespace snippets {
+namespace op {
+
+/**
+ * @interface HorizonSum
+ * @brief The operation calculates a horizon sum of a vector register
+ * @ingroup snippets
+ */
+class HorizonSum : public ngraph::op::Op {
+public:
+    OPENVINO_OP("HorizonSum", "SnippetsOpset");
+
+    HorizonSum(const Output<Node>& x);
+    HorizonSum() = default;
+
+    bool visit_attributes(AttributeVisitor& visitor) override { return true;}
+    std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
+    void validate_and_infer_types() override;
+};
+
+} // namespace op
+} // namespace snippets
+} // namespace ngraph
diff --git a/src/common/snippets/include/snippets/op/load.hpp b/src/common/snippets/include/snippets/op/load.hpp
@@ -13,16 +13,16 @@ namespace op {
 
 /**
  * @interface Load
- * @brief Generated by Canonicalization step where explicit instructions should be emitted for data loading
- *        where number of elements to load is determined by "count"
- *        Default value is "1" - to load one element
+ * @brief Generated during Lowering stage (convert_to_snippets_dialect) where explicit instructions should be emitted for data loading
+ *        where number of elements to load is determined by "count" (Default value is "1" - to load one element)
+ *        and memory offset for loading is determined by "offset" (Default value is "0" - to load starting from the first element)
  * @ingroup snippets
  */
 class Load : public MemoryAccess {
 public:
     OPENVINO_OP("Load", "SnippetsOpset");
 
-    Load(const Output<Node>& x, const size_t count = 1lu);
+    Load(const Output<Node>& x, const size_t count = 1lu, const size_t offset = 0lu);
     Load() = default;
 
     std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
@@ -38,7 +38,7 @@ class Load : public MemoryAccess {
 class LoadReshape : public Load {
 public:
     OPENVINO_OP("LoadReshape", "SnippetsOpset");
-    LoadReshape(const Output<Node>& x, size_t count = 1lu, std::vector<size_t> order = {});
+    LoadReshape(const Output<Node>& x, size_t count = 1lu, const size_t offset = 0lu, std::vector<size_t> order = {});
     bool visit_attributes(AttributeVisitor& visitor) override;
     std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
     void validate_and_infer_types() override;

diff --git a/src/common/snippets/include/snippets/op/memory_access.hpp b/src/common/snippets/include/snippets/op/memory_access.hpp
@@ -24,14 +24,17 @@ class MemoryAccess : public ngraph::op::Op {
     OPENVINO_OP("MemoryAccess", "SnippetsOpset");
 
     size_t get_count() const;
-    void set_count(size_t count);
+    size_t get_offset() const;
+    void set_count(const size_t count);
+    void set_offset(const size_t offset);
     bool visit_attributes(AttributeVisitor& visitor) override;
     void validate_and_infer_types() override;
 
 protected:
-    explicit MemoryAccess(const Output<Node>& x, size_t count = 1lu);
+    explicit MemoryAccess(const Output<Node>& x, size_t count = 1lu, size_t offset = 0lu);
     MemoryAccess() = default;
     size_t m_count = 0lu;
+    size_t m_offset = 0lu;
 };
 
 } // namespace op

diff --git a/src/common/snippets/include/snippets/op/store.hpp b/src/common/snippets/include/snippets/op/store.hpp
@@ -13,16 +13,16 @@ namespace op {
 
 /**
  * @interface Store
- * @brief Generated by Canonicalization step where explicit instructions should be emitted for data storing
- *        where number of elements to store is determined by "count"
- *        Default value is "1" - to store one element
+ * @brief Generated during Lowering stage (convert_to_snippets_dialect) where explicit instructions should be emitted for data storing
+ *        where number of elements to store is determined by "count" (Default value is "1" - to store one element)
+ *        and memory offset for storing is determined by "offset" (Default value is "0" - to store starting at start memory ptr)
  * @ingroup snippets
  */
 class Store : public MemoryAccess {
 public:
     OPENVINO_OP("Store", "SnippetsOpset");
 
-    Store(const Output<Node>& x, const size_t count = 1lu);
+    Store(const Output<Node>& x, const size_t count = 1lu, const size_t offset = 0lu);
     Store() = default;
 
     std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;

diff --git a/src/common/snippets/include/snippets/op/subgraph.hpp b/src/common/snippets/include/snippets/op/subgraph.hpp
@@ -89,21 +89,13 @@ class Subgraph : public ngraph::op::Op {
         return m_generator;
     }
 
-    size_t get_non_scalar_constants_count() const {
-        return m_non_scalar_constants_count;
-    }
-
-    bool is_quantized() const {
-        return config.m_is_quantized;
-    }
-
-    bool has_type_relaxed_ops() const {
-        return config.m_has_type_relaxed_ops;
-    }
-
-    bool has_domain_sensitive_ops() const {
-        return config.m_has_domain_sensitive_ops;
-    }
+    // Return common memory size for all buffers in body. Should be called only after tileRank setting
+    size_t get_buffer_scratchpad_size() const;
+    size_t get_virtual_port_count() const { return m_virtual_port_count; }
+    bool is_buffer_needed() const { return m_buffer_needed; }
+    bool is_quantized() const { return config.m_is_quantized; }
+    bool has_type_relaxed_ops() const { return config.m_has_type_relaxed_ops; }
+    bool has_domain_sensitive_ops() const { return config.m_has_domain_sensitive_ops; }
 
     snippets::Schedule generate(const BlockedShapeVector& output_shapes, const BlockedShapeVector& input_shapes, ngraph::pass::Manager& opt,
                                 const void* compile_params = nullptr);
@@ -117,8 +109,9 @@ class Subgraph : public ngraph::op::Op {
     // plugin sets generator for a snippet to some specific generator.
     // it's going to be replaced with Jitters table later
     void set_generator(std::shared_ptr<ngraph::snippets::Generator> generator);
-    void set_non_scalar_constants_count(const size_t count);
     void set_tile_rank(size_t newRank) {tileRank = newRank;}
+    void set_virtual_port_count(const size_t count);
+    void buffer_needed(const bool need);
 
     void print() const;
     void print_statistics(bool verbose);
@@ -133,11 +126,14 @@ class Subgraph : public ngraph::op::Op {
     void align_element_types(const BlockedShapeVector& outputShapes, const BlockedShapeVector& inputShapes);
     void convert_to_snippet_dialect();
     void init_config();
-    // Count of potentional non-scalar Consants that will be created after some tranformations
-    // At the moment it's relevant only for FakeQuantize decomposition
-    // NOTE: To avoid overheads in each calcution of this count (for example, in validate_and_type_infer()),
+    // Count of Subgraph virtual ports:
+    //  - Potential non-scalar Constants that will be created after some transformations (At the moment it's relevant only for FakeQuantize decomposition)
+    // Need Buffer op or not
+    //  - Buffers. All Buffers are considered as one common additional virtual port. So we cannot summarize them as potential non-scalar Constants
+    // NOTE: To avoid overheads in each calculation of this count (for example, in validate_and_type_infer()),
     //       we should MANUALLY calculate it where it needed.
-    size_t m_non_scalar_constants_count = 0;
+    size_t m_virtual_port_count = 0;
+    bool m_buffer_needed = false;
     Shape exec_domain = {};
     std::shared_ptr<ov::Model> m_body = nullptr;
     std::shared_ptr<ngraph::snippets::Generator> m_generator = nullptr;
@@ -162,11 +158,12 @@ class Subgraph : public ngraph::op::Op {
         // True if Subgraph contains TypeRelaxed nodes -> for several streams in tp mode we should copy body using mutexes
         // because TypeRelaxed::copy_with_new_inputs() isn't save-thread method
         bool m_has_type_relaxed_ops = false;
-        // True if we should check runtime info for nodes to call specific needed transformations
-        bool m_need_fill_tail_register = false;
         // True if body has operations that don't support plugin-side domain optimizations
         // (e.g. Transpose, Softmax, MatMul in general doesn't support dimensions collapsing)
         bool m_has_domain_sensitive_ops = false;
+        // True if we should go through whole body to check for where loops should be explicitly inserted.
+        // Otherwise, we insert Loops on Parameters and Results - for example, it's optimized out for subgraph with only Eltwise ops
+        bool m_explicit_loop_insertion = false;
     } config;
 };
 

diff --git a/src/common/snippets/include/snippets/op/vector_buffer.hpp b/src/common/snippets/include/snippets/op/vector_buffer.hpp
@@ -0,0 +1,34 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <ngraph/op/op.hpp>
+
+namespace ngraph {
+namespace snippets {
+namespace op {
+
+/**
+ * @interface VectorBuffer
+ * @brief The operation is for intermediate data storage in vector register
+ * @ingroup snippets
+ */
+class VectorBuffer : public ngraph::op::Op {
+public:
+    OPENVINO_OP("VectorBuffer", "SnippetsOpset");
+
+    VectorBuffer(const ov::element::Type element_type = ov::element::f32);
+
+    bool visit_attributes(AttributeVisitor& visitor) override { return true;}
+    std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
+    void validate_and_infer_types() override;
+
+private:
+    ov::element::Type m_element_type;
+};
+
+} // namespace op
+} // namespace snippets
+} // namespace ngraph