[Snippets][CPU] Added FP32 MHA tokenization support (#14327)

openvinotoolkit · Jan 18, 2023 · 6525dd4 · 6525dd4
1 parent 6ec71c3
commit 6525dd4
Show file tree

Hide file tree

Showing 176 changed files with 9,989 additions and 1,628 deletions.
diff --git a/src/common/snippets/CMakeLists.txt b/src/common/snippets/CMakeLists.txt
@@ -26,9 +26,10 @@ ie_faster_build(${TARGET_NAME}
 )
 
 target_link_libraries(${TARGET_NAME} PUBLIC openvino::runtime
-                                     PRIVATE ngraph_reference openvino::runtime::dev)
+                                     PRIVATE ngraph_reference ov_shape_inference openvino::runtime::dev)
 
-target_include_directories(${TARGET_NAME} PUBLIC $<BUILD_INTERFACE:${PUBLIC_HEADERS_DIR}>)
+target_include_directories(${TARGET_NAME} PUBLIC $<BUILD_INTERFACE:${PUBLIC_HEADERS_DIR}>
+                                          PRIVATE $<BUILD_INTERFACE:${SHAPE_INFER_INCLUDE_DIR}>)
 
 add_cpplint_target(${TARGET_NAME}_cpplint FOR_TARGETS ${TARGET_NAME})
 

diff --git a/src/common/snippets/include/snippets/generator.hpp b/src/common/snippets/include/snippets/generator.hpp
@@ -84,15 +84,15 @@ class Schedule {
      * @param f can this kernel be linearided to 1D range
      * @param p pointer to generated code
      */
-    Schedule(const Shape& ws, bool f, code p) : work_size(ws), is_flat(f), ptr(p) {}
+    Schedule(const ov::PartialShape& ws, bool f, code p) : work_size(ws), is_flat(f), ptr(p) {}
     /**
      * @brief Returns callable instanse of code pointer
      */
     template<typename K> K get_callable() const {
         return reinterpret_cast<K>(const_cast<unsigned char*>(ptr));
     }
 
-    Shape work_size {};
+    ov::PartialShape work_size {};
     bool is_flat {false};
     code ptr {nullptr};
 };
@@ -112,21 +112,43 @@ class Generator {
      * @brief Default destructor
      */
     virtual ~Generator() = default;
+    /**
+    * @interface GeneratorConfig
+    * @brief Allows to tweak the lowering process.
+    */
+    class GeneratorConfig {
+    public:
+        // True if the lowered Emitters need to be accessed during runtime. Normally they're destroyed after code emission.
+        bool m_save_lowered_code = false;
+        // True if we can optimize tails for single evaluation during code generation
+        // More details with optimization examples you can see in generate() method
+        // For example, tails with Buffer ops doesn't support single evaluation optimizations
+        //              because of that we should always reset memory pointer using finalization offsets
+        //              after data storing to Buffer
+        bool m_optimize_single_evaluation = true;
+        // True if we should check runtime info for nodes to call specific needed transformations
+        bool m_need_fill_tail_register = false;
+    };
     /**
      * @brief virtual method any specific implementation should implement
      * @param m model in canonical for for table-based code generation
+     * @param config config with transformation and optimization parameters
+     * @param compile_params parameters for generated code
      * @return pointer to generated code
      */
-    code generate(std::shared_ptr<ov::Model>& m, const void* compile_params = nullptr) const;
+    code generate(std::shared_ptr<ov::Model>& m, const GeneratorConfig& config, const void* compile_params = nullptr);
 
     /**
      * @brief gets target machine
      * @return pointer to constant target machine
      */
-    std::shared_ptr<const TargetMachine> get_target_machine() const { return target; }
+    std::shared_ptr<const TargetMachine> get_target_machine() const;
 
 protected:
     std::shared_ptr<TargetMachine> target;
+    // todo: we need to save lowered code to access compiled brgemm kernels on execution time (normally lowered is destructed by then).
+    //  This is temporary solution, remove this when kernel caching is implemented. Don't forget to make generate const method.
+    std::vector<AllocatedEmitter> lowered_saved;
 };
 
 } // namespace snippets

diff --git a/src/common/snippets/include/snippets/op/brgemm.hpp b/src/common/snippets/include/snippets/op/brgemm.hpp
@@ -0,0 +1,47 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "ngraph/op/op.hpp"
+#include "ngraph/op/matmul.hpp"
+
+namespace ngraph {
+namespace snippets {
+namespace op {
+
+/**
+ * @interface Brgemm
+ * @brief Brgemm is a batch-reduced matrix multiplication with the support of arbitrary strides between matrices rows
+ * @ingroup snippets
+ */
+class Brgemm : public ngraph::op::v0::MatMul {
+public:
+    OPENVINO_OP("Brgemm", "SnippetsOpset", ngraph::op::v0::MatMul);
+    Brgemm(const Output<Node>& A, const Output<Node>& B, const size_t offset_a = 0lu, const size_t offset_b = 0lu, const size_t offset_c = 0lu);
+    Brgemm() = default;
+
+    bool visit_attributes(AttributeVisitor& visitor) override;
+    void validate_and_infer_types() override;
+    std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
+
+    bool has_evaluate() const override { return false; }
+
+    size_t get_offset_a() const { return m_offset_a; }
+    size_t get_offset_b() const { return m_offset_b; }
+    size_t get_offset_c() const { return m_offset_c; }
+
+    void set_offset_a(const size_t offset) { m_offset_a = offset; }
+    void set_offset_b(const size_t offset) { m_offset_b = offset; }
+    void set_offset_c(const size_t offset) { m_offset_c = offset; }
+
+private:
+    size_t m_offset_a = 0lu;  // offset for first input
+    size_t m_offset_b = 0lu;  // offset for second input
+    size_t m_offset_c = 0lu;  // offset for output
+};
+
+} // namespace op
+} // namespace snippets
+} // namespace ngraph
diff --git a/src/common/snippets/include/snippets/op/broadcastload.hpp b/src/common/snippets/include/snippets/op/broadcastload.hpp
@@ -21,12 +21,18 @@ class BroadcastLoad : public BroadcastMove {
 public:
     OPENVINO_OP("BroadcastLoad", "SnippetsOpset", ngraph::snippets::op::BroadcastMove);
 
-    BroadcastLoad(const Output<Node>& x, Shape output_shape);
+    BroadcastLoad(const Output<Node>& x, ov::PartialShape output_shape, size_t offset = 0lu);
     BroadcastLoad() = default;
 
-    std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
+    size_t get_offset() const { return m_offset; }
+    void set_offset(const size_t offset) { m_offset = offset; }
 
+    bool visit_attributes(AttributeVisitor& visitor) override;
+    std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
     void validate_and_infer_types() override;
+
+private:
+    size_t m_offset = 0lu;
 };
 
 } // namespace op

diff --git a/src/common/snippets/include/snippets/op/broadcastmove.hpp b/src/common/snippets/include/snippets/op/broadcastmove.hpp
@@ -19,7 +19,7 @@ class BroadcastMove : public ngraph::op::Op {
 public:
     OPENVINO_OP("BroadcastMove", "SnippetsOpset");
 
-    BroadcastMove(const Output<Node>& x, Shape output_shape);
+    BroadcastMove(const Output<Node>& x, ov::PartialShape output_shape);
     BroadcastMove() = default;
 
     bool visit_attributes(AttributeVisitor& visitor) override;
@@ -28,12 +28,9 @@ class BroadcastMove : public ngraph::op::Op {
 
     void validate_and_infer_types() override;
 
-    OPENVINO_SUPPRESS_DEPRECATED_START
-    bool evaluate(const HostTensorVector& output_values, const HostTensorVector& input_values) const override;
-    OPENVINO_SUPPRESS_DEPRECATED_END
 
 protected:
-    Shape output_shape;
+    ov::PartialShape output_shape;
 };
 
 } // namespace op

diff --git a/src/common/snippets/include/snippets/op/buffer.hpp b/src/common/snippets/include/snippets/op/buffer.hpp
@@ -0,0 +1,47 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <ngraph/op/op.hpp>
+
+namespace ngraph {
+namespace snippets {
+namespace op {
+
+/**
+ * @interface Buffer
+ * @brief The operation is for intermediate data storage
+ *        - m_allocation_rank - rank of shape for memory allocation: shape[shape_rank - normalize(m_allocation_rank) : shape_rank].
+ *                 It's needed to allocate needed memory size that depends on Tile rank, for example.
+ *                 Default value is -1 (full shape)
+ *        Notes:
+ *               - All buffers in a graph have the same memory pointer. So if we have a few buffers,
+ *                 each the corresponding MemoryAccess op for Buffer should have offset for common memory pointer of this Buffer
+ *               - Buffer should be a single consumer for operation output port
+ * @ingroup snippets
+ */
+class Buffer : public ngraph::op::Op {
+public:
+    OPENVINO_OP("Buffer", "SnippetsOpset");
+
+    Buffer(const Output<Node>& x, const int32_t allocation_rank = -1);
+    Buffer() = default;
+
+    int32_t get_allocation_rank() const { return m_allocation_rank; }
+    void set_allocation_rank(int32_t rank) { m_allocation_rank = rank; }
+
+    size_t get_byte_size() const;
+
+    bool visit_attributes(AttributeVisitor& visitor) override;
+    std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
+    void validate_and_infer_types() override;
+
+private:
+    int32_t m_allocation_rank = -1;
+};
+
+} // namespace op
+} // namespace snippets
+} // namespace ngraph
diff --git a/src/common/snippets/include/snippets/op/fill.hpp b/src/common/snippets/include/snippets/op/fill.hpp
@@ -0,0 +1,47 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <ngraph/op/op.hpp>
+
+namespace ngraph {
+namespace snippets {
+namespace op {
+
+/**
+ * @interface Fill
+ * @brief Generated in Tail Loop vector representation in code generation step for cases when we should
+ *        refill registers by special values.
+ *        For example, for cases with ReduceMax or ReduceSum in Softmax
+ *        Where:
+ *          - offset - starting element index where filling is performed while beginning of input data is untouched
+ *          - fill_value - hexadecimal filling value
+ * @ingroup snippets
+ */
+class Fill : public ngraph::op::Op {
+public:
+    OPENVINO_OP("Fill", "SnippetsOpset");
+
+    Fill(const Output<Node>& x, const size_t offset, const uint32_t fill_value = 0x0);
+    Fill() = default;
+
+    size_t get_offset() const { return m_offset; }
+    uint32_t get_fill_value() const { return m_fill_value; }
+
+    void set_offset(const size_t offset) { m_offset = offset; }
+    void set_fill_value(const uint32_t fill_value) { m_fill_value = fill_value; }
+
+    bool visit_attributes(AttributeVisitor& visitor) override;
+    std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
+    void validate_and_infer_types() override;
+
+protected:
+    size_t m_offset = 0lu;
+    uint32_t m_fill_value = 0x0;
+};
+
+} // namespace op
+} // namespace snippets
+} // namespace ngraph
diff --git a/src/common/snippets/include/snippets/op/horizon_max.hpp b/src/common/snippets/include/snippets/op/horizon_max.hpp
@@ -0,0 +1,32 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "ngraph/op/op.hpp"
+
+namespace ngraph {
+namespace snippets {
+namespace op {
+
+/**
+ * @interface HorizonMax
+ * @brief The operation calculates a horizon maximum of a vector register
+ * @ingroup snippets
+ */
+class HorizonMax : public ngraph::op::Op {
+public:
+    OPENVINO_OP("HorizonMax", "SnippetsOpset");
+
+    HorizonMax(const Output<Node>& x);
+    HorizonMax() = default;
+
+    bool visit_attributes(AttributeVisitor& visitor) override { return true;}
+    std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
+    void validate_and_infer_types() override;
+};
+
+} // namespace op
+} // namespace snippets
+} // namespace ngraph
diff --git a/src/common/snippets/include/snippets/op/horizon_sum.hpp b/src/common/snippets/include/snippets/op/horizon_sum.hpp
@@ -0,0 +1,32 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "ngraph/op/op.hpp"
+
+namespace ngraph {
+namespace snippets {
+namespace op {
+
+/**
+ * @interface HorizonSum
+ * @brief The operation calculates a horizon sum of a vector register
+ * @ingroup snippets
+ */
+class HorizonSum : public ngraph::op::Op {
+public:
+    OPENVINO_OP("HorizonSum", "SnippetsOpset");
+
+    HorizonSum(const Output<Node>& x);
+    HorizonSum() = default;
+
+    bool visit_attributes(AttributeVisitor& visitor) override { return true;}
+    std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
+    void validate_and_infer_types() override;
+};
+
+} // namespace op
+} // namespace snippets
+} // namespace ngraph
diff --git a/src/common/snippets/include/snippets/op/kernel.hpp b/src/common/snippets/include/snippets/op/kernel.hpp
@@ -20,13 +20,14 @@ class Kernel : public ngraph::op::Op {
 public:
     OPENVINO_OP("Kernel", "SnippetsOpset");
 
-    Kernel(const std::vector<std::pair<std::shared_ptr<ngraph::snippets::Emitter>, ngraph::snippets::RegInfo>>& region);
+    Kernel(std::vector<AllocatedEmitter> region, std::shared_ptr<const ov::Model> m);
     Kernel() = default;
 
-    std::vector<std::pair<std::shared_ptr<ngraph::snippets::Emitter>, ngraph::snippets::RegInfo>> region;
+    std::vector<AllocatedEmitter> region;
+    const std::shared_ptr<const ov::Model> model;
 
     std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& inputs) const override {
-        return std::make_shared<Kernel>(region);
+        return std::make_shared<Kernel>(region, model);
     }
     const void *compile_params = nullptr;
 };