[Snippets] Refactored work with Buffers (openvinotoolkit#19644)

[Snippets] BufferManager is not derived from PassPipeline now [Snippets] Added MemorySolver support [Snippets] Made as static class [Snippets] Added one-level inplace support [Snippets] Added optimization bits [Snippets] Small cosmetic fixes [Snippets] Renamed to BufferSolver [Snippets] Refactored [Snippets] Fixed IdendifyBuffers [Snippets] Add inplace multi + identify buffers [Snippets] Made common pass [Snippets] Added PassPipeline::get_pass<>() [Snippets] Added comments, briefs, refactored smth [Snippets] Fixed win build [Snippets] Not allow to have the same Buffer ID for multi level Buffers [Snippets] Moved CleanupRepeatedPtrShifts to common pioeline [Snippets] Made IdentifyBuffers::ShiftPtrParams [Snippets] Fixed window sliding mode [Snippets] Refactored nested clusters [Snippets] Adde normalized buffer regs [Snippets] Not allowed to have the same ID for nested Buffers in IdentifyBuffers [Snippets] Fixed DefineBufferClusters::are_buffer_neighbours::find [Snippets] Removed useless method from InitLoops [Snippets] Fixed CC build [Snippets] Applied Ivan comments [Snippets] Applied Ivan comment: refactored pass classes [Snippets] Applied Vladislav comments [Snippets] Applied Ivan comments 2 [Runtime] Moved MemorySolver to API2.0 [Snippets] Created common buffer allocation pass AllocateBuffers [Snippets][Tests] Added InplaceEltwise unit test [Snippets] fixed NormalizeBufferIDs [Snippets][CPU] Fixed BrgemmBlocking lowered pass: move wsp for AMX to brgemm [Snippets][CPU][Tests] Covered AMX MHA buffer allocation by unit tests
kblaszczak-intel · Nov 30, 2023 · df03b04 · df03b04
1 parent 6ab5ef7
commit df03b04
Show file tree

Hide file tree

Showing 29 changed files with 1,689 additions and 267 deletions.
diff --git a/src/common/snippets/include/snippets/lowered/linear_ir.hpp b/src/common/snippets/include/snippets/lowered/linear_ir.hpp
@@ -29,6 +29,9 @@ class Config {
     // Minimal advised work amount that should be processed during one call of the executable produced by Subgraph::generate
     // Set by a backend, should be large enough to compensate for the kernel call overheads
     size_t m_min_kernel_work_amount = 256;
+    // True if the Buffer scratchpad size of LinearIR will be optimized (all possible optimizations will be activated)
+    // False if all Buffers will have uniqie ID and offsets in the Linear IR
+    bool m_are_buffers_optimized = true;
 };
 
 /* The control flow of Snippets is built on Linear Intermediate Representation (Linear IR).

diff --git a/src/common/snippets/include/snippets/lowered/pass/allocate_buffers.hpp b/src/common/snippets/include/snippets/lowered/pass/allocate_buffers.hpp
@@ -5,7 +5,6 @@
 #pragma once
 
 #include "pass.hpp"
-#include "snippets/snippets_isa.hpp"
 
 namespace ov {
 namespace snippets {
@@ -14,26 +13,40 @@ namespace pass {
 
 /**
  * @interface AllocateBuffers
- * @brief The pass calculates common size of buffer scratchpad and propagates Buffer offsets to connected MemoryAccess operations.
- *        Notes:
- *           - The pass implicitly regulates InPlace processing for some Buffers when it's possible.
- *             The pass don't allocate new memory for InPlace Buffers, we propagate the same offsets for them.
- *           - The pass should be splitted into two passes: ProcessInplace (markup of Buffers which can use the same memory)
- *             and AllocateBuffer (allocate memory for Buffers using MemorySolver which can optimally reuse memory).
+ * @brief The pass allocates common memory for all Buffers.
+ *        There are two modes: default and optimized allocation. Default allocation (non-optimized) mode sets unique offsets and ID to Buffers.
+ *        Optimized mode allocates memory for Buffer ops using the following optimizations:
+ *         - MemorySolver: helps to solve issue of optimal memory allocation;
+ *         - InPlace: Loop or MemoryAccess ops read from the memory and store data to the same memory if possible
+ *         - Reusing Buffer IDs: Buffers have the same IDs (gpr) in cases when Buffers aren't connected or have the same data ptr shifts
+ *        Note: All buffers are related to each other and represent common buffer scratchpad of Subgraph.
+ *              The buffer scratchpad has one general data pointer. Each buffer has offset relative to the data pointer of buffer scratchpad.
  * @ingroup snippets
  */
-
-class AllocateBuffers : public Pass {
+class AllocateBuffers: public Pass {
 public:
     OPENVINO_RTTI("AllocateBuffers", "Pass")
-    bool run(lowered::LinearIR& linear_ir) override;
-
-    size_t get_scratchpad_size() const { return m_buffer_scratchpad_size; }
-
+    AllocateBuffers(size_t& buffer_scratchpad_size, bool is_optimized = true);
+
+    /**
+     * @brief Apply the pass to the Linear IR
+     * @param linear_ir the target Linear IR
+     * @return status of the pass
+     */
+    bool run(LinearIR& linear_ir) override;
+
+    /**
+     * @brief Set offset to Buffer op and propagates its to the connected memory access ops
+     * @param buffer_expr expression with Buffer op
+     * @param offset offset in common buffer scratchpad
+     */
+    static void set_buffer_offset(const ExpressionPtr& buffer_expr, const size_t offset);
+
+    using BufferCluster = std::set<ExpressionPtr>;
+    using BufferClusters = std::vector<BufferCluster>;
 private:
-    static void propagate_offset(const LinearIR& linear_ir, const ExpressionPtr& buffer_expr, size_t offset);
-
-    size_t m_buffer_scratchpad_size = 0;
+    size_t& m_buffer_scratchpad_size;
+    bool m_is_optimized_mode = true;
 };
 
 } // namespace pass

diff --git a/src/common/snippets/include/snippets/lowered/pass/define_buffer_clusters.hpp b/src/common/snippets/include/snippets/lowered/pass/define_buffer_clusters.hpp
@@ -0,0 +1,138 @@
+// Copyright (C) 2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "pass.hpp"
+
+#include "allocate_buffers.hpp"
+
+namespace ov {
+namespace snippets {
+namespace lowered {
+namespace pass {
+
+/**
+ * @interface DefineBufferClusters
+ * @brief The pass defines buffer clusters. The buffers from one cluster share the
+ *        same memory (has the same offset relative to the data pointer of buffer scratchpad).
+ *         - If MemoryAccess op or Loop can read and write to the same (inplace behavior), the Buffers should be in the one cluster.
+ *         - If Buffer is in the Loop which read or write from/to the other Buffers, this Buffer can emulate `window` slidings.
+ *           It means that Buffer inside can reuse memory of Buffers outside in bounds of full Loop work.
+ *           Demonstration:
+ *                               |-----------------------------------------------------|
+ *                               | |------------|                       |------------| |                        InnerLoops have work amount 128
+ *             Buffer0 [3x128]-> | | InnerLoop0 | -> Buffer1 [3x128] -> | InnerLoop1 | | -> Buffer2 [3x128]     OuterLoop has work amount 3
+ *                               | |------------|      OuterLoop        |------------| |
+ *                               |-----------------------------------------------------|
+ *           Buffer1 can reuse memory [128] of Buffer0 or Buffer2 in each iteration of OuterLoop
+ *           Note: The pass requires expression enumeration and buffer identification (for nested Buffers inplace).
+ *                 These passes should be executed separately before this pass!
+ * @ingroup snippets
+ */
+class DefineBufferClusters : public Pass {
+public:
+    OPENVINO_RTTI("DefineBufferClusters", "Pass")
+
+    DefineBufferClusters(AllocateBuffers::BufferClusters& clusters) : m_clusters(clusters) {}
+
+    /**
+     * @brief Apply the pass to the Linear IR
+     * @param linear_ir the target Linear IR
+     * @return status of the pass
+     */
+    bool run(lowered::LinearIR& linear_ir) override;
+
+private:
+    using BufferPorts = std::unordered_map<ExpressionPtr, std::set<size_t>>;
+    /**
+     * @brief Finds Buffer cluster in set of clusters which contains the target expression with Buffer
+     * @param target target expression with Buffer op
+     * @return vector iterator which refers to the found cluster
+     */
+    AllocateBuffers::BufferClusters::iterator find_cluster_by_expr(const ExpressionPtr& target);
+    /**
+     * @brief Returns True if Buffer is direct source for the target expr (there aren't other loop between the Buffer and target expr)
+     * @param buffer_expr expression with assumed Buffer op
+     * @param target_expr expression with target op - LoopEnd or MemoryAccess op
+     * @return boolean value
+     */
+    bool is_direct_buffer(const ExpressionPtr& buffer_expr, const ExpressionPtr& target_expr) const;
+    /**
+     * @brief Creates new buffer cluster if buffer_exprs is missed in clusters. If buffer_exprs is already in clusters, do nothing
+     * @param buffer_expr expression with Buffer op
+     */
+    void create_new_cluster(const ExpressionPtr& buffer_expr);
+    /**
+     * @brief Returns common ID of cluster if all buffer inside have the same Buffer ID. Otherwise returns the default value SIZE_MAX
+     *        that means that Buffers in cluster have different IDs.
+     * @param cluster set of Buffer expressions - cluster
+     * @return common buffer ID or SIZE_MAX - size value
+     */
+    size_t get_cluster_buffer_id(const AllocateBuffers::BufferCluster& cluster) const;
+
+    /**
+     * @brief Analyzes Loop: if Loop has Buffer ops on inputs and outputs, Loop can read and write from/to the same memory.
+     * @param expr_it iterator of Linear IR which refers to the expression with LoopEnd
+     */
+    void parse_loop(const LinearIR::constExprIt& expr_it);
+    /**
+     * @brief Analyzes full MemoryAccess op: if the op has Buffer ops on I/O, the op can read and write from/to the same memory.
+     * @param expr expression with full MemoryAccess op
+     */
+    void parse_memory_access_op(const ExpressionPtr& expr);
+    /**
+     * @brief Gets input outputs buffers of Loop
+     * @param loop_expr expression with LoopEnd op
+     * @return unordered map [Expression -> set of input ports] which represents input Buffers of Loop
+     */
+    BufferPorts get_input_buffers(const ExpressionPtr& loop_expr) const;
+    /**
+     * @brief Gets output buffers of Loop
+     * @param loop_expr expression with LoopEnd op
+     * @return unordered map [Expression -> set of input ports] which represents output Buffers of Loop
+     */
+    BufferPorts get_output_buffers(const ExpressionPtr& loop_expr) const;
+    /**
+     * @brief Analyzes nested Loops: unite nested buffer clusters if they can reproduce `window` sliding
+     * @param input_buffers unordered map [Expression -> set of input ports] which represents input Buffers of Loop
+     * @param output_buffers unordered map [Expression -> set of output ports (one)] which represents output Buffers of Loop
+     * @param outer_loop_end_expr_it iterator of Linear IR which refers to the expression with outer LoopEnd
+     */
+    void parse_nested_loops(const BufferPorts& input_buffers, const BufferPorts& output_buffers, const LinearIR::constExprIt& outer_loop_end_expr_it);
+    /**
+     * @brief Finds the last connected Loop to the target Buffer and returns the corresponding finalization offset
+     * @param buffer_expr expression with Buffer op
+     * @return finalization offset - int64_t value
+     */
+    int64_t get_buffer_finalization_offset(const ExpressionPtr& buffer_expr) const;
+    /**
+     * @brief Check if two Buffer expressions are connected to the same Loop. Set common LoopEnd as `loop` parameter and
+     *        indexes of Loop ports `up_idx` and `down_idx` if Buffers are really neighbours
+     * @param up expression with upper Buffer op
+     * @param down expression with lower Buffer op
+     * @param loop expression with common LoopEnd op
+     * @param up_idx the reference to port index of upper Buffer op to the Loop
+     * @param down_idx the reference to port index of lower Buffer op to the Loop
+     * @return Return True if the Buffers are connected to the same Loop
+     */
+    static bool are_buffer_neighbours(const ExpressionPtr& up, const ExpressionPtr& down, ExpressionPtr& loop, size_t& up_idx, size_t& down_idx);
+    /**
+     * @brief Unite clusters
+     * @param inner_cluster_it iterator to inner cluster - buffer cluster is in the loop
+     * @param outer_cluster buffer clusters with buffers outside the Loop
+     * @param outer_buffer target Buffer from outer_cluster
+     * @param is_outer_up true if outer buffer is upper in Linear IR than inner Buffers
+     * @return Return True if clusters have been united
+     */
+    bool unite_nested_clusters(const AllocateBuffers::BufferClusters::iterator& inner_cluster_it, AllocateBuffers::BufferCluster& outer_cluster,
+                               const ExpressionPtr& outer_buffer, bool is_outer_up);
+
+    AllocateBuffers::BufferClusters& m_clusters;
+};
+
+} // namespace pass
+} // namespace lowered
+} // namespace snippets
+} // namespace ov
diff --git a/src/common/snippets/include/snippets/lowered/pass/enumerate_expressions.hpp b/src/common/snippets/include/snippets/lowered/pass/enumerate_expressions.hpp
@@ -0,0 +1,28 @@
+// Copyright (C) 2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "pass.hpp"
+
+namespace ov {
+namespace snippets {
+namespace lowered {
+namespace pass {
+
+/**
+ * @interface EnumerateExpressions
+ * @brief The pass enumerates expression by execution order
+ * @ingroup snippets
+ */
+class EnumerateExpressions : public Pass {
+public:
+    OPENVINO_RTTI("EnumerateExpressions", "Pass")
+    bool run(LinearIR& linear_ir) override;
+};
+
+} // namespace pass
+} // namespace lowered
+} // namespace snippets
+} // namespace ov
diff --git a/src/common/snippets/include/snippets/lowered/pass/identify_buffers.hpp b/src/common/snippets/include/snippets/lowered/pass/identify_buffers.hpp
@@ -6,8 +6,6 @@
 
 #include "pass.hpp"
 
-#include "snippets/op/buffer.hpp"
-
 namespace ov {
 namespace snippets {
 namespace lowered {
@@ -22,7 +20,8 @@ namespace pass {
  *          - Loops, Brgemm (the same other ops) - are "edges" between Buffers (hub of edges).
  *                   The buffers are connected to the same Loop - are adjacent in graph sense bounds.
  *          - The vertices (buffers) are adjacent if they are connected to the same Loop and
- *            their data pointers cannot be proportionally incremented in Loops: different ptr increments or data sizes;
+ *            their data pointers cannot be proportionally incremented in Loops: different ptr increments or data sizes -
+ *            or one of the Buffers is in some a Loop but another Buffer is not;
  *          - Firstly, create adjacency matrix using the definition above;
  *          - Secondly, assign the same color to non-adjacent vertices of graph (buffers), and use different colors otherwise.
  *        Note: should be called before ResetBuffer() pass to have correct offsets
@@ -33,13 +32,79 @@ class IdentifyBuffers: public Pass {
     OPENVINO_RTTI("IdentifyBuffers", "Pass")
     IdentifyBuffers() = default;
 
+    /**
+     * @brief Apply the pass to the Linear IR
+     * @param linear_ir the target Linear IR
+     * @return status of the pass
+     */
     bool run(LinearIR& linear_ir) override;
 
+    struct ShiftPtrParams {
+        ShiftPtrParams() = default;
+        ShiftPtrParams(int64_t ds, int64_t pi, int64_t fo) : data_size(ds), ptr_increment(pi), finalization_offset(fo) {}
+        int64_t data_size = 0;
+        int64_t ptr_increment = 0;
+        int64_t finalization_offset = 0;
+
+        friend bool operator==(const ShiftPtrParams& lhs, const ShiftPtrParams& rhs);
+        friend bool operator!=(const ShiftPtrParams& lhs, const ShiftPtrParams& rhs);
+    };
+
+    /**
+     * @brief Check if two Buffers can reuse ID by ShiftPtrParams < data_size, ptr_increment, finalization_offset >
+     * @param lhs Data pointer shift params for first Buffer
+     * @param rhs Data pointer shift params for second Buffer
+     * @return Returns True if params are valid for reusing. Otherwise returns False
+     */
+    static bool can_reuse_id(const ShiftPtrParams& lhs, const ShiftPtrParams& rhs);
+
 private:
-    using BufferSet = std::vector<std::shared_ptr<op::Buffer>>;
+    using BufferPool = std::vector<ExpressionPtr>;
 
-    std::vector<bool> create_adjacency_matrix(const LinearIR& linear_ir, const BufferSet& buffers) const;
-    std::map<size_t, BufferSet> coloring(BufferSet& buffers, std::vector<bool>& adj);
+    /**
+     * @brief Get Buffer Index in Buffer set
+     * @param target the target Buffer expression
+     * @param pool set of Buffers from the Linear IR
+     * @return index of target Buffer expression in set
+     */
+    static size_t get_buffer_idx(const ExpressionPtr& target, const BufferPool& pool);
+    /**
+     * @brief Create adjacency matrix for Buffer system. See comment in the method for more details.
+     * @param linear_ir the target Linear IR
+     * @param pool set of Buffers from the Linear IR
+     * @return adjacency matrix where True value means that Buffers are adjacent and cannot have the same ID
+     */
+    static std::vector<bool> create_adjacency_matrix(const LinearIR& linear_ir, const BufferPool& pool);
+    /**
+     * @brief Algorithm of Graph coloring where vertices are Buffers
+     * @param buffers set of Buffers from the Linear IR
+     * @param adj adjacency matrix
+     * @return map [color id -> Buffer set]
+     */
+    static std::map<size_t, BufferPool> coloring(BufferPool& buffers, std::vector<bool>& adj);
+    /**
+     * @brief Update the adjacency matrix:
+     *         - If Buffers are from the same Loops and connected to the same Loop and
+     *           they have not proportionally ptr shift params for this Loop, the Buffers are adjacent - set value True in the matrix;
+     *         - If one of Buffer inside Loop but another Buffer is connected to this Loop and this Buffer has not zero data shift params,
+     *           the Buffers are adjacent - set value True in the matrix;
+     * @param lhs Pair where first value if Expression with first Buffer and second value is data pointer shift params for its
+     * @param rhs Pair where first value if Expression with second Buffer and second value is data pointer shift params for its
+     * @param buffers set of Buffers from the Linear IR
+     * @param adj Target adjacency matrix
+     */
+    static void update_adj_matrix(const std::pair<ExpressionPtr, ShiftPtrParams>& lhs,
+                                  const std::pair<ExpressionPtr, ShiftPtrParams>& rhs,
+                                  const BufferPool& buffers,
+                                  std::vector<bool>& adj);
+    /**
+     * @brief Check if two Buffers are adjacent and cannot have the same ID
+     * @param lhs Pair where first value is Expression with first Buffer and second value is data pointer shift params for it
+     * @param rhs Pair where first value is Expression with second Buffer and second value is data pointer shift params for it
+     * @return Returns True if they are adjacent, otherwise returns False
+     */
+    static bool are_adjacent(const std::pair<ExpressionPtr, ShiftPtrParams>& lhs,
+                             const std::pair<ExpressionPtr, ShiftPtrParams>& rhs);
 };
 
 } // namespace pass

diff --git a/src/common/snippets/include/snippets/lowered/pass/init_buffers_default.hpp b/src/common/snippets/include/snippets/lowered/pass/init_buffers_default.hpp
@@ -0,0 +1,41 @@
+// Copyright (C) 2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "pass.hpp"
+
+namespace ov {
+namespace snippets {
+namespace lowered {
+namespace pass {
+
+/**
+ * @interface InitBuffersDefault
+ * @brief The pass inits Buffer expressions in LinearIR default (non-optimized): sets unique offsets and ID to Buffers.
+ * @ingroup snippets
+ */
+
+class InitBuffersDefault : public Pass {
+public:
+    OPENVINO_RTTI("InitBuffersDefault", "Pass")
+
+    InitBuffersDefault(size_t& buffer_scratchpad_size) : m_buffer_scratchpad_size(buffer_scratchpad_size) {
+        m_buffer_scratchpad_size = 0;
+    }
+    /**
+     * @brief Apply the pass to the Linear IR
+     * @param linear_ir the target Linear IR
+     * @return status of the pass
+     */
+    bool run(lowered::LinearIR& linear_ir) override;
+
+private:
+    size_t& m_buffer_scratchpad_size;
+};
+
+} // namespace pass
+} // namespace lowered
+} // namespace snippets
+} // namespace ov
diff --git a/src/common/snippets/include/snippets/lowered/pass/init_loops.hpp b/src/common/snippets/include/snippets/lowered/pass/init_loops.hpp
@@ -15,7 +15,7 @@ namespace pass {
 
 /**
  * @interface InitLoops
- * @brief The pass initialize scheduling information in LoopInfo
+ * @brief The pass initializes scheduling information in LoopInfo
  * @ingroup snippets
  */
 class InitLoops : public Pass {