Skip to content

Commit

Permalink
[Snippets] Refactored work with Buffers (openvinotoolkit#19644)
Browse files Browse the repository at this point in the history
[Snippets] BufferManager is not derived from PassPipeline now

[Snippets] Added MemorySolver support

[Snippets] Made as static class

[Snippets] Added one-level inplace support

[Snippets] Added optimization bits

[Snippets] Small cosmetic fixes

[Snippets] Renamed to BufferSolver

[Snippets] Refactored

[Snippets] Fixed IdendifyBuffers

[Snippets] Add inplace multi + identify buffers

[Snippets] Made common pass

[Snippets] Added PassPipeline::get_pass<>()

[Snippets] Added comments, briefs, refactored smth

[Snippets] Fixed win build

[Snippets] Not allow to have the same Buffer ID for multi level Buffers

[Snippets] Moved CleanupRepeatedPtrShifts to common pioeline

[Snippets] Made IdentifyBuffers::ShiftPtrParams

[Snippets] Fixed window sliding mode

[Snippets] Refactored nested clusters

[Snippets] Adde normalized buffer regs

[Snippets] Not allowed to have the same ID for nested Buffers in IdentifyBuffers

[Snippets] Fixed DefineBufferClusters::are_buffer_neighbours::find

[Snippets] Removed useless method from InitLoops

[Snippets] Fixed CC build

[Snippets] Applied Ivan comments

[Snippets] Applied Ivan comment: refactored pass classes

[Snippets] Applied Vladislav comments

[Snippets] Applied Ivan comments 2

[Runtime] Moved MemorySolver to API2.0

[Snippets] Created common buffer allocation pass AllocateBuffers

[Snippets][Tests] Added InplaceEltwise unit test

[Snippets] fixed NormalizeBufferIDs

[Snippets][CPU] Fixed BrgemmBlocking lowered pass: move wsp for AMX to brgemm

[Snippets][CPU][Tests] Covered AMX MHA buffer allocation by unit tests
  • Loading branch information
a-sidorova authored Nov 30, 2023
1 parent 6ab5ef7 commit df03b04
Show file tree
Hide file tree
Showing 29 changed files with 1,689 additions and 267 deletions.
3 changes: 3 additions & 0 deletions src/common/snippets/include/snippets/lowered/linear_ir.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,9 @@ class Config {
// Minimal advised work amount that should be processed during one call of the executable produced by Subgraph::generate
// Set by a backend, should be large enough to compensate for the kernel call overheads
size_t m_min_kernel_work_amount = 256;
// True if the Buffer scratchpad size of LinearIR will be optimized (all possible optimizations will be activated)
// False if all Buffers will have uniqie ID and offsets in the Linear IR
bool m_are_buffers_optimized = true;
};

/* The control flow of Snippets is built on Linear Intermediate Representation (Linear IR).
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
#pragma once

#include "pass.hpp"
#include "snippets/snippets_isa.hpp"

namespace ov {
namespace snippets {
Expand All @@ -14,26 +13,40 @@ namespace pass {

/**
* @interface AllocateBuffers
* @brief The pass calculates common size of buffer scratchpad and propagates Buffer offsets to connected MemoryAccess operations.
* Notes:
* - The pass implicitly regulates InPlace processing for some Buffers when it's possible.
* The pass don't allocate new memory for InPlace Buffers, we propagate the same offsets for them.
* - The pass should be splitted into two passes: ProcessInplace (markup of Buffers which can use the same memory)
* and AllocateBuffer (allocate memory for Buffers using MemorySolver which can optimally reuse memory).
* @brief The pass allocates common memory for all Buffers.
* There are two modes: default and optimized allocation. Default allocation (non-optimized) mode sets unique offsets and ID to Buffers.
* Optimized mode allocates memory for Buffer ops using the following optimizations:
* - MemorySolver: helps to solve issue of optimal memory allocation;
* - InPlace: Loop or MemoryAccess ops read from the memory and store data to the same memory if possible
* - Reusing Buffer IDs: Buffers have the same IDs (gpr) in cases when Buffers aren't connected or have the same data ptr shifts
* Note: All buffers are related to each other and represent common buffer scratchpad of Subgraph.
* The buffer scratchpad has one general data pointer. Each buffer has offset relative to the data pointer of buffer scratchpad.
* @ingroup snippets
*/

class AllocateBuffers : public Pass {
class AllocateBuffers: public Pass {
public:
OPENVINO_RTTI("AllocateBuffers", "Pass")
bool run(lowered::LinearIR& linear_ir) override;

size_t get_scratchpad_size() const { return m_buffer_scratchpad_size; }

AllocateBuffers(size_t& buffer_scratchpad_size, bool is_optimized = true);

/**
* @brief Apply the pass to the Linear IR
* @param linear_ir the target Linear IR
* @return status of the pass
*/
bool run(LinearIR& linear_ir) override;

/**
* @brief Set offset to Buffer op and propagates its to the connected memory access ops
* @param buffer_expr expression with Buffer op
* @param offset offset in common buffer scratchpad
*/
static void set_buffer_offset(const ExpressionPtr& buffer_expr, const size_t offset);

using BufferCluster = std::set<ExpressionPtr>;
using BufferClusters = std::vector<BufferCluster>;
private:
static void propagate_offset(const LinearIR& linear_ir, const ExpressionPtr& buffer_expr, size_t offset);

size_t m_buffer_scratchpad_size = 0;
size_t& m_buffer_scratchpad_size;
bool m_is_optimized_mode = true;
};

} // namespace pass
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
// Copyright (C) 2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#pragma once

#include "pass.hpp"

#include "allocate_buffers.hpp"

namespace ov {
namespace snippets {
namespace lowered {
namespace pass {

/**
* @interface DefineBufferClusters
* @brief The pass defines buffer clusters. The buffers from one cluster share the
* same memory (has the same offset relative to the data pointer of buffer scratchpad).
* - If MemoryAccess op or Loop can read and write to the same (inplace behavior), the Buffers should be in the one cluster.
* - If Buffer is in the Loop which read or write from/to the other Buffers, this Buffer can emulate `window` slidings.
* It means that Buffer inside can reuse memory of Buffers outside in bounds of full Loop work.
* Demonstration:
* |-----------------------------------------------------|
* | |------------| |------------| | InnerLoops have work amount 128
* Buffer0 [3x128]-> | | InnerLoop0 | -> Buffer1 [3x128] -> | InnerLoop1 | | -> Buffer2 [3x128] OuterLoop has work amount 3
* | |------------| OuterLoop |------------| |
* |-----------------------------------------------------|
* Buffer1 can reuse memory [128] of Buffer0 or Buffer2 in each iteration of OuterLoop
* Note: The pass requires expression enumeration and buffer identification (for nested Buffers inplace).
* These passes should be executed separately before this pass!
* @ingroup snippets
*/
class DefineBufferClusters : public Pass {
public:
OPENVINO_RTTI("DefineBufferClusters", "Pass")

DefineBufferClusters(AllocateBuffers::BufferClusters& clusters) : m_clusters(clusters) {}

/**
* @brief Apply the pass to the Linear IR
* @param linear_ir the target Linear IR
* @return status of the pass
*/
bool run(lowered::LinearIR& linear_ir) override;

private:
using BufferPorts = std::unordered_map<ExpressionPtr, std::set<size_t>>;
/**
* @brief Finds Buffer cluster in set of clusters which contains the target expression with Buffer
* @param target target expression with Buffer op
* @return vector iterator which refers to the found cluster
*/
AllocateBuffers::BufferClusters::iterator find_cluster_by_expr(const ExpressionPtr& target);
/**
* @brief Returns True if Buffer is direct source for the target expr (there aren't other loop between the Buffer and target expr)
* @param buffer_expr expression with assumed Buffer op
* @param target_expr expression with target op - LoopEnd or MemoryAccess op
* @return boolean value
*/
bool is_direct_buffer(const ExpressionPtr& buffer_expr, const ExpressionPtr& target_expr) const;
/**
* @brief Creates new buffer cluster if buffer_exprs is missed in clusters. If buffer_exprs is already in clusters, do nothing
* @param buffer_expr expression with Buffer op
*/
void create_new_cluster(const ExpressionPtr& buffer_expr);
/**
* @brief Returns common ID of cluster if all buffer inside have the same Buffer ID. Otherwise returns the default value SIZE_MAX
* that means that Buffers in cluster have different IDs.
* @param cluster set of Buffer expressions - cluster
* @return common buffer ID or SIZE_MAX - size value
*/
size_t get_cluster_buffer_id(const AllocateBuffers::BufferCluster& cluster) const;

/**
* @brief Analyzes Loop: if Loop has Buffer ops on inputs and outputs, Loop can read and write from/to the same memory.
* @param expr_it iterator of Linear IR which refers to the expression with LoopEnd
*/
void parse_loop(const LinearIR::constExprIt& expr_it);
/**
* @brief Analyzes full MemoryAccess op: if the op has Buffer ops on I/O, the op can read and write from/to the same memory.
* @param expr expression with full MemoryAccess op
*/
void parse_memory_access_op(const ExpressionPtr& expr);
/**
* @brief Gets input outputs buffers of Loop
* @param loop_expr expression with LoopEnd op
* @return unordered map [Expression -> set of input ports] which represents input Buffers of Loop
*/
BufferPorts get_input_buffers(const ExpressionPtr& loop_expr) const;
/**
* @brief Gets output buffers of Loop
* @param loop_expr expression with LoopEnd op
* @return unordered map [Expression -> set of input ports] which represents output Buffers of Loop
*/
BufferPorts get_output_buffers(const ExpressionPtr& loop_expr) const;
/**
* @brief Analyzes nested Loops: unite nested buffer clusters if they can reproduce `window` sliding
* @param input_buffers unordered map [Expression -> set of input ports] which represents input Buffers of Loop
* @param output_buffers unordered map [Expression -> set of output ports (one)] which represents output Buffers of Loop
* @param outer_loop_end_expr_it iterator of Linear IR which refers to the expression with outer LoopEnd
*/
void parse_nested_loops(const BufferPorts& input_buffers, const BufferPorts& output_buffers, const LinearIR::constExprIt& outer_loop_end_expr_it);
/**
* @brief Finds the last connected Loop to the target Buffer and returns the corresponding finalization offset
* @param buffer_expr expression with Buffer op
* @return finalization offset - int64_t value
*/
int64_t get_buffer_finalization_offset(const ExpressionPtr& buffer_expr) const;
/**
* @brief Check if two Buffer expressions are connected to the same Loop. Set common LoopEnd as `loop` parameter and
* indexes of Loop ports `up_idx` and `down_idx` if Buffers are really neighbours
* @param up expression with upper Buffer op
* @param down expression with lower Buffer op
* @param loop expression with common LoopEnd op
* @param up_idx the reference to port index of upper Buffer op to the Loop
* @param down_idx the reference to port index of lower Buffer op to the Loop
* @return Return True if the Buffers are connected to the same Loop
*/
static bool are_buffer_neighbours(const ExpressionPtr& up, const ExpressionPtr& down, ExpressionPtr& loop, size_t& up_idx, size_t& down_idx);
/**
* @brief Unite clusters
* @param inner_cluster_it iterator to inner cluster - buffer cluster is in the loop
* @param outer_cluster buffer clusters with buffers outside the Loop
* @param outer_buffer target Buffer from outer_cluster
* @param is_outer_up true if outer buffer is upper in Linear IR than inner Buffers
* @return Return True if clusters have been united
*/
bool unite_nested_clusters(const AllocateBuffers::BufferClusters::iterator& inner_cluster_it, AllocateBuffers::BufferCluster& outer_cluster,
const ExpressionPtr& outer_buffer, bool is_outer_up);

AllocateBuffers::BufferClusters& m_clusters;
};

} // namespace pass
} // namespace lowered
} // namespace snippets
} // namespace ov
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
// Copyright (C) 2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#pragma once

#include "pass.hpp"

namespace ov {
namespace snippets {
namespace lowered {
namespace pass {

/**
* @interface EnumerateExpressions
* @brief The pass enumerates expression by execution order
* @ingroup snippets
*/
class EnumerateExpressions : public Pass {
public:
OPENVINO_RTTI("EnumerateExpressions", "Pass")
bool run(LinearIR& linear_ir) override;
};

} // namespace pass
} // namespace lowered
} // namespace snippets
} // namespace ov
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,6 @@

#include "pass.hpp"

#include "snippets/op/buffer.hpp"

namespace ov {
namespace snippets {
namespace lowered {
Expand All @@ -22,7 +20,8 @@ namespace pass {
* - Loops, Brgemm (the same other ops) - are "edges" between Buffers (hub of edges).
* The buffers are connected to the same Loop - are adjacent in graph sense bounds.
* - The vertices (buffers) are adjacent if they are connected to the same Loop and
* their data pointers cannot be proportionally incremented in Loops: different ptr increments or data sizes;
* their data pointers cannot be proportionally incremented in Loops: different ptr increments or data sizes -
* or one of the Buffers is in some a Loop but another Buffer is not;
* - Firstly, create adjacency matrix using the definition above;
* - Secondly, assign the same color to non-adjacent vertices of graph (buffers), and use different colors otherwise.
* Note: should be called before ResetBuffer() pass to have correct offsets
Expand All @@ -33,13 +32,79 @@ class IdentifyBuffers: public Pass {
OPENVINO_RTTI("IdentifyBuffers", "Pass")
IdentifyBuffers() = default;

/**
* @brief Apply the pass to the Linear IR
* @param linear_ir the target Linear IR
* @return status of the pass
*/
bool run(LinearIR& linear_ir) override;

struct ShiftPtrParams {
ShiftPtrParams() = default;
ShiftPtrParams(int64_t ds, int64_t pi, int64_t fo) : data_size(ds), ptr_increment(pi), finalization_offset(fo) {}
int64_t data_size = 0;
int64_t ptr_increment = 0;
int64_t finalization_offset = 0;

friend bool operator==(const ShiftPtrParams& lhs, const ShiftPtrParams& rhs);
friend bool operator!=(const ShiftPtrParams& lhs, const ShiftPtrParams& rhs);
};

/**
* @brief Check if two Buffers can reuse ID by ShiftPtrParams < data_size, ptr_increment, finalization_offset >
* @param lhs Data pointer shift params for first Buffer
* @param rhs Data pointer shift params for second Buffer
* @return Returns True if params are valid for reusing. Otherwise returns False
*/
static bool can_reuse_id(const ShiftPtrParams& lhs, const ShiftPtrParams& rhs);

private:
using BufferSet = std::vector<std::shared_ptr<op::Buffer>>;
using BufferPool = std::vector<ExpressionPtr>;

std::vector<bool> create_adjacency_matrix(const LinearIR& linear_ir, const BufferSet& buffers) const;
std::map<size_t, BufferSet> coloring(BufferSet& buffers, std::vector<bool>& adj);
/**
* @brief Get Buffer Index in Buffer set
* @param target the target Buffer expression
* @param pool set of Buffers from the Linear IR
* @return index of target Buffer expression in set
*/
static size_t get_buffer_idx(const ExpressionPtr& target, const BufferPool& pool);
/**
* @brief Create adjacency matrix for Buffer system. See comment in the method for more details.
* @param linear_ir the target Linear IR
* @param pool set of Buffers from the Linear IR
* @return adjacency matrix where True value means that Buffers are adjacent and cannot have the same ID
*/
static std::vector<bool> create_adjacency_matrix(const LinearIR& linear_ir, const BufferPool& pool);
/**
* @brief Algorithm of Graph coloring where vertices are Buffers
* @param buffers set of Buffers from the Linear IR
* @param adj adjacency matrix
* @return map [color id -> Buffer set]
*/
static std::map<size_t, BufferPool> coloring(BufferPool& buffers, std::vector<bool>& adj);
/**
* @brief Update the adjacency matrix:
* - If Buffers are from the same Loops and connected to the same Loop and
* they have not proportionally ptr shift params for this Loop, the Buffers are adjacent - set value True in the matrix;
* - If one of Buffer inside Loop but another Buffer is connected to this Loop and this Buffer has not zero data shift params,
* the Buffers are adjacent - set value True in the matrix;
* @param lhs Pair where first value if Expression with first Buffer and second value is data pointer shift params for its
* @param rhs Pair where first value if Expression with second Buffer and second value is data pointer shift params for its
* @param buffers set of Buffers from the Linear IR
* @param adj Target adjacency matrix
*/
static void update_adj_matrix(const std::pair<ExpressionPtr, ShiftPtrParams>& lhs,
const std::pair<ExpressionPtr, ShiftPtrParams>& rhs,
const BufferPool& buffers,
std::vector<bool>& adj);
/**
* @brief Check if two Buffers are adjacent and cannot have the same ID
* @param lhs Pair where first value is Expression with first Buffer and second value is data pointer shift params for it
* @param rhs Pair where first value is Expression with second Buffer and second value is data pointer shift params for it
* @return Returns True if they are adjacent, otherwise returns False
*/
static bool are_adjacent(const std::pair<ExpressionPtr, ShiftPtrParams>& lhs,
const std::pair<ExpressionPtr, ShiftPtrParams>& rhs);
};

} // namespace pass
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
// Copyright (C) 2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#pragma once

#include "pass.hpp"

namespace ov {
namespace snippets {
namespace lowered {
namespace pass {

/**
* @interface InitBuffersDefault
* @brief The pass inits Buffer expressions in LinearIR default (non-optimized): sets unique offsets and ID to Buffers.
* @ingroup snippets
*/

class InitBuffersDefault : public Pass {
public:
OPENVINO_RTTI("InitBuffersDefault", "Pass")

InitBuffersDefault(size_t& buffer_scratchpad_size) : m_buffer_scratchpad_size(buffer_scratchpad_size) {
m_buffer_scratchpad_size = 0;
}
/**
* @brief Apply the pass to the Linear IR
* @param linear_ir the target Linear IR
* @return status of the pass
*/
bool run(lowered::LinearIR& linear_ir) override;

private:
size_t& m_buffer_scratchpad_size;
};

} // namespace pass
} // namespace lowered
} // namespace snippets
} // namespace ov
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ namespace pass {

/**
* @interface InitLoops
* @brief The pass initialize scheduling information in LoopInfo
* @brief The pass initializes scheduling information in LoopInfo
* @ingroup snippets
*/
class InitLoops : public Pass {
Expand Down
Loading

0 comments on commit df03b04

Please sign in to comment.