Skip to content

Commit

Permalink
Applied Ivan comments
Browse files Browse the repository at this point in the history
  • Loading branch information
a-sidorova committed Apr 5, 2023
1 parent c0875bb commit 2a04469
Show file tree
Hide file tree
Showing 13 changed files with 117 additions and 82 deletions.
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright (C) 2018-2022 Intel Corporation
// Copyright (C) 2018-2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

Expand All @@ -7,22 +7,39 @@
#include <ngraph/pass/graph_rewrite.hpp>
#include <ngraph/pattern/matcher.hpp>

#include "snippets/snippets_isa.hpp"

namespace ngraph {
namespace snippets {
namespace pass {

/**
* @interface BufferIdentification
* @brief The pass set identifiers for Buffers in common Buffer system
* @brief The pass set identifiers for Buffers in common Buffer system.
* The buffers with the same identifier has the same data register.
* The pass uses greedy graph coloring algorithm using adjacency matrix:
* - Buffers - are vertices of graph
* - Loops, Brgemm (the same other ops) - are "edges" between Buffers (hub of edges).
* The buffers are connected to the same Loop - are adjacent in graph sense bounds.
* - The vertices (buffers) are adjacent if they are connected to the same Loop and
* their data pointers cannot be proportionally incremented in Loops: different ptr increments or data sizes.
* - Firstly, create adjacency matrix using the definition above
* - Secondly, color vertices of graph (buffers) using adjacency matrix
* Note: should be called before ResetBuffer() pass to have correct offsets
* @ingroup snippets
*/
class BufferIdentification: public ngraph::pass::FunctionPass {
public:
OPENVINO_RTTI("InsertLoops", "0");
OPENVINO_RTTI("BufferIdentification", "0");
BufferIdentification() = default;

bool run_on_model(const std::shared_ptr<ngraph::Function>& m) override;

private:
using BufferSet = std::vector<std::shared_ptr<snippets::op::Buffer>>;

std::vector<bool> create_adjacency_matrix(const BufferSet& buffers);
std::map<size_t, BufferSet> coloring(BufferSet& buffers, std::vector<bool>& adj);
};

} // namespace pass
Expand Down
1 change: 0 additions & 1 deletion src/common/snippets/src/generator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
#include "snippets/op/subgraph.hpp"
#include "snippets/op/kernel.hpp"
#include <snippets/itt.hpp>
#include <snippets/snippets_isa.hpp>

#include <ngraph/pass/manager.hpp>
#include <openvino/core/type.hpp>
Expand Down
34 changes: 15 additions & 19 deletions src/common/snippets/src/op/subgraph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -77,12 +77,19 @@ void snippets::op::Subgraph::init_config() {
auto snippets::op::Subgraph::get_estimated_buffer_count(const ov::NodeVector& ops) -> size_t {
// The count of potential unique Buffers - it's hidden virtual ports as well
// We should go through Subgraph and calculate potential non-inplace Buffers count.
// These Buffers can be only around Loops (for example, around MatMul they may be inplace). So we should
// check for element type size of nodes which are used Buffer to get rating from above for uniqe Buffer count.
// These Buffers can be only around Loops (for example, around MatMul without blocking (Loops around) they may be inplace).
// So we should check for element type size of nodes which are used Buffer to get rating from above for uniqe Buffer count.
// The count is estimated because when we calculate this number we have only original graph representation
// and where will be Loops - we can just predict.
// Note: The ops that create Buffers: MatMul, Transpose and Softmax (always FP32)
std::vector<size_t> used_precision_size;

auto push_prc_size = [&used_precision_size](size_t precision_size) {
if (used_precision_size.empty() || used_precision_size.back() != precision_size) {
used_precision_size.push_back(precision_size);
}
};

for (const auto& op : ops) {
if (const auto transpose = ov::as_type_ptr<ov::op::v1::Transpose>(op)) {
// At the moment Transposes are supported only on Results and Parameters but
Expand All @@ -96,34 +103,23 @@ auto snippets::op::Subgraph::get_estimated_buffer_count(const ov::NodeVector& op
}) ||
!ov::is_type<ov::op::v0::Parameter>(transpose->get_input_node_shared_ptr(0));
if (are_prev_or_next_ops) {
const auto prc_size = transpose->get_element_type().size();
if (used_precision_size.empty() || used_precision_size.back() != prc_size) {
used_precision_size.push_back(prc_size);
}
push_prc_size(transpose->get_element_type().size());
}
} else if (ov::is_type<ov::op::v1::Softmax>(op) || ov::is_type<ov::op::v8::Softmax>(op)) {
// Softmax always uses 2 FP32 Buffers
const auto prc_size = ov::element::f32.size();
if (used_precision_size.empty() || used_precision_size.back() != prc_size) {
used_precision_size.push_back(prc_size);
}
// Softmax always uses 2 FP32 Buffers after decomposition.
// They are inplace and the same so we can push precision size only once
push_prc_size(ov::element::f32.size());
} else if (const auto matmul = ov::as_type_ptr<ov::op::v0::MatMul>(op)) {
// First input check is enough because MatMul requires the same prc size on inputs
if (!ov::is_type<ov::op::v0::Parameter>(matmul->get_input_node_shared_ptr(0)) ||
!ov::is_type<ov::op::v0::Parameter>(matmul->get_input_node_shared_ptr(1))) {
const auto prc_size = matmul->get_input_element_type(0).size();
if (used_precision_size.empty() || used_precision_size.back() != prc_size) {
used_precision_size.push_back(prc_size);
}
push_prc_size(matmul->get_input_element_type(0).size());
}

const auto consumers = matmul->get_output_target_inputs(0);
if (std::none_of(consumers.begin(), consumers.end(),
[](const ov::Input<ov::Node>& in) { return ov::is_type<ov::op::v0::Result>(in.get_node()); })) {
const auto prc_size = matmul->get_element_type().size();
if (used_precision_size.empty() || used_precision_size.back() != prc_size) {
used_precision_size.push_back(prc_size);
}
push_prc_size(matmul->get_element_type().size());
}
}
}
Expand Down
14 changes: 7 additions & 7 deletions src/common/snippets/src/pass/assign_registers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -99,8 +99,8 @@ bool ngraph::snippets::pass::AssignRegisters::run_on_model(const std::shared_ptr
std::map<Tensor, Reg> manually_assigned_gprs, manually_assigned_vecs;
manual_assigning(f, ops, manually_assigned_gprs, manually_assigned_vecs);

const auto IS_MANUALLY_ALLOCATED_REG = SIZE_MAX;
auto enumerate_out_tensors = [IS_MANUALLY_ALLOCATED_REG] (const std::shared_ptr<ov::Node>& op,
const auto IS_MANUALLY_ASSIGNED_REG = SIZE_MAX;
auto enumerate_out_tensors = [IS_MANUALLY_ASSIGNED_REG] (const std::shared_ptr<ov::Node>& op,
decltype(regs_vec)& reg_map,
const std::map<Tensor, Reg>& manually_assigned_regs,
size_t& counter) {
Expand All @@ -109,7 +109,7 @@ bool ngraph::snippets::pass::AssignRegisters::run_on_model(const std::shared_ptr
// Note that some ops might have identical input&output tensors (Result and Tile* for ex.)
// so we have to check that the tensor has not been enumerated already
if (reg_map.count(t) == 0) {
reg_map[t] = manually_assigned_regs.count(t) == 0 ? counter++ : IS_MANUALLY_ALLOCATED_REG;
reg_map[t] = manually_assigned_regs.count(t) == 0 ? counter++ : IS_MANUALLY_ASSIGNED_REG;
}
}
};
Expand All @@ -131,13 +131,13 @@ bool ngraph::snippets::pass::AssignRegisters::run_on_model(const std::shared_ptr
std::vector<std::set<Reg>> used_vec(ops.size(), std::set<Reg>());
std::vector<std::set<Reg>> defined_vec(ops.size(), std::set<Reg>());

auto tensor2reg = [IS_MANUALLY_ALLOCATED_REG] (const std::vector<Tensor>& tensors, const std::map<Tensor, Reg>& reg_map) {
auto tensor2reg = [IS_MANUALLY_ASSIGNED_REG] (const std::vector<Tensor>& tensors, const std::map<Tensor, Reg>& reg_map) {
std::set<Reg> result;
for (const auto& t : tensors) {
if (reg_map.count(t) == 0)
throw ngraph::ngraph_error("Assign registers: attempt to access not enumerated tensor");
Reg reg_id = reg_map.at(t);
if (reg_id != IS_MANUALLY_ALLOCATED_REG)
if (reg_id != IS_MANUALLY_ASSIGNED_REG)
result.insert(reg_id);
}
return result;
Expand Down Expand Up @@ -298,10 +298,10 @@ bool ngraph::snippets::pass::AssignRegisters::run_on_model(const std::shared_ptr

std::map<Tensor, Reg> assigned_regs(std::move(manually_assigned_gprs));
assigned_regs.insert(manually_assigned_vecs.begin(), manually_assigned_vecs.end());
auto register_assigned_regs = [IS_MANUALLY_ALLOCATED_REG, &assigned_regs](const std::map<Tensor, Reg>& unique_regs,
auto register_assigned_regs = [IS_MANUALLY_ASSIGNED_REG, &assigned_regs](const std::map<Tensor, Reg>& unique_regs,
const std::map<Reg, Reg>& unique2reused) {
for (const auto& reg : unique_regs) {
if (reg.second == IS_MANUALLY_ALLOCATED_REG)
if (reg.second == IS_MANUALLY_ASSIGNED_REG)
continue;
if (unique2reused.count(reg.second) == 0)
throw ngraph::ngraph_error("Assign registers failed to allocate register for a tensor");
Expand Down
56 changes: 38 additions & 18 deletions src/common/snippets/src/pass/buffer_identification.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright (C) 2022 Intel Corporation
// Copyright (C) 2018-2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

Expand All @@ -13,24 +13,24 @@ namespace snippets {
namespace pass {

namespace {
using BufferSet = std::vector<std::shared_ptr<op::Buffer>>;

auto is_intermediate_buffer(const std::shared_ptr<ov::Node>& op) -> std::shared_ptr<op::Buffer> {
std::shared_ptr<op::Buffer> is_intermediate_buffer(const std::shared_ptr<ov::Node>& op) {
const auto buffer = ov::as_type_ptr<op::Buffer>(op);
return buffer && buffer->is_intermediate_memory() ? buffer : nullptr;
}


inline size_t index(size_t size, size_t row, size_t col) {
return col + row * size;
}
} // namespace

auto create_adjacency_matrix(const BufferSet& buffers) -> std::vector<bool> {
std::vector<bool> BufferIdentification::create_adjacency_matrix(const BufferIdentification::BufferSet& buffers) {
// The sync point to check for adjency is Loop because only in Loop we increment pointers.
// So if some Buffers in the one Loop have conflict (cannot be inplace: the same ptr increment and finalization offset)
// they are called as adjacent
const auto size = buffers.size();
std::vector<bool> adj(size * size, false);
for (size_t i = 0; i < size; ++i)
adj[i + i * size] = true;
adj[index(size, i, i)] = true;

auto update_adj_matrix = [&](const std::shared_ptr<op::Buffer>& buffer, size_t buffer_index,
const std::shared_ptr<op::Buffer>& neighbour_buffer) {
Expand All @@ -41,19 +41,19 @@ auto create_adjacency_matrix(const BufferSet& buffers) -> std::vector<bool> {
NGRAPH_CHECK(iter != buffers.cend(), "Buffer wasn't find in Buffer system of Subgraph");

const size_t adj_idx = std::distance(buffers.cbegin(), iter);
adj[buffer_index + adj_idx * size] = true;
adj[index(size, adj_idx, buffer_index)] = adj[index(size, buffer_index, adj_idx)] = true;
}
}
};

for (size_t i = 0; i < buffers.size(); ++i) {
const auto buffer = buffers[i];
const auto& buffer = buffers[i];

auto port = buffer->input_value(0).get_index();
auto parent = buffer->get_input_node_shared_ptr(0);
// We iterate in While cycle to check nested Loops
while (const auto loop_end = ov::as_type_ptr<op::LoopEnd>(parent)) {
const auto loop_begin = loop_end->get_loop_begin();
const auto& loop_begin = loop_end->get_loop_begin();
for (const auto& input_value : loop_begin->input_values()) {
auto loop_in = input_value.get_node_shared_ptr();
auto port_idx = input_value.get_index();
Expand All @@ -69,7 +69,7 @@ auto create_adjacency_matrix(const BufferSet& buffers) -> std::vector<bool> {
}
for (const auto& output : loop_end->outputs()) {
// check for first target input is enough for Buffer searching because operations can have only single Buffer per each output port as op
const auto target_inputs = output.get_target_inputs();
const auto& target_inputs = output.get_target_inputs();
auto consumer_in = *target_inputs.begin();
auto port_idx = consumer_in.get_index();
auto consumer = consumer_in.get_node()->shared_from_this();
Expand All @@ -95,21 +95,35 @@ auto create_adjacency_matrix(const BufferSet& buffers) -> std::vector<bool> {
return adj;
}

auto coloring(BufferSet& buffers, std::vector<bool>& adj) -> std::map<size_t, BufferSet> {
std::map<size_t, BufferIdentification::BufferSet> BufferIdentification::coloring(BufferIdentification::BufferSet& buffers, std::vector<bool>& adj) {
size_t color = 0;
std::map<size_t, BufferSet> color_groups;
std::map<size_t, BufferIdentification::BufferSet> color_groups;
const auto size = buffers.size();
// If we have count of adjacent Buffers is equal to count of all Buffers,
// it mean that Buffers aren't adjacent between them (they just have loops)
if (static_cast<size_t>(std::count(adj.begin(), adj.end(), true)) == size) {
color_groups[color] = buffers;
return color_groups;
}

for (size_t i = 0; i < size; i++) {
// The Buffer is already colored (visited) - skip
if (!buffers[i])
continue;

auto buffer = buffers[i];
const auto& buffer = buffers[i];
color_groups[color].push_back(buffer); // Add to Color Group
buffers[i] = nullptr; // Remove from graph vertices

// while Buffer i has not coloured non-neighbours
// (row i contains 0)
while (!std::accumulate(adj.begin() + i * size, adj.begin() + (i + 1) * size, true, std::logical_and<bool>())) {
// Find first non-adjacent and non-visited (non-colored) Buffer to color him to the same color
// NOTE: At the moment Snippets don't garantee that Buffer pointer won't be resetted after Loop execution.
// So we cannot reuse Buffer pointer at second time and don't allow the following case:
// Buffer[0] -> ... -> Buffer[1] -> ... -> Buffer[0]
// To cover this case, we make force break when find first adjacent not-visitted `vertex`
// Notice, this case will be supported in new infrastructure with Linear IR
size_t j = i + 1;
bool force_break = false;
for (; j < size; ++j) {
Expand All @@ -121,12 +135,19 @@ auto coloring(BufferSet& buffers, std::vector<bool>& adj) -> std::map<size_t, Bu
break;
}

// If we have to make force break or we don't have the corresponding non-adjacent and non-colored Buffers,
// we should make break - all potential Buffers for the current color are already colored
if (force_break || j == size)
break;

auto neighbour_buffer = buffers[j];
const auto& neighbour_buffer = buffers[j];
color_groups[color].push_back(neighbour_buffer); // Add to Color Group
buffers[j] = nullptr; // Remove from graph vertices
// Unite adjacency links:
// All the neighbors of Buffer `j` are added to the neighbors of Buffer `i` (the `vertices` are pulled together).
// The result is an updated i-th row of the adjacency matrix,
// in which 0 are only in columns with `vertex` numbers that are not adjacent to either the i-th or j-th `vertices`.
// Mathematically, this can be replaced by the operation of OR of Boolean vectors representing strings i and j.
std::transform(adj.begin() + i * size, adj.begin() + (i + 1) * size, adj.begin() + j * size,
adj.begin() + i * size, std::logical_or<bool>());
}
Expand All @@ -142,7 +163,7 @@ bool BufferIdentification::run_on_model(const std::shared_ptr<ov::Model>& model)
// Unite Buffers using Graph coloring algorithm.
// Notes: We identify only Buffer with Intermediate memory because Buffers with new memory are used only in Brgemm case
// so these Buffers are always IntermediateBuffer nonadjacent
BufferSet buffers;
BufferIdentification::BufferSet buffers;

const auto ops = model->get_ordered_ops();
for (const auto& op : ops) {
Expand All @@ -157,10 +178,9 @@ bool BufferIdentification::run_on_model(const std::shared_ptr<ov::Model>& model)
// Graph coloring algorithm
const auto color_groups = coloring(buffers, adj);

// FIXME: use const auto& [color, united_buffers] when C++17 is available
for (const auto& pair : color_groups) {
const auto color = pair.first;
const auto united_buffers = pair.second;
const auto& united_buffers = pair.second;
for (const auto& buffer : united_buffers) {
buffer->set_id(color);
}
Expand Down
11 changes: 7 additions & 4 deletions src/common/snippets/src/pass/collapse_subgraph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -529,15 +529,18 @@ TokenizeSnippets::TokenizeSnippets() {
ResultVector body_results;
std::vector<std::set<Input<Node>>> subgraph_result_inputs;

ov::NodeVector new_body_ops;
ov::NodeVector ops_for_buffer_count;
for (auto subgraph : input_subgraphs) {
// we should summurize additional needed data count (non-scalar Constants and Buffers) from all input subgraphs
// because we will collapse them with our node and we should get total count
const auto subgraph_ptr = ov::as_type_ptr<ngraph::snippets::op::Subgraph>(subgraph);
hidden_data_count += subgraph_ptr->get_virtual_port_count();
// Buffers can be existed only in Subgraphs with domain sensetive ops which
// requires intermediate memory for data repacking
// To avoid load time regressions, we verify only these Subgraph with domain sensetive ops
if (subgraph_ptr->has_domain_sensitive_ops()) {
const auto ops = subgraph_ptr->body_ptr()->get_ordered_ops();
new_body_ops.insert(new_body_ops.end(), ops.begin(), ops.end());
ops_for_buffer_count.insert(ops_for_buffer_count.end(), ops.begin(), ops.end());
}

for (auto output : subgraph->outputs()) {
Expand Down Expand Up @@ -570,7 +573,7 @@ TokenizeSnippets::TokenizeSnippets() {
}

if (op::Subgraph::is_domain_sensitive_op(node)) {
new_body_ops.push_back(node);
ops_for_buffer_count.push_back(node);
}

for (auto output : node->outputs()) {
Expand All @@ -583,7 +586,7 @@ TokenizeSnippets::TokenizeSnippets() {
}

// todo: move this plugin-specific constraint to the plugin callback
const auto unique_buffer_count = op::Subgraph::get_estimated_buffer_count(new_body_ops);
const auto unique_buffer_count = op::Subgraph::get_estimated_buffer_count(ops_for_buffer_count);
if (body_parameters.size() + body_results.size() + hidden_data_count + unique_buffer_count > 12) {
const std::string message_reset = "new subgraph is created. Impossible to schedule subgraph with " +
std::to_string(body_parameters.size()) + " inputs, " + std::to_string(body_results.size()) + " outputs and " +
Expand Down
Loading

0 comments on commit 2a04469

Please sign in to comment.