Applied Ivan comments

openvinotoolkit · Apr 5, 2023 · 2a04469 · 2a04469
1 parent c0875bb
commit 2a04469
Show file tree

Hide file tree

Showing 13 changed files with 117 additions and 82 deletions.
diff --git a/src/common/snippets/include/snippets/pass/buffer_identification.hpp b/src/common/snippets/include/snippets/pass/buffer_identification.hpp
@@ -1,4 +1,4 @@
-// Copyright (C) 2018-2022 Intel Corporation
+// Copyright (C) 2018-2023 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -7,22 +7,39 @@
 #include <ngraph/pass/graph_rewrite.hpp>
 #include <ngraph/pattern/matcher.hpp>
 
+#include "snippets/snippets_isa.hpp"
+
 namespace ngraph {
 namespace snippets {
 namespace pass {
 
 /**
  * @interface BufferIdentification
- * @brief The pass set identifiers for Buffers in common Buffer system
+ * @brief The pass set identifiers for Buffers in common Buffer system.
+ *        The buffers with the same identifier has the same data register.
+ *        The pass uses greedy graph coloring algorithm using adjacency matrix:
+ *          - Buffers - are vertices of graph
+ *          - Loops, Brgemm (the same other ops) - are "edges" between Buffers (hub of edges).
+ *                   The buffers are connected to the same Loop - are adjacent in graph sense bounds.
+ *          - The vertices (buffers) are adjacent if they are connected to the same Loop and
+ *            their data pointers cannot be proportionally incremented in Loops: different ptr increments or data sizes.
+ *          - Firstly, create adjacency matrix using the definition above
+ *          - Secondly, color vertices of graph (buffers) using adjacency matrix
  *        Note: should be called before ResetBuffer() pass to have correct offsets
  * @ingroup snippets
  */
 class BufferIdentification: public ngraph::pass::FunctionPass {
 public:
-    OPENVINO_RTTI("InsertLoops", "0");
+    OPENVINO_RTTI("BufferIdentification", "0");
     BufferIdentification() = default;
 
     bool run_on_model(const std::shared_ptr<ngraph::Function>& m) override;
+
+private:
+    using BufferSet = std::vector<std::shared_ptr<snippets::op::Buffer>>;
+
+    std::vector<bool> create_adjacency_matrix(const BufferSet& buffers);
+    std::map<size_t, BufferSet> coloring(BufferSet& buffers, std::vector<bool>& adj);
 };
 
 } // namespace pass

diff --git a/src/common/snippets/src/generator.cpp b/src/common/snippets/src/generator.cpp
@@ -10,7 +10,6 @@
 #include "snippets/op/subgraph.hpp"
 #include "snippets/op/kernel.hpp"
 #include <snippets/itt.hpp>
-#include <snippets/snippets_isa.hpp>
 
 #include <ngraph/pass/manager.hpp>
 #include <openvino/core/type.hpp>

diff --git a/src/common/snippets/src/op/subgraph.cpp b/src/common/snippets/src/op/subgraph.cpp
@@ -77,12 +77,19 @@ void snippets::op::Subgraph::init_config() {
 auto snippets::op::Subgraph::get_estimated_buffer_count(const ov::NodeVector& ops) -> size_t {
     // The count of potential unique Buffers - it's hidden virtual ports as well
     // We should go through Subgraph and calculate potential non-inplace Buffers count.
-    // These Buffers can be only around Loops (for example, around MatMul they may be inplace). So we should
-    // check for element type size of nodes which are used Buffer to get rating from above for uniqe Buffer count.
+    // These Buffers can be only around Loops (for example, around MatMul without blocking (Loops around) they may be inplace).
+    // So we should check for element type size of nodes which are used Buffer to get rating from above for uniqe Buffer count.
     // The count is estimated because when we calculate this number we have only original graph representation
     // and where will be Loops - we can just predict.
     // Note: The ops that create Buffers: MatMul, Transpose and Softmax (always FP32)
     std::vector<size_t> used_precision_size;
+
+    auto push_prc_size = [&used_precision_size](size_t precision_size) {
+        if (used_precision_size.empty() || used_precision_size.back() != precision_size) {
+            used_precision_size.push_back(precision_size);
+        }
+    };
+
     for (const auto& op : ops) {
         if (const auto transpose = ov::as_type_ptr<ov::op::v1::Transpose>(op)) {
             // At the moment Transposes are supported only on Results and Parameters but
@@ -96,34 +103,23 @@ auto snippets::op::Subgraph::get_estimated_buffer_count(const ov::NodeVector& op
                                                            }) ||
                                               !ov::is_type<ov::op::v0::Parameter>(transpose->get_input_node_shared_ptr(0));
             if (are_prev_or_next_ops) {
-                const auto prc_size = transpose->get_element_type().size();
-                if (used_precision_size.empty() || used_precision_size.back() != prc_size) {
-                    used_precision_size.push_back(prc_size);
-                }
+                push_prc_size(transpose->get_element_type().size());
             }
         } else if (ov::is_type<ov::op::v1::Softmax>(op) || ov::is_type<ov::op::v8::Softmax>(op)) {
-            // Softmax always uses 2 FP32 Buffers
-            const auto prc_size = ov::element::f32.size();
-            if (used_precision_size.empty() || used_precision_size.back() != prc_size) {
-                used_precision_size.push_back(prc_size);
-            }
+            // Softmax always uses 2 FP32 Buffers after decomposition.
+            // They are inplace and the same so we can push precision size only once
+            push_prc_size(ov::element::f32.size());
         } else if (const auto matmul = ov::as_type_ptr<ov::op::v0::MatMul>(op)) {
             // First input check is enough because MatMul requires the same prc size on inputs
             if (!ov::is_type<ov::op::v0::Parameter>(matmul->get_input_node_shared_ptr(0)) ||
                 !ov::is_type<ov::op::v0::Parameter>(matmul->get_input_node_shared_ptr(1))) {
-                const auto prc_size = matmul->get_input_element_type(0).size();
-                if (used_precision_size.empty() || used_precision_size.back() != prc_size) {
-                    used_precision_size.push_back(prc_size);
-                }
+                push_prc_size(matmul->get_input_element_type(0).size());
             }
 
             const auto consumers = matmul->get_output_target_inputs(0);
             if (std::none_of(consumers.begin(), consumers.end(),
                              [](const ov::Input<ov::Node>& in) { return ov::is_type<ov::op::v0::Result>(in.get_node()); })) {
-                const auto prc_size = matmul->get_element_type().size();
-                if (used_precision_size.empty() || used_precision_size.back() != prc_size) {
-                    used_precision_size.push_back(prc_size);
-                }
+                push_prc_size(matmul->get_element_type().size());
             }
         }
     }

diff --git a/src/common/snippets/src/pass/assign_registers.cpp b/src/common/snippets/src/pass/assign_registers.cpp
@@ -99,8 +99,8 @@ bool ngraph::snippets::pass::AssignRegisters::run_on_model(const std::shared_ptr
     std::map<Tensor, Reg> manually_assigned_gprs, manually_assigned_vecs;
     manual_assigning(f, ops, manually_assigned_gprs, manually_assigned_vecs);
 
-    const auto IS_MANUALLY_ALLOCATED_REG = SIZE_MAX;
-    auto enumerate_out_tensors = [IS_MANUALLY_ALLOCATED_REG] (const std::shared_ptr<ov::Node>& op,
+    const auto IS_MANUALLY_ASSIGNED_REG = SIZE_MAX;
+    auto enumerate_out_tensors = [IS_MANUALLY_ASSIGNED_REG] (const std::shared_ptr<ov::Node>& op,
                                      decltype(regs_vec)& reg_map,
                                      const std::map<Tensor, Reg>& manually_assigned_regs,
                                      size_t& counter) {
@@ -109,7 +109,7 @@ bool ngraph::snippets::pass::AssignRegisters::run_on_model(const std::shared_ptr
             // Note that some ops might have identical input&output tensors (Result and Tile* for ex.)
             // so we have to check that the tensor has not been enumerated already
             if (reg_map.count(t) == 0) {
-                reg_map[t] = manually_assigned_regs.count(t) == 0 ? counter++ : IS_MANUALLY_ALLOCATED_REG;
+                reg_map[t] = manually_assigned_regs.count(t) == 0 ? counter++ : IS_MANUALLY_ASSIGNED_REG;
             }
         }
     };
@@ -131,13 +131,13 @@ bool ngraph::snippets::pass::AssignRegisters::run_on_model(const std::shared_ptr
     std::vector<std::set<Reg>> used_vec(ops.size(), std::set<Reg>());
     std::vector<std::set<Reg>> defined_vec(ops.size(), std::set<Reg>());
 
-    auto tensor2reg = [IS_MANUALLY_ALLOCATED_REG] (const std::vector<Tensor>& tensors, const std::map<Tensor, Reg>& reg_map) {
+    auto tensor2reg = [IS_MANUALLY_ASSIGNED_REG] (const std::vector<Tensor>& tensors, const std::map<Tensor, Reg>& reg_map) {
         std::set<Reg> result;
         for (const auto& t : tensors) {
             if (reg_map.count(t) == 0)
                 throw ngraph::ngraph_error("Assign registers: attempt to access not enumerated tensor");
             Reg reg_id = reg_map.at(t);
-            if (reg_id != IS_MANUALLY_ALLOCATED_REG)
+            if (reg_id != IS_MANUALLY_ASSIGNED_REG)
                 result.insert(reg_id);
         }
         return result;
@@ -298,10 +298,10 @@ bool ngraph::snippets::pass::AssignRegisters::run_on_model(const std::shared_ptr
 
     std::map<Tensor, Reg> assigned_regs(std::move(manually_assigned_gprs));
     assigned_regs.insert(manually_assigned_vecs.begin(), manually_assigned_vecs.end());
-    auto register_assigned_regs = [IS_MANUALLY_ALLOCATED_REG, &assigned_regs](const std::map<Tensor, Reg>& unique_regs,
+    auto register_assigned_regs = [IS_MANUALLY_ASSIGNED_REG, &assigned_regs](const std::map<Tensor, Reg>& unique_regs,
                                                    const std::map<Reg, Reg>& unique2reused) {
         for (const auto& reg : unique_regs) {
-            if (reg.second == IS_MANUALLY_ALLOCATED_REG)
+            if (reg.second == IS_MANUALLY_ASSIGNED_REG)
                 continue;
             if (unique2reused.count(reg.second) == 0)
                 throw ngraph::ngraph_error("Assign registers failed to allocate register for a tensor");

diff --git a/src/common/snippets/src/pass/buffer_identification.cpp b/src/common/snippets/src/pass/buffer_identification.cpp
@@ -1,4 +1,4 @@
-// Copyright (C) 2022 Intel Corporation
+// Copyright (C) 2018-2023 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -13,24 +13,24 @@ namespace snippets {
 namespace pass {
 
 namespace {
-using BufferSet = std::vector<std::shared_ptr<op::Buffer>>;
-
-auto is_intermediate_buffer(const std::shared_ptr<ov::Node>& op) -> std::shared_ptr<op::Buffer> {
+std::shared_ptr<op::Buffer> is_intermediate_buffer(const std::shared_ptr<ov::Node>& op) {
     const auto buffer = ov::as_type_ptr<op::Buffer>(op);
     return buffer && buffer->is_intermediate_memory() ? buffer : nullptr;
 }
 
-
+inline size_t index(size_t size, size_t row, size_t col) {
+    return col + row * size;
+}
 } // namespace
 
-auto create_adjacency_matrix(const BufferSet& buffers) -> std::vector<bool> {
+std::vector<bool> BufferIdentification::create_adjacency_matrix(const BufferIdentification::BufferSet& buffers) {
     // The sync point to check for adjency is Loop because only in Loop we increment pointers.
     // So if some Buffers in the one Loop have conflict (cannot be inplace: the same ptr increment and finalization offset)
     // they are called as adjacent
     const auto size = buffers.size();
     std::vector<bool> adj(size * size, false);
     for (size_t i = 0; i < size; ++i)
-        adj[i + i * size] = true;
+        adj[index(size, i, i)] = true;
 
     auto update_adj_matrix = [&](const std::shared_ptr<op::Buffer>& buffer, size_t buffer_index,
                                  const std::shared_ptr<op::Buffer>& neighbour_buffer) {
@@ -41,19 +41,19 @@ auto create_adjacency_matrix(const BufferSet& buffers) -> std::vector<bool> {
                 NGRAPH_CHECK(iter != buffers.cend(), "Buffer wasn't find in Buffer system of Subgraph");
 
                 const size_t adj_idx = std::distance(buffers.cbegin(), iter);
-                adj[buffer_index + adj_idx * size] = true;
+                adj[index(size, adj_idx, buffer_index)] = adj[index(size, buffer_index, adj_idx)] = true;
             }
         }
     };
 
     for (size_t i = 0; i < buffers.size(); ++i) {
-        const auto buffer = buffers[i];
+        const auto& buffer = buffers[i];
 
         auto port = buffer->input_value(0).get_index();
         auto parent = buffer->get_input_node_shared_ptr(0);
         // We iterate in While cycle to check nested Loops
         while (const auto loop_end = ov::as_type_ptr<op::LoopEnd>(parent)) {
-            const auto loop_begin = loop_end->get_loop_begin();
+            const auto& loop_begin = loop_end->get_loop_begin();
             for (const auto& input_value : loop_begin->input_values()) {
                 auto loop_in = input_value.get_node_shared_ptr();
                 auto port_idx = input_value.get_index();
@@ -69,7 +69,7 @@ auto create_adjacency_matrix(const BufferSet& buffers) -> std::vector<bool> {
             }
             for (const auto& output : loop_end->outputs()) {
                 // check for first target input is enough for Buffer searching because operations can have only single Buffer per each output port as op
-                const auto target_inputs = output.get_target_inputs();
+                const auto& target_inputs = output.get_target_inputs();
                 auto consumer_in = *target_inputs.begin();
                 auto port_idx = consumer_in.get_index();
                 auto consumer = consumer_in.get_node()->shared_from_this();
@@ -95,21 +95,35 @@ auto create_adjacency_matrix(const BufferSet& buffers) -> std::vector<bool> {
     return adj;
 }
 
-auto coloring(BufferSet& buffers, std::vector<bool>& adj) -> std::map<size_t, BufferSet> {
+std::map<size_t, BufferIdentification::BufferSet> BufferIdentification::coloring(BufferIdentification::BufferSet& buffers, std::vector<bool>& adj) {
     size_t color = 0;
-    std::map<size_t, BufferSet> color_groups;
+    std::map<size_t, BufferIdentification::BufferSet> color_groups;
     const auto size = buffers.size();
+    // If we have count of adjacent Buffers is equal to count of all Buffers,
+    // it mean that Buffers aren't adjacent between them (they just have loops)
+    if (static_cast<size_t>(std::count(adj.begin(), adj.end(), true)) == size) {
+        color_groups[color] = buffers;
+        return color_groups;
+    }
+
     for (size_t i = 0; i < size; i++) {
+        // The Buffer is already colored (visited) - skip
         if (!buffers[i])
             continue;
 
-        auto buffer = buffers[i];
+        const auto& buffer = buffers[i];
         color_groups[color].push_back(buffer); // Add to Color Group
         buffers[i] = nullptr;  // Remove from graph vertices
 
         // while Buffer i has not coloured non-neighbours
         // (row i contains 0)
         while (!std::accumulate(adj.begin() + i * size, adj.begin() + (i + 1) * size, true, std::logical_and<bool>())) {
+            // Find first non-adjacent and non-visited (non-colored) Buffer to color him to the same color
+            // NOTE: At the moment Snippets don't garantee that Buffer pointer won't be resetted after Loop execution.
+            //       So we cannot reuse Buffer pointer at second time and don't allow the following case:
+            //                   Buffer[0] -> ... -> Buffer[1] -> ... -> Buffer[0]
+            //       To cover this case, we make force break when find first adjacent not-visitted `vertex`
+            //       Notice, this case will be supported in new infrastructure with Linear IR
             size_t j = i + 1;
             bool force_break = false;
             for (; j < size; ++j) {
@@ -121,12 +135,19 @@ auto coloring(BufferSet& buffers, std::vector<bool>& adj) -> std::map<size_t, Bu
                     break;
             }
 
+            // If we have to make force break or we don't have the corresponding non-adjacent and non-colored Buffers,
+            // we should make break - all potential Buffers for the current color are already colored
             if (force_break || j == size)
                 break;
 
-            auto neighbour_buffer = buffers[j];
+            const auto& neighbour_buffer = buffers[j];
             color_groups[color].push_back(neighbour_buffer); // Add to Color Group
             buffers[j] = nullptr;  // Remove from graph vertices
+            // Unite adjacency links:
+            //    All the neighbors of Buffer `j` are added to the neighbors of Buffer `i` (the `vertices` are pulled together).
+            //    The result is an updated i-th row of the adjacency matrix,
+            //    in which 0 are only in columns with `vertex` numbers that are not adjacent to either the i-th or j-th `vertices`.
+            //    Mathematically, this can be replaced by the operation of OR of Boolean vectors representing strings i and j.
             std::transform(adj.begin() + i * size, adj.begin() + (i + 1) * size, adj.begin() + j * size,
                            adj.begin() + i * size, std::logical_or<bool>());
         }
@@ -142,7 +163,7 @@ bool BufferIdentification::run_on_model(const std::shared_ptr<ov::Model>& model)
     // Unite Buffers using Graph coloring algorithm.
     // Notes: We identify only Buffer with Intermediate memory because Buffers with new memory are used only in Brgemm case
     //        so these Buffers are always IntermediateBuffer nonadjacent
-    BufferSet buffers;
+    BufferIdentification::BufferSet buffers;
 
     const auto ops = model->get_ordered_ops();
     for (const auto& op : ops) {
@@ -157,10 +178,9 @@ bool BufferIdentification::run_on_model(const std::shared_ptr<ov::Model>& model)
     // Graph coloring algorithm
     const auto color_groups = coloring(buffers, adj);
 
-    // FIXME: use const auto& [color, united_buffers] when C++17 is available
     for (const auto& pair : color_groups) {
         const auto color = pair.first;
-        const auto united_buffers = pair.second;
+        const auto& united_buffers = pair.second;
         for (const auto& buffer : united_buffers) {
             buffer->set_id(color);
         }

diff --git a/src/common/snippets/src/pass/collapse_subgraph.cpp b/src/common/snippets/src/pass/collapse_subgraph.cpp
@@ -529,15 +529,18 @@ TokenizeSnippets::TokenizeSnippets() {
         ResultVector body_results;
         std::vector<std::set<Input<Node>>> subgraph_result_inputs;
 
-        ov::NodeVector new_body_ops;
+        ov::NodeVector ops_for_buffer_count;
         for (auto subgraph : input_subgraphs) {
             // we should summurize additional needed data count (non-scalar Constants and Buffers) from all input subgraphs
             // because we will collapse them with our node and we should get total count
             const auto subgraph_ptr = ov::as_type_ptr<ngraph::snippets::op::Subgraph>(subgraph);
             hidden_data_count += subgraph_ptr->get_virtual_port_count();
+            // Buffers can be existed only in Subgraphs with domain sensetive ops which
+            // requires intermediate memory for data repacking
+            // To avoid load time regressions, we verify only these Subgraph with domain sensetive ops
             if (subgraph_ptr->has_domain_sensitive_ops()) {
                 const auto ops = subgraph_ptr->body_ptr()->get_ordered_ops();
-                new_body_ops.insert(new_body_ops.end(), ops.begin(), ops.end());
+                ops_for_buffer_count.insert(ops_for_buffer_count.end(), ops.begin(), ops.end());
             }
 
             for (auto output : subgraph->outputs()) {
@@ -570,7 +573,7 @@ TokenizeSnippets::TokenizeSnippets() {
         }
 
         if (op::Subgraph::is_domain_sensitive_op(node)) {
-            new_body_ops.push_back(node);
+            ops_for_buffer_count.push_back(node);
         }
 
         for (auto output : node->outputs()) {
@@ -583,7 +586,7 @@ TokenizeSnippets::TokenizeSnippets() {
         }
 
         // todo: move this plugin-specific constraint to the plugin callback
-        const auto unique_buffer_count = op::Subgraph::get_estimated_buffer_count(new_body_ops);
+        const auto unique_buffer_count = op::Subgraph::get_estimated_buffer_count(ops_for_buffer_count);
         if (body_parameters.size() + body_results.size() + hidden_data_count + unique_buffer_count > 12) {
             const std::string message_reset = "new subgraph is created. Impossible to schedule subgraph with " +
             std::to_string(body_parameters.size()) + " inputs, " + std::to_string(body_results.size()) + " outputs and " +