Sns explicit tiles leftovers (openvinotoolkit#60)

IvanNovoselov · Nov 30, 2022 · a748dac · a748dac
1 parent 0075a5f
commit a748dac
Show file tree

Hide file tree

Showing 13 changed files with 94 additions and 66 deletions.
diff --git a/src/common/snippets/include/snippets/op/loop.hpp b/src/common/snippets/include/snippets/op/loop.hpp
@@ -14,56 +14,69 @@ namespace op {
 
 /**
  * @interface LoopBase
- * @brief Inserted during scheduling generation and represents Loop in affine notation
+ * @brief Base class for LoopBegin and LoopEnd
  * @ingroup snippets
  */
 class LoopBase : public ngraph::op::Op {
 public:
     OPENVINO_OP("LoopBase", "SnippetsOpset");
-    LoopBase(const std::vector<Output<Node>>& args, size_t dimension, size_t work_amount, size_t increment);
+    LoopBase(const std::vector<Output<Node>>& args, size_t work_amount, size_t increment);
     LoopBase() = delete;
     bool visit_attributes(AttributeVisitor& visitor) override;
     size_t get_work_amount() const;
     size_t get_increment() const;
-    size_t get_dimension() const;
     bool get_evaluate_once() const;
 
 protected:
-    size_t dimension;
     size_t work_amount;
     size_t increment;
     bool evaluate_once; // true if the Loop is executed only once, used to skip setting and testing the loop counter
 };
 class LoopEnd;
+/**
+ * @interface LoopBegin
+ * @brief Marks the start of the Loop region.
+ *        Number of outputs always equals to the number of inputs (bypassed values) + 1 (edge to the corresponding LoopEnd)
+ * @param args - vector of input values, they are passed directly to output.
+ * @ingroup snippets
+ */
 class LoopBegin : public LoopBase {
     friend LoopEnd;
 public:
     OPENVINO_OP("LoopBegin", "SnippetsOpset");
-    /// \brief Construct an Loop
-    /// \param region The vector of pairs: emitters and the corresponding registers
-    /// \param increment Loop size - count of elements to load and store.
-    ///                  Vector Loop should have size of vector register and Scalar Loop should have 1
-    /// \param num_inputs Count of inputs
-    /// \param num_outputs Count of outputs
-    /// \param io_dims Vector of last dimensions of inputs and outputs
-    /// \param io_data_sizes Vector of data type sizes of inputs and outputs
-    explicit LoopBegin(const std::vector<Output<Node>>& args);
+    explicit LoopBegin(const OutputVector& args);
     LoopBegin() = delete;
     void validate_and_infer_types() override;
     std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& inputs)  const override;
     std::shared_ptr<LoopEnd> get_loop_end();
     // begin_address and input_regs are needed to communicate information between LoopBegin and LoopEnd emitters
     const uint8_t* begin_address;
     std::vector<size_t> input_regs;
+
 private:
     void validate_and_infer_types_except_LoopEnd();
-    LoopBegin(const std::vector<Output<Node>>& args, size_t dimension, size_t work_amount, size_t increment);
+    LoopBegin(const std::vector<Output<Node>>& args, size_t work_amount, size_t increment);
 };
 
+/**
+ * @interface LoopEnd
+ * @brief Marks the end of the Loop region and defines the loop properties.
+ *        Number of outputs always equals to the number of inputs (bypassed values) - 1 (edge to the corresponding LoopEnd)
+ * @param args vector of input values + LoopBegin, all values except for the LoopBegin are passed directly to output.
+ * @param work_amount total number of evaluations to be processed by the loop
+ * @param increment number of evaluations processed in one iteration of the loop.
+ * @param apply_increment describes which data pointers attributed to the loop should be incremented on every iteration.
+ * should be used when Loop is connected to Parameters and/or Results. If apply_increment[i] == true then i-th i/o data
+ * pointer will be incremented by work_amount*data_size on every iteration.
+ * @param ptr_increments specifies i/o pointer increment performed on every iteration. This is an alternative to
+ * apply_increments, which enables more flexibility.
+ * @param finalization_offsets pointer increments that are be applied to i/o pointers before exiting the loop
+ * @ingroup snippets
+ */
 class LoopEnd : public LoopBase {
 public:
     OPENVINO_OP("LoopEnd", "SnippetsOpset");
-    LoopEnd(const std::vector<Output<Node>>& args, size_t dimension, size_t work_amount, size_t increment,
+    LoopEnd(const std::vector<Output<Node>>& args, size_t work_amount, size_t increment,
               std::vector<bool> apply_increment, std::vector<int64_t> finalization_offsets);
     LoopEnd() = delete;
     std::shared_ptr<LoopBegin> get_loop_begin();

diff --git a/src/common/snippets/include/snippets/op/memory_access.hpp b/src/common/snippets/include/snippets/op/memory_access.hpp
@@ -12,9 +12,10 @@ namespace op {
 
 /**
  * @interface MemoryAccess
- * @brief This is an ubre
- *        where number of elements to store is determined by "count"
- *        Default value is "1" - to store one element
+ * @brief This is a base class for memory access operations (like Load and Store).
+ *        It provides universal set/get interface to manipulate the number
+ *        of elements accessed during one operation call ("count").
+ *        Default "count" value is "1" - it means to load/store one element
  * @ingroup snippets
  */
 

diff --git a/src/common/snippets/include/snippets/op/subgraph.hpp b/src/common/snippets/include/snippets/op/subgraph.hpp
@@ -107,7 +107,6 @@ class Subgraph : public ngraph::op::Op {
     snippets::Schedule generate(ngraph::pass::Manager &opt, const void* compile_params = nullptr);
     snippets::Schedule generate(const void* compile_params = nullptr);
     ov::PartialShape canonicalize(const BlockedShapeVector& output_shapes, const BlockedShapeVector& input_shapes);
-    ov::PartialShape get_master_shape();
     std::vector<PartialShape> reshape_body(const std::vector<PartialShape>& input_shapes);
     std::vector<Shape> reshape_body(const std::vector<Shape>& input_shapes);
 

diff --git a/...pets/include/snippets/op/loop_helpers.hpp → ...ts/include/snippets/pass/loop_helpers.hpp b/...pets/include/snippets/op/loop_helpers.hpp → ...ts/include/snippets/pass/loop_helpers.hpp
@@ -6,15 +6,27 @@
 
 #include "ngraph/op/op.hpp"
 #include "ngraph/op/parameter.hpp"
-#include "loop.hpp"
+#include "snippets/op/loop.hpp"
 
 namespace ngraph {
 namespace snippets {
 namespace op {
 
 /* ==== LoopBegin === */
+/**
+ * @interface insertLoopBeginAfterOutputs
+ * @brief  Inserts LoopBegin operation after the group of operations described
+ *          by the input argument (OutputVector). Use insertLoopBegin instead - it has a more universal interface.
+ * @ingroup snippets
+ */
 std::shared_ptr<LoopBegin> insertLoopBeginAfterOutputs(const OutputVector& originalOutputs);
 
+/**
+ * @interface insertLoopBegin
+ * @brief  Inserts LoopBegin operation after the group of operations described
+ *          by the input argument (ParameterVector, NodeVector or OutputVector).
+ * @ingroup snippets
+ */
 template<typename T>
 std::shared_ptr<LoopBegin> insertLoopBegin(const T& afterTheseNodes) {
     static_assert(std::is_same<T, ParameterVector>() || std::is_same<T, NodeVector>(),
@@ -37,12 +49,32 @@ inline std::shared_ptr<LoopBegin> insertLoopBegin(const OutputVector& afterThese
 /* ============== */
 
 /* ==== LoopEnd === */
+/**
+ * @interface insertLoopBeginAfterOutputs
+ * @brief  Inserts LoopBegin operation after the group of operations described
+ *          by the input argument (vector of inputs). Use insertLoopEnd instead - it has a more universal interface.
+ * @param originalInputs LoopEnd will be inserted before these inputs
+ * @param loopBegin pointer to the beginning of the Loop region
+ * @param work_amount total number of evaluations to be processed by the loop
+ * @param increment number of evaluations processed in one iteration of the loop
+ * @param apply_increment describes which data pointers attributed to the loop should be incremented on every iteration.
+ * should be used when Loop is connected to Parameters and/or Results
+ * @param finalization_offsets pointer shifts that should be applied to data pointers before exiting the loop
+ * @ingroup snippets
+ */
+
 std::shared_ptr<LoopEnd> insertLoopEndBeforeInputs(const std::vector<Input<Node>>& originalInputs,
-                                                  const std::shared_ptr<LoopBegin>& tileBegin,
-                                                  size_t dimension, size_t work_amount, size_t increment,
+                                                  const std::shared_ptr<LoopBegin>& loopBegin,
+                                                  size_t work_amount, size_t increment,
                                                   std::vector<bool> apply_increment = {},
                                                   std::vector<int64_t> finalization_offsets = {});
 
+/**
+ * @interface insertLoopEnd
+ * @brief  Inserts LoopEnd operation before the group of operations described
+ *          by the input argument (ResultVector, NodeVector or OutputVector).
+ * @ingroup snippets
+ */
 template<typename T, typename ...Args>
 std::shared_ptr<LoopEnd> insertLoopEnd(const T& beforeTheseNodes, Args ...args) {
     static_assert(std::is_same<T, ResultVector>() || std::is_same<T, NodeVector>(),

diff --git a/src/common/snippets/src/generator.cpp b/src/common/snippets/src/generator.cpp
@@ -105,8 +105,10 @@ ngraph::snippets::code ngraph::snippets::Generator::generate(std::shared_ptr<ov:
     const auto& ops = m->get_ordered_ops();
     for (auto op = ops.begin(); op < ops.end(); op++) {
         const auto& loop_begin = ov::as_type_ptr<ngraph::snippets::op::LoopBegin>(*op);
-        // ignore outer loops and possible manual tail loops
+
+        // ignore outer loops and possible manual scalar loops
         if (loop_begin && loop_begin->get_increment() != 1) {
+            OV_ITT_TASK_NEXT(GENERATE, "::VectorLoop")
             NodeVector vector_loop, tail_loop;
             std::shared_ptr<op::LoopEnd> vector_loop_end, tail_loop_end;
             vector_loop_end = loop_begin->get_loop_end();

diff --git a/src/common/snippets/src/op/loop.cpp b/src/common/snippets/src/op/loop.cpp
@@ -10,12 +10,11 @@ namespace ngraph {
 namespace snippets {
 namespace op {
 
-LoopBase::LoopBase(const std::vector<Output<Node>> &args, size_t dimension, size_t work_amount, size_t increment)
-        : Op(args), dimension(dimension), work_amount(work_amount), increment(increment), evaluate_once(false) {
+LoopBase::LoopBase(const std::vector<Output<Node>> &args, size_t work_amount, size_t wa_increment)
+        : Op(args), work_amount(work_amount), increment(wa_increment), evaluate_once(false) {
 }
 
 bool LoopBase::visit_attributes(AttributeVisitor &visitor) {
-    visitor.on_attribute("dimension", dimension);
     visitor.on_attribute("work_amount", work_amount);
     visitor.on_attribute("increment", increment);
     return true;
@@ -33,25 +32,21 @@ size_t LoopBase::get_increment() const {
     return increment;
 }
 
-size_t LoopBase::get_dimension() const {
-    return dimension;
-}
-
-LoopBegin::LoopBegin(const std::vector<Output<Node>> &args, size_t dimension, size_t work_amount, size_t increment)
-        : LoopBase(args, dimension, work_amount, increment),
+LoopBegin::LoopBegin(const std::vector<Output<Node>> &args, size_t work_amount, size_t increment)
+        : LoopBase(args, work_amount, increment),
         begin_address(nullptr), input_regs({}) {
     // We can only call a reduced validate_and_infer types from the constructor, since LoopEnd might not be attached
     // to the LoopBegin at this point (which is usually the case: create LoopBegin first => then attach LoopEnd to it)
     validate_and_infer_types_except_LoopEnd();
 }
 
 LoopBegin::LoopBegin(const std::vector<Output<Node>> &args)
-        : LoopBase(args, 0, 0, 0), begin_address(nullptr), input_regs({}) {
+        : LoopBase(args, 0, 0), begin_address(nullptr), input_regs({}) {
     validate_and_infer_types_except_LoopEnd();
 }
 
 std::shared_ptr<Node> LoopBegin::clone_with_new_inputs(const OutputVector& inputs) const {
-    return std::shared_ptr<LoopBegin>(new LoopBegin(inputs, dimension, work_amount, increment));
+    return std::shared_ptr<LoopBegin>(new LoopBegin(inputs, work_amount, increment));
 }
 
 
@@ -70,7 +65,6 @@ void LoopBegin::validate_and_infer_types() {
     NODE_VALIDATION_CHECK(this, last_output_inputs.size() == 1, "LoopBegin must have exactly one input attached to the last output");
     const auto& loop_end = ov::as_type_ptr<LoopEnd>(last_output_inputs.begin()->get_node()->shared_from_this());
     NODE_VALIDATION_CHECK(this, loop_end != nullptr, "LoopBegin must have LoopEnd connected to its last output");
-    dimension = loop_end->get_dimension();
     work_amount = loop_end->get_work_amount();
     increment = loop_end->get_increment();
 }
@@ -85,15 +79,15 @@ std::shared_ptr<LoopEnd> LoopBegin::get_loop_end() {
     return  loop_end;
 }
 
-LoopEnd::LoopEnd(const std::vector<Output<Node>> &args, size_t dimension, size_t work_amount, size_t increment,
+LoopEnd::LoopEnd(const std::vector<Output<Node>> &args, size_t work_amount, size_t increment,
                  std::vector<bool> apply_increment, std::vector<int64_t> finalization_offsets)
-        : LoopBase(args, dimension, work_amount, increment), apply_increment(std::move(apply_increment)),
+        : LoopBase(args, work_amount, increment), apply_increment(std::move(apply_increment)),
         finalization_offsets(std::move(finalization_offsets)), has_outer_loop(true) {
     constructor_validate_and_infer_types();
 }
 
 std::shared_ptr<Node> LoopEnd::clone_with_new_inputs(const OutputVector& inputs) const {
-    return std::make_shared<LoopEnd>(inputs, dimension, work_amount, increment, apply_increment, finalization_offsets);
+    return std::make_shared<LoopEnd>(inputs, work_amount, increment, apply_increment, finalization_offsets);
 }
 
 std::shared_ptr<LoopBegin> LoopEnd::get_loop_begin() {

diff --git a/src/common/snippets/src/op/subgraph.cpp b/src/common/snippets/src/op/subgraph.cpp
@@ -296,16 +296,6 @@ ov::PartialShape snippets::op::Subgraph::canonicalize(const BlockedShapeVector&
     return master_shape;
 }
 
-PartialShape snippets::op::Subgraph::get_master_shape() {
-    auto results = m_body->get_results();
-    PartialShape outPShape = results[0]->get_input_partial_shape(0);
-    for (const auto& r : results)
-        PartialShape::broadcast_merge_into(outPShape, r->get_input_shape(0),
-                                           ::ngraph::op::AutoBroadcastType::NUMPY);
-    master_shape = outPShape;
-    return master_shape;
-}
-
 void snippets::op::Subgraph::align_element_types(const BlockedShapeVector& outputShapes,
                                                  const BlockedShapeVector& inputShapes) {
     // We should insert Convert before Results to set original output element type if needed
@@ -367,7 +357,7 @@ void snippets::op::Subgraph::convert_to_snippet_dialect() {
     manager.register_pass<snippets::pass::InsertStore>(count);
     // todo: presently dynamic pipeline is activated even if the last two dimension are static
     //  In general, we can use static kernels in this case, but several parameters (src and dst memory pointers for example)
-    //  should be passed as run-time args, so it's a mixed regime: kernel is shape-aware, but some additional runtime args are required
+    //  should be passed as run-time args, so it's a mixed mode: kernel is shape-aware, but some additional runtime args are required
     // Presently Broadcasting is organized in the following way:
     // * ALL last dims are static => broadcasting is handled via MoveBroadcast and pointer arithmetics (even for dynamic upper dims)
     if (!inputs_has_dynamic_last_dims) {

diff --git a/src/common/snippets/src/pass/insert_loops.cpp b/src/common/snippets/src/pass/insert_loops.cpp
@@ -4,7 +4,7 @@
 
 #include <snippets/itt.hpp>
 #include "snippets/pass/insert_loops.hpp"
-#include "snippets/op/loop_helpers.hpp"
+#include "snippets/pass/loop_helpers.hpp"
 
 #include <ngraph/rt_info.hpp>
 
@@ -52,8 +52,8 @@ bool ngraph::snippets::pass::InsertLoops::run_on_model(const std::shared_ptr<ov:
                            });
         }
         const auto& inner_loop_begin = op::insertLoopBegin(commonParams);
-        const auto& inner_loop_end = insertLoopEnd(commonResults, inner_loop_begin, inner_dim, inner_work_amount,
-                                                   vector_size, apply_increments, inner_finalization_offsets);
+        const auto& inner_loop_end = insertLoopEnd(commonResults, inner_loop_begin, inner_work_amount,
+                                                   vector_size, apply_increments,  inner_finalization_offsets);
         // set internal flag to enable scalar vs vector loop optimizations
         inner_loop_end->has_outer_loop = outer_work_amount > 1;
         // Due to features of topological sort, some Constants (Scalars) may appear right after Parameters in
@@ -78,7 +78,7 @@ bool ngraph::snippets::pass::InsertLoops::run_on_model(const std::shared_ptr<ov:
                            return ps[outer_dim] != 1 && ps[inner_dim] == 1;
                        });
         const auto& outer_loop_begin = op::insertLoopBegin(commonParams);
-        insertLoopEnd(commonResults, outer_loop_begin, outer_dim, outer_work_amount, 1, apply_increments);
+        insertLoopEnd(commonResults, outer_loop_begin, outer_work_amount, 1, apply_increments);
     }
 
     return true;

diff --git a/src/common/snippets/src/op/loop_helpers.cpp → ...common/snippets/src/pass/loop_helpers.cpp b/src/common/snippets/src/op/loop_helpers.cpp → ...common/snippets/src/pass/loop_helpers.cpp
@@ -3,7 +3,7 @@
 //
 
 #include "ngraph/op/op.hpp"
-#include "snippets/op/loop_helpers.hpp"
+#include "snippets/pass/loop_helpers.hpp"
 
 namespace ngraph {
 namespace snippets {
@@ -26,15 +26,15 @@ std::shared_ptr<LoopBegin> insertLoopBeginAfterOutputs(const OutputVector& origi
 
 std::shared_ptr<LoopEnd> insertLoopEndBeforeInputs(const std::vector<Input<Node>>& originalInputs,
                                                    const std::shared_ptr<LoopBegin>& loopBegin,
-                                                   size_t dimension, size_t work_amount, size_t increment,
+                                                   size_t work_amount, size_t increment,
                                                    std::vector<bool> apply_increment,
                                                    std::vector<int64_t> finalization_offsets) {
     OutputVector originalParentOutputs;
     for (const auto& in : originalInputs) {
         originalParentOutputs.push_back(in.get_source_output());
     }
     originalParentOutputs.push_back(loopBegin->output(loopBegin->get_output_size() - 1));
-    auto loop_end = std::make_shared<LoopEnd>(originalParentOutputs, dimension, work_amount, increment,
+    auto loop_end = std::make_shared<LoopEnd>(originalParentOutputs, work_amount, increment,
                                              std::move(apply_increment), std::move(finalization_offsets));
 
     for (int i = 0; i < originalInputs.size(); i++) {

diff --git a/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.cpp b/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.cpp
@@ -307,7 +307,7 @@ LoopEndEmitter::LoopEndEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::imp
     for (int i = 0; i < num_inputs; i++)
         io_data_size.push_back(loop_begin->get_input_element_type(i).size());
     for (int i = 0; i < num_outputs; i++)
-        io_data_size.push_back(loop_end->get_input_element_type(i).size());
+        io_data_size.push_back(loop_end->get_output_element_type(i).size());
     in_out_type_ = emitter_in_out_map::gpr_to_gpr;
 }
 

diff --git a/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.hpp b/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.hpp
@@ -54,7 +54,7 @@ class jit_container_emitter: public jit_emitter {
 };
 ///
 /// \brief    Kernel is the only entry point to Codogen Jit compilation. Kernel perform abstract-to-physical register
-/// mapping and creates pools of available gpr and vec registers. Kernel usually to contains (at least one)
+/// mapping and creates a pools of available gpr and vec registers. Kernel usually contains (at least one)
 /// LoopBeginEmitter and LoopEndEmitter pair. In general the enclosed emitters should be organized in the following way:
 /// KernelEmitter {                 /* entry point, maps registers, creates pools of available registers */
 ///     1.S LoopBeginEmitter        /* Scalar Loop over the outer dimension [START] */

diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.cpp b/src/plugins/intel_cpu/src/nodes/subgraph.cpp
@@ -429,9 +429,6 @@ std::vector<VectorDims> Snippet::shapeInfer() const {
 }
 
 void Snippet::prepareParams() {
-    // here must be all the stuff that could only be done for static shapes, e.g. offset calculation
-    // Here it must be all the stuff that could be done once for both static and dynamic shapes
-
     masterShape = getNormalizedDimsBySize(masterShape, tensorRank);
     for (auto& pshape : normInputShapes)
         pshape = getNormalizedDimsBySize(pshape, tensorRank);