diff --git a/src/common/snippets/include/snippets/op/loop.hpp b/src/common/snippets/include/snippets/op/loop.hpp index 519cc53ddd3eaf..82ef5c8b576fad 100644 --- a/src/common/snippets/include/snippets/op/loop.hpp +++ b/src/common/snippets/include/snippets/op/loop.hpp @@ -14,40 +14,37 @@ namespace op { /** * @interface LoopBase - * @brief Inserted during scheduling generation and represents Loop in affine notation + * @brief Base class for LoopBegin and LoopEnd * @ingroup snippets */ class LoopBase : public ngraph::op::Op { public: OPENVINO_OP("LoopBase", "SnippetsOpset"); - LoopBase(const std::vector>& args, size_t dimension, size_t work_amount, size_t increment); + LoopBase(const std::vector>& args, size_t work_amount, size_t increment); LoopBase() = delete; bool visit_attributes(AttributeVisitor& visitor) override; size_t get_work_amount() const; size_t get_increment() const; - size_t get_dimension() const; bool get_evaluate_once() const; protected: - size_t dimension; size_t work_amount; size_t increment; bool evaluate_once; // true if the Loop is executed only once, used to skip setting and testing the loop counter }; class LoopEnd; +/** + * @interface LoopBegin + * @brief Marks the start of the Loop region. + * Number of outputs always equals to the number of inputs (bypassed values) + 1 (edge to the corresponding LoopEnd) + * @param args - vector of input values, they are passed directly to output. + * @ingroup snippets + */ class LoopBegin : public LoopBase { friend LoopEnd; public: OPENVINO_OP("LoopBegin", "SnippetsOpset"); - /// \brief Construct an Loop - /// \param region The vector of pairs: emitters and the corresponding registers - /// \param increment Loop size - count of elements to load and store. - /// Vector Loop should have size of vector register and Scalar Loop should have 1 - /// \param num_inputs Count of inputs - /// \param num_outputs Count of outputs - /// \param io_dims Vector of last dimensions of inputs and outputs - /// \param io_data_sizes Vector of data type sizes of inputs and outputs - explicit LoopBegin(const std::vector>& args); + explicit LoopBegin(const OutputVector& args); LoopBegin() = delete; void validate_and_infer_types() override; std::shared_ptr clone_with_new_inputs(const OutputVector& inputs) const override; @@ -55,15 +52,31 @@ class LoopBegin : public LoopBase { // begin_address and input_regs are needed to communicate information between LoopBegin and LoopEnd emitters const uint8_t* begin_address; std::vector input_regs; + private: void validate_and_infer_types_except_LoopEnd(); - LoopBegin(const std::vector>& args, size_t dimension, size_t work_amount, size_t increment); + LoopBegin(const std::vector>& args, size_t work_amount, size_t increment); }; +/** + * @interface LoopEnd + * @brief Marks the end of the Loop region and defines the loop properties. + * Number of outputs always equals to the number of inputs (bypassed values) - 1 (edge to the corresponding LoopEnd) + * @param args vector of input values + LoopBegin, all values except for the LoopBegin are passed directly to output. + * @param work_amount total number of evaluations to be processed by the loop + * @param increment number of evaluations processed in one iteration of the loop. + * @param apply_increment describes which data pointers attributed to the loop should be incremented on every iteration. + * should be used when Loop is connected to Parameters and/or Results. If apply_increment[i] == true then i-th i/o data + * pointer will be incremented by work_amount*data_size on every iteration. + * @param ptr_increments specifies i/o pointer increment performed on every iteration. This is an alternative to + * apply_increments, which enables more flexibility. + * @param finalization_offsets pointer increments that are be applied to i/o pointers before exiting the loop + * @ingroup snippets + */ class LoopEnd : public LoopBase { public: OPENVINO_OP("LoopEnd", "SnippetsOpset"); - LoopEnd(const std::vector>& args, size_t dimension, size_t work_amount, size_t increment, + LoopEnd(const std::vector>& args, size_t work_amount, size_t increment, std::vector apply_increment, std::vector finalization_offsets); LoopEnd() = delete; std::shared_ptr get_loop_begin(); diff --git a/src/common/snippets/include/snippets/op/memory_access.hpp b/src/common/snippets/include/snippets/op/memory_access.hpp index 22aca3f358be4c..5971c5cc5ce744 100644 --- a/src/common/snippets/include/snippets/op/memory_access.hpp +++ b/src/common/snippets/include/snippets/op/memory_access.hpp @@ -12,9 +12,10 @@ namespace op { /** * @interface MemoryAccess - * @brief This is an ubre - * where number of elements to store is determined by "count" - * Default value is "1" - to store one element + * @brief This is a base class for memory access operations (like Load and Store). + * It provides universal set/get interface to manipulate the number + * of elements accessed during one operation call ("count"). + * Default "count" value is "1" - it means to load/store one element * @ingroup snippets */ diff --git a/src/common/snippets/include/snippets/op/subgraph.hpp b/src/common/snippets/include/snippets/op/subgraph.hpp index dfcde2bd4fd2c6..151f4d98d4b431 100644 --- a/src/common/snippets/include/snippets/op/subgraph.hpp +++ b/src/common/snippets/include/snippets/op/subgraph.hpp @@ -107,7 +107,6 @@ class Subgraph : public ngraph::op::Op { snippets::Schedule generate(ngraph::pass::Manager &opt, const void* compile_params = nullptr); snippets::Schedule generate(const void* compile_params = nullptr); ov::PartialShape canonicalize(const BlockedShapeVector& output_shapes, const BlockedShapeVector& input_shapes); - ov::PartialShape get_master_shape(); std::vector reshape_body(const std::vector& input_shapes); std::vector reshape_body(const std::vector& input_shapes); diff --git a/src/common/snippets/include/snippets/op/loop_helpers.hpp b/src/common/snippets/include/snippets/pass/loop_helpers.hpp similarity index 61% rename from src/common/snippets/include/snippets/op/loop_helpers.hpp rename to src/common/snippets/include/snippets/pass/loop_helpers.hpp index 57a14e5f036cc9..12e0e9746bc8f0 100644 --- a/src/common/snippets/include/snippets/op/loop_helpers.hpp +++ b/src/common/snippets/include/snippets/pass/loop_helpers.hpp @@ -6,15 +6,27 @@ #include "ngraph/op/op.hpp" #include "ngraph/op/parameter.hpp" -#include "loop.hpp" +#include "snippets/op/loop.hpp" namespace ngraph { namespace snippets { namespace op { /* ==== LoopBegin === */ +/** + * @interface insertLoopBeginAfterOutputs + * @brief Inserts LoopBegin operation after the group of operations described + * by the input argument (OutputVector). Use insertLoopBegin instead - it has a more universal interface. + * @ingroup snippets + */ std::shared_ptr insertLoopBeginAfterOutputs(const OutputVector& originalOutputs); +/** + * @interface insertLoopBegin + * @brief Inserts LoopBegin operation after the group of operations described + * by the input argument (ParameterVector, NodeVector or OutputVector). + * @ingroup snippets + */ template std::shared_ptr insertLoopBegin(const T& afterTheseNodes) { static_assert(std::is_same() || std::is_same(), @@ -37,12 +49,32 @@ inline std::shared_ptr insertLoopBegin(const OutputVector& afterThese /* ============== */ /* ==== LoopEnd === */ +/** + * @interface insertLoopBeginAfterOutputs + * @brief Inserts LoopBegin operation after the group of operations described + * by the input argument (vector of inputs). Use insertLoopEnd instead - it has a more universal interface. + * @param originalInputs LoopEnd will be inserted before these inputs + * @param loopBegin pointer to the beginning of the Loop region + * @param work_amount total number of evaluations to be processed by the loop + * @param increment number of evaluations processed in one iteration of the loop + * @param apply_increment describes which data pointers attributed to the loop should be incremented on every iteration. + * should be used when Loop is connected to Parameters and/or Results + * @param finalization_offsets pointer shifts that should be applied to data pointers before exiting the loop + * @ingroup snippets + */ + std::shared_ptr insertLoopEndBeforeInputs(const std::vector>& originalInputs, - const std::shared_ptr& tileBegin, - size_t dimension, size_t work_amount, size_t increment, + const std::shared_ptr& loopBegin, + size_t work_amount, size_t increment, std::vector apply_increment = {}, std::vector finalization_offsets = {}); +/** + * @interface insertLoopEnd + * @brief Inserts LoopEnd operation before the group of operations described + * by the input argument (ResultVector, NodeVector or OutputVector). + * @ingroup snippets + */ template std::shared_ptr insertLoopEnd(const T& beforeTheseNodes, Args ...args) { static_assert(std::is_same() || std::is_same(), diff --git a/src/common/snippets/src/generator.cpp b/src/common/snippets/src/generator.cpp index d34b93392a09c8..00171a27a707b7 100644 --- a/src/common/snippets/src/generator.cpp +++ b/src/common/snippets/src/generator.cpp @@ -105,8 +105,10 @@ ngraph::snippets::code ngraph::snippets::Generator::generate(std::shared_ptrget_ordered_ops(); for (auto op = ops.begin(); op < ops.end(); op++) { const auto& loop_begin = ov::as_type_ptr(*op); - // ignore outer loops and possible manual tail loops + + // ignore outer loops and possible manual scalar loops if (loop_begin && loop_begin->get_increment() != 1) { + OV_ITT_TASK_NEXT(GENERATE, "::VectorLoop") NodeVector vector_loop, tail_loop; std::shared_ptr vector_loop_end, tail_loop_end; vector_loop_end = loop_begin->get_loop_end(); diff --git a/src/common/snippets/src/op/loop.cpp b/src/common/snippets/src/op/loop.cpp index c36b713b8f0496..3e1c7cf9a7f1fe 100644 --- a/src/common/snippets/src/op/loop.cpp +++ b/src/common/snippets/src/op/loop.cpp @@ -10,12 +10,11 @@ namespace ngraph { namespace snippets { namespace op { -LoopBase::LoopBase(const std::vector> &args, size_t dimension, size_t work_amount, size_t increment) - : Op(args), dimension(dimension), work_amount(work_amount), increment(increment), evaluate_once(false) { +LoopBase::LoopBase(const std::vector> &args, size_t work_amount, size_t wa_increment) + : Op(args), work_amount(work_amount), increment(wa_increment), evaluate_once(false) { } bool LoopBase::visit_attributes(AttributeVisitor &visitor) { - visitor.on_attribute("dimension", dimension); visitor.on_attribute("work_amount", work_amount); visitor.on_attribute("increment", increment); return true; @@ -33,12 +32,8 @@ size_t LoopBase::get_increment() const { return increment; } -size_t LoopBase::get_dimension() const { - return dimension; -} - -LoopBegin::LoopBegin(const std::vector> &args, size_t dimension, size_t work_amount, size_t increment) - : LoopBase(args, dimension, work_amount, increment), +LoopBegin::LoopBegin(const std::vector> &args, size_t work_amount, size_t increment) + : LoopBase(args, work_amount, increment), begin_address(nullptr), input_regs({}) { // We can only call a reduced validate_and_infer types from the constructor, since LoopEnd might not be attached // to the LoopBegin at this point (which is usually the case: create LoopBegin first => then attach LoopEnd to it) @@ -46,12 +41,12 @@ LoopBegin::LoopBegin(const std::vector> &args, size_t dimension, si } LoopBegin::LoopBegin(const std::vector> &args) - : LoopBase(args, 0, 0, 0), begin_address(nullptr), input_regs({}) { + : LoopBase(args, 0, 0), begin_address(nullptr), input_regs({}) { validate_and_infer_types_except_LoopEnd(); } std::shared_ptr LoopBegin::clone_with_new_inputs(const OutputVector& inputs) const { - return std::shared_ptr(new LoopBegin(inputs, dimension, work_amount, increment)); + return std::shared_ptr(new LoopBegin(inputs, work_amount, increment)); } @@ -70,7 +65,6 @@ void LoopBegin::validate_and_infer_types() { NODE_VALIDATION_CHECK(this, last_output_inputs.size() == 1, "LoopBegin must have exactly one input attached to the last output"); const auto& loop_end = ov::as_type_ptr(last_output_inputs.begin()->get_node()->shared_from_this()); NODE_VALIDATION_CHECK(this, loop_end != nullptr, "LoopBegin must have LoopEnd connected to its last output"); - dimension = loop_end->get_dimension(); work_amount = loop_end->get_work_amount(); increment = loop_end->get_increment(); } @@ -85,15 +79,15 @@ std::shared_ptr LoopBegin::get_loop_end() { return loop_end; } -LoopEnd::LoopEnd(const std::vector> &args, size_t dimension, size_t work_amount, size_t increment, +LoopEnd::LoopEnd(const std::vector> &args, size_t work_amount, size_t increment, std::vector apply_increment, std::vector finalization_offsets) - : LoopBase(args, dimension, work_amount, increment), apply_increment(std::move(apply_increment)), + : LoopBase(args, work_amount, increment), apply_increment(std::move(apply_increment)), finalization_offsets(std::move(finalization_offsets)), has_outer_loop(true) { constructor_validate_and_infer_types(); } std::shared_ptr LoopEnd::clone_with_new_inputs(const OutputVector& inputs) const { - return std::make_shared(inputs, dimension, work_amount, increment, apply_increment, finalization_offsets); + return std::make_shared(inputs, work_amount, increment, apply_increment, finalization_offsets); } std::shared_ptr LoopEnd::get_loop_begin() { diff --git a/src/common/snippets/src/op/subgraph.cpp b/src/common/snippets/src/op/subgraph.cpp index 8cdd858a90a7a1..485fac1391c24b 100644 --- a/src/common/snippets/src/op/subgraph.cpp +++ b/src/common/snippets/src/op/subgraph.cpp @@ -296,16 +296,6 @@ ov::PartialShape snippets::op::Subgraph::canonicalize(const BlockedShapeVector& return master_shape; } -PartialShape snippets::op::Subgraph::get_master_shape() { - auto results = m_body->get_results(); - PartialShape outPShape = results[0]->get_input_partial_shape(0); - for (const auto& r : results) - PartialShape::broadcast_merge_into(outPShape, r->get_input_shape(0), - ::ngraph::op::AutoBroadcastType::NUMPY); - master_shape = outPShape; - return master_shape; -} - void snippets::op::Subgraph::align_element_types(const BlockedShapeVector& outputShapes, const BlockedShapeVector& inputShapes) { // We should insert Convert before Results to set original output element type if needed @@ -367,7 +357,7 @@ void snippets::op::Subgraph::convert_to_snippet_dialect() { manager.register_pass(count); // todo: presently dynamic pipeline is activated even if the last two dimension are static // In general, we can use static kernels in this case, but several parameters (src and dst memory pointers for example) - // should be passed as run-time args, so it's a mixed regime: kernel is shape-aware, but some additional runtime args are required + // should be passed as run-time args, so it's a mixed mode: kernel is shape-aware, but some additional runtime args are required // Presently Broadcasting is organized in the following way: // * ALL last dims are static => broadcasting is handled via MoveBroadcast and pointer arithmetics (even for dynamic upper dims) if (!inputs_has_dynamic_last_dims) { diff --git a/src/common/snippets/src/pass/insert_loops.cpp b/src/common/snippets/src/pass/insert_loops.cpp index 30c9a20883b8d5..0aa4ce84ba2397 100644 --- a/src/common/snippets/src/pass/insert_loops.cpp +++ b/src/common/snippets/src/pass/insert_loops.cpp @@ -4,7 +4,7 @@ #include #include "snippets/pass/insert_loops.hpp" -#include "snippets/op/loop_helpers.hpp" +#include "snippets/pass/loop_helpers.hpp" #include @@ -52,8 +52,8 @@ bool ngraph::snippets::pass::InsertLoops::run_on_model(const std::shared_ptrhas_outer_loop = outer_work_amount > 1; // Due to features of topological sort, some Constants (Scalars) may appear right after Parameters in @@ -78,7 +78,7 @@ bool ngraph::snippets::pass::InsertLoops::run_on_model(const std::shared_ptr insertLoopBeginAfterOutputs(const OutputVector& origi std::shared_ptr insertLoopEndBeforeInputs(const std::vector>& originalInputs, const std::shared_ptr& loopBegin, - size_t dimension, size_t work_amount, size_t increment, + size_t work_amount, size_t increment, std::vector apply_increment, std::vector finalization_offsets) { OutputVector originalParentOutputs; @@ -34,7 +34,7 @@ std::shared_ptr insertLoopEndBeforeInputs(const std::vector originalParentOutputs.push_back(in.get_source_output()); } originalParentOutputs.push_back(loopBegin->output(loopBegin->get_output_size() - 1)); - auto loop_end = std::make_shared(originalParentOutputs, dimension, work_amount, increment, + auto loop_end = std::make_shared(originalParentOutputs, work_amount, increment, std::move(apply_increment), std::move(finalization_offsets)); for (int i = 0; i < originalInputs.size(); i++) { diff --git a/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.cpp b/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.cpp index 3dc0a1e043d2a7..b7df4a498d2b7c 100644 --- a/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.cpp +++ b/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.cpp @@ -307,7 +307,7 @@ LoopEndEmitter::LoopEndEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::imp for (int i = 0; i < num_inputs; i++) io_data_size.push_back(loop_begin->get_input_element_type(i).size()); for (int i = 0; i < num_outputs; i++) - io_data_size.push_back(loop_end->get_input_element_type(i).size()); + io_data_size.push_back(loop_end->get_output_element_type(i).size()); in_out_type_ = emitter_in_out_map::gpr_to_gpr; } diff --git a/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.hpp b/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.hpp index 7fa1b8f1aa958d..735af9ec58dbf1 100644 --- a/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.hpp +++ b/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.hpp @@ -54,7 +54,7 @@ class jit_container_emitter: public jit_emitter { }; /// /// \brief Kernel is the only entry point to Codogen Jit compilation. Kernel perform abstract-to-physical register -/// mapping and creates pools of available gpr and vec registers. Kernel usually to contains (at least one) +/// mapping and creates a pools of available gpr and vec registers. Kernel usually contains (at least one) /// LoopBeginEmitter and LoopEndEmitter pair. In general the enclosed emitters should be organized in the following way: /// KernelEmitter { /* entry point, maps registers, creates pools of available registers */ /// 1.S LoopBeginEmitter /* Scalar Loop over the outer dimension [START] */ diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.cpp b/src/plugins/intel_cpu/src/nodes/subgraph.cpp index 7ad5ebf1636d1f..e4d99a0468c441 100644 --- a/src/plugins/intel_cpu/src/nodes/subgraph.cpp +++ b/src/plugins/intel_cpu/src/nodes/subgraph.cpp @@ -429,9 +429,6 @@ std::vector Snippet::shapeInfer() const { } void Snippet::prepareParams() { - // here must be all the stuff that could only be done for static shapes, e.g. offset calculation - // Here it must be all the stuff that could be done once for both static and dynamic shapes - masterShape = getNormalizedDimsBySize(masterShape, tensorRank); for (auto& pshape : normInputShapes) pshape = getNormalizedDimsBySize(pshape, tensorRank); diff --git a/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_lowered.cpp b/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_lowered.cpp index d04db522a54881..afea8266be0e04 100644 --- a/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_lowered.cpp +++ b/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_lowered.cpp @@ -6,7 +6,7 @@ #include "common_test_utils/data_utils.hpp" #include #include "ngraph_functions/builders.hpp" -#include +#include "snippets/pass/loop_helpers.hpp" namespace ov { namespace test { @@ -38,14 +38,14 @@ std::shared_ptr AddFunctionLoweredBroadcast::initLowered() const { ResultVector results({model->get_results()[0]}); const auto& inner_loop_begin = ngraph::snippets::op::insertLoopBegin(input_params); std::vector apply_increments(input_params.size() + results.size(), true); - insertLoopEnd(results, inner_loop_begin, 1, 1, 1, apply_increments); + insertLoopEnd(results, inner_loop_begin, 1, 1, apply_increments); auto outer_WA = std::accumulate(input_shapes.begin(), input_shapes.end(), 0, [](int64_t max_val, const PartialShape& ps) { return std::max(ps[ps.size() - 2].get_length(), max_val); }); if (outer_WA > 1) { const auto& outer_loop_begin = ngraph::snippets::op::insertLoopBegin(input_params); - insertLoopEnd(results, outer_loop_begin, 0, 1, 1, apply_increments); + insertLoopEnd(results, outer_loop_begin, 1, 1, apply_increments); } return model; } @@ -94,14 +94,14 @@ std::shared_ptr EltwiseThreeInputsLoweredFunction::initLowered() cons ResultVector results({model->get_results()[0]}); const auto& inner_loop_begin = ngraph::snippets::op::insertLoopBegin(input_params); std::vector apply_increments(input_params.size() + results.size(), true); - const auto& inner_loop_end = insertLoopEnd(results, inner_loop_begin, 1, 1, 1, apply_increments); + const auto& inner_loop_end = insertLoopEnd(results, inner_loop_begin, 1, 1, apply_increments); auto outer_WA = std::accumulate(input_shapes.begin(), input_shapes.end(), 0, [](int64_t max_val, const PartialShape& ps) { return std::max(ps[ps.size() - 2].get_length(), max_val); }); if (outer_WA > 1) { const auto& outer_loop_begin = ngraph::snippets::op::insertLoopBegin(input_params); - insertLoopEnd(results, outer_loop_begin, 0, 1, 1, apply_increments); + insertLoopEnd(results, outer_loop_begin, 1, 1, apply_increments); } return model; }