diff --git a/src/common/snippets/include/snippets/pass/insert_loops.hpp b/src/common/snippets/include/snippets/pass/insert_loops.hpp new file mode 100644 index 00000000000000..874fc688e404a5 --- /dev/null +++ b/src/common/snippets/include/snippets/pass/insert_loops.hpp @@ -0,0 +1,32 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include + +namespace ngraph { +namespace snippets { +namespace pass { + +/** + * @interface InsertLoops + * @brief Insert explicit Loop operations into the body to process multiple data entities during one kernel execution + * @ingroup snippets + */ +class InsertLoops: public ngraph::pass::FunctionPass { +public: + OPENVINO_RTTI("InsertLoops", "0"); + InsertLoops(ov::PartialShape master_shape, size_t vector_size); + bool run_on_model(const std::shared_ptr& m) override; + +private: + ov::PartialShape master_shape; + size_t vector_size; +}; + +} // namespace pass +} // namespace snippets +} // namespace ngraph diff --git a/src/common/snippets/src/op/subgraph.cpp b/src/common/snippets/src/op/subgraph.cpp index 6f3dba1118a4ce..3d12fe0275125f 100644 --- a/src/common/snippets/src/op/subgraph.cpp +++ b/src/common/snippets/src/op/subgraph.cpp @@ -15,6 +15,7 @@ #include "snippets/pass/convert_constants.hpp" #include "snippets/pass/convert_power_to_powerstatic.hpp" #include "snippets/pass/vector_to_scalar.hpp" +#include "snippets/pass/insert_loops.hpp" #include "snippets/pass/transform_convert.hpp" #include "snippets/pass/align_element_type.hpp" #include "snippets/utils.hpp" @@ -26,7 +27,6 @@ #include "ngraph/pass/constant_folding.hpp" #include "ngraph_ops/type_relaxed.hpp" #include -#include "snippets/op/loop_helpers.hpp" #include #include @@ -403,6 +403,10 @@ void snippets::op::Subgraph::convert_to_snippet_dialect() { manager.get_pass_config()-> set_callback(skip_matching_domain); } + // todo: get_lanes() assumes fp32. Could there be any int8 issues? + // Note that InsertLoops requires validate_and_infer_types afterwards, so add it manually if + // automatic validation will be disabled in the pass manager + manager.register_pass(master_shape, m_generator->get_target_machine()->get_lanes()); } manager.run_passes(m_body); } @@ -436,77 +440,6 @@ snippets::Schedule snippets::op::Subgraph::generate(ngraph::pass::Manager& opt, convert_to_snippet_dialect(); opt.run_passes(m_body); - if (master_shape.is_static()) { - const auto inner_dim = master_shape.size() - 1; - // Note: outer_dim could overflow if master_shape.size() < 2 - const auto outer_dim = master_shape.size() - 2; - const auto inner_WA = master_shape[inner_dim].get_length(); - const auto outer_WA = master_shape.size() >= 2 ? master_shape[outer_dim].get_length() : 1; - // todo: get_lanes() assumes fp32. Could there be any int8 issues? - const auto vector_size = m_generator->get_target_machine()->get_lanes(); - - ParameterVector commonParams = m_body->get_parameters(); - // Note that topological sort parses node arguments in reversed order, but results are added - in direct order - // So ve need to pass the reversed results to LoopEnd to keep the original traversal order in topological sorter - const auto& orig_results = m_body->get_results(); - ResultVector commonResults(orig_results.rbegin(), orig_results.rend()); - std::vector ioShapes; - ioShapes.reserve(commonParams.size() + commonResults.size()); - std::transform(commonParams.begin(), commonParams.end(), std::back_inserter(ioShapes), - [](const std::shared_ptr& n) { return n->get_output_partial_shape(0); }); - std::transform(commonResults.begin(), commonResults.end(), std::back_inserter(ioShapes), - [](const std::shared_ptr& n) { return n->get_input_partial_shape(0); }); - - if (inner_WA > 0) { - std::vector apply_increments; - apply_increments.reserve(ioShapes.size()); - // Inner Loop applies increments if a dimension is not broadcasted - std::transform(ioShapes.begin(), ioShapes.end(), std::back_inserter(apply_increments), - [=](const PartialShape& ps) { - return ps[inner_dim] != 1 && master_shape[inner_dim] != 1; - }); - std::vector inner_finalization_offsets(ioShapes.size(), 0); - if (outer_WA > 1) { - // We need to step back if an outer dim is broadcasted, while the corresponding lower one is not - std::transform(ioShapes.begin(), ioShapes.end(), inner_finalization_offsets.begin(), - [=](const PartialShape& ps) { - return ps[outer_dim] == 1 && ps[inner_dim] != 1 ? -inner_WA : 0; - }); - } - const auto& inner_loop_begin = insertLoopBegin(commonParams); - const auto& inner_loop_end = insertLoopEnd(commonResults, inner_loop_begin, inner_dim, inner_WA, vector_size, apply_increments, - inner_finalization_offsets); - // set internal flag to enable scalar vs vector loop optimizations - inner_loop_end->has_outer_loop = outer_WA > 1; - // Due to features of topological sort, some Constants (Scalars) may appear right after Parameters in - // sorted ops (so it's between Parameters and LoopBegin). Consequently, ScalarEmitters would be called - // outside the Loop, and only the first Loop iteration would yield correct data (assuming the vector reg - // assigned to scalar will get corrupted inside the loop body). To avoid such cases, we add control dependency - // on LoopBegin to guarantee that the constants are executed inside the Loop. - for (const auto& n : m_body->get_ordered_ops()) { - if (auto c = std::dynamic_pointer_cast(n)) - c->add_control_dependency(inner_loop_begin); - else if (n == inner_loop_begin) - break; - } - } - - if (outer_WA > 1) { - std::vector apply_increments; - apply_increments.reserve(ioShapes.size()); - // Outer Loop applies increments only if a corresponding lower dim was broadcasted (or all lower dims == 1) - std::transform(ioShapes.begin(), ioShapes.end(), std::back_inserter(apply_increments), - [=](const PartialShape& ps) { - return ps[outer_dim] != 1 && ps[inner_dim] == 1; - }); - const auto& outer_loop_begin = insertLoopBegin(commonParams); - insertLoopEnd(commonResults, outer_loop_begin, outer_dim, outer_WA, 1, apply_increments); - } - m_body->validate_nodes_and_infer_types(); - } else { - throw ngraph_error("Dynamic case is not supported yet"); - } - snippets::pass::AssignRegisters().run_on_model(m_body); // schedule generation should go here and be target agnostic diff --git a/src/common/snippets/src/pass/insert_loops.cpp b/src/common/snippets/src/pass/insert_loops.cpp new file mode 100644 index 00000000000000..f4192087c2dc41 --- /dev/null +++ b/src/common/snippets/src/pass/insert_loops.cpp @@ -0,0 +1,85 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include "snippets/pass/insert_loops.hpp" +#include "snippets/op/loop_helpers.hpp" + +#include + +ngraph::snippets::pass::InsertLoops::InsertLoops(ov::PartialShape master_shape, size_t vector_size) +: master_shape(std::move(master_shape)), vector_size(vector_size) { +} + +bool ngraph::snippets::pass::InsertLoops::run_on_model(const std::shared_ptr &model) { + RUN_ON_FUNCTION_SCOPE(InsertLoops); + if (master_shape.is_dynamic()) + throw ngraph_error("InsertLoops doesn't support dynamic shapes yet"); + + const auto inner_dim = master_shape.size() - 1; + // Note: outer_dim could overflow if master_shape.size() < 2 + const auto outer_dim = master_shape.size() - 2; + const auto inner_WA = master_shape[inner_dim].get_length(); + const auto outer_WA = master_shape.size() >= 2 ? master_shape[outer_dim].get_length() : 1; + + ParameterVector commonParams = model->get_parameters(); + // Note that topological sort parses node arguments in reversed order, but results are added - in direct order + // So ve need to pass the reversed results to LoopEnd to keep the original traversal order in topological sorter + const auto& orig_results = model->get_results(); + ResultVector commonResults(orig_results.rbegin(), orig_results.rend()); + std::vector ioShapes; + ioShapes.reserve(commonParams.size() + commonResults.size()); + std::transform(commonParams.begin(), commonParams.end(), std::back_inserter(ioShapes), + [](const std::shared_ptr& n) { return n->get_output_partial_shape(0); }); + std::transform(commonResults.begin(), commonResults.end(), std::back_inserter(ioShapes), + [](const std::shared_ptr& n) { return n->get_input_partial_shape(0); }); + + if (inner_WA > 0) { + std::vector apply_increments; + apply_increments.reserve(ioShapes.size()); + // Inner Loop applies increments if a dimension is not broadcasted + std::transform(ioShapes.begin(), ioShapes.end(), std::back_inserter(apply_increments), + [=](const PartialShape& ps) { + return ps[inner_dim] != 1 && master_shape[inner_dim] != 1; + }); + std::vector inner_finalization_offsets(ioShapes.size(), 0); + if (outer_WA > 1) { + // We need to step back if an outer dim is broadcasted, while the corresponding lower one is not + std::transform(ioShapes.begin(), ioShapes.end(), inner_finalization_offsets.begin(), + [=](const PartialShape& ps) { + return ps[outer_dim] == 1 && ps[inner_dim] != 1 ? -inner_WA : 0; + }); + } + const auto& inner_loop_begin = op::insertLoopBegin(commonParams); + const auto& inner_loop_end = insertLoopEnd(commonResults, inner_loop_begin, inner_dim, inner_WA, vector_size, apply_increments, + inner_finalization_offsets); + // set internal flag to enable scalar vs vector loop optimizations + inner_loop_end->has_outer_loop = outer_WA > 1; + // Due to features of topological sort, some Constants (Scalars) may appear right after Parameters in + // sorted ops (so it's between Parameters and LoopBegin). Consequently, ScalarEmitters would be called + // outside the Loop, and only the first Loop iteration would yield correct data (assuming the vector reg + // assigned to scalar will get corrupted inside the loop body). To avoid such cases, we add control dependency + // on LoopBegin to guarantee that the constants are executed inside the Loop. + for (const auto& n : model->get_ordered_ops()) { + if (auto c = std::dynamic_pointer_cast(n)) + c->add_control_dependency(inner_loop_begin); + else if (n == inner_loop_begin) + break; + } + } + + if (outer_WA > 1) { + std::vector apply_increments; + apply_increments.reserve(ioShapes.size()); + // Outer Loop applies increments only if a corresponding lower dim was broadcasted (or all lower dims == 1) + std::transform(ioShapes.begin(), ioShapes.end(), std::back_inserter(apply_increments), + [=](const PartialShape& ps) { + return ps[outer_dim] != 1 && ps[inner_dim] == 1; + }); + const auto& outer_loop_begin = op::insertLoopBegin(commonParams); + insertLoopEnd(commonResults, outer_loop_begin, outer_dim, outer_WA, 1, apply_increments); + } + + return true; +} \ No newline at end of file