From 5f8a55e4d0e01929c9f2e0c1e156b26224832a5c Mon Sep 17 00:00:00 2001 From: Ivan Novoselov Date: Fri, 4 Nov 2022 20:08:07 +0100 Subject: [PATCH] [Snippets] Explicit Tiles review comments No. 1 --- .../include/snippets/op/loop_helpers.hpp | 18 +++++++++++------ src/common/snippets/src/pass/insert_loops.cpp | 20 +++++++++---------- .../tests/src/pass/canonicalization.cpp | 2 +- .../src/emitters/jit_snippets_emitters.cpp | 6 +++--- src/plugins/intel_cpu/src/nodes/subgraph.cpp | 17 ++++------------ src/plugins/intel_cpu/src/nodes/subgraph.h | 1 - .../shared_tests_instances/snippets/add.cpp | 3 +++ .../plugin/shared/include/snippets/add.hpp | 9 +-------- .../plugin/shared/src/snippets/add.cpp | 8 ++++++-- .../src/base/snippets_test_utils.cpp | 1 - .../include/subgraph_simple.hpp | 3 ++- .../src/subgraph_simple.cpp | 7 ++++--- 12 files changed, 46 insertions(+), 49 deletions(-) diff --git a/src/common/snippets/include/snippets/op/loop_helpers.hpp b/src/common/snippets/include/snippets/op/loop_helpers.hpp index f1c4164c316059..57a14e5f036cc9 100644 --- a/src/common/snippets/include/snippets/op/loop_helpers.hpp +++ b/src/common/snippets/include/snippets/op/loop_helpers.hpp @@ -12,13 +12,9 @@ namespace ngraph { namespace snippets { namespace op { +/* ==== LoopBegin === */ std::shared_ptr insertLoopBeginAfterOutputs(const OutputVector& originalOutputs); -std::shared_ptr insertLoopEndBeforeInputs(const std::vector>& originalInputs, - const std::shared_ptr& tileBegin, - size_t dimension, size_t work_amount, size_t increment, - std::vector apply_increment = {}, - std::vector finalization_offsets = {}); template std::shared_ptr insertLoopBegin(const T& afterTheseNodes) { static_assert(std::is_same() || std::is_same(), @@ -36,8 +32,16 @@ std::shared_ptr insertLoopBegin(const T& afterTheseNodes) { template<> inline std::shared_ptr insertLoopBegin(const OutputVector& afterTheseNodes) { - return insertLoopBeginAfterOutputs(afterTheseNodes); + return insertLoopBeginAfterOutputs(afterTheseNodes); } +/* ============== */ + +/* ==== LoopEnd === */ +std::shared_ptr insertLoopEndBeforeInputs(const std::vector>& originalInputs, + const std::shared_ptr& tileBegin, + size_t dimension, size_t work_amount, size_t increment, + std::vector apply_increment = {}, + std::vector finalization_offsets = {}); template std::shared_ptr insertLoopEnd(const T& beforeTheseNodes, Args ...args) { @@ -51,10 +55,12 @@ std::shared_ptr insertLoopEnd(const T& beforeTheseNodes, Args ...args) } return insertLoopEndBeforeInputs(originalInputs, args...); } + template std::shared_ptr insertLoopEnd(const std::vector>& beforeTheseNodes, Args ...args) { return insertLoopEndBeforeInputs(beforeTheseNodes, args...); } +/* ============== */ } // namespace op } // namespace snippets diff --git a/src/common/snippets/src/pass/insert_loops.cpp b/src/common/snippets/src/pass/insert_loops.cpp index f4192087c2dc41..30c9a20883b8d5 100644 --- a/src/common/snippets/src/pass/insert_loops.cpp +++ b/src/common/snippets/src/pass/insert_loops.cpp @@ -20,8 +20,8 @@ bool ngraph::snippets::pass::InsertLoops::run_on_model(const std::shared_ptr= 2 ? master_shape[outer_dim].get_length() : 1; + const auto inner_work_amount = master_shape[inner_dim].get_length(); + const auto outer_work_amount = master_shape.size() >= 2 ? master_shape[outer_dim].get_length() : 1; ParameterVector commonParams = model->get_parameters(); // Note that topological sort parses node arguments in reversed order, but results are added - in direct order @@ -35,7 +35,7 @@ bool ngraph::snippets::pass::InsertLoops::run_on_model(const std::shared_ptr& n) { return n->get_input_partial_shape(0); }); - if (inner_WA > 0) { + if (inner_work_amount > 0) { std::vector apply_increments; apply_increments.reserve(ioShapes.size()); // Inner Loop applies increments if a dimension is not broadcasted @@ -44,18 +44,18 @@ bool ngraph::snippets::pass::InsertLoops::run_on_model(const std::shared_ptr inner_finalization_offsets(ioShapes.size(), 0); - if (outer_WA > 1) { + if (outer_work_amount > 1) { // We need to step back if an outer dim is broadcasted, while the corresponding lower one is not std::transform(ioShapes.begin(), ioShapes.end(), inner_finalization_offsets.begin(), [=](const PartialShape& ps) { - return ps[outer_dim] == 1 && ps[inner_dim] != 1 ? -inner_WA : 0; + return ps[outer_dim] == 1 && ps[inner_dim] != 1 ? -inner_work_amount : 0; }); } const auto& inner_loop_begin = op::insertLoopBegin(commonParams); - const auto& inner_loop_end = insertLoopEnd(commonResults, inner_loop_begin, inner_dim, inner_WA, vector_size, apply_increments, - inner_finalization_offsets); + const auto& inner_loop_end = insertLoopEnd(commonResults, inner_loop_begin, inner_dim, inner_work_amount, + vector_size, apply_increments, inner_finalization_offsets); // set internal flag to enable scalar vs vector loop optimizations - inner_loop_end->has_outer_loop = outer_WA > 1; + inner_loop_end->has_outer_loop = outer_work_amount > 1; // Due to features of topological sort, some Constants (Scalars) may appear right after Parameters in // sorted ops (so it's between Parameters and LoopBegin). Consequently, ScalarEmitters would be called // outside the Loop, and only the first Loop iteration would yield correct data (assuming the vector reg @@ -69,7 +69,7 @@ bool ngraph::snippets::pass::InsertLoops::run_on_model(const std::shared_ptr 1) { + if (outer_work_amount > 1) { std::vector apply_increments; apply_increments.reserve(ioShapes.size()); // Outer Loop applies increments only if a corresponding lower dim was broadcasted (or all lower dims == 1) @@ -78,7 +78,7 @@ bool ngraph::snippets::pass::InsertLoops::run_on_model(const std::shared_ptrset_generator(std::make_shared()); auto canonical_output_shape = subgraph->canonicalize(output_blocked_shapes, input_blocked_shapes); - ASSERT_TRUE(!canonical_output_shape.is_dynamic()); + ASSERT_TRUE(canonical_output_shape.is_static()); ASSERT_DIMS_EQ(canonical_output_shape.get_shape(), expected_output_shape); } diff --git a/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.cpp b/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.cpp index f566cd1452a312..3dc0a1e043d2a7 100644 --- a/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.cpp +++ b/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.cpp @@ -246,7 +246,7 @@ LoopBeginEmitter::LoopBeginEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl: IE_THROW() << "LoopBeginEmitter invoked with invalid configuration: the last output must be LoopEnd"; work_amount = loop_begin->get_work_amount(); evaluate_once = loop_begin->get_evaluate_once(); - num_inputs = loop_begin->get_output_size() - 1; + num_inputs = loop_begin->get_input_size(); in_out_type_ = emitter_in_out_map::gpr_to_gpr; } @@ -297,8 +297,8 @@ LoopEndEmitter::LoopEndEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::imp if (!loop_begin) IE_THROW() << "LoopEndEmitter invoked with invalid configuration: the last arg must be LoopBegin"; // Note that 1 edge connects LoopBegin and LoopEnd - num_inputs = loop_begin->get_output_size() - 1; - num_outputs = loop_end->get_input_size() - 1; + num_inputs = loop_begin->get_input_size(); + num_outputs = loop_end->get_output_size(); increment = loop_end->get_increment(); work_amount = loop_end->get_work_amount(); apply_increments = loop_end->get_apply_increment(); diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.cpp b/src/plugins/intel_cpu/src/nodes/subgraph.cpp index 29975e0b3781aa..7ad5ebf1636d1f 100644 --- a/src/plugins/intel_cpu/src/nodes/subgraph.cpp +++ b/src/plugins/intel_cpu/src/nodes/subgraph.cpp @@ -22,6 +22,7 @@ #include #include "emitters/cpu_generator.hpp" +#include "utils/cpu_utils.hpp" #include "snippets_transformations/fuse_load_store_and_convert.hpp" #include "ngraph_transformations/convert_to_swish_cpu.hpp" @@ -70,14 +71,6 @@ void Snippet::copy_snippet() { isa_num_lanes = snippet->get_generator()->get_target_machine()->get_lanes(); } -VectorDims Snippet::prependWithOnes(const VectorDims& dims, size_t rank) { - if (rank <= dims.size()) - return dims; - VectorDims result(rank, 1); - std::copy(dims.begin(), dims.end(), &result[rank - dims.size()]); - return result; -} - void Snippet::initSupportedPrimitiveDescriptors() { copy_snippet(); if (!supportedPrimitiveDescriptors.empty()) @@ -439,11 +432,11 @@ void Snippet::prepareParams() { // here must be all the stuff that could only be done for static shapes, e.g. offset calculation // Here it must be all the stuff that could be done once for both static and dynamic shapes - masterShape = prependWithOnes(masterShape, tensorRank); + masterShape = getNormalizedDimsBySize(masterShape, tensorRank); for (auto& pshape : normInputShapes) - pshape = prependWithOnes(pshape, tensorRank); + pshape = getNormalizedDimsBySize(pshape, tensorRank); for (auto& pshape : normOutputShapes) - pshape = prependWithOnes(pshape, tensorRank); + pshape = getNormalizedDimsBySize(pshape, tensorRank); tileRank = 1; fullWorkAmount = std::accumulate(masterShape.begin(), masterShape.end(), 1, std::multiplies()); @@ -486,8 +479,6 @@ void Snippet::prepareParams() { dim = 1; } - // ov::pass::Serialize("tile_initial.xml", "tile_initial.bin").run_on_model(snippet->get_body()); - // std::vector new_shapes; for (const auto& s : normInputShapes) { ov::Shape ns(tileRank, 0); diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.h b/src/plugins/intel_cpu/src/nodes/subgraph.h index 8b657e42805131..0dc2354f02cd53 100644 --- a/src/plugins/intel_cpu/src/nodes/subgraph.h +++ b/src/plugins/intel_cpu/src/nodes/subgraph.h @@ -58,7 +58,6 @@ class Snippet : public Node { // NOTE: Before call mutex should be initialized void copy_snippet(); - static VectorDims prependWithOnes(const VectorDims& dims, size_t rank); ov::PartialShape canonicalizeBody(); void optimizeExecDomain(std::vector&, std::vector&, VectorDims&, size_t&) const; void calcJITParams(std::vector& offsets) const; diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/add.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/add.cpp index 7f1bfc7f328a8b..9c5c2a9904a48d 100644 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/add.cpp +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/add.cpp @@ -43,10 +43,13 @@ std::vector> inShapesStatic{ {{1, 128, 1, 17}, {1, 128, 1, 17}}, {{1, 128, 1, 29}, {1, 128, 1, 29}}, {{1, 128, 1, 33}, {1, 128, 1, 33}}, + {{1, 128, 9, 30}, {1, 128, 1, 30}}, + {{1, 128, 9, 1}, {1, 128, 1, 30}}, }; INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Eltwise, AddSinhPair, ::testing::Combine( ::testing::ValuesIn(inShapesStatic), + ::testing::Values(ov::element::f32), ::testing::Values(3), // Add + 2 converts after inputs ::testing::Values(1), // Subgraph is created, since the inputs are followed by converts ::testing::Values(CommonTestUtils::DEVICE_CPU)), diff --git a/src/tests/functional/plugin/shared/include/snippets/add.hpp b/src/tests/functional/plugin/shared/include/snippets/add.hpp index 4ba7541489671f..3f19a02737980c 100644 --- a/src/tests/functional/plugin/shared/include/snippets/add.hpp +++ b/src/tests/functional/plugin/shared/include/snippets/add.hpp @@ -21,6 +21,7 @@ typedef std::tuple< typedef std::tuple< std::vector, // Input 0, Input 1 Shape + ov::element::Type, // Element type size_t, // Expected num nodes size_t, // Expected num subgraphs std::string // Target Device @@ -34,14 +35,6 @@ typedef std::tuple< std::string // Target Device > AddConstParams; -typedef std::tuple< - InputShape, // Input 0 Shape - InputShape, // Input 1 Shape - size_t, // Expected num nodes - size_t, // Expected num subgraphs - std::string // Target Device - > AddDynamicParams; - class Add : public testing::WithParamInterface, virtual public ov::test::SnippetsTestsCommon { public: diff --git a/src/tests/functional/plugin/shared/src/snippets/add.cpp b/src/tests/functional/plugin/shared/src/snippets/add.cpp index 2a79d1112c5c6e..f0a1908ef2a9e8 100644 --- a/src/tests/functional/plugin/shared/src/snippets/add.cpp +++ b/src/tests/functional/plugin/shared/src/snippets/add.cpp @@ -91,14 +91,16 @@ void AddRollConst::SetUp() { std::string AddSinhPair::getTestCaseName(testing::TestParamInfo obj) { std::vector input_shapes; + ov::element::Type type; std::string targetDevice; size_t num_nodes, num_subgraphs; - std::tie(input_shapes, num_nodes, num_subgraphs, targetDevice) = obj.param; + std::tie(input_shapes, type, num_nodes, num_subgraphs, targetDevice) = obj.param; if (input_shapes.size() != 2) IE_THROW() << "Invalid input shapes vector size"; std::ostringstream result; result << "IS[0]=" << CommonTestUtils::vec2str(input_shapes[0]) << "_"; result << "IS[1]=" << CommonTestUtils::vec2str(input_shapes[1]) << "_"; + result << "T=" << type << "_"; result << "#N=" << num_nodes << "_"; result << "#S=" << num_subgraphs << "_"; result << "targetDevice=" << targetDevice; @@ -107,7 +109,8 @@ std::string AddSinhPair::getTestCaseName(testing::TestParamInfo input_shapes; - std::tie(input_shapes, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam(); + ov::element::Type type; + std::tie(input_shapes, type, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam(); std::vector is; for (const auto& s : input_shapes) { is.emplace_back(InputShape {{}, {s, }}); @@ -115,6 +118,7 @@ void AddSinhPair::SetUp() { init_input_shapes(is); auto f = ov::test::snippets::AddSinhFunction({input_shapes[0], input_shapes[1]}); function = f.getOriginal(); + setInferenceType(type); } TEST_P(Add, CompareWithRefImpl) { diff --git a/src/tests/functional/shared_test_classes/src/base/snippets_test_utils.cpp b/src/tests/functional/shared_test_classes/src/base/snippets_test_utils.cpp index fc376251006e49..30560a943cfe31 100644 --- a/src/tests/functional/shared_test_classes/src/base/snippets_test_utils.cpp +++ b/src/tests/functional/shared_test_classes/src/base/snippets_test_utils.cpp @@ -5,7 +5,6 @@ #include "shared_test_classes/base/snippets_test_utils.hpp" #include "functional_test_utils/skip_tests_config.hpp" #include "exec_graph_info.hpp" -#include "cpp_interfaces/interface/ie_internal_plugin_config.hpp" namespace ov { namespace test { diff --git a/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_simple.hpp b/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_simple.hpp index b11ea9a513a790..a1254dfaa80521 100644 --- a/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_simple.hpp +++ b/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_simple.hpp @@ -72,8 +72,9 @@ class AddSinhConstFunction : public SnippetsFunctionBase { // The function is needed to check different input element types (model precision change) class AddRollConstFunction : public SnippetsFunctionBase { public: - explicit AddRollConstFunction(const std::vector& inputShapes) : SnippetsFunctionBase(inputShapes) { + explicit AddRollConstFunction(const std::vector& inputShapes) : SnippetsFunctionBase(inputShapes) { NGRAPH_CHECK(input_shapes.size() == 1, "Got invalid number of input shapes"); + NGRAPH_CHECK(input_shapes[0].is_static(), "Only static shapes are supported"); } protected: std::shared_ptr initOriginal() const override; diff --git a/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_simple.cpp b/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_simple.cpp index 652172c8a66eff..6fa4648a5548a9 100644 --- a/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_simple.cpp +++ b/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_simple.cpp @@ -56,9 +56,10 @@ std::shared_ptr AddSinhConstFunction::initOriginal() const { return std::make_shared(NodeVector{add}, ParameterVector{data0}); } std::shared_ptr AddRollConstFunction::initOriginal() const { - auto data0 = std::make_shared(precision, input_shapes[0]); - const std::vector const_values = CommonTestUtils::generate_float_numbers(shape_size(input_shapes[0]), -10., 10.); - auto const_data1 = std::make_shared(precision, input_shapes[0], const_values); + const auto input_shape = input_shapes[0].get_shape(); + auto data0 = std::make_shared(precision, input_shape); + const std::vector const_values = CommonTestUtils::generate_float_numbers(shape_size(input_shape), -10., 10.); + auto const_data1 = std::make_shared(precision, input_shape, const_values); auto shift = std::make_shared(ov::element::i32, ov::Shape{1}, std::vector{1}); auto axes = std::make_shared(ov::element::i32, ov::Shape{1}, std::vector{0}); auto roll0 = std::make_shared(data0, shift, axes);