From 3bd3fd15ddf46a12c0ab02a05e7446475abfbff3 Mon Sep 17 00:00:00 2001 From: Ivan Novoselov Date: Sun, 31 Jul 2022 16:18:12 +0100 Subject: [PATCH] Snippets increase subgraph size (#3) - Implement static TileScheduler to handle compile params processing. Now compile params are accessed only here - TileScheduler should emit code only for necessary scalar/vector Tiles - Perform abstract-to-physical register mapping in one place (currently KernelEmitter constructor) - Implement more precise register mapping, so larger subgraphs could be created (now up to 12 i/o regs instead of 7) --- .../snippets/include/snippets/emitter.hpp | 2 + .../snippets/include/snippets/op/tile.hpp | 5 +- .../include/snippets/op/tile_scheduler.hpp | 39 ++ .../snippets/pass/assign_registers.hpp | 2 +- .../include/snippets/snippets_isa.hpp | 1 + src/common/snippets/src/generator.cpp | 44 +- src/common/snippets/src/op/tile.cpp | 2 +- src/common/snippets/src/op/tile_scheduler.cpp | 10 + .../snippets/src/pass/assign_registers.cpp | 69 +- .../snippets/src/pass/collapse_subgraph.cpp | 2 +- .../snippets/tests/src/lowering_utils.cpp | 1 + src/common/snippets/tests/src/registers.cpp | 43 +- .../intel_cpu/src/emitters/cpu_generator.cpp | 1 + .../intel_cpu/src/emitters/jit_emitter.cpp | 4 + .../intel_cpu/src/emitters/jit_emitter.hpp | 1 + .../src/emitters/jit_snippets_emitters.cpp | 605 ++++++++++++++++++ .../src/emitters/jit_snippets_emitters.hpp | 543 ++++------------ .../shared_tests_instances/snippets/add.cpp | 42 +- .../snippets/max_num_params_eltwise.cpp | 26 + .../snippets/three_inputs_eltwise.cpp | 36 +- .../plugin/shared/include/snippets/add.hpp | 15 + .../snippets/max_num_params_eltwise.hpp | 31 + .../plugin/shared/src/snippets/add.cpp | 92 ++- .../src/snippets/max_num_params_eltwise.cpp | 49 ++ .../src/snippets/three_inputs_eltwise.cpp | 72 +-- .../include/subgraph_simple.hpp | 39 +- .../src/subgraph_simple.cpp | 30 + 27 files changed, 1189 insertions(+), 617 deletions(-) create mode 100644 src/common/snippets/include/snippets/op/tile_scheduler.hpp create mode 100644 src/common/snippets/src/op/tile_scheduler.cpp create mode 100644 src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.cpp create mode 100644 src/tests/functional/plugin/cpu/shared_tests_instances/snippets/max_num_params_eltwise.cpp create mode 100644 src/tests/functional/plugin/shared/include/snippets/max_num_params_eltwise.hpp create mode 100644 src/tests/functional/plugin/shared/src/snippets/max_num_params_eltwise.cpp diff --git a/src/common/snippets/include/snippets/emitter.hpp b/src/common/snippets/include/snippets/emitter.hpp index 2ba0f85c5deda8..99c09d9d61d1bf 100644 --- a/src/common/snippets/include/snippets/emitter.hpp +++ b/src/common/snippets/include/snippets/emitter.hpp @@ -51,5 +51,7 @@ class Emitter { virtual ~Emitter() = default; }; +using AllocatedEmitter = std::pair, ngraph::snippets::RegInfo>; + } // namespace snippets } // namespace ngraph \ No newline at end of file diff --git a/src/common/snippets/include/snippets/op/tile.hpp b/src/common/snippets/include/snippets/op/tile.hpp index 9620b81421fdff..41f06eb9e01cf9 100644 --- a/src/common/snippets/include/snippets/op/tile.hpp +++ b/src/common/snippets/include/snippets/op/tile.hpp @@ -20,14 +20,13 @@ class Tile : public ngraph::op::Op { public: OPENVINO_OP("Tile", "SnippetsOpset"); - Tile(const std::vector, ngraph::snippets::RegInfo>>& region); + Tile(const std::vector& region); Tile() = default; - std::vector, ngraph::snippets::RegInfo>> region; + std::vector region; std::shared_ptr clone_with_new_inputs(const OutputVector& inputs) const override { return std::make_shared(region); } - const void *compile_params; }; } // namespace op diff --git a/src/common/snippets/include/snippets/op/tile_scheduler.hpp b/src/common/snippets/include/snippets/op/tile_scheduler.hpp new file mode 100644 index 00000000000000..9d6010f77978b0 --- /dev/null +++ b/src/common/snippets/include/snippets/op/tile_scheduler.hpp @@ -0,0 +1,39 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "ngraph/op/op.hpp" +#include "snippets/emitter.hpp" +#include "tile.hpp" + +namespace ngraph { +namespace snippets { +namespace op { + +/** + * @interface TileScheduler + * @brief Contains a set of Tiles (currently one vector and one scalar) and performs necessary preparations + * before the Tiles could be executed: calculates offsets, sets proper work amounts, decrement pointers if the same data + * have to be read several times (broadcasting). + * @ingroup snippets + */ +class TileScheduler : public ngraph::op::Op { +public: + OPENVINO_OP("TileScheduler", "SnippetsOpset"); + + TileScheduler(const AllocatedEmitter& vector_region, const AllocatedEmitter& scalar_region); + TileScheduler() = default; + AllocatedEmitter vector_region; + AllocatedEmitter scalar_region; + // todo: this clone_with_new_inputs is irrelevant + std::shared_ptr clone_with_new_inputs(const OutputVector& inputs) const override { + return std::make_shared(vector_region, scalar_region); + } + const void *compile_params; +}; + +} // namespace op +} // namespace snippets +} // namespace ngraph diff --git a/src/common/snippets/include/snippets/pass/assign_registers.hpp b/src/common/snippets/include/snippets/pass/assign_registers.hpp index fb3672fe389536..0eff4bcc7d7033 100644 --- a/src/common/snippets/include/snippets/pass/assign_registers.hpp +++ b/src/common/snippets/include/snippets/pass/assign_registers.hpp @@ -18,7 +18,7 @@ namespace pass { */ class AssignRegisters : public ngraph::pass::FunctionPass { public: - AssignRegisters() { + explicit AssignRegisters() { set_property(ngraph::pass::PassProperty::REQUIRE_STATIC_SHAPE, true); } bool run_on_model(const std::shared_ptr& m) override; diff --git a/src/common/snippets/include/snippets/snippets_isa.hpp b/src/common/snippets/include/snippets/snippets_isa.hpp index da94fec0980d3a..7277137e49de2c 100644 --- a/src/common/snippets/include/snippets/snippets_isa.hpp +++ b/src/common/snippets/include/snippets/snippets_isa.hpp @@ -20,6 +20,7 @@ #include "op/powerstatic.hpp" #include "op/store.hpp" #include "op/tile.hpp" +#include "op/tile_scheduler.hpp" #include "op/vectorload.hpp" #include "op/vectorstore.hpp" diff --git a/src/common/snippets/src/generator.cpp b/src/common/snippets/src/generator.cpp index 44a69470134279..b7f81a1e4da017 100644 --- a/src/common/snippets/src/generator.cpp +++ b/src/common/snippets/src/generator.cpp @@ -17,7 +17,8 @@ auto ngraph::snippets::getRegisters(std::shared_ptr& n) -> ngraph: auto rt = n->get_rt_info(); // ToDo: change to reg_t - std::vector rout; + std::vector rin, rout; + auto it_rt = rt.find("reginfo"); if (it_rt != rt.end()) { for (auto reg : it_rt->second.as>()) { @@ -25,12 +26,11 @@ auto ngraph::snippets::getRegisters(std::shared_ptr& n) -> ngraph: } } - std::vector rin; - for (auto input : n->inputs()) { + for (const auto& input : n->inputs()) { auto rt = input.get_source_output().get_node_shared_ptr()->get_rt_info(); auto it_rt = rt.find("reginfo"); if (it_rt != rt.end()) { - for (auto reg : it_rt->second.as>()) { + for (auto& reg : it_rt->second.as>()) { rin.push_back(reg); } } @@ -48,13 +48,12 @@ ngraph::snippets::code ngraph::snippets::Generator::generate(std::shared_ptrget_results(); auto in = params.size(); auto out = results.size(); - auto nptrs = in + out; OV_ITT_TASK_CHAIN(GENERATE, ngraph::pass::itt::domains::SnippetsTransform, "Snippets::Generator", "::VectorTile") // vector tile - std::vector, ngraph::snippets::RegInfo>> lowered; + std::vector lowered; for (auto n : m->get_ordered_ops()) { - lowered.push_back(std::make_pair(target->get(n->get_type_info())(n), ngraph::snippets::getRegisters(n))); + lowered.emplace_back(std::make_pair(target->get(n->get_type_info())(n), ngraph::snippets::getRegisters(n))); } OV_ITT_TASK_NEXT(GENERATE, "::ScalarTile") @@ -65,34 +64,29 @@ ngraph::snippets::code ngraph::snippets::Generator::generate(std::shared_ptr(); mng.run_passes(m_scalar); OV_ITT_TASK_NEXT(GENERATE, "::ScalarTile_get") - std::vector, RegInfo>> scalar_lowered; + std::vector scalar_lowered; for (auto n : m_scalar->get_ordered_ops()) { - scalar_lowered.push_back(std::make_pair(target->get(n->get_type_info())(n), ngraph::snippets::getRegisters(n))); + scalar_lowered.emplace_back(std::make_pair(target->get(n->get_type_info())(n), ngraph::snippets::getRegisters(n))); } OV_ITT_TASK_NEXT(GENERATE, "::Tiles1D") - // wrapping into tiles1D - std::vector, RegInfo>> tiles1D; - auto tile = std::make_shared(lowered); - tile->compile_params = compile_params; - tiles1D.push_back(std::make_pair(target->get(ngraph::snippets::op::Tile::get_type_info_static())(tile), - std::make_pair(std::vector({target->get_lanes(), 0, nptrs, 1}), std::vector{}))); - tile = std::make_shared(scalar_lowered); - tile->compile_params = compile_params; - tiles1D.push_back(std::make_pair(target->get(ngraph::snippets::op::Tile::get_type_info_static())(tile), - std::make_pair(std::vector{{1, target->get_lanes(), nptrs, 1}}, std::vector{}))); + const auto& vector_tile = std::make_shared(lowered); + const auto& vector_region = std::make_pair(target->get(ngraph::snippets::op::Tile::get_type_info_static())(vector_tile), + std::make_pair(std::vector{target->get_lanes()}, std::vector{})); + const auto& scalar_tile = std::make_shared(scalar_lowered); + const auto& scalar_region = std::make_pair(target->get(ngraph::snippets::op::Tile::get_type_info_static())(scalar_tile), + std::make_pair(std::vector{1}, std::vector{})); OV_ITT_TASK_NEXT(GENERATE, "::Tiles2D") // wrapping into tiles2D - std::vector, RegInfo>> tiles2D; - tile = std::make_shared(tiles1D); - tile->compile_params = compile_params; - tiles2D.push_back(std::make_pair(target->get(ngraph::snippets::op::Tile::get_type_info_static())(tile), - std::make_pair(std::vector({1, 0, nptrs, 0}), std::vector{}))); + auto tile_scheduler = std::make_shared(vector_region, scalar_region); + tile_scheduler->compile_params = compile_params; + const auto& tile_scheduler_region = std::make_pair(target->get(ngraph::snippets::op::TileScheduler::get_type_info_static())(tile_scheduler), + std::make_pair(std::vector({in, out, target->get_lanes()}), std::vector{})); OV_ITT_TASK_NEXT(GENERATE, "::EmitCode") // emission - auto tiles2DKernel = std::make_shared(tiles2D); + auto tiles2DKernel = std::make_shared(std::vector {tile_scheduler_region}); tiles2DKernel->compile_params = compile_params; std::shared_ptr kernel = target->get(ngraph::snippets::op::Kernel::get_type_info_static())(tiles2DKernel); kernel->emit_code({in, out}, {}); diff --git a/src/common/snippets/src/op/tile.cpp b/src/common/snippets/src/op/tile.cpp index c17b0b0c8163c5..a57c0a9687c4e5 100644 --- a/src/common/snippets/src/op/tile.cpp +++ b/src/common/snippets/src/op/tile.cpp @@ -8,5 +8,5 @@ using namespace std; using namespace ngraph; -snippets::op::Tile::Tile(const std::vector, snippets::RegInfo>>& nested) : Op(), region(nested) { +snippets::op::Tile::Tile(const std::vector& nested) : Op(), region(nested) { } diff --git a/src/common/snippets/src/op/tile_scheduler.cpp b/src/common/snippets/src/op/tile_scheduler.cpp new file mode 100644 index 00000000000000..fd0ba9e6a23223 --- /dev/null +++ b/src/common/snippets/src/op/tile_scheduler.cpp @@ -0,0 +1,10 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "snippets/op/tile_scheduler.hpp" +#include "snippets/generator.hpp" + +ngraph::snippets::op::TileScheduler::TileScheduler(const AllocatedEmitter& vector_region, const AllocatedEmitter& scalar_region) + : Op(), vector_region{vector_region}, scalar_region{scalar_region} { +} diff --git a/src/common/snippets/src/pass/assign_registers.cpp b/src/common/snippets/src/pass/assign_registers.cpp index 8f229b137bbdfb..cb72e1bdb52b0a 100644 --- a/src/common/snippets/src/pass/assign_registers.cpp +++ b/src/common/snippets/src/pass/assign_registers.cpp @@ -16,7 +16,6 @@ bool ngraph::snippets::pass::AssignRegisters::run_on_model(const std::shared_ptr& f) { RUN_ON_FUNCTION_SCOPE(AssignRegisters); OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::AssignRegisters") - int reg64_tmp_start { 8 }; // R8, R9, R10, R11, R12, R13, R14, R15 inputs+outputs+1 using Reg = size_t; auto ops = f->get_ordered_ops(); decltype(ops) stmts; @@ -26,8 +25,8 @@ bool ngraph::snippets::pass::AssignRegisters::run_on_model(const std::shared_ptr size_t rdx = 0; std::map, Reg> regs; - for (auto op : stmts) { - for (auto output : op->outputs()) { + for (const auto& op : stmts) { + for (const auto& output : op->outputs()) { regs[output.get_tensor_ptr()] = rdx++; } } @@ -35,9 +34,9 @@ bool ngraph::snippets::pass::AssignRegisters::run_on_model(const std::shared_ptr std::vector> used; std::vector> def; - for (auto op : stmts) { + for (const auto& op : stmts) { std::set u; - for (auto input : op->inputs()) { + for (const auto& input : op->inputs()) { if (regs.count(input.get_tensor_ptr())) { u.insert(regs[input.get_tensor_ptr()]); } @@ -46,7 +45,7 @@ bool ngraph::snippets::pass::AssignRegisters::run_on_model(const std::shared_ptr std::set d; if (!std::dynamic_pointer_cast(op)) { - for (auto output : op->outputs()) { + for (const auto& output : op->outputs()) { d.insert(regs[output.get_tensor_ptr()]); } } @@ -65,8 +64,8 @@ bool ngraph::snippets::pass::AssignRegisters::run_on_model(const std::shared_ptr for (size_t n = 0; n < stmts.size(); n++) { auto node = stmts[n]; if (!std::dynamic_pointer_cast(node)) { - for (auto out : node->outputs()) { - for (auto port : out.get_target_inputs()) { + for (const auto& out : node->outputs()) { + for (const auto& port : out.get_target_inputs()) { auto pos = std::find(stmts.begin(), stmts.end(), port.get_node()->shared_from_this()); if (pos != stmts.end()) { auto k = pos-stmts.begin(); @@ -136,46 +135,32 @@ bool ngraph::snippets::pass::AssignRegisters::run_on_model(const std::shared_ptr std::map, Reg> physical_regs; - for (auto reg : regs) { + for (const auto& reg : regs) { physical_regs[reg.first] = register_map[reg.second]; } - - size_t constantID = 0; - - for (auto n : f->get_ordered_ops()) { + const auto num_parameters = f->get_parameters().size(); + for (const auto& n : f->get_ordered_ops()) { auto& rt = n->get_rt_info(); - // nothing to do for model signature - if (std::dynamic_pointer_cast(n) || std::dynamic_pointer_cast(n)) { - continue; - } - - // store only effective address - if (auto result = std::dynamic_pointer_cast(n)) { - auto ea = reg64_tmp_start+static_cast(f->get_result_index(result) + f->get_parameters().size()); - rt["effectiveAddress"] = ea; + std::vector regs; + regs.reserve(n->outputs().size()); + /* The main idea here is that each operation stores its output regs in rt["reginfo"]. Input and output regs are + * then derived by parsing node's and parent's rt["reginfo"], look into ngraph::snippets::getRegisters for details. + * Note also that Parameter and Result store general-purpose register index, because they work with memory + * (memory pointer is stored in gpr). All other "regular" ops store vector regs indexes, since calculations are + * performed on registers. + */ + if (is_type(n)) { continue; - } - // store effective address and procced with vector registers - if (ov::as_type_ptr(n) || ov::as_type_ptr(n)) { - auto source = n->get_input_source_output(0).get_node_shared_ptr(); - - if (auto param = ov::as_type_ptr(source)) { - auto ea = reg64_tmp_start+static_cast(f->get_parameter_index(param)); - rt["effectiveAddress"] = ea; - } else if (auto constant = ov::as_type_ptr(source)) { - auto ea = reg64_tmp_start+static_cast(f->get_parameters().size() + f->get_results().size() + 1 + constantID); - rt["effectiveAddress"] = ea; - constantID++; - } else { - throw ngraph_error("load/broadcast should follow only Parameter or non-Scalar constant"); + } else if (const auto& param = ov::as_type_ptr(n)) { + regs.push_back(f->get_parameter_index(param)); + } else if (const auto& store = ov::as_type_ptr(n)) { + regs.push_back(f->get_result_index(store) + num_parameters); + } else { + for (const auto& output : n->outputs()) { + auto allocated = physical_regs[output.get_tensor_ptr()]; + regs.push_back(allocated); } } - - std::vector regs; regs.reserve(n->outputs().size()); - for (auto output : n->outputs()) { - auto allocated = physical_regs[output.get_tensor_ptr()]; - regs.push_back(allocated); - } rt["reginfo"] = regs; } diff --git a/src/common/snippets/src/pass/collapse_subgraph.cpp b/src/common/snippets/src/pass/collapse_subgraph.cpp index 49cb66b610ee8f..bc3eea81861d89 100644 --- a/src/common/snippets/src/pass/collapse_subgraph.cpp +++ b/src/common/snippets/src/pass/collapse_subgraph.cpp @@ -477,7 +477,7 @@ TokenizeSnippets::TokenizeSnippets() { throw ngraph_error("body results and node results size mismatch during subgraph collaps"); } // todo: move this plugin-specific constraint to the plugin callback - if (body_parameters.size() + body_results.size() > 7) { + if (body_parameters.size() + body_results.size() > 12) { const std::string message_reset = "new subgraph is created. Impossible to schedule subgraph with " + std::to_string(body_parameters.size()) + " inputs and " + std::to_string(body_results.size()) + " outputs."; const std::string message_abort = "failed to continue subgraph. Impossible to schedule subgraph with " + diff --git a/src/common/snippets/tests/src/lowering_utils.cpp b/src/common/snippets/tests/src/lowering_utils.cpp index bdbfe41d6dd45c..ac388390a6d5ad 100644 --- a/src/common/snippets/tests/src/lowering_utils.cpp +++ b/src/common/snippets/tests/src/lowering_utils.cpp @@ -35,6 +35,7 @@ DummyTargetMachine::DummyTargetMachine() { jitters[ngraph::snippets::op::BroadcastMove::get_type_info_static()] = dummy_functor; jitters[ngraph::snippets::op::Kernel::get_type_info_static()] = dummy_functor; jitters[ngraph::snippets::op::Tile::get_type_info_static()] = dummy_functor; + jitters[ngraph::snippets::op::TileScheduler::get_type_info_static()] = dummy_functor; } std::shared_ptr LoweringTests::getSubgraph(const std::shared_ptr& f) { diff --git a/src/common/snippets/tests/src/registers.cpp b/src/common/snippets/tests/src/registers.cpp index 89e4e4768ff60e..2eb5cddd84fb9f 100644 --- a/src/common/snippets/tests/src/registers.cpp +++ b/src/common/snippets/tests/src/registers.cpp @@ -25,12 +25,14 @@ TEST(TransformationTests, AssignRegisters) { { auto p0 = std::make_shared(element::f32, Shape(1)); auto p1 = std::make_shared(element::f32, Shape(1)); + p0->set_friendly_name("p00"); + p1->set_friendly_name("p01"); auto y00 = std::make_shared(p0); y00->set_friendly_name("y00"); auto y01 = std::make_shared(p1); y01->set_friendly_name("y01"); auto y02 = std::make_shared(y00, y01); y02->set_friendly_name("y02"); - auto y03 = std::make_shared(y02); y03->set_friendly_name("y03"); - - f = std::make_shared(NodeVector{y03}, ParameterVector{p0, p1}); + auto s00 = std::make_shared(y02); s00->set_friendly_name("y03"); + s00->set_friendly_name("s00"); + f = std::make_shared(NodeVector{s00}, ParameterVector{p0, p1}); pass::Manager m; m.register_pass(); @@ -39,13 +41,17 @@ TEST(TransformationTests, AssignRegisters) { ASSERT_NO_THROW(check_rt_info(f)); } - // instead of comparing to a reference function check that registers are correctly assigned - // and stored to runtime info + /* Instead of comparing to a reference function check that registers are correctly assigned and stored to runtime + * info. Note that Parameters and Store rt_info contains gpr indexes, while general op's rt_info contain vector + * indexes */ { std::map ref_registers { + {"p00", 0}, // gpr + {"p01", 1}, // gpr {"y00", 0}, {"y01", 1}, - {"y02", 2} + {"y02", 2}, + {"s00", 2}, // gpr }; auto total_ops = 0; @@ -75,6 +81,14 @@ TEST(TransformationTests, AssignRegisters2) { auto p5 = std::make_shared(ngraph::element::f32, Shape()); auto p6 = std::make_shared(ngraph::element::f32, Shape()); auto p7 = std::make_shared(ngraph::element::f32, Shape()); + p0->set_friendly_name("p00"); + p1->set_friendly_name("p01"); + p2->set_friendly_name("p02"); + p3->set_friendly_name("p03"); + p4->set_friendly_name("p04"); + p5->set_friendly_name("p05"); + p6->set_friendly_name("p06"); + p7->set_friendly_name("p07"); auto c0 = std::make_shared(ngraph::element::f32, Shape(), 3.14f); c0->set_friendly_name("r00"); auto c1 = std::make_shared(ngraph::element::f32, Shape(), 6.6260701e-34f); c1->set_friendly_name("r01"); @@ -102,9 +116,10 @@ TEST(TransformationTests, AssignRegisters2) { auto y20 = std::make_shared(y17, y18); y20->set_friendly_name("r22"); auto y21 = std::make_shared(y15, y19); y21->set_friendly_name("r23"); auto y22 = std::make_shared(y20, y21); y22->set_friendly_name("r24"); - auto y23 = std::make_shared(y22); + auto s00 = std::make_shared(y22); + s00->set_friendly_name("s00"); - f = std::make_shared(NodeVector{y23}, ParameterVector{p0, p1, p2, p3, p4, p5, p6, p7}); + f = std::make_shared(NodeVector{s00}, ParameterVector{p0, p1, p2, p3, p4, p5, p6, p7}); pass::Manager m; m.register_pass(); @@ -117,10 +132,14 @@ TEST(TransformationTests, AssignRegisters2) { // and stored to runtime info { std::map ref_registers { - {"r00", 1}, {"r01", 3}, {"r02", 5}, {"r03", 5}, {"r04", 2}, {"r05", 6}, {"r06", 6}, {"r07", 6}, - {"r08", 5}, {"r09", 2}, {"r10", 1}, {"r11", 4}, {"r12", 4}, {"r13", 6}, {"r14", 2}, {"r15", 5}, - {"r16", 0}, {"r17", 4}, {"r18", 0}, {"r19", 2}, {"r20", 4}, {"r21", 1}, {"r22", 0}, {"r23", 6}, - {"r24", 1} + {"p00", 0}, {"p01", 1}, {"p02", 2}, {"p03", 3}, {"p04", 4}, {"p05", 5}, + {"p06", 6}, {"p07", 7}, + {"r00", 1}, {"r01", 3}, {"r02", 5}, {"r03", 5}, {"r04", 2}, {"r05", 6}, + {"r06", 6}, {"r07", 6}, {"r08", 5}, {"r09", 2}, {"r10", 1}, {"r11", 4}, + {"r12", 4}, {"r13", 6}, {"r14", 2}, {"r15", 5}, {"r16", 0}, {"r17", 4}, + {"r18", 0}, {"r19", 2}, {"r20", 4}, {"r21", 1}, {"r22", 0}, {"r23", 6}, + {"r24", 1}, + {"s00", 8}, }; auto total_ops = 0; diff --git a/src/plugins/intel_cpu/src/emitters/cpu_generator.cpp b/src/plugins/intel_cpu/src/emitters/cpu_generator.cpp index b6e5fb3b2ec6cd..b5975c245abfdc 100644 --- a/src/plugins/intel_cpu/src/emitters/cpu_generator.cpp +++ b/src/plugins/intel_cpu/src/emitters/cpu_generator.cpp @@ -118,6 +118,7 @@ ov::intel_cpu::CPUTargetMachine::CPUTargetMachine(dnnl::impl::cpu::x64::cpu_isa_ jitters[ngraph::snippets::op::Kernel::get_type_info_static()] = CREATE_EMITTER(KernelEmitter); jitters[ngraph::snippets::op::Tile::get_type_info_static()] = CREATE_EMITTER(TileEmitter); + jitters[ngraph::snippets::op::TileScheduler::get_type_info_static()] = CREATE_EMITTER(TileSchedulerEmitter); } size_t ov::intel_cpu::CPUTargetMachine::get_lanes() const { diff --git a/src/plugins/intel_cpu/src/emitters/jit_emitter.cpp b/src/plugins/intel_cpu/src/emitters/jit_emitter.cpp index 50f2674fb111b4..91079b55da46c8 100644 --- a/src/plugins/intel_cpu/src/emitters/jit_emitter.cpp +++ b/src/plugins/intel_cpu/src/emitters/jit_emitter.cpp @@ -46,6 +46,10 @@ size_t jit_emitter::aux_vecs_count() const { return 0; } +emitter_in_out_map jit_emitter::get_in_out_type() const { + return in_out_type_; +} + size_t jit_emitter::aux_gprs_count() const { // We need one gpr to load table address return entry_map_.empty() ? 0 : 1; diff --git a/src/plugins/intel_cpu/src/emitters/jit_emitter.hpp b/src/plugins/intel_cpu/src/emitters/jit_emitter.hpp index f0f460d51713a5..74fe712ddd6f9f 100644 --- a/src/plugins/intel_cpu/src/emitters/jit_emitter.hpp +++ b/src/plugins/intel_cpu/src/emitters/jit_emitter.hpp @@ -55,6 +55,7 @@ class jit_emitter : public ngraph::snippets::Emitter { const std::vector &pool_vec_idxs = {}, const std::vector &pool_gpr_idxs = {}); virtual size_t get_inputs_num() const = 0; virtual size_t aux_vecs_count() const; + emitter_in_out_map get_in_out_type() const; static std::set get_supported_precisions(); protected: diff --git a/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.cpp b/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.cpp new file mode 100644 index 00000000000000..a93cb3653d0e25 --- /dev/null +++ b/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.cpp @@ -0,0 +1,605 @@ +// Copyright (C) 2020-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include + +#include "jit_snippets_emitters.hpp" + +using namespace Xbyak; + +namespace ov { +namespace intel_cpu { +jit_container_emitter::jit_container_emitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, + const std::shared_ptr& n) : jit_emitter(h, isa, n) { + in_out_type_ = emitter_in_out_map::gpr_to_gpr; +} + +void jit_container_emitter::map_abstract_registers(const std::vector &vec_pool, const std::vector &gpr_pool, + std::set& vecs_used, std::set& gprs_used) { + if (body.empty()) + IE_THROW() << "Cannot map registers for jit_container_emitter when its body is empty"; + auto abstract_to_physical = [](const std::vector& abstract_regs, const std::vector& regs_pool) { + std::vector physical_regs(abstract_regs.size()); + for (size_t i = 0; i < abstract_regs.size(); i++) + physical_regs[i] = regs_pool.at(abstract_regs[i]); + return physical_regs; + }; + for (auto& code : body) { + const auto& emitter = code.first; + std::vector in_abstract_regs, out_abstract_regs; + std::tie(in_abstract_regs, out_abstract_regs) = code.second; + std::vector in_physical_regs, out_physical_regs; + switch (std::dynamic_pointer_cast(emitter)->get_in_out_type()) { + case gpr_to_gpr: + // Note that gpr_to_gpr is used for high-level utility operations like Kernel/TileScheduler/Tile. + // Input registers are not mapped in this case, since they contain utility info + // (num_params, tile increment, etc.), but not reg indexes. + in_physical_regs = std::move(in_abstract_regs); + out_physical_regs = std::move(abstract_to_physical(out_abstract_regs, gpr_pool)); + gprs_used.insert(out_physical_regs.begin(), out_physical_regs.end()); + break; + case gpr_to_vec: + // Load Emitters + in_physical_regs = std::move(abstract_to_physical(in_abstract_regs, gpr_pool)); + out_physical_regs = std::move(abstract_to_physical(out_abstract_regs, vec_pool)); + gprs_used.insert(in_physical_regs.begin(), in_physical_regs.end()); + vecs_used.insert(out_physical_regs.begin(), out_physical_regs.end()); + break; + case vec_to_gpr: + // Store Emitters + in_physical_regs = std::move(abstract_to_physical(in_abstract_regs, vec_pool)); + out_physical_regs = std::move(abstract_to_physical(out_abstract_regs, gpr_pool)); + vecs_used.insert(in_physical_regs.begin(), in_physical_regs.end()); + gprs_used.insert(out_physical_regs.begin(), out_physical_regs.end()); + break; + case vec_to_vec: + // Regular operations + in_physical_regs = std::move(abstract_to_physical(in_abstract_regs, vec_pool)); + out_physical_regs = std::move(abstract_to_physical(out_abstract_regs, vec_pool)); + vecs_used.insert(in_physical_regs.begin(), in_physical_regs.end()); + vecs_used.insert(out_physical_regs.begin(), out_physical_regs.end()); + break; + default: + IE_THROW() << "Unhandled in_out type"; + } + code.second = std::make_pair(in_physical_regs, out_physical_regs); + if (auto container = std::dynamic_pointer_cast(code.first)) + container->map_abstract_registers(vec_pool, gpr_pool, vecs_used, gprs_used); + } +} + +KernelEmitter::KernelEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, + const std::shared_ptr& n) : jit_container_emitter(h, isa, n) { + const auto kernel = ov::as_type_ptr(n); + if (!kernel) + IE_THROW() << "KernelEmitter invoked with invalid op argument"; + if (kernel->region.empty()) + IE_THROW() << "KernelEmitter invoked with empty body"; + body = kernel->region; + if (!kernel->compile_params) + IE_THROW() << "KernelEmitter invoked without compile_params"; + jcp = *reinterpret_cast(kernel->compile_params); + // Initialize pools of gp and vec registers + gp_regs_pool.resize(16); + vec_regs_pool.resize(16); + std::iota(gp_regs_pool.begin(), gp_regs_pool.end(), 0); + std::iota(vec_regs_pool.begin(), vec_regs_pool.end(), 0); + auto remove_regs_from_pool = [](std::vector& pool, const std::set& to_remove) { + // It's important to keep the order of other elements + pool.erase(std::remove_if(pool.begin(), pool.end(), + [&](size_t x) {return to_remove.count(x) != 0;}), pool.end()); + }; + // Reserve stack base and pointer for push(...) and pop(...) operations + // Reserve abi_param1 and abi_param2, since they'll be used to pass runtime call args to kernel + remove_regs_from_pool(gp_regs_pool, {Xbyak::Operand::RSP, Xbyak::Operand::RBP, + static_cast(dnnl::impl::cpu::x64::abi_param1.getIdx()), + static_cast(dnnl::impl::cpu::x64::abi_param2.getIdx())}); + std::set vecs_used, gprs_used; + map_abstract_registers(vec_regs_pool, gp_regs_pool, vecs_used, gprs_used); + remove_regs_from_pool(gp_regs_pool, gprs_used); + remove_regs_from_pool(vec_regs_pool, vecs_used); + // Remember used gprs to pass it to the TileSchedulerEmitter, so it can init them with appropriate data ptrs + gp_regs_used = std::vector(gprs_used.begin(), gprs_used.end()); +} + +void KernelEmitter::emit_code(const std::vector &in, + const std::vector &out, + const std::vector &pool, + const std::vector &gpr) const { + validate_arguments(in, out, pool, gpr); + emit_impl(in, out, pool, gpr, nullptr); +} + +void KernelEmitter::validate_arguments(const std::vector &in, + const std::vector &out, + const std::vector &pool, + const std::vector &gpr) const { + if (in.size() != 2) + IE_THROW() << "KernelEmitter got invalid number of inputs. Expected 2, got " << in.size(); + if (!out.empty()) + IE_THROW() << "KKernelEmitter got invalid number of outputs. Expected 0, got " << out.size(); +} + +void KernelEmitter::init_data_pointers(size_t num_inputs, size_t num_params, + const Reg64& reg_indexes, const Reg64& reg_const_params, const std::vector& data_ptr_regs) const { + const int64_t harness_num_dims = jcp.output_dims.size() - 1; + auto init_ptrs_with_offsets = [&](Reg64 pointer, const int64_t *offsets, Reg64 reg_tmp) { + for (int j = 0; j < harness_num_dims; j++) { + if (jcp.output_dims[j] != 1 && offsets[j] != 0) { + h->mov(reg_tmp, offsets[j]); + h->imul(reg_tmp, h->ptr[reg_indexes + j * sizeof(size_t)]); + h->add(pointer, reg_tmp); + } + } + }; + for (auto i = 0; i < num_params; i++) { + if (i < num_inputs) + h->mov(data_ptr_regs[i], h->ptr[reg_const_params + GET_OFF(src_ptrs) + i * sizeof(void*)]); + else + h->mov(data_ptr_regs[i], h->ptr[reg_const_params + GET_OFF(dst_ptrs) + (i - num_inputs) * sizeof(void*)]); + // we can use the last data_ptr_reg as tmp_reg until the last iteration, and reg_const_params then + Reg64 reg_tmp = i < num_params-1 ? data_ptr_regs.back() : reg_const_params; + init_ptrs_with_offsets(data_ptr_regs[i], &jcp.data_offsets[i * harness_num_dims], reg_tmp); + } +} +void KernelEmitter::emit_impl(const std::vector& in, + const std::vector& out, + const std::vector& allocated_vec_regs, + const std::vector& allocated_gp_regs, + const ov::intel_cpu::emitter_context *emit_context) const { + h->preamble(); + + const size_t num_inputs = in[0]; + const size_t num_outputs = in[1]; + + Reg64 reg_indexes = Reg64(dnnl::impl::cpu::x64::abi_param1.getIdx()); + Reg64 reg_const_params = Reg64(dnnl::impl::cpu::x64::abi_param2.getIdx()); + std::vector data_ptr_regs(gp_regs_used.size()); + std::transform(gp_regs_used.begin(), gp_regs_used.end(), data_ptr_regs.begin(), [](size_t idx){return Reg64(static_cast(idx));}); + + init_data_pointers(num_inputs, num_inputs + num_outputs, reg_indexes, reg_const_params, data_ptr_regs); + // todo: emit_impl is a const method, so we can't just push_back unused regs to the gp_regs_pool. + // we need a more elegant approach to avoid a full copy here + auto local_gpr_pool = gp_regs_pool; + local_gpr_pool.push_back(static_cast(reg_indexes.getIdx())); + local_gpr_pool.push_back(static_cast(reg_const_params.getIdx())); + for (const auto& c : body) { + const auto& emitter = c.first; + std::vector in_regs, out_regs; + std::tie(in_regs, out_regs) = c.second; + if (auto tile_scheduler = std::dynamic_pointer_cast(emitter)) + out_regs = gp_regs_used; + emitter->emit_code(in_regs, out_regs, vec_regs_pool, local_gpr_pool); + } + h->postamble(); +} + +TileSchedulerEmitter::TileSchedulerEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, + const std::shared_ptr& n) : jit_container_emitter(h, isa, n) { + const auto tile_scheduler = ov::as_type_ptr(n); + if (!tile_scheduler) + IE_THROW() << "TileSchedulerEmitter invoked with invalid op argument"; + if (!tile_scheduler->compile_params) + IE_THROW() << "TileEmitter invoked without compile_params"; + body = {tile_scheduler->vector_region, tile_scheduler->scalar_region}; + jcp = *reinterpret_cast(tile_scheduler->compile_params); +} +void TileSchedulerEmitter::emit_code(const std::vector &in, + const std::vector &out, + const std::vector &pool, + const std::vector &gpr) const { + validate_arguments(in, out, pool, gpr); + emit_impl(in, out, pool, gpr, nullptr); +} +void TileSchedulerEmitter::validate_arguments(const std::vector &in, + const std::vector &out, + const std::vector &pool, + const std::vector &gpr) const { + if (in.size() != 3) + IE_THROW() << "TileSchedulerEmitter got invalid number of inputs. Expected 3, got " << in.size(); + if (out.size() != in[0] + in[1]) + IE_THROW() << "TileSchedulerEmitter got invalid number of outputs. Expected " << in[0] + in[1] << " , got " << out.size(); + if (body.size() != 2) + IE_THROW() << "TileSchedulerEmitter got invalid body size, expected 2 (vector & scalar TileEmitter), got " << body.size(); + if (!(std::dynamic_pointer_cast(body[0].first) && std::dynamic_pointer_cast(body[1].first))) + IE_THROW() << "TileSchedulerEmitter can contain only TileEmitters inside its body"; +} + +void TileSchedulerEmitter::emit_tiles(const Reg64& reg_inner_amount, size_t vector_size, + const std::vector& vec_pool, const std::vector& gpr_pool) const { + const auto& vector_tile = body[0]; + const auto& scalar_tile = body[1]; + const auto& vector_tile_body = std::dynamic_pointer_cast(vector_tile.first)->get_nested_code(); + const auto& scalar_tile_body = std::dynamic_pointer_cast(scalar_tile.first)->get_nested_code(); + const size_t inner_work_amount = jcp.scheduler_dims[1]; + + auto process_tile = + [&](const bool evaluate_once, const std::vector& body, const AllocatedEmitter& tile) { + // If Tile is evaluated only once, then we can emit its body directly and skip work_amount decrements and checks + if (evaluate_once) { + for (auto& code : body) + code.first->emit_code(code.second.first, code.second.second, vec_pool, gpr_pool); + } else { + std::vector in_regs, out_regs; + std::tie(in_regs, out_regs) = tile.second; + // pass work_amount reg to Tile + in_regs.push_back(static_cast(reg_inner_amount.getIdx())); + tile.first->emit_code(in_regs, out_regs, vec_pool, gpr_pool); + } + }; + bool vector_evaluate_once = false; + if (inner_work_amount >= vector_size) { + vector_evaluate_once = inner_work_amount < 2 * vector_size; + // Need to set proper work amount for inner tiles if evaluated multiple times + if (!vector_evaluate_once) + h->mov(reg_inner_amount, inner_work_amount); + process_tile(vector_evaluate_once, vector_tile_body, vector_tile); + } + if (inner_work_amount % vector_size >= 1) { + bool scalar_evaluate_once = inner_work_amount % vector_size < 2; + if (!scalar_evaluate_once) { + // vector_tile is not executed, work_amount is not set + if (inner_work_amount < vector_size) + h->mov(reg_inner_amount, inner_work_amount); + // vector_tile is executed, but work_amount is neither set nor decremented appropriately. + else if (vector_evaluate_once) + h->mov(reg_inner_amount, inner_work_amount - vector_size); + // else: vector_tile is executed multiple times, so work_amount is already set + } + process_tile(scalar_evaluate_once, scalar_tile_body, scalar_tile); + } +} + +void TileSchedulerEmitter::emit_impl(const std::vector& in, + const std::vector& out, + const std::vector& vec_pool, + const std::vector& gpr_pool, + const ov::intel_cpu::emitter_context *emit_context) const { + const size_t num_inputs = in[0]; + const size_t num_outputs = in[1]; + const size_t vector_size = in[2]; + const size_t num_params = num_inputs + num_outputs; + const auto& data_ptr_reg_idxs(out); + std::vector data_ptr_regs(data_ptr_reg_idxs.size()); + std::transform(data_ptr_reg_idxs.begin(), data_ptr_reg_idxs.end(), data_ptr_regs.begin(), [](size_t idx){return Reg64(static_cast(idx));}); + + // todo: emit_impl has const input args, so we can't just pop_back necessary regs from gpr_pool. + // we need a more elegant approach to avoid a full copy here. Similar problem is demonstrated in KernelEmitter + auto local_gpr_pool = gpr_pool; + Reg64 reg_outer_amount = Reg64(static_cast(local_gpr_pool.back())); + local_gpr_pool.pop_back(); + Reg64 reg_inner_amount = Reg64(static_cast(local_gpr_pool.back())); + local_gpr_pool.pop_back(); + Label for_body; + const size_t outer_work_amount = jcp.scheduler_dims[0]; + if (outer_work_amount == 1) { + // emit code directly without looping over external dim + emit_tiles(reg_inner_amount, vector_size, vec_pool, local_gpr_pool); + } else if (outer_work_amount > 1) { + // We need to create a Loop in this case + h->mov(reg_outer_amount, outer_work_amount); + h->L(for_body); + { + emit_tiles(reg_inner_amount, vector_size, vec_pool, local_gpr_pool); + + // Todo: Load and Store emitters are currently implemented so they ALWAYS increment appropriate pointers + // after reading/writing. This might be a problem if we need to read the same data multiple times (broadcasting shapes). + // To overcome this limitation, we add appropriate negative offsets if necessary. + for (auto i = 0; i < num_params; i++) { + if (jcp.scheduler_offsets[i] != 0) { + h->add(data_ptr_regs[i], jcp.scheduler_offsets[i]); + } + } + // Note that outer dimensions are always incremented by 1 (outer tiles are always scalar) + h->sub(reg_outer_amount, 1); + h->cmp(reg_outer_amount, 1); + h->jge(for_body, CodeGenerator::T_NEAR); + } + } +} + +std::vector& TileEmitter::get_nested_code() { + return body; +} + +TileEmitter::TileEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, + const std::shared_ptr& n) : jit_container_emitter(h, isa, n) { + const auto tile = ov::as_type_ptr(n); + if (!tile) + IE_THROW() << "TileEmitter invoked with invalid op argument"; + body = tile->region; +} + +void TileEmitter::emit_code(const std::vector &in, + const std::vector &out, + const std::vector &pool, + const std::vector &gpr) const { + validate_arguments(in, out, pool, gpr); + emit_impl(in, out, pool, gpr, nullptr); +} + +void TileEmitter::validate_arguments(const std::vector &in, + const std::vector &out, + const std::vector &pool, + const std::vector &gpr) const { + if (in.size() != 2) + IE_THROW() << "TileEmitter got invalid number of inputs. Expected 2, got " << in.size(); + if (!out.empty()) + IE_THROW() << "TileEmitter got invalid number of outputs. Expected 0" << " , got " << out.size(); +} + +void TileEmitter::emit_impl(const std::vector& in, + const std::vector& out, + const std::vector& vec_pool, + const std::vector& gpr_pool, + const ov::intel_cpu::emitter_context *emit_context) const { + const size_t inc = in[0]; + Reg64 work_amount = Reg64(static_cast(in[1])); + Label for_body; + + // Note that: + // * Work amount must be set by TileScheduler that executes Tiles + // * TileScheduler executes Tile only if it has to perform >= 1 iterations + h->L(for_body); + { + for (auto& code : body) + code.first->emit_code(code.second.first, code.second.second, vec_pool, gpr_pool); + h->sub(work_amount, inc); + h->cmp(work_amount, inc); + h->jge(for_body, CodeGenerator::T_NEAR); + } +} + +FakeBroadcastEmitter::FakeBroadcastEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, + const std::shared_ptr& n) : jit_emitter(h, isa, n) { + if (n->get_input_shape(0).empty()) + use_broadcast = true; + else if (*n->get_input_shape(0).rbegin() != *n->get_output_shape(0).rbegin()) + use_broadcast = true; + else + use_broadcast = false; +} + +void FakeBroadcastEmitter::emit_impl(const std::vector& in, + const std::vector& out, + const std::vector& pool, + const std::vector& gpr, + const ov::intel_cpu::emitter_context *emit_context) const { + if (host_isa_ == dnnl::impl::cpu::x64::sse41) { + emit_isa(in, out); + } else if (host_isa_ == dnnl::impl::cpu::x64::avx2) { + emit_isa(in, out); + } else if (host_isa_ == dnnl::impl::cpu::x64::avx512_common) { + emit_isa(in, out); + } else { + IE_THROW() << host_isa_; + assert(!"unsupported isa"); + } +} + +template +void FakeBroadcastEmitter::emit_isa(const std::vector &in, const std::vector &out) const { + using Vmm = typename dnnl::impl::utils::conditional3::type; + Vmm vmm_src0 = Vmm(in[0]); + Vmm vmm_dst = Vmm(out[0]); + + if (use_broadcast) { + h->uni_vbroadcastss(vmm_dst, Xmm(in[0])); + } else { + h->uni_vmovups(vmm_dst, vmm_src0); + } +} + +ScalarEmitter::ScalarEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, + const std::shared_ptr& n) : jit_emitter(h, isa, n) { + value = dnnl::impl::cpu::x64::float2int(ov::as_type_ptr(n)->cast_vector()[0]); + push_arg_entry_of("scalar", value, true); + prepare_table(); +} + +void ScalarEmitter::emit_impl(const std::vector& in, + const std::vector& out, + const std::vector& pool, + const std::vector& gpr, + const ov::intel_cpu::emitter_context *emit_context) const { + if (host_isa_ == dnnl::impl::cpu::x64::sse41) { + emit_isa(in, out); + } else if (host_isa_ == dnnl::impl::cpu::x64::avx2) { + emit_isa(in, out); + } else if (host_isa_ == dnnl::impl::cpu::x64::avx512_common) { + emit_isa(in, out); + } else { + IE_THROW() << host_isa_; + assert(!"unsupported isa"); + } +} + +template +void ScalarEmitter::emit_isa(const std::vector &in, const std::vector &out) const { + using Vmm = typename dnnl::impl::utils::conditional3::type; + Vmm vmm_dst = Vmm(out[0]); + h->uni_vbroadcastss(vmm_dst, table_val("scalar")); +} + + +MemoryEmitter::MemoryEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, + const std::shared_ptr& n) : jit_emitter(h, isa, n) { +} + +StoreEmitter::StoreEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, + const std::shared_ptr& n) : MemoryEmitter(h, isa, n) { + in_out_type_ = emitter_in_out_map::vec_to_gpr; +} + +void StoreEmitter::emit_impl(const std::vector& in, + const std::vector& out, + const std::vector& pool, + const std::vector& gpr, + const ov::intel_cpu::emitter_context *emit_context) const { + if (host_isa_ == dnnl::impl::cpu::x64::sse41) { + emit_isa(in, out); + } else if (host_isa_ == dnnl::impl::cpu::x64::avx2) { + emit_isa(in, out); + } else if (host_isa_ == dnnl::impl::cpu::x64::avx512_common) { + emit_isa(in, out); + } else { + IE_THROW() << host_isa_; + assert(!"unsupported isa"); + } +} + +template +void StoreEmitter::emit_isa(const std::vector &in, const std::vector &out) const { + using Vmm = typename dnnl::impl::utils::conditional3::type; + Reg64 out_reg(static_cast(out[0])); + Vmm vmm_src0 = Vmm(in[0]); + h->uni_vmovups(h->ptr[out_reg], vmm_src0); + h->add(out_reg, dnnl::impl::cpu::x64::cpu_isa_traits::vlen); +} + +ScalarStoreEmitter::ScalarStoreEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, + const std::shared_ptr& n) : MemoryEmitter(h, isa, n) { + in_out_type_ = emitter_in_out_map::vec_to_gpr; +} + +void ScalarStoreEmitter::emit_impl(const std::vector& in, + const std::vector& out, + const std::vector& pool, + const std::vector& gpr, + const ov::intel_cpu::emitter_context *emit_context) const { + if (host_isa_ == dnnl::impl::cpu::x64::sse41) { + emit_isa(in, out); + } else if (host_isa_ == dnnl::impl::cpu::x64::avx2) { + emit_isa(in, out); + } else if (host_isa_ == dnnl::impl::cpu::x64::avx512_common) { + emit_isa(in, out); + } else { + IE_THROW() << host_isa_; + assert(!"unsupported isa"); + } +} + +template +void ScalarStoreEmitter::emit_isa(const std::vector &in, const std::vector &out) const { + using Vmm = typename dnnl::impl::utils::conditional3::type; + Reg64 out_reg(static_cast(out[0])); + Xmm vmm_src0 = Xmm(in[0]); + h->uni_vmovss(h->ptr[out_reg], vmm_src0); + h->add(out_reg, sizeof(float)); +} + +LoadEmitter::LoadEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, + const std::shared_ptr& n) + : MemoryEmitter(h, isa, n), shouldPostIncrement(*n->get_input_shape(0).rbegin() != 1) { + in_out_type_ = emitter_in_out_map::gpr_to_vec; +} + +void LoadEmitter::emit_impl(const std::vector& in, + const std::vector& out, + const std::vector& pool, + const std::vector& gpr, + const ov::intel_cpu::emitter_context *emit_context) const { + if (host_isa_ == dnnl::impl::cpu::x64::sse41) { + emit_isa(in, out); + } else if (host_isa_ == dnnl::impl::cpu::x64::avx2) { + emit_isa(in, out); + } else if (host_isa_ == dnnl::impl::cpu::x64::avx512_common) { + emit_isa(in, out); + } else { + IE_THROW() << host_isa_; + assert(!"unsupported isa"); + } +} + +template +void LoadEmitter::emit_isa(const std::vector &in, const std::vector &out) const { + using Vmm = typename dnnl::impl::utils::conditional3::type; + Reg64 in_reg(static_cast(in[0])); + Vmm vmm_src0 = Vmm(out[0]); + h->uni_vmovups(vmm_src0, h->ptr[in_reg]); + + if (shouldPostIncrement) { + h->add(in_reg, dnnl::impl::cpu::x64::cpu_isa_traits::vlen); + } +} + +BroadcastLoadEmitter::BroadcastLoadEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, + const std::shared_ptr& n) : MemoryEmitter(h, isa, n) { + in_out_type_ = emitter_in_out_map::gpr_to_vec; +} + +void BroadcastLoadEmitter::emit_impl(const std::vector& in, + const std::vector& out, + const std::vector& pool, + const std::vector& gpr, + const ov::intel_cpu::emitter_context *emit_context) const { + if (host_isa_ == dnnl::impl::cpu::x64::sse41) { + emit_isa(in, out); + } else if (host_isa_ == dnnl::impl::cpu::x64::avx2) { + emit_isa(in, out); + } else if (host_isa_ == dnnl::impl::cpu::x64::avx512_common) { + emit_isa(in, out); + } else { + IE_THROW() << host_isa_; + assert(!"unsupported isa"); + } +} + +template +void BroadcastLoadEmitter::emit_isa(const std::vector &in, const std::vector &out) const { + using Vmm = typename dnnl::impl::utils::conditional3::type; + Reg64 in_reg(in[0]); + Vmm vmm_src0 = Vmm(out[0]); + + // In doesn't really matter if we broadcast or `movss` for vector tails so keep only one version for `BroadcastLoad`, + // key point here is not to add post-increment, it might be fixed by some other approach in future + h->uni_vbroadcastss(vmm_src0, h->ptr[in_reg]); +} + + +ScalarLoadEmitter::ScalarLoadEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, + const std::shared_ptr& n) + : MemoryEmitter(h, isa, n), shouldPostIncrement(*n->get_input_shape(0).rbegin() != 1) { + in_out_type_ = emitter_in_out_map::gpr_to_vec; +} + +void ScalarLoadEmitter::emit_impl(const std::vector& in, + const std::vector& out, + const std::vector& pool, + const std::vector& gpr, + const ov::intel_cpu::emitter_context *emit_context) const { + if (host_isa_ == dnnl::impl::cpu::x64::sse41) { + emit_isa(in, out); + } else if (host_isa_ == dnnl::impl::cpu::x64::avx2) { + emit_isa(in, out); + } else if (host_isa_ == dnnl::impl::cpu::x64::avx512_common) { + emit_isa(in, out); + } else { + IE_THROW() << host_isa_; + assert(!"unsupported isa"); + } +} + +template +void ScalarLoadEmitter::emit_isa(const std::vector &in, const std::vector &out) const { + using Vmm = typename dnnl::impl::utils::conditional3::type; + Reg64 in_reg(static_cast(in[0])); + Xmm vmm_src0 = Xmm(out[0]); + h->uni_vmovss(vmm_src0, h->ptr[in_reg]); + + // Doesn't work if the same pointer comes with multiple load operations + if (shouldPostIncrement) { + h->add(in_reg, sizeof(float)); + } +} +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.hpp b/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.hpp index c078fa68003cd7..79f862decfdebc 100644 --- a/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.hpp +++ b/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.hpp @@ -10,11 +10,13 @@ #include "jit_emitter.hpp" using namespace Xbyak; +using ngraph::snippets::AllocatedEmitter; namespace ov { namespace intel_cpu { -#define SNIPPETS_MAX_SNIPPETS_DIMS 7 + +#define SNIPPETS_MAX_SNIPPETS_DIMS 12 #define SNIPPETS_MAX_HARNESS_DIMS 5 #define SNIPPETS_MAX_TILE_RANK 2 #define GET_OFF(field) offsetof(jit_snippets_call_args, field) @@ -30,11 +32,27 @@ struct jit_snippets_compile_args { std::vector output_dims = {}; }; /// -/// \brief Kernel is the only entry point to Codogen Jit compilation. Kernel calculates appropriate data offsets, -/// and invokes enclosed outer Tiles. Only 2d Tiles are currently supported, so the emitters should -/// be organized in the following way: -/// KernelEmitter { /* entry point */ -/// TileEmitter { /* outer tile */ +/// \brief jit_container_emitter designed to wrap Emitters that contain other Emitters (presently KernelEmitter, +/// TileSchedulerEmitter and TileEmitter). This is needed to provide common interface for register mapping +/// (abstract to physical) and nested code access. +/// +class jit_container_emitter: public jit_emitter { +public: + jit_container_emitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, + const std::shared_ptr& n); +protected: + // maps gpr and vec abstract registers to physical ones. Physical reg indexes are taken from the provided pools + // (the first 2 args). All the used gpr and vec registers are also stored in the provided sets (the second 2 args). + void map_abstract_registers(const std::vector&, const std::vector&, + std::set&, std::set&); + std::vector body; +}; +/// +/// \brief Kernel is the only entry point to Codogen Jit compilation. Kernel perform abstract-to-physical register +/// mapping and creates pools of available gpr and vec registers. Kernel is expected to contain (at least one) +/// TileSchedulerEmitter. In general the enclosed emitters should be organized in the following way: +/// KernelEmitter { /* entry point, maps registers, creates pools of available registers */ +/// TileSchedulerEmitter { /* executes required inner, avoids emitting code that won't be executed */ /// TileEmitter { /* inner vector tile */ /// ... /* All the necessary Load/Strore/elementwise emitters */ /// } @@ -43,255 +61,100 @@ struct jit_snippets_compile_args { /// } /// } /// } -/// Note that Kernel params are passed directly to the emit_code(). The vector of inputs should contain 2 arguments, the -/// output vector should be empty. Input parameters -/// -/// \param in[0] The number of the node inputs -/// \param in[1] The number of the node outputs +/// Note that Kernel doesn't accept any input arguments. /// -// Todo: Scheduler dims and offsets are currently calculated in Subgraph node and passed to the KernelEmitter. -// However, it seems more natural to calculate all the offsets right in the Kernel op, because the calculation is -// not device-specific. It is based only on input/output dims (which we already know) and harness num dims -// (which we should pass from the plugin). It seems also better to wrap the enclosed emitters in tiles in the Kernel op -// and avoid creating empty tiles. -class KernelEmitter : public jit_emitter { +class KernelEmitter : public jit_container_emitter { public: KernelEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, - const std::shared_ptr& n) - : jit_emitter(h, isa, n) { - const auto kernel = ov::as_type_ptr(n); - if (!kernel) - IE_THROW() << "KernelEmitter invoked with invalid op argument"; - if (!kernel->compile_params) - IE_THROW() << "KernelEmitter invoked without compile_params"; - code = kernel->region; - jcp = *reinterpret_cast(kernel->compile_params); - } + const std::shared_ptr& n); size_t get_inputs_num() const override {return 0;} - - void emit_code(const std::vector &in, const std::vector &out, - const std::vector &pool = {}, const std::vector &gpr = {}) const override { - validate_arguments(in, out, pool, gpr); - emit_impl(in, out, pool, gpr, nullptr); - } + void emit_code(const std::vector &in, + const std::vector &out, + const std::vector &pool, + const std::vector &gpr) const override; private: - void validate_arguments(const std::vector &in, const std::vector &out, - const std::vector &pool = {}, const std::vector &gpr = {}) const override { - if (in.size() != 2) - IE_THROW() << "KernelEmitter got invalid number of inputs. Expected 2, got " << in.size(); - if (out.size() != 0) - IE_THROW() << "KernelEmitter got unexpected output arguments."; - const size_t num_params = in[0] + in[1]; - if (num_params > SNIPPETS_MAX_SNIPPETS_DIMS) - IE_THROW() << "KernelEmitter supports only up to " << SNIPPETS_MAX_SNIPPETS_DIMS << - " parameters, got " << num_params; - const int64_t harness_num_dims = jcp.output_dims.size() - 1; - if (harness_num_dims > SNIPPETS_MAX_HARNESS_DIMS) - IE_THROW() << "KernelEmitter supports harness with up to " << SNIPPETS_MAX_HARNESS_DIMS << - " dims, got " << harness_num_dims; - } - + void validate_arguments(const std::vector &in, + const std::vector &out, + const std::vector &pool, + const std::vector &gpr) const override; void emit_impl(const std::vector& in, const std::vector& out, const std::vector& pool, const std::vector& gpr, - const ov::intel_cpu::emitter_context *emit_context) const override { - const size_t num_inputs = in[0]; - const size_t num_outputs = in[1]; - const size_t num_params = num_inputs + num_outputs; - int reg64_tmp_start { 8 }; // R8, R9, R10, R11, R12, R13, R14, R15 inputs+outputs+1 - const int64_t harness_num_dims = jcp.output_dims.size() - 1; - - Reg64 reg_indexes { dnnl::impl::cpu::x64::abi_param_regs[0] }; - Reg64 reg_const_params { dnnl::impl::cpu::x64::abi_param_regs[1] }; - Xbyak::Reg64 reg_tmp_64 { dnnl::impl::cpu::x64::abi_not_param_reg }; - - h->preamble(); - - std::vector regs(num_params); - auto init_ptrs_with_offsets = [&](Reg64 pointer, const int64_t *offsets) { - for (int j = 0; j < harness_num_dims; j++) { - if (jcp.output_dims[j] != 1 && offsets[j] != 0) { - h->mov(reg_tmp_64, offsets[j]); - h->imul(reg_tmp_64, h->ptr[reg_indexes + j * sizeof(size_t)]); - h->add(pointer, reg_tmp_64); - } - } - }; - for (auto i = 0; i < num_params; i++) { - regs[i] = Reg64(reg64_tmp_start + i); - if (i < num_inputs) - h->mov(regs[i], h->ptr[reg_const_params + GET_OFF(src_ptrs) + i * sizeof(void*)]); - else - h->mov(regs[i], h->ptr[reg_const_params + GET_OFF(dst_ptrs) + (i - num_inputs) * sizeof(void*)]); - init_ptrs_with_offsets(regs[i], &jcp.data_offsets[i * harness_num_dims]); - } - - for (auto& c : code) { - c.first->emit_code(c.second.first, c.second.second, pool, gpr); - } - - h->postamble(); - } + const ov::intel_cpu::emitter_context *emit_context) const override; + void init_data_pointers(size_t, size_t, const Reg64&, const Reg64&, const std::vector&) const; jit_snippets_compile_args jcp; - std::vector, ngraph::snippets::RegInfo>> code; + std::vector gp_regs_pool; + std::vector gp_regs_used; + std::vector vec_regs_pool; }; /// -/// \brief Tile is designed to organize loop over the input and output data. It is essentially a for(...) loop: -/// it calculates the total number of iterations, performs operations specified by enclosed emitters, advances iteration counters -/// and breaks when necessary. +/// \brief TileSchedulerEmitter contains Tiles to be executed (presently vector and scalar). It calculates data offsets +/// and work amounts, performs data pointer decrements if necessary. It also performs some Tile optimizations: scalar/vector +/// tiles are emitted only if necessary; Tile body could be emitted directly, if only one Tile evaluation is required. /// -/// \param in[0] The number of input entities (or scheduler counts) processed during one iteration of the tile. -/// It is expected to be 1 for outer or scalar tiles and vlen for vector tiles. -/// \param in[1] Increment of the previous Tile in current dimension. Must be 0 if this is the first Tile. -/// So previous_inc is zero for outer and vector tiles (the are the first in dim) and vlen for scalar tiles (they usually go after vector Tiles). -/// \param in[2] sum number inputs and number of outputs of the node. -/// \param in[3] dimension of the tile. Note that only 2d Tile are currently supported, so dim is 0 for outer tiles, 1 for inner tiles. +/// \param in[0] The number of the node inputs +/// \param in[1] The number of the node outputs +/// \param in[2] The number of elements that fits into vector register /// -// Todo: Inner and outer tiles have different semantics. For example, outer tile always has the increment == 1, and it can contain only -// tile emitters (one outer or two inner). So it seems better to create different classes for inner and outer tiles. -// Todo: Currently data pointers incremented after each read/write in Load/Store emitters, so we have to decrement them here -// if the same data needs to be read twice. Better to move all the pointer increments to TileEmitter and avoid the increments if necessary. -class TileEmitter : public jit_emitter { + +class TileSchedulerEmitter : public jit_container_emitter { public: - TileEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, - const std::shared_ptr& n) - : jit_emitter(h, isa, n) { - const auto tile = ov::as_type_ptr(n); - if (!tile) - IE_THROW() << "TileEmitter invoked with invalid op argument"; - if (!tile->compile_params) - IE_THROW() << "TileEmitter invoked without compile_params"; - code = tile->region; - jcp = *reinterpret_cast(tile->compile_params); - } + TileSchedulerEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, + const std::shared_ptr& n); size_t get_inputs_num() const override {return 0;} - - void emit_code(const std::vector &in, const std::vector &out, - const std::vector &pool = {}, const std::vector &gpr = {}) const override { - validate_arguments(in, out, pool, gpr); - emit_impl(in, out, pool, gpr, nullptr); - } + void emit_code(const std::vector &in, + const std::vector &out, + const std::vector &pool, + const std::vector &gpr) const override; private: - void validate_arguments(const std::vector &in, const std::vector &out, - const std::vector &pool = {}, const std::vector &gpr = {}) const override { - if (in.size() != 4) - IE_THROW() << "TileEmitter got invalid number of inputs. Expected 4, got " << in.size(); - if (out.size() != 0) - IE_THROW() << "TileEmitter got unexpected output arguments."; - const size_t num_params = in[2]; - if (num_params > SNIPPETS_MAX_SNIPPETS_DIMS) - IE_THROW() << "TileEmitter supports only up to " << SNIPPETS_MAX_SNIPPETS_DIMS << - " parameters, got " << num_params; - const size_t dim = in[3]; - if (dim >= SNIPPETS_MAX_TILE_RANK) - IE_THROW() << "TileEmitter supports tile ranks up to " << SNIPPETS_MAX_TILE_RANK << - " got " << dim; - } - + void validate_arguments(const std::vector &in, + const std::vector &out, + const std::vector &pool, + const std::vector &gpr) const override; void emit_impl(const std::vector& in, const std::vector& out, const std::vector& pool, const std::vector& gpr, - const ov::intel_cpu::emitter_context *emit_context) const override { - const size_t inc = in[0]; - const size_t previous_inc = in[1]; // increment of a previous tile in the same dim (0 if the first tile in the dim) - const size_t num_params = in[2]; - const size_t dim = in[3]; // tile dimension: 0 - outer, 1 - inner - const int reg64_tmp_start { 8 }; // R8, R9, R10, R11, R12, R13, R14, R15 inputs+outputs+1 - Reg64 amount = Reg64(reg64_tmp_start + num_params); // amount - std::array for_body; - - // If R15 is not used, reserve it for use in scalar to avoid redundant push-pop's. - // todo: Do we need explicitly check that code contains ScalarEmitter? - std::vector local_gpr = reg64_tmp_start + num_params < 15 ? std::vector{15} : std::vector{}; - std::vector regs(num_params); - for (auto i = 0; dim == 0 && i < num_params; i++) - regs[i] = Reg64(reg64_tmp_start + i); - // Loop processing could be simplified in some cases - if (inc > jcp.scheduler_dims[dim]) { - return; - } else if (inc == jcp.scheduler_dims[dim]) { - for (auto& c : code) { - c.first->emit_code(c.second.first, c.second.second, pool, local_gpr); - } - } else { - // The previous tile has done nothing, all the work is ours - if (previous_inc == 0 || previous_inc > jcp.scheduler_dims[dim]) { - h->mov(amount, jcp.scheduler_dims[dim]); - // The previous tile has done all the work - } else if (jcp.scheduler_dims[dim] % previous_inc == 0) { - return; - }// else: the previous tile has already set a proper work amount - h->cmp(amount, inc); - h->jl(for_body[0], CodeGenerator::T_NEAR); - - h->L(for_body[1]); - { - h->push(amount); - for (auto& c : code) { - c.first->emit_code(c.second.first, c.second.second, pool, local_gpr); - } - h->pop(amount); - // Todo: Load and Store emitters are currently implemented so they ALWAYS increment appropriate pointers - // after reading/writing. This might be a problem if we need to read the same data multiple times (broadcasting shapes). - // To overcome this limitation, we add appropriate negative offsets if necessary. - for (auto i = 0; dim == 0 && i < num_params; i++) { - if (jcp.scheduler_offsets[i] != 0) { - h->add(regs[i], jcp.scheduler_offsets[i]); - } - } - h->sub(amount, inc); - h->cmp(amount, inc); - h->jge(for_body[1], CodeGenerator::T_NEAR); - } - - h->L(for_body[0]); - } - } + const ov::intel_cpu::emitter_context *emit_context) const override; + + void emit_tiles(const Reg64&, size_t, const std::vector& , const std::vector&) const; - // A = <42, 17> - // B = < 1, 17> - // for (auto k = 0; k < dom_0; k++) { // 42 - // for (auto n = 0; n < dom_1; n++) { // 17 - // auto a = *ptr0; ptr0 += vlan; // vector/scalar load - // auto b = *ptr1; ptr1 += vlan; // vector/scalar load - // } - // ptr0 -= 0*dom_1; - // ptr1 -= 1*dom_1; - // } - - // broadcast by MVD is extra case - // A = <42, 17> - // B = <42, 1> - // for (auto k = 0; k < dom_0; k++) { // 42 - // for (auto n = 0; n < dom_1; n++) { // 17 - // auto a = *ptr0; ptr0 += vlan; // vector/scalar load - // auto b = *ptr1; // broadcast load - // } - // ptr0 -= 0*dom_1; - // ptr1 += sizeof(ptr1[0]); //ptr1 -= -sizeof(ptr1[0]); - // } - - // A = <42, 17, 31> - // B = < 1, 17, 31> - // for (auto k = 0; k < dom_0; k++) { // 42 - // for (auto n = 0; n < dom_1; n++) { // 17 - // for (auto m = 0; m < dom_2; m++) { // 31 - // auto a = *ptr0; ptr0 += vlan; // vector/scalar load - // auto b = *ptr1; ptr1 += vlan; // vector/scalar load - // } - // } - // ptr0 -= 0*dom_1*dom2; - // ptr1 -= 1*dom_1*dom2; - // } jit_snippets_compile_args jcp; - std::vector, ngraph::snippets::RegInfo>> code; +}; + +/// +/// \brief Tile is designed to organize loop over the input and output data. It is essentially a for(...) loop: +/// it performs operations specified by enclosed emitters, advances iteration counters +/// and breaks when necessary. +/// +/// \param in[0] The number of input entities (or scheduler counts) processed during one iteration of the tile. +/// It is expected to be 1 for outer or scalar tiles and vlen for vector tiles. +class TileEmitter : public jit_container_emitter { +public: + TileEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr& n); + + size_t get_inputs_num() const override {return 0;} + std::vector& get_nested_code(); + void emit_code(const std::vector &in, + const std::vector &out, + const std::vector &pool, + const std::vector &gpr) const override; +private: + void validate_arguments(const std::vector &in, + const std::vector &out, + const std::vector &pool, + const std::vector &gpr) const override; + void emit_impl(const std::vector& in, + const std::vector& out, + const std::vector& pool, + const std::vector& gpr, + const ov::intel_cpu::emitter_context *emit_context) const override; }; class NopEmitter : public jit_emitter { @@ -313,15 +176,8 @@ class NopEmitter : public jit_emitter { class FakeBroadcastEmitter : public jit_emitter { public: - FakeBroadcastEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr& n) - : jit_emitter(h, isa, n) { - if (n->get_input_shape(0).empty()) - use_broadcast = true; - else if (*n->get_input_shape(0).rbegin() != *n->get_output_shape(0).rbegin()) - use_broadcast = true; - else - use_broadcast = false; - } + FakeBroadcastEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr& n); + size_t get_inputs_num() const override {return 1;} private: @@ -329,32 +185,10 @@ class FakeBroadcastEmitter : public jit_emitter { const std::vector& out, const std::vector& pool, const std::vector& gpr, - const ov::intel_cpu::emitter_context *emit_context) const override { - if (host_isa_ == dnnl::impl::cpu::x64::sse41) { - emit_isa(in, out); - } else if (host_isa_ == dnnl::impl::cpu::x64::avx2) { - emit_isa(in, out); - } else if (host_isa_ == dnnl::impl::cpu::x64::avx512_core) { - emit_isa(in, out); - } else { - IE_THROW() << host_isa_; - assert(!"unsupported isa"); - } - } + const ov::intel_cpu::emitter_context *emit_context) const override; template - void emit_isa(const std::vector &in, const std::vector &out) const { - using Vmm = typename dnnl::impl::utils::conditional3::type; - Vmm vmm_src0 = Vmm(in[0]); - Vmm vmm_dst = Vmm(out[0]); - - if (use_broadcast) { - h->uni_vbroadcastss(vmm_dst, Xmm(in[0])); - } else { - h->uni_vmovups(vmm_dst, vmm_src0); - } - } + void emit_isa(const std::vector &in, const std::vector &out) const; private: bool use_broadcast; @@ -362,12 +196,7 @@ class FakeBroadcastEmitter : public jit_emitter { class ScalarEmitter : public jit_emitter { public: - ScalarEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr& n) - : jit_emitter(h, isa, n) { - value = dnnl::impl::cpu::x64::float2int(ov::as_type_ptr(n)->cast_vector()[0]); - push_arg_entry_of("scalar", value, true); - prepare_table(); - } + ScalarEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr& n); size_t get_inputs_num() const override {return 0;} @@ -379,26 +208,10 @@ class ScalarEmitter : public jit_emitter { const std::vector& out, const std::vector& pool, const std::vector& gpr, - const ov::intel_cpu::emitter_context *emit_context) const override { - if (host_isa_ == dnnl::impl::cpu::x64::sse41) { - emit_isa(in, out); - } else if (host_isa_ == dnnl::impl::cpu::x64::avx2) { - emit_isa(in, out); - } else if (host_isa_ == dnnl::impl::cpu::x64::avx512_core) { - emit_isa(in, out); - } else { - IE_THROW() << host_isa_; - assert(!"unsupported isa"); - } - } + const ov::intel_cpu::emitter_context *emit_context) const override; template - void emit_isa(const std::vector &in, const std::vector &out) const { - using Vmm = typename dnnl::impl::utils::conditional3::type; - Vmm vmm_dst = Vmm(out[0]); - h->uni_vbroadcastss(vmm_dst, table_val("scalar")); - } + void emit_isa(const std::vector &in, const std::vector &out) const; private: int32_t value; @@ -415,34 +228,17 @@ class ScalarEmitter : public jit_emitter { /// Blocked parameter to tell if input is actually blocked. Broadcast means broadcast by W in other cases no need to substitute load. class MemoryEmitter : public jit_emitter { public: - MemoryEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr& n) - : jit_emitter(h, isa, n), ea(getEA(n)) { - } - + MemoryEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr& n); size_t get_inputs_num() const override {return 1;} protected: - static auto getEA(const std::shared_ptr& n) -> size_t { - auto& rt = n->get_rt_info(); - size_t ea = 0; - auto it = rt.find("effectiveAddress"); - if (it != rt.end()) { - ea = it->second.as(); - } else { - throw ov::Exception("effective address for Load generation cannot be determined"); - } - return ea; - } - - size_t ea; +// static size_t getEA(const std::shared_ptr& n); +// size_t ea; }; class StoreEmitter : public MemoryEmitter { public: - StoreEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr& n) - : MemoryEmitter(h, isa, n) { - } - + StoreEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr& n); size_t get_inputs_num() const override {return 1;} private: @@ -450,35 +246,15 @@ class StoreEmitter : public MemoryEmitter { const std::vector& out, const std::vector& pool, const std::vector& gpr, - const ov::intel_cpu::emitter_context *emit_context) const override { - if (host_isa_ == dnnl::impl::cpu::x64::sse41) { - emit_isa(in, out); - } else if (host_isa_ == dnnl::impl::cpu::x64::avx2) { - emit_isa(in, out); - } else if (host_isa_ == dnnl::impl::cpu::x64::avx512_core) { - emit_isa(in, out); - } else { - IE_THROW() << host_isa_; - assert(!"unsupported isa"); - } - } + const ov::intel_cpu::emitter_context *emit_context) const override; template - void emit_isa(const std::vector &in, const std::vector &out) const { - using Vmm = typename dnnl::impl::utils::conditional3::type; - Reg64 out_reg(ea); - Vmm vmm_src0 = Vmm(in[0]); - h->uni_vmovups(h->ptr[out_reg], vmm_src0); - h->add(out_reg, dnnl::impl::cpu::x64::cpu_isa_traits::vlen); - } + void emit_isa(const std::vector &in, const std::vector &out) const; }; class ScalarStoreEmitter : public MemoryEmitter { public: - ScalarStoreEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr& n) - : MemoryEmitter(h, isa, n) { - } + ScalarStoreEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr& n); size_t get_inputs_num() const override {return 1;} @@ -487,35 +263,15 @@ class ScalarStoreEmitter : public MemoryEmitter { const std::vector& out, const std::vector& pool, const std::vector& gpr, - const ov::intel_cpu::emitter_context *emit_context) const override { - if (host_isa_ == dnnl::impl::cpu::x64::sse41) { - emit_isa(in, out); - } else if (host_isa_ == dnnl::impl::cpu::x64::avx2) { - emit_isa(in, out); - } else if (host_isa_ == dnnl::impl::cpu::x64::avx512_core) { - emit_isa(in, out); - } else { - IE_THROW() << host_isa_; - assert(!"unsupported isa"); - } - } + const ov::intel_cpu::emitter_context *emit_context) const override; template - void emit_isa(const std::vector &in, const std::vector &out) const { - using Vmm = typename dnnl::impl::utils::conditional3::type; - Reg64 out_reg(ea); - Xmm vmm_src0 = Xmm(in[0]); - h->uni_vmovss(h->ptr[out_reg], vmm_src0); - h->add(out_reg, sizeof(float)); - } + void emit_isa(const std::vector &in, const std::vector &out) const; }; class LoadEmitter : public MemoryEmitter { public: - LoadEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr& n) - : MemoryEmitter(h, isa, n), shouldPostIncrement(*n->get_input_shape(0).rbegin() != 1) { - } + LoadEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr& n); size_t get_inputs_num() const override {return 0;} @@ -524,31 +280,10 @@ class LoadEmitter : public MemoryEmitter { const std::vector& out, const std::vector& pool, const std::vector& gpr, - const ov::intel_cpu::emitter_context *emit_context) const override { - if (host_isa_ == dnnl::impl::cpu::x64::sse41) { - emit_isa(in, out); - } else if (host_isa_ == dnnl::impl::cpu::x64::avx2) { - emit_isa(in, out); - } else if (host_isa_ == dnnl::impl::cpu::x64::avx512_core) { - emit_isa(in, out); - } else { - IE_THROW() << host_isa_; - assert(!"unsupported isa"); - } - } + const ov::intel_cpu::emitter_context *emit_context) const override; template - void emit_isa(const std::vector &in, const std::vector &out) const { - using Vmm = typename dnnl::impl::utils::conditional3::type; - Reg64 in_reg(ea); - Vmm vmm_src0 = Vmm(out[0]); - h->uni_vmovups(vmm_src0, h->ptr[in_reg]); - - if (shouldPostIncrement) { - h->add(in_reg, dnnl::impl::cpu::x64::cpu_isa_traits::vlen); - } - } + void emit_isa(const std::vector &in, const std::vector &out) const; private: bool shouldPostIncrement; @@ -556,9 +291,7 @@ class LoadEmitter : public MemoryEmitter { class BroadcastLoadEmitter : public MemoryEmitter { public: - BroadcastLoadEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr& n) - : MemoryEmitter(h, isa, n) { - } + BroadcastLoadEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr& n); size_t get_inputs_num() const override {return 0;} private: @@ -566,37 +299,16 @@ class BroadcastLoadEmitter : public MemoryEmitter { const std::vector& out, const std::vector& pool, const std::vector& gpr, - const ov::intel_cpu::emitter_context *emit_context) const override { - if (host_isa_ == dnnl::impl::cpu::x64::sse41) { - emit_isa(in, out); - } else if (host_isa_ == dnnl::impl::cpu::x64::avx2) { - emit_isa(in, out); - } else if (host_isa_ == dnnl::impl::cpu::x64::avx512_core) { - emit_isa(in, out); - } else { - IE_THROW() << host_isa_; - assert(!"unsupported isa"); - } - } + const ov::intel_cpu::emitter_context *emit_context) const override; template - void emit_isa(const std::vector &in, const std::vector &out) const { - using Vmm = typename dnnl::impl::utils::conditional3::type; - Reg64 in_reg(ea); - Vmm vmm_src0 = Vmm(out[0]); - - // In doesn't really matter if we broadcast or `movss` for vector tails so keep only one version for `BroadcastLoad`, - // key point here is not to add post-increment, it might be fixed by some other approach in future - h->uni_vbroadcastss(vmm_src0, h->ptr[in_reg]); - } + void emit_isa(const std::vector &in, const std::vector &out) const; }; class ScalarLoadEmitter : public MemoryEmitter { public: - ScalarLoadEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr& n) - : MemoryEmitter(h, isa, n), shouldPostIncrement(*n->get_input_shape(0).rbegin() != 1) { - } + ScalarLoadEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr& n); + size_t get_inputs_num() const override {return 0;} private: @@ -604,36 +316,13 @@ class ScalarLoadEmitter : public MemoryEmitter { const std::vector& out, const std::vector& pool, const std::vector& gpr, - const ov::intel_cpu::emitter_context *emit_context) const override { - if (host_isa_ == dnnl::impl::cpu::x64::sse41) { - emit_isa(in, out); - } else if (host_isa_ == dnnl::impl::cpu::x64::avx2) { - emit_isa(in, out); - } else if (host_isa_ == dnnl::impl::cpu::x64::avx512_core) { - emit_isa(in, out); - } else { - IE_THROW() << host_isa_; - assert(!"unsupported isa"); - } - } + const ov::intel_cpu::emitter_context *emit_context) const override; template - void emit_isa(const std::vector &in, const std::vector &out) const { - using Vmm = typename dnnl::impl::utils::conditional3::type; - Reg64 in_reg(ea); - Xmm vmm_src0 = Xmm(out[0]); - h->uni_vmovss(vmm_src0, h->ptr[in_reg]); - - // Doesn't work if the same pointer comes with multiple load operations - if (shouldPostIncrement) { - h->add(in_reg, sizeof(float)); - } - } + void emit_isa(const std::vector &in, const std::vector &out) const; private: bool shouldPostIncrement; }; - } // namespace intel_cpu } // namespace ov diff --git a/src/tests/functional/plugin/cpu/shared_tests_instances/snippets/add.cpp b/src/tests/functional/plugin/cpu/shared_tests_instances/snippets/add.cpp index 56ba1a51c2b651..d7bc5d0de7e12e 100644 --- a/src/tests/functional/plugin/cpu/shared_tests_instances/snippets/add.cpp +++ b/src/tests/functional/plugin/cpu/shared_tests_instances/snippets/add.cpp @@ -12,23 +12,31 @@ namespace snippets { namespace { - INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Eltwise, Add, - ::testing::Combine( - ::testing::Values(ov::Shape {1, 42, 16, 64}), - ::testing::Values(ov::Shape {1, 42, 16, 1}), - ::testing::Values(1), // one node - Add - ::testing::Values(0), // SnippetsMarkSkipped disables tokenization for eltwise chains after inputs - ::testing::Values(CommonTestUtils::DEVICE_CPU)), - Add::getTestCaseName); - - INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Eltwise, AddSinh, - ::testing::Combine( - ::testing::Values(ov::Shape {1, 42, 16, 64}), - ::testing::Values(ov::Shape {1, 42, 16, 1}), - ::testing::Values(3), // Add + 2 converts after inputs - ::testing::Values(1), // Subgraph is created, since the inputs are followed by converts - ::testing::Values(CommonTestUtils::DEVICE_CPU)), - AddSinh::getTestCaseName); +INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Eltwise, Add, + ::testing::Combine( + ::testing::Values(ov::Shape {1, 42, 16, 64}), + ::testing::Values(ov::Shape {1, 42, 16, 1}), + ::testing::Values(1), // one node - Add + ::testing::Values(0), // SnippetsMarkSkipped disables tokenization for eltwise chains after inputs + ::testing::Values(CommonTestUtils::DEVICE_CPU)), + Add::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Eltwise, AddSinh, + ::testing::Combine( + ::testing::Values(ov::Shape {1, 42, 16, 64}), + ::testing::Values(ov::Shape {1, 42, 16, 1}), + ::testing::Values(3), // Add + 2 converts after inputs + ::testing::Values(1), // Subgraph is created, since the inputs are followed by converts + ::testing::Values(CommonTestUtils::DEVICE_CPU)), + AddSinh::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Eltwise, AddSinhConst, + ::testing::Combine( + ::testing::Values(ov::Shape {1, 42, 16, 64}), + ::testing::Values(2), // Add + 2 converts after inputs + ::testing::Values(1), // Subgraph is created, since the inputs are followed by converts + ::testing::Values(CommonTestUtils::DEVICE_CPU)), + AddSinhConst::getTestCaseName); } // namespace } // namespace snippets diff --git a/src/tests/functional/plugin/cpu/shared_tests_instances/snippets/max_num_params_eltwise.cpp b/src/tests/functional/plugin/cpu/shared_tests_instances/snippets/max_num_params_eltwise.cpp new file mode 100644 index 00000000000000..20c01c02be8fd3 --- /dev/null +++ b/src/tests/functional/plugin/cpu/shared_tests_instances/snippets/max_num_params_eltwise.cpp @@ -0,0 +1,26 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "snippets/max_num_params_eltwise.hpp" +#include "common_test_utils/test_constants.hpp" + +namespace ov { +namespace test { +namespace snippets { +namespace { +// Note that we need these shapes to cover all cases of code emission (none/one/multiple of scalar/vector tiles) +std::vector input_shapes {{1, 64, 10, 10}, {1, 1, 17, 37}, {1, 1, 1, 1}, {1, 1, 1, 7}, + {1, 1, 1, 128}, {1, 1, 1, 14}, {1, 1, 1, 16}, {1, 1, 1, 30}}; +INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Eltwise, MaxNumParamsEltwiseSinh, + ::testing::Combine( + ::testing::ValuesIn(input_shapes), + ::testing::Values(12), // 10 Sinh after inputs + Subgraph + Concat + ::testing::Values(1), + ::testing::Values(CommonTestUtils::DEVICE_CPU)), + MaxNumParamsEltwiseSinh::getTestCaseName); + +} // namespace +} // namespace snippets +} // namespace test +} // namespace ov \ No newline at end of file diff --git a/src/tests/functional/plugin/cpu/shared_tests_instances/snippets/three_inputs_eltwise.cpp b/src/tests/functional/plugin/cpu/shared_tests_instances/snippets/three_inputs_eltwise.cpp index c0c833268898fb..779db741cd258b 100644 --- a/src/tests/functional/plugin/cpu/shared_tests_instances/snippets/three_inputs_eltwise.cpp +++ b/src/tests/functional/plugin/cpu/shared_tests_instances/snippets/three_inputs_eltwise.cpp @@ -10,25 +10,25 @@ namespace test { namespace snippets { namespace { - INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Eltwise, ThreeInputsEltwise, - ::testing::Combine( - ::testing::Values(ov::Shape {1, 64, 10, 10}), - ::testing::Values(ov::Shape {1, 64, 10, 1}), - ::testing::Values(ov::Shape {1, 1, 1, 10}), - ::testing::Values(2), // eltwises fuse only for non-broadcasted shapes - ::testing::Values(0), // SnippetsMarkSkipped disables tokenization for eltwise chains after inputs - ::testing::Values(CommonTestUtils::DEVICE_CPU)), - ThreeInputsEltwise::getTestCaseName); +INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Eltwise, ThreeInputsEltwise, + ::testing::Combine( + ::testing::Values(ov::Shape {1, 64, 10, 10}), + ::testing::Values(ov::Shape {1, 64, 10, 1}), + ::testing::Values(ov::Shape {1, 1, 1, 10}), + ::testing::Values(2), // eltwises fuse only for non-broadcasted shapes + ::testing::Values(0), // SnippetsMarkSkipped disables tokenization for eltwise chains after inputs + ::testing::Values(CommonTestUtils::DEVICE_CPU)), + ThreeInputsEltwise::getTestCaseName); - INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Eltwise, ThreeInputsEltwiseSinh, - ::testing::Combine( - ::testing::Values(ov::Shape {1, 64, 10, 10}), - ::testing::Values(ov::Shape {1, 64, 10, 1}), - ::testing::Values(ov::Shape {1, 1, 1, 10}), - ::testing::Values(4), // Subgraph + 3 converts after inputs - ::testing::Values(1), // Subgraph is created, since the inputs are followed by converts - ::testing::Values(CommonTestUtils::DEVICE_CPU)), - ThreeInputsEltwiseSinh::getTestCaseName); +INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Eltwise, ThreeInputsEltwiseSinh, + ::testing::Combine( + ::testing::Values(ov::Shape {1, 64, 10, 10}), + ::testing::Values(ov::Shape {1, 64, 10, 1}), + ::testing::Values(ov::Shape {1, 1, 1, 10}), + ::testing::Values(4), // Subgraph + 3 converts after inputs + ::testing::Values(1), // Subgraph is created, since the inputs are followed by converts + ::testing::Values(CommonTestUtils::DEVICE_CPU)), + ThreeInputsEltwiseSinh::getTestCaseName); } // namespace } // namespace snippets diff --git a/src/tests/functional/plugin/shared/include/snippets/add.hpp b/src/tests/functional/plugin/shared/include/snippets/add.hpp index a3dbe852cde592..7f7001de94bf5d 100644 --- a/src/tests/functional/plugin/shared/include/snippets/add.hpp +++ b/src/tests/functional/plugin/shared/include/snippets/add.hpp @@ -18,6 +18,13 @@ typedef std::tuple< std::string // Target Device > AddParams; +typedef std::tuple< + ov::Shape, // Input 0 Shape + size_t, // Expected num nodes + size_t, // Expected num subgraphs + std::string // Target Device +> AddConstParams; + class Add : public testing::WithParamInterface, virtual public ov::test::SnippetsTestsCommon { public: @@ -32,6 +39,14 @@ class AddSinh : public Add { void SetUp() override; }; +class AddSinhConst : public testing::WithParamInterface, + virtual public ov::test::SnippetsTestsCommon { +public: + static std::string getTestCaseName(testing::TestParamInfo obj); +protected: + void SetUp() override; +}; + } // namespace snippets } // namespace test } // namespace ov \ No newline at end of file diff --git a/src/tests/functional/plugin/shared/include/snippets/max_num_params_eltwise.hpp b/src/tests/functional/plugin/shared/include/snippets/max_num_params_eltwise.hpp new file mode 100644 index 00000000000000..26640e58910512 --- /dev/null +++ b/src/tests/functional/plugin/shared/include/snippets/max_num_params_eltwise.hpp @@ -0,0 +1,31 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "shared_test_classes/base/snippets_test_utils.hpp" + +namespace ov { +namespace test { +namespace snippets { + +typedef std::tuple< + ov::Shape, // Input Shape All shapes are replicated + size_t, // Expected num nodes + size_t, // Expected num subgraphs + std::string // Target Device +> MaxNumParamsEltwiseParams; + +class MaxNumParamsEltwiseSinh : public testing::WithParamInterface, + virtual public ov::test::SnippetsTestsCommon { +public: + static std::string getTestCaseName(testing::TestParamInfo obj); + +protected: + void SetUp() override; +}; + +} // namespace snippets +} // namespace test +} // namespace ov diff --git a/src/tests/functional/plugin/shared/src/snippets/add.cpp b/src/tests/functional/plugin/shared/src/snippets/add.cpp index 896f03e78d05a1..1b8d1f8ecdfc8d 100644 --- a/src/tests/functional/plugin/shared/src/snippets/add.cpp +++ b/src/tests/functional/plugin/shared/src/snippets/add.cpp @@ -10,38 +10,61 @@ namespace ov { namespace test { namespace snippets { - std::string Add::getTestCaseName(testing::TestParamInfo obj) { - ov::Shape inputShapes0, inputShapes1, newInputShapes; - std::string targetDevice; - size_t num_nodes, num_subgraphs; - std::tie(inputShapes0, inputShapes1, num_nodes, num_subgraphs, targetDevice) = obj.param; - - std::ostringstream result; - result << "IS[0]=" << CommonTestUtils::vec2str(inputShapes0) << "_"; - result << "IS[1]=" << CommonTestUtils::vec2str(inputShapes1) << "_"; - result << "#N=" << num_nodes << "_"; - result << "#S=" << num_subgraphs << "_"; - result << "targetDevice=" << targetDevice; - return result.str(); - } - - void Add::SetUp() { - ov::Shape inputShape0, inputShape1; - std::tie(inputShape0, inputShape1, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam(); - init_input_shapes({{{}, {inputShape0, }}, {{}, {inputShape1, }}}); - - auto f = ov::test::snippets::AddFunction({inputShape0, inputShape1}); - function = f.getOriginal(); - } - - void AddSinh::SetUp() { - ov::Shape inputShape0, inputShape1; - std::tie(inputShape0, inputShape1, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam(); - init_input_shapes({{{}, {inputShape0, }}, {{}, {inputShape1, }}}); - - auto f = ov::test::snippets::AddSinhFunction({inputShape0, inputShape1}); - function = f.getOriginal(); - } +std::string Add::getTestCaseName(testing::TestParamInfo obj) { + ov::Shape inputShapes0, inputShapes1, newInputShapes; + std::string targetDevice; + size_t num_nodes, num_subgraphs; + std::tie(inputShapes0, inputShapes1, num_nodes, num_subgraphs, targetDevice) = obj.param; + + std::ostringstream result; + result << "IS[0]=" << CommonTestUtils::vec2str(inputShapes0) << "_"; + result << "IS[1]=" << CommonTestUtils::vec2str(inputShapes1) << "_"; + result << "#N=" << num_nodes << "_"; + result << "#S=" << num_subgraphs << "_"; + result << "targetDevice=" << targetDevice; + return result.str(); +} + +void Add::SetUp() { + ov::Shape inputShape0, inputShape1; + std::tie(inputShape0, inputShape1, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam(); + init_input_shapes({{{}, {inputShape0, }}, {{}, {inputShape1, }}}); + + auto f = ov::test::snippets::AddFunction({inputShape0, inputShape1}); + function = f.getOriginal(); +} + +void AddSinh::SetUp() { + ov::Shape inputShape0, inputShape1; + std::tie(inputShape0, inputShape1, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam(); + init_input_shapes({{{}, {inputShape0, }}, {{}, {inputShape1, }}}); + + auto f = ov::test::snippets::AddSinhFunction({inputShape0, inputShape1}); + function = f.getOriginal(); +} + +std::string AddSinhConst::getTestCaseName(testing::TestParamInfo obj) { + ov::Shape inputShapes, newInputShapes; + std::string targetDevice; + size_t num_nodes, num_subgraphs; + std::tie(inputShapes, num_nodes, num_subgraphs, targetDevice) = obj.param; + + std::ostringstream result; + result << "IS[0]=" << CommonTestUtils::vec2str(inputShapes) << "_"; + result << "#N=" << num_nodes << "_"; + result << "#S=" << num_subgraphs << "_"; + result << "targetDevice=" << targetDevice; + return result.str(); +} + +void AddSinhConst::SetUp() { + ov::Shape inputShape; + std::tie(inputShape, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam(); + init_input_shapes({{{}, {inputShape, }}}); + + auto f = ov::test::snippets::AddSinhConstFunction({inputShape}); + function = f.getOriginal(); +} TEST_P(Add, CompareWithRefImpl) { run(); @@ -53,6 +76,11 @@ TEST_P(AddSinh, CompareWithRefImpl) { validateNumSubgraphs(); } +TEST_P(AddSinhConst, CompareWithRefImpl) { + run(); + validateNumSubgraphs(); +} + } // namespace snippets } // namespace test } // namespace ov diff --git a/src/tests/functional/plugin/shared/src/snippets/max_num_params_eltwise.cpp b/src/tests/functional/plugin/shared/src/snippets/max_num_params_eltwise.cpp new file mode 100644 index 00000000000000..1140937be63359 --- /dev/null +++ b/src/tests/functional/plugin/shared/src/snippets/max_num_params_eltwise.cpp @@ -0,0 +1,49 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "common_test_utils/common_utils.hpp" +#include "snippets/max_num_params_eltwise.hpp" +#include "subgraph_simple.hpp" + +namespace ov { +namespace test { +namespace snippets { + +std::string MaxNumParamsEltwiseSinh::getTestCaseName(testing::TestParamInfo obj) { + ov::Shape inputShapes; + std::string targetDevice; + size_t num_nodes, num_subgraphs; + std::tie(inputShapes, num_nodes, num_subgraphs, targetDevice) = obj.param; + + std::ostringstream result; + result << "IS[0]=" << CommonTestUtils::vec2str(inputShapes) << "_"; + result << "#N=" << num_nodes << "_"; + result << "#S=" << num_subgraphs << "_"; + result << "targetDevice=" << targetDevice; + return result.str(); +} + +void MaxNumParamsEltwiseSinh::SetUp() { + ov::Shape inputShape; + std::tie(inputShape, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam(); + std::vector expandedShapes(10, inputShape); + std::vector input_shapes; + for (const auto& s : expandedShapes) { + input_shapes.emplace_back(InputShape {{}, {s, }}); + } + + init_input_shapes(input_shapes); + + auto f = ov::test::snippets::EltwiseMaxNumParamsSinhFunction(expandedShapes); + function = f.getOriginal(); +} + +TEST_P(MaxNumParamsEltwiseSinh, CompareWithRefImpl) { + run(); + validateNumSubgraphs(); +} + +} // namespace snippets +} // namespace test +} // namespace ov diff --git a/src/tests/functional/plugin/shared/src/snippets/three_inputs_eltwise.cpp b/src/tests/functional/plugin/shared/src/snippets/three_inputs_eltwise.cpp index ad1c3e74255938..276218e6150c57 100644 --- a/src/tests/functional/plugin/shared/src/snippets/three_inputs_eltwise.cpp +++ b/src/tests/functional/plugin/shared/src/snippets/three_inputs_eltwise.cpp @@ -10,42 +10,42 @@ namespace ov { namespace test { namespace snippets { - std::string ThreeInputsEltwise::getTestCaseName(testing::TestParamInfo obj) { - ov::Shape inputShapes0, inputShapes1, inputShapes2; - std::string targetDevice; - size_t num_nodes, num_subgraphs; - std::tie(inputShapes0, inputShapes1, inputShapes2, - num_nodes, num_subgraphs, targetDevice) = obj.param; - - std::ostringstream result; - result << "IS[0]=" << CommonTestUtils::vec2str(inputShapes0) << "_"; - result << "IS[1]=" << CommonTestUtils::vec2str(inputShapes1) << "_"; - result << "IS[2]=" << CommonTestUtils::vec2str(inputShapes2) << "_"; - result << "#N=" << num_nodes << "_"; - result << "#S=" << num_subgraphs << "_"; - result << "targetDevice=" << targetDevice; - return result.str(); - } - - void ThreeInputsEltwise::SetUp() { - ov::Shape inputShape0, inputShape1, inputShape2; - std::tie(inputShape0, inputShape1, inputShape2, - ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam(); - init_input_shapes({{{}, {inputShape0, }}, {{}, {inputShape1, }}, {{}, {inputShape2, }}}); - - auto f = ov::test::snippets::EltwiseThreeInputsFunction({inputShape0, inputShape1, inputShape2}); - function = f.getOriginal(); - } - - void ThreeInputsEltwiseSinh::SetUp() { - ov::Shape inputShape0, inputShape1, inputShape2; - std::tie(inputShape0, inputShape1, inputShape2, - ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam(); - init_input_shapes({{{}, {inputShape0, }}, {{}, {inputShape1, }}, {{}, {inputShape2, }}}); - - auto f = ov::test::snippets::EltwiseThreeInputsSinhFunction({inputShape0, inputShape1, inputShape2}); - function = f.getOriginal(); - } +std::string ThreeInputsEltwise::getTestCaseName(testing::TestParamInfo obj) { + ov::Shape inputShapes0, inputShapes1, inputShapes2; + std::string targetDevice; + size_t num_nodes, num_subgraphs; + std::tie(inputShapes0, inputShapes1, inputShapes2, + num_nodes, num_subgraphs, targetDevice) = obj.param; + + std::ostringstream result; + result << "IS[0]=" << CommonTestUtils::vec2str(inputShapes0) << "_"; + result << "IS[1]=" << CommonTestUtils::vec2str(inputShapes1) << "_"; + result << "IS[2]=" << CommonTestUtils::vec2str(inputShapes2) << "_"; + result << "#N=" << num_nodes << "_"; + result << "#S=" << num_subgraphs << "_"; + result << "targetDevice=" << targetDevice; + return result.str(); +} + +void ThreeInputsEltwise::SetUp() { + ov::Shape inputShape0, inputShape1, inputShape2; + std::tie(inputShape0, inputShape1, inputShape2, + ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam(); + init_input_shapes({{{}, {inputShape0, }}, {{}, {inputShape1, }}, {{}, {inputShape2, }}}); + + auto f = ov::test::snippets::EltwiseThreeInputsFunction({inputShape0, inputShape1, inputShape2}); + function = f.getOriginal(); +} + +void ThreeInputsEltwiseSinh::SetUp() { + ov::Shape inputShape0, inputShape1, inputShape2; + std::tie(inputShape0, inputShape1, inputShape2, + ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam(); + init_input_shapes({{{}, {inputShape0, }}, {{}, {inputShape1, }}, {{}, {inputShape2, }}}); + + auto f = ov::test::snippets::EltwiseThreeInputsSinhFunction({inputShape0, inputShape1, inputShape2}); + function = f.getOriginal(); +} TEST_P(ThreeInputsEltwise, CompareWithRefImpl) { run(); diff --git a/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_simple.hpp b/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_simple.hpp index f67a86966a4bd8..1ba86caa00f87e 100644 --- a/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_simple.hpp +++ b/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_simple.hpp @@ -29,13 +29,14 @@ class AddFunction : public SnippetsFunctionBase { std::shared_ptr initOriginal() const override; std::shared_ptr initReference() const override; }; -/// Add separated from inputs by Sin to WA CPU-specific disabling after inputs. +/// Add separated from inputs by Sinh to WA CPU-specific disabling after inputs. /// Works because Sinh is not supported by tokenization yet. /// Tokenized simply by starting subgraph. // in1 in2 -// Sin Sinh +// Sinh Sinh // Add // Result +// todo: remove Sinh once "no subgraph after input" limitation is relaxed class AddSinhFunction : public SnippetsFunctionBase { public: explicit AddSinhFunction(const std::vector& inputShapes) : SnippetsFunctionBase(inputShapes) { @@ -45,6 +46,21 @@ class AddSinhFunction : public SnippetsFunctionBase { std::shared_ptr initOriginal() const override; std::shared_ptr initReference() const override; }; +/// Like AddSinh but with a constant second input (and no sinh on in) +// in1 in2 +// Sin Sinh +// Add +// Result +// todo: remove Sinh once "no subgraph after input" limitation is relaxed +class AddSinhConstFunction : public SnippetsFunctionBase { +public: + explicit AddSinhConstFunction(const std::vector& inputShapes) : SnippetsFunctionBase(inputShapes) { + NGRAPH_CHECK(input_shapes.size() == 1, "Got invalid number of input shapes"); + } +protected: + std::shared_ptr initOriginal() const override; +// std::shared_ptr initReference() const override; +}; /// Simple Eltwise graph fully convertible to Subgraph. /// Tokenized simply by attaching eltwises. // in1 in2 @@ -77,6 +93,7 @@ class EltwiseThreeInputsFunction : public SnippetsFunctionBase { }; /// EltwiseFunctionThreeInputs with Sinh after inputs to to WA CPU-specific disabling after inputs /// See AddSinh for details. +// todo: remove Sinh once "no subgraph after input" limitation is relaxed class EltwiseThreeInputsSinhFunction : public SnippetsFunctionBase { public: explicit EltwiseThreeInputsSinhFunction(const std::vector& inputShapes) : @@ -86,6 +103,24 @@ class EltwiseThreeInputsSinhFunction : public SnippetsFunctionBase { protected: std::shared_ptr initOriginal() const override; }; +/// Eltwise graph with 10 inputs and 2 outputs. +/// Needed to test for a max number of inputs+outputs allowed. +// in1 in2 in3 ... in10 +// Sinh Sinh Sinh ...Sinh +// ........................ +// Subtract Power +// \ Sinh +// Result +// todo: remove Sinh once "no subgraph after input" limitation is relaxed +class EltwiseMaxNumParamsSinhFunction : public SnippetsFunctionBase { +public: + explicit EltwiseMaxNumParamsSinhFunction(const std::vector& inputShapes) : + SnippetsFunctionBase(inputShapes) { + NGRAPH_CHECK(input_shapes.size() == 10, "Got invalid number of input shapes"); + } +protected: + std::shared_ptr initOriginal() const override; +}; /// MatMul with two eltwise branches joined with Add just before the Result. /// Tokenized by attaching eltwises to separate subgraphs, and then joining them together. // in1 in2 diff --git a/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_simple.cpp b/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_simple.cpp index 81c267f3745828..d20285b3de36ba 100644 --- a/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_simple.cpp +++ b/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_simple.cpp @@ -46,6 +46,14 @@ std::shared_ptr AddSinhFunction::initReference() const { ParameterVector{indata0, indata1})); return std::make_shared(NodeVector{add}, ParameterVector{data0, data1}); } +std::shared_ptr AddSinhConstFunction::initOriginal() const { + auto data0 = std::make_shared(precision, input_shapes[0]); + const std::vector const_values = CommonTestUtils::generate_float_numbers(shape_size(input_shapes[0]), -10., 10.); + auto const_data1 = std::make_shared(precision, input_shapes[0], const_values); + auto sin0 = std::make_shared(data0); + auto add = std::make_shared(sin0, const_data1); + return std::make_shared(NodeVector{add}, ParameterVector{data0}); +} std::shared_ptr EltwiseFunction::initOriginal() const { auto data0 = std::make_shared(precision, input_shapes[0]); auto data1 = std::make_shared(precision, input_shapes[1]); @@ -98,6 +106,28 @@ std::shared_ptr EltwiseThreeInputsSinhFunction::initOriginal() const auto mul = std::make_shared(add, sub); return std::make_shared(NodeVector{mul}, ParameterVector{data0, data1, data2}); } +std::shared_ptr EltwiseMaxNumParamsSinhFunction::initOriginal() const { + ParameterVector params; + std::vector> sinh; // 10 + for (const auto& shape : input_shapes) { + auto param = std::make_shared(precision, shape); + params.push_back(param); + sinh.push_back(std::make_shared(param)); + } + std::vector> add; // 5 + for (size_t i = 0; i < input_shapes.size() / 2; i++) { + add.push_back(std::make_shared(sinh[i * 2], sinh[i * 2 + 1])); + } + std::vector> mul; // 2 + for (size_t i = 0; i < add.size() / 2; i++) { + auto mul_node = std::make_shared(add[i * 2], add[i * 2 + 1]); + mul.push_back(mul_node); + } + auto sub = std::make_shared(mul[0], mul[1]); + auto power = std::make_shared(add.back(), sub); + auto exit_sinh = std::make_shared(power); + return std::make_shared(NodeVector{sub, exit_sinh}, params); +} std::shared_ptr MatMulEltwiseBranchesFunction::initOriginal() const { auto data_1 = std::make_shared(precision, input_shapes[0]);