Skip to content

Commit

Permalink
Snippets increase subgraph size (#3)
Browse files Browse the repository at this point in the history
- Implement static TileScheduler to handle compile params processing. Now compile params are accessed only here
- TileScheduler should emit code only for necessary scalar/vector Tiles
- Perform abstract-to-physical register mapping in one place (currently KernelEmitter constructor)
- Implement more precise register mapping, so larger subgraphs could be created (now up to 12 i/o regs instead of 7)
  • Loading branch information
IvanNovoselov authored and a-sidorova committed Aug 16, 2022
1 parent b33f22c commit e23ada1
Show file tree
Hide file tree
Showing 27 changed files with 1,189 additions and 617 deletions.
2 changes: 2 additions & 0 deletions src/common/snippets/include/snippets/emitter.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,5 +51,7 @@ class Emitter {
virtual ~Emitter() = default;
};

using AllocatedEmitter = std::pair<std::shared_ptr<Emitter>, ngraph::snippets::RegInfo>;

} // namespace snippets
} // namespace ngraph
5 changes: 2 additions & 3 deletions src/common/snippets/include/snippets/op/tile.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,13 @@ class Tile : public ngraph::op::Op {
public:
OPENVINO_OP("Tile", "SnippetsOpset");

Tile(const std::vector<std::pair<std::shared_ptr<ngraph::snippets::Emitter>, ngraph::snippets::RegInfo>>& region);
Tile(const std::vector<AllocatedEmitter>& region);
Tile() = default;
std::vector<std::pair<std::shared_ptr<ngraph::snippets::Emitter>, ngraph::snippets::RegInfo>> region;
std::vector<AllocatedEmitter> region;

std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& inputs) const override {
return std::make_shared<Tile>(region);
}
const void *compile_params;
};

} // namespace op
Expand Down
39 changes: 39 additions & 0 deletions src/common/snippets/include/snippets/op/tile_scheduler.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
// Copyright (C) 2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#pragma once

#include "ngraph/op/op.hpp"
#include "snippets/emitter.hpp"
#include "tile.hpp"

namespace ngraph {
namespace snippets {
namespace op {

/**
* @interface TileScheduler
* @brief Contains a set of Tiles (currently one vector and one scalar) and performs necessary preparations
* before the Tiles could be executed: calculates offsets, sets proper work amounts, decrement pointers if the same data
* have to be read several times (broadcasting).
* @ingroup snippets
*/
class TileScheduler : public ngraph::op::Op {
public:
OPENVINO_OP("TileScheduler", "SnippetsOpset");

TileScheduler(const AllocatedEmitter& vector_region, const AllocatedEmitter& scalar_region);
TileScheduler() = default;
AllocatedEmitter vector_region;
AllocatedEmitter scalar_region;
// todo: this clone_with_new_inputs is irrelevant
std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& inputs) const override {
return std::make_shared<TileScheduler>(vector_region, scalar_region);
}
const void *compile_params;
};

} // namespace op
} // namespace snippets
} // namespace ngraph
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ namespace pass {
*/
class AssignRegisters : public ngraph::pass::FunctionPass {
public:
AssignRegisters() {
explicit AssignRegisters() {
set_property(ngraph::pass::PassProperty::REQUIRE_STATIC_SHAPE, true);
}
bool run_on_model(const std::shared_ptr<ov::Model>& m) override;
Expand Down
1 change: 1 addition & 0 deletions src/common/snippets/include/snippets/snippets_isa.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#include "op/powerstatic.hpp"
#include "op/store.hpp"
#include "op/tile.hpp"
#include "op/tile_scheduler.hpp"
#include "op/vectorload.hpp"
#include "op/vectorstore.hpp"

Expand Down
44 changes: 19 additions & 25 deletions src/common/snippets/src/generator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,20 +17,20 @@ auto ngraph::snippets::getRegisters(std::shared_ptr<ngraph::Node>& n) -> ngraph:
auto rt = n->get_rt_info();

// ToDo: change to reg_t
std::vector<size_t> rout;
std::vector<size_t> rin, rout;

auto it_rt = rt.find("reginfo");
if (it_rt != rt.end()) {
for (auto reg : it_rt->second.as<std::vector<size_t>>()) {
rout.push_back(reg);
}
}

std::vector<size_t> rin;
for (auto input : n->inputs()) {
for (const auto& input : n->inputs()) {
auto rt = input.get_source_output().get_node_shared_ptr()->get_rt_info();
auto it_rt = rt.find("reginfo");
if (it_rt != rt.end()) {
for (auto reg : it_rt->second.as<std::vector<size_t>>()) {
for (auto& reg : it_rt->second.as<std::vector<size_t>>()) {
rin.push_back(reg);
}
}
Expand All @@ -48,13 +48,12 @@ ngraph::snippets::code ngraph::snippets::Generator::generate(std::shared_ptr<ov:
auto results = m->get_results();
auto in = params.size();
auto out = results.size();
auto nptrs = in + out;

OV_ITT_TASK_CHAIN(GENERATE, ngraph::pass::itt::domains::SnippetsTransform, "Snippets::Generator", "::VectorTile")
// vector tile
std::vector<std::pair<std::shared_ptr<ngraph::snippets::Emitter>, ngraph::snippets::RegInfo>> lowered;
std::vector<AllocatedEmitter> lowered;
for (auto n : m->get_ordered_ops()) {
lowered.push_back(std::make_pair(target->get(n->get_type_info())(n), ngraph::snippets::getRegisters(n)));
lowered.emplace_back(std::make_pair(target->get(n->get_type_info())(n), ngraph::snippets::getRegisters(n)));
}
OV_ITT_TASK_NEXT(GENERATE, "::ScalarTile")

Expand All @@ -65,34 +64,29 @@ ngraph::snippets::code ngraph::snippets::Generator::generate(std::shared_ptr<ov:
mng.register_pass<ngraph::snippets::pass::ReplaceStoresWithScalarStores>();
mng.run_passes(m_scalar);
OV_ITT_TASK_NEXT(GENERATE, "::ScalarTile_get")
std::vector<std::pair<std::shared_ptr<Emitter>, RegInfo>> scalar_lowered;
std::vector<AllocatedEmitter> scalar_lowered;
for (auto n : m_scalar->get_ordered_ops()) {
scalar_lowered.push_back(std::make_pair(target->get(n->get_type_info())(n), ngraph::snippets::getRegisters(n)));
scalar_lowered.emplace_back(std::make_pair(target->get(n->get_type_info())(n), ngraph::snippets::getRegisters(n)));
}
OV_ITT_TASK_NEXT(GENERATE, "::Tiles1D")

// wrapping into tiles1D
std::vector<std::pair<std::shared_ptr<Emitter>, RegInfo>> tiles1D;
auto tile = std::make_shared<ngraph::snippets::op::Tile>(lowered);
tile->compile_params = compile_params;
tiles1D.push_back(std::make_pair(target->get(ngraph::snippets::op::Tile::get_type_info_static())(tile),
std::make_pair(std::vector<size_t>({target->get_lanes(), 0, nptrs, 1}), std::vector<size_t>{})));
tile = std::make_shared<ngraph::snippets::op::Tile>(scalar_lowered);
tile->compile_params = compile_params;
tiles1D.push_back(std::make_pair(target->get(ngraph::snippets::op::Tile::get_type_info_static())(tile),
std::make_pair(std::vector<size_t>{{1, target->get_lanes(), nptrs, 1}}, std::vector<size_t>{})));
const auto& vector_tile = std::make_shared<ngraph::snippets::op::Tile>(lowered);
const auto& vector_region = std::make_pair(target->get(ngraph::snippets::op::Tile::get_type_info_static())(vector_tile),
std::make_pair(std::vector<size_t>{target->get_lanes()}, std::vector<size_t>{}));
const auto& scalar_tile = std::make_shared<ngraph::snippets::op::Tile>(scalar_lowered);
const auto& scalar_region = std::make_pair(target->get(ngraph::snippets::op::Tile::get_type_info_static())(scalar_tile),
std::make_pair(std::vector<size_t>{1}, std::vector<size_t>{}));

OV_ITT_TASK_NEXT(GENERATE, "::Tiles2D")
// wrapping into tiles2D
std::vector<std::pair<std::shared_ptr<Emitter>, RegInfo>> tiles2D;
tile = std::make_shared<ngraph::snippets::op::Tile>(tiles1D);
tile->compile_params = compile_params;
tiles2D.push_back(std::make_pair(target->get(ngraph::snippets::op::Tile::get_type_info_static())(tile),
std::make_pair(std::vector<size_t>({1, 0, nptrs, 0}), std::vector<size_t>{})));
auto tile_scheduler = std::make_shared<ngraph::snippets::op::TileScheduler>(vector_region, scalar_region);
tile_scheduler->compile_params = compile_params;
const auto& tile_scheduler_region = std::make_pair(target->get(ngraph::snippets::op::TileScheduler::get_type_info_static())(tile_scheduler),
std::make_pair(std::vector<size_t>({in, out, target->get_lanes()}), std::vector<size_t>{}));

OV_ITT_TASK_NEXT(GENERATE, "::EmitCode")
// emission
auto tiles2DKernel = std::make_shared<ngraph::snippets::op::Kernel>(tiles2D);
auto tiles2DKernel = std::make_shared<ngraph::snippets::op::Kernel>(std::vector<AllocatedEmitter> {tile_scheduler_region});
tiles2DKernel->compile_params = compile_params;
std::shared_ptr<Emitter> kernel = target->get(ngraph::snippets::op::Kernel::get_type_info_static())(tiles2DKernel);
kernel->emit_code({in, out}, {});
Expand Down
2 changes: 1 addition & 1 deletion src/common/snippets/src/op/tile.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,5 @@
using namespace std;
using namespace ngraph;

snippets::op::Tile::Tile(const std::vector<std::pair<std::shared_ptr<snippets::Emitter>, snippets::RegInfo>>& nested) : Op(), region(nested) {
snippets::op::Tile::Tile(const std::vector<AllocatedEmitter>& nested) : Op(), region(nested) {
}
10 changes: 10 additions & 0 deletions src/common/snippets/src/op/tile_scheduler.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
// Copyright (C) 2018-2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#include "snippets/op/tile_scheduler.hpp"
#include "snippets/generator.hpp"

ngraph::snippets::op::TileScheduler::TileScheduler(const AllocatedEmitter& vector_region, const AllocatedEmitter& scalar_region)
: Op(), vector_region{vector_region}, scalar_region{scalar_region} {
}
69 changes: 27 additions & 42 deletions src/common/snippets/src/pass/assign_registers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
bool ngraph::snippets::pass::AssignRegisters::run_on_model(const std::shared_ptr<ov::Model>& f) {
RUN_ON_FUNCTION_SCOPE(AssignRegisters);
OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::AssignRegisters")
int reg64_tmp_start { 8 }; // R8, R9, R10, R11, R12, R13, R14, R15 inputs+outputs+1
using Reg = size_t;
auto ops = f->get_ordered_ops();
decltype(ops) stmts;
Expand All @@ -26,18 +25,18 @@ bool ngraph::snippets::pass::AssignRegisters::run_on_model(const std::shared_ptr

size_t rdx = 0;
std::map<std::shared_ptr<descriptor::Tensor>, Reg> regs;
for (auto op : stmts) {
for (auto output : op->outputs()) {
for (const auto& op : stmts) {
for (const auto& output : op->outputs()) {
regs[output.get_tensor_ptr()] = rdx++;
}
}

std::vector<std::set<Reg>> used;
std::vector<std::set<Reg>> def;

for (auto op : stmts) {
for (const auto& op : stmts) {
std::set<Reg> u;
for (auto input : op->inputs()) {
for (const auto& input : op->inputs()) {
if (regs.count(input.get_tensor_ptr())) {
u.insert(regs[input.get_tensor_ptr()]);
}
Expand All @@ -46,7 +45,7 @@ bool ngraph::snippets::pass::AssignRegisters::run_on_model(const std::shared_ptr

std::set<Reg> d;
if (!std::dynamic_pointer_cast<snippets::op::Store>(op)) {
for (auto output : op->outputs()) {
for (const auto& output : op->outputs()) {
d.insert(regs[output.get_tensor_ptr()]);
}
}
Expand All @@ -65,8 +64,8 @@ bool ngraph::snippets::pass::AssignRegisters::run_on_model(const std::shared_ptr
for (size_t n = 0; n < stmts.size(); n++) {
auto node = stmts[n];
if (!std::dynamic_pointer_cast<snippets::op::Store>(node)) {
for (auto out : node->outputs()) {
for (auto port : out.get_target_inputs()) {
for (const auto& out : node->outputs()) {
for (const auto& port : out.get_target_inputs()) {
auto pos = std::find(stmts.begin(), stmts.end(), port.get_node()->shared_from_this());
if (pos != stmts.end()) {
auto k = pos-stmts.begin();
Expand Down Expand Up @@ -136,46 +135,32 @@ bool ngraph::snippets::pass::AssignRegisters::run_on_model(const std::shared_ptr

std::map<std::shared_ptr<descriptor::Tensor>, Reg> physical_regs;

for (auto reg : regs) {
for (const auto& reg : regs) {
physical_regs[reg.first] = register_map[reg.second];
}

size_t constantID = 0;

for (auto n : f->get_ordered_ops()) {
const auto num_parameters = f->get_parameters().size();
for (const auto& n : f->get_ordered_ops()) {
auto& rt = n->get_rt_info();
// nothing to do for model signature
if (std::dynamic_pointer_cast<opset1::Parameter>(n) || std::dynamic_pointer_cast<opset1::Result>(n)) {
continue;
}

// store only effective address
if (auto result = std::dynamic_pointer_cast<snippets::op::Store>(n)) {
auto ea = reg64_tmp_start+static_cast<int64_t>(f->get_result_index(result) + f->get_parameters().size());
rt["effectiveAddress"] = ea;
std::vector<size_t> regs;
regs.reserve(n->outputs().size());
/* The main idea here is that each operation stores its output regs in rt["reginfo"]. Input and output regs are
* then derived by parsing node's and parent's rt["reginfo"], look into ngraph::snippets::getRegisters for details.
* Note also that Parameter and Result store general-purpose register index, because they work with memory
* (memory pointer is stored in gpr). All other "regular" ops store vector regs indexes, since calculations are
* performed on registers.
*/
if (is_type<ov::op::v0::Result>(n)) {
continue;
}
// store effective address and procced with vector registers
if (ov::as_type_ptr<ngraph::snippets::op::Load>(n) || ov::as_type_ptr<ngraph::snippets::op::BroadcastLoad>(n)) {
auto source = n->get_input_source_output(0).get_node_shared_ptr();

if (auto param = ov::as_type_ptr<opset1::Parameter>(source)) {
auto ea = reg64_tmp_start+static_cast<int64_t>(f->get_parameter_index(param));
rt["effectiveAddress"] = ea;
} else if (auto constant = ov::as_type_ptr<opset1::Constant>(source)) {
auto ea = reg64_tmp_start+static_cast<int64_t>(f->get_parameters().size() + f->get_results().size() + 1 + constantID);
rt["effectiveAddress"] = ea;
constantID++;
} else {
throw ngraph_error("load/broadcast should follow only Parameter or non-Scalar constant");
} else if (const auto& param = ov::as_type_ptr<ov::op::v0::Parameter>(n)) {
regs.push_back(f->get_parameter_index(param));
} else if (const auto& store = ov::as_type_ptr<ngraph::snippets::op::Store>(n)) {
regs.push_back(f->get_result_index(store) + num_parameters);
} else {
for (const auto& output : n->outputs()) {
auto allocated = physical_regs[output.get_tensor_ptr()];
regs.push_back(allocated);
}
}

std::vector<size_t> regs; regs.reserve(n->outputs().size());
for (auto output : n->outputs()) {
auto allocated = physical_regs[output.get_tensor_ptr()];
regs.push_back(allocated);
}
rt["reginfo"] = regs;
}

Expand Down
2 changes: 1 addition & 1 deletion src/common/snippets/src/pass/collapse_subgraph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -477,7 +477,7 @@ TokenizeSnippets::TokenizeSnippets() {
throw ngraph_error("body results and node results size mismatch during subgraph collaps");
}
// todo: move this plugin-specific constraint to the plugin callback
if (body_parameters.size() + body_results.size() > 7) {
if (body_parameters.size() + body_results.size() > 12) {
const std::string message_reset = "new subgraph is created. Impossible to schedule subgraph with " +
std::to_string(body_parameters.size()) + " inputs and " + std::to_string(body_results.size()) + " outputs.";
const std::string message_abort = "failed to continue subgraph. Impossible to schedule subgraph with " +
Expand Down
1 change: 1 addition & 0 deletions src/common/snippets/tests/src/lowering_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ DummyTargetMachine::DummyTargetMachine() {
jitters[ngraph::snippets::op::BroadcastMove::get_type_info_static()] = dummy_functor;
jitters[ngraph::snippets::op::Kernel::get_type_info_static()] = dummy_functor;
jitters[ngraph::snippets::op::Tile::get_type_info_static()] = dummy_functor;
jitters[ngraph::snippets::op::TileScheduler::get_type_info_static()] = dummy_functor;
}

std::shared_ptr<ngraph::snippets::op::Subgraph> LoweringTests::getSubgraph(const std::shared_ptr<Model>& f) {
Expand Down
Loading

0 comments on commit e23ada1

Please sign in to comment.