Skip to content

Commit

Permalink
Sns explicit tiles leftovers (openvinotoolkit#60)
Browse files Browse the repository at this point in the history
  • Loading branch information
IvanNovoselov authored Nov 30, 2022
1 parent 0075a5f commit a748dac
Show file tree
Hide file tree
Showing 13 changed files with 94 additions and 66 deletions.
43 changes: 28 additions & 15 deletions src/common/snippets/include/snippets/op/loop.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,56 +14,69 @@ namespace op {

/**
* @interface LoopBase
* @brief Inserted during scheduling generation and represents Loop in affine notation
* @brief Base class for LoopBegin and LoopEnd
* @ingroup snippets
*/
class LoopBase : public ngraph::op::Op {
public:
OPENVINO_OP("LoopBase", "SnippetsOpset");
LoopBase(const std::vector<Output<Node>>& args, size_t dimension, size_t work_amount, size_t increment);
LoopBase(const std::vector<Output<Node>>& args, size_t work_amount, size_t increment);
LoopBase() = delete;
bool visit_attributes(AttributeVisitor& visitor) override;
size_t get_work_amount() const;
size_t get_increment() const;
size_t get_dimension() const;
bool get_evaluate_once() const;

protected:
size_t dimension;
size_t work_amount;
size_t increment;
bool evaluate_once; // true if the Loop is executed only once, used to skip setting and testing the loop counter
};
class LoopEnd;
/**
* @interface LoopBegin
* @brief Marks the start of the Loop region.
* Number of outputs always equals to the number of inputs (bypassed values) + 1 (edge to the corresponding LoopEnd)
* @param args - vector of input values, they are passed directly to output.
* @ingroup snippets
*/
class LoopBegin : public LoopBase {
friend LoopEnd;
public:
OPENVINO_OP("LoopBegin", "SnippetsOpset");
/// \brief Construct an Loop
/// \param region The vector of pairs: emitters and the corresponding registers
/// \param increment Loop size - count of elements to load and store.
/// Vector Loop should have size of vector register and Scalar Loop should have 1
/// \param num_inputs Count of inputs
/// \param num_outputs Count of outputs
/// \param io_dims Vector of last dimensions of inputs and outputs
/// \param io_data_sizes Vector of data type sizes of inputs and outputs
explicit LoopBegin(const std::vector<Output<Node>>& args);
explicit LoopBegin(const OutputVector& args);
LoopBegin() = delete;
void validate_and_infer_types() override;
std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& inputs) const override;
std::shared_ptr<LoopEnd> get_loop_end();
// begin_address and input_regs are needed to communicate information between LoopBegin and LoopEnd emitters
const uint8_t* begin_address;
std::vector<size_t> input_regs;

private:
void validate_and_infer_types_except_LoopEnd();
LoopBegin(const std::vector<Output<Node>>& args, size_t dimension, size_t work_amount, size_t increment);
LoopBegin(const std::vector<Output<Node>>& args, size_t work_amount, size_t increment);
};

/**
* @interface LoopEnd
* @brief Marks the end of the Loop region and defines the loop properties.
* Number of outputs always equals to the number of inputs (bypassed values) - 1 (edge to the corresponding LoopEnd)
* @param args vector of input values + LoopBegin, all values except for the LoopBegin are passed directly to output.
* @param work_amount total number of evaluations to be processed by the loop
* @param increment number of evaluations processed in one iteration of the loop.
* @param apply_increment describes which data pointers attributed to the loop should be incremented on every iteration.
* should be used when Loop is connected to Parameters and/or Results. If apply_increment[i] == true then i-th i/o data
* pointer will be incremented by work_amount*data_size on every iteration.
* @param ptr_increments specifies i/o pointer increment performed on every iteration. This is an alternative to
* apply_increments, which enables more flexibility.
* @param finalization_offsets pointer increments that are be applied to i/o pointers before exiting the loop
* @ingroup snippets
*/
class LoopEnd : public LoopBase {
public:
OPENVINO_OP("LoopEnd", "SnippetsOpset");
LoopEnd(const std::vector<Output<Node>>& args, size_t dimension, size_t work_amount, size_t increment,
LoopEnd(const std::vector<Output<Node>>& args, size_t work_amount, size_t increment,
std::vector<bool> apply_increment, std::vector<int64_t> finalization_offsets);
LoopEnd() = delete;
std::shared_ptr<LoopBegin> get_loop_begin();
Expand Down
7 changes: 4 additions & 3 deletions src/common/snippets/include/snippets/op/memory_access.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,10 @@ namespace op {

/**
* @interface MemoryAccess
* @brief This is an ubre
* where number of elements to store is determined by "count"
* Default value is "1" - to store one element
* @brief This is a base class for memory access operations (like Load and Store).
* It provides universal set/get interface to manipulate the number
* of elements accessed during one operation call ("count").
* Default "count" value is "1" - it means to load/store one element
* @ingroup snippets
*/

Expand Down
1 change: 0 additions & 1 deletion src/common/snippets/include/snippets/op/subgraph.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,6 @@ class Subgraph : public ngraph::op::Op {
snippets::Schedule generate(ngraph::pass::Manager &opt, const void* compile_params = nullptr);
snippets::Schedule generate(const void* compile_params = nullptr);
ov::PartialShape canonicalize(const BlockedShapeVector& output_shapes, const BlockedShapeVector& input_shapes);
ov::PartialShape get_master_shape();
std::vector<PartialShape> reshape_body(const std::vector<PartialShape>& input_shapes);
std::vector<Shape> reshape_body(const std::vector<Shape>& input_shapes);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,27 @@

#include "ngraph/op/op.hpp"
#include "ngraph/op/parameter.hpp"
#include "loop.hpp"
#include "snippets/op/loop.hpp"

namespace ngraph {
namespace snippets {
namespace op {

/* ==== LoopBegin === */
/**
* @interface insertLoopBeginAfterOutputs
* @brief Inserts LoopBegin operation after the group of operations described
* by the input argument (OutputVector). Use insertLoopBegin instead - it has a more universal interface.
* @ingroup snippets
*/
std::shared_ptr<LoopBegin> insertLoopBeginAfterOutputs(const OutputVector& originalOutputs);

/**
* @interface insertLoopBegin
* @brief Inserts LoopBegin operation after the group of operations described
* by the input argument (ParameterVector, NodeVector or OutputVector).
* @ingroup snippets
*/
template<typename T>
std::shared_ptr<LoopBegin> insertLoopBegin(const T& afterTheseNodes) {
static_assert(std::is_same<T, ParameterVector>() || std::is_same<T, NodeVector>(),
Expand All @@ -37,12 +49,32 @@ inline std::shared_ptr<LoopBegin> insertLoopBegin(const OutputVector& afterThese
/* ============== */

/* ==== LoopEnd === */
/**
* @interface insertLoopBeginAfterOutputs
* @brief Inserts LoopBegin operation after the group of operations described
* by the input argument (vector of inputs). Use insertLoopEnd instead - it has a more universal interface.
* @param originalInputs LoopEnd will be inserted before these inputs
* @param loopBegin pointer to the beginning of the Loop region
* @param work_amount total number of evaluations to be processed by the loop
* @param increment number of evaluations processed in one iteration of the loop
* @param apply_increment describes which data pointers attributed to the loop should be incremented on every iteration.
* should be used when Loop is connected to Parameters and/or Results
* @param finalization_offsets pointer shifts that should be applied to data pointers before exiting the loop
* @ingroup snippets
*/

std::shared_ptr<LoopEnd> insertLoopEndBeforeInputs(const std::vector<Input<Node>>& originalInputs,
const std::shared_ptr<LoopBegin>& tileBegin,
size_t dimension, size_t work_amount, size_t increment,
const std::shared_ptr<LoopBegin>& loopBegin,
size_t work_amount, size_t increment,
std::vector<bool> apply_increment = {},
std::vector<int64_t> finalization_offsets = {});

/**
* @interface insertLoopEnd
* @brief Inserts LoopEnd operation before the group of operations described
* by the input argument (ResultVector, NodeVector or OutputVector).
* @ingroup snippets
*/
template<typename T, typename ...Args>
std::shared_ptr<LoopEnd> insertLoopEnd(const T& beforeTheseNodes, Args ...args) {
static_assert(std::is_same<T, ResultVector>() || std::is_same<T, NodeVector>(),
Expand Down
4 changes: 3 additions & 1 deletion src/common/snippets/src/generator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -105,8 +105,10 @@ ngraph::snippets::code ngraph::snippets::Generator::generate(std::shared_ptr<ov:
const auto& ops = m->get_ordered_ops();
for (auto op = ops.begin(); op < ops.end(); op++) {
const auto& loop_begin = ov::as_type_ptr<ngraph::snippets::op::LoopBegin>(*op);
// ignore outer loops and possible manual tail loops

// ignore outer loops and possible manual scalar loops
if (loop_begin && loop_begin->get_increment() != 1) {
OV_ITT_TASK_NEXT(GENERATE, "::VectorLoop")
NodeVector vector_loop, tail_loop;
std::shared_ptr<op::LoopEnd> vector_loop_end, tail_loop_end;
vector_loop_end = loop_begin->get_loop_end();
Expand Down
24 changes: 9 additions & 15 deletions src/common/snippets/src/op/loop.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,11 @@ namespace ngraph {
namespace snippets {
namespace op {

LoopBase::LoopBase(const std::vector<Output<Node>> &args, size_t dimension, size_t work_amount, size_t increment)
: Op(args), dimension(dimension), work_amount(work_amount), increment(increment), evaluate_once(false) {
LoopBase::LoopBase(const std::vector<Output<Node>> &args, size_t work_amount, size_t wa_increment)
: Op(args), work_amount(work_amount), increment(wa_increment), evaluate_once(false) {
}

bool LoopBase::visit_attributes(AttributeVisitor &visitor) {
visitor.on_attribute("dimension", dimension);
visitor.on_attribute("work_amount", work_amount);
visitor.on_attribute("increment", increment);
return true;
Expand All @@ -33,25 +32,21 @@ size_t LoopBase::get_increment() const {
return increment;
}

size_t LoopBase::get_dimension() const {
return dimension;
}

LoopBegin::LoopBegin(const std::vector<Output<Node>> &args, size_t dimension, size_t work_amount, size_t increment)
: LoopBase(args, dimension, work_amount, increment),
LoopBegin::LoopBegin(const std::vector<Output<Node>> &args, size_t work_amount, size_t increment)
: LoopBase(args, work_amount, increment),
begin_address(nullptr), input_regs({}) {
// We can only call a reduced validate_and_infer types from the constructor, since LoopEnd might not be attached
// to the LoopBegin at this point (which is usually the case: create LoopBegin first => then attach LoopEnd to it)
validate_and_infer_types_except_LoopEnd();
}

LoopBegin::LoopBegin(const std::vector<Output<Node>> &args)
: LoopBase(args, 0, 0, 0), begin_address(nullptr), input_regs({}) {
: LoopBase(args, 0, 0), begin_address(nullptr), input_regs({}) {
validate_and_infer_types_except_LoopEnd();
}

std::shared_ptr<Node> LoopBegin::clone_with_new_inputs(const OutputVector& inputs) const {
return std::shared_ptr<LoopBegin>(new LoopBegin(inputs, dimension, work_amount, increment));
return std::shared_ptr<LoopBegin>(new LoopBegin(inputs, work_amount, increment));
}


Expand All @@ -70,7 +65,6 @@ void LoopBegin::validate_and_infer_types() {
NODE_VALIDATION_CHECK(this, last_output_inputs.size() == 1, "LoopBegin must have exactly one input attached to the last output");
const auto& loop_end = ov::as_type_ptr<LoopEnd>(last_output_inputs.begin()->get_node()->shared_from_this());
NODE_VALIDATION_CHECK(this, loop_end != nullptr, "LoopBegin must have LoopEnd connected to its last output");
dimension = loop_end->get_dimension();
work_amount = loop_end->get_work_amount();
increment = loop_end->get_increment();
}
Expand All @@ -85,15 +79,15 @@ std::shared_ptr<LoopEnd> LoopBegin::get_loop_end() {
return loop_end;
}

LoopEnd::LoopEnd(const std::vector<Output<Node>> &args, size_t dimension, size_t work_amount, size_t increment,
LoopEnd::LoopEnd(const std::vector<Output<Node>> &args, size_t work_amount, size_t increment,
std::vector<bool> apply_increment, std::vector<int64_t> finalization_offsets)
: LoopBase(args, dimension, work_amount, increment), apply_increment(std::move(apply_increment)),
: LoopBase(args, work_amount, increment), apply_increment(std::move(apply_increment)),
finalization_offsets(std::move(finalization_offsets)), has_outer_loop(true) {
constructor_validate_and_infer_types();
}

std::shared_ptr<Node> LoopEnd::clone_with_new_inputs(const OutputVector& inputs) const {
return std::make_shared<LoopEnd>(inputs, dimension, work_amount, increment, apply_increment, finalization_offsets);
return std::make_shared<LoopEnd>(inputs, work_amount, increment, apply_increment, finalization_offsets);
}

std::shared_ptr<LoopBegin> LoopEnd::get_loop_begin() {
Expand Down
12 changes: 1 addition & 11 deletions src/common/snippets/src/op/subgraph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -296,16 +296,6 @@ ov::PartialShape snippets::op::Subgraph::canonicalize(const BlockedShapeVector&
return master_shape;
}

PartialShape snippets::op::Subgraph::get_master_shape() {
auto results = m_body->get_results();
PartialShape outPShape = results[0]->get_input_partial_shape(0);
for (const auto& r : results)
PartialShape::broadcast_merge_into(outPShape, r->get_input_shape(0),
::ngraph::op::AutoBroadcastType::NUMPY);
master_shape = outPShape;
return master_shape;
}

void snippets::op::Subgraph::align_element_types(const BlockedShapeVector& outputShapes,
const BlockedShapeVector& inputShapes) {
// We should insert Convert before Results to set original output element type if needed
Expand Down Expand Up @@ -367,7 +357,7 @@ void snippets::op::Subgraph::convert_to_snippet_dialect() {
manager.register_pass<snippets::pass::InsertStore>(count);
// todo: presently dynamic pipeline is activated even if the last two dimension are static
// In general, we can use static kernels in this case, but several parameters (src and dst memory pointers for example)
// should be passed as run-time args, so it's a mixed regime: kernel is shape-aware, but some additional runtime args are required
// should be passed as run-time args, so it's a mixed mode: kernel is shape-aware, but some additional runtime args are required
// Presently Broadcasting is organized in the following way:
// * ALL last dims are static => broadcasting is handled via MoveBroadcast and pointer arithmetics (even for dynamic upper dims)
if (!inputs_has_dynamic_last_dims) {
Expand Down
8 changes: 4 additions & 4 deletions src/common/snippets/src/pass/insert_loops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

#include <snippets/itt.hpp>
#include "snippets/pass/insert_loops.hpp"
#include "snippets/op/loop_helpers.hpp"
#include "snippets/pass/loop_helpers.hpp"

#include <ngraph/rt_info.hpp>

Expand Down Expand Up @@ -52,8 +52,8 @@ bool ngraph::snippets::pass::InsertLoops::run_on_model(const std::shared_ptr<ov:
});
}
const auto& inner_loop_begin = op::insertLoopBegin(commonParams);
const auto& inner_loop_end = insertLoopEnd(commonResults, inner_loop_begin, inner_dim, inner_work_amount,
vector_size, apply_increments, inner_finalization_offsets);
const auto& inner_loop_end = insertLoopEnd(commonResults, inner_loop_begin, inner_work_amount,
vector_size, apply_increments, inner_finalization_offsets);
// set internal flag to enable scalar vs vector loop optimizations
inner_loop_end->has_outer_loop = outer_work_amount > 1;
// Due to features of topological sort, some Constants (Scalars) may appear right after Parameters in
Expand All @@ -78,7 +78,7 @@ bool ngraph::snippets::pass::InsertLoops::run_on_model(const std::shared_ptr<ov:
return ps[outer_dim] != 1 && ps[inner_dim] == 1;
});
const auto& outer_loop_begin = op::insertLoopBegin(commonParams);
insertLoopEnd(commonResults, outer_loop_begin, outer_dim, outer_work_amount, 1, apply_increments);
insertLoopEnd(commonResults, outer_loop_begin, outer_work_amount, 1, apply_increments);
}

return true;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
//

#include "ngraph/op/op.hpp"
#include "snippets/op/loop_helpers.hpp"
#include "snippets/pass/loop_helpers.hpp"

namespace ngraph {
namespace snippets {
Expand All @@ -26,15 +26,15 @@ std::shared_ptr<LoopBegin> insertLoopBeginAfterOutputs(const OutputVector& origi

std::shared_ptr<LoopEnd> insertLoopEndBeforeInputs(const std::vector<Input<Node>>& originalInputs,
const std::shared_ptr<LoopBegin>& loopBegin,
size_t dimension, size_t work_amount, size_t increment,
size_t work_amount, size_t increment,
std::vector<bool> apply_increment,
std::vector<int64_t> finalization_offsets) {
OutputVector originalParentOutputs;
for (const auto& in : originalInputs) {
originalParentOutputs.push_back(in.get_source_output());
}
originalParentOutputs.push_back(loopBegin->output(loopBegin->get_output_size() - 1));
auto loop_end = std::make_shared<LoopEnd>(originalParentOutputs, dimension, work_amount, increment,
auto loop_end = std::make_shared<LoopEnd>(originalParentOutputs, work_amount, increment,
std::move(apply_increment), std::move(finalization_offsets));

for (int i = 0; i < originalInputs.size(); i++) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -307,7 +307,7 @@ LoopEndEmitter::LoopEndEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::imp
for (int i = 0; i < num_inputs; i++)
io_data_size.push_back(loop_begin->get_input_element_type(i).size());
for (int i = 0; i < num_outputs; i++)
io_data_size.push_back(loop_end->get_input_element_type(i).size());
io_data_size.push_back(loop_end->get_output_element_type(i).size());
in_out_type_ = emitter_in_out_map::gpr_to_gpr;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ class jit_container_emitter: public jit_emitter {
};
///
/// \brief Kernel is the only entry point to Codogen Jit compilation. Kernel perform abstract-to-physical register
/// mapping and creates pools of available gpr and vec registers. Kernel usually to contains (at least one)
/// mapping and creates a pools of available gpr and vec registers. Kernel usually contains (at least one)
/// LoopBeginEmitter and LoopEndEmitter pair. In general the enclosed emitters should be organized in the following way:
/// KernelEmitter { /* entry point, maps registers, creates pools of available registers */
/// 1.S LoopBeginEmitter /* Scalar Loop over the outer dimension [START] */
Expand Down
3 changes: 0 additions & 3 deletions src/plugins/intel_cpu/src/nodes/subgraph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -429,9 +429,6 @@ std::vector<VectorDims> Snippet::shapeInfer() const {
}

void Snippet::prepareParams() {
// here must be all the stuff that could only be done for static shapes, e.g. offset calculation
// Here it must be all the stuff that could be done once for both static and dynamic shapes

masterShape = getNormalizedDimsBySize(masterShape, tensorRank);
for (auto& pshape : normInputShapes)
pshape = getNormalizedDimsBySize(pshape, tensorRank);
Expand Down
Loading

0 comments on commit a748dac

Please sign in to comment.