Skip to content

Commit

Permalink
[Snippets] Brgemm blocking by KN dims at the LIR level (#19335)
Browse files Browse the repository at this point in the history
  • Loading branch information
v-Golubev authored Dec 8, 2023
1 parent 488abd0 commit 261cf4e
Show file tree
Hide file tree
Showing 62 changed files with 1,285 additions and 838 deletions.
2 changes: 1 addition & 1 deletion src/common/snippets/include/snippets/lowered/linear_ir.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ class LinearIR {
LinearIR() = default;
LinearIR(const std::shared_ptr<ov::Model>& m, const std::shared_ptr<IShapeInferSnippetsFactory>& factory, Config config = {});

ExpressionPtr create_expression(const std::shared_ptr<Node>& n, const std::vector<PortConnectorPtr>& inputs);
ExpressionPtr create_expression(const std::shared_ptr<Node>& n, const std::vector<PortConnectorPtr>& inputs) const;

std::shared_ptr<LinearIR> clone() const;
static LinearIR::container deep_copy_range(LinearIR::container::const_iterator begin,
Expand Down
100 changes: 80 additions & 20 deletions src/common/snippets/include/snippets/lowered/loop_manager.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,7 @@ class LinearIR::LoopManager {

struct LoopPort {
LoopPort() = default;
LoopPort(const ExpressionPort& port, bool is_scheduled = true)
: expr_port(std::make_shared<ExpressionPort>(port)), is_incremented(is_scheduled) {}

LoopPort(const ExpressionPort& port, bool is_incremented = true, size_t dim_idx = 0);
std::shared_ptr<LoopPort> clone_with_new_expr(const ExpressionPtr& new_expr) const;

friend bool operator==(const LoopPort& lhs, const LoopPort& rhs);
Expand All @@ -37,33 +35,68 @@ class LinearIR::LoopManager {
int64_t ptr_increment = 0;
int64_t finalization_offset = 0;
int64_t data_size = 0;
size_t dim_idx = 0; // The numeration starts from the end (dim_idx = 0 -> is the most inner dimension)
};

class LoopInfo {
public:
enum {UNDEFINED_DIM_IDX = std::numeric_limits<size_t>::max()};
LoopInfo() = default;
LoopInfo(size_t work_amount, size_t increment, size_t dim_idx,
LoopInfo(size_t work_amount, size_t increment,
const std::vector<LoopPort>& entries,
const std::vector<LoopPort>& exits)
: work_amount(work_amount), increment(increment), dim_idx(dim_idx),
entry_points(entries), exit_points(exits), outer_splited_loop(false) {}
LoopInfo(size_t work_amount, size_t increment, size_t dim_idx,
const std::vector<LoopPort>& exits,
bool outer_splited_loop = false)
: m_work_amount(work_amount), m_increment(increment),
m_entry_points(entries), m_exit_points(exits), m_outer_splited_loop(outer_splited_loop) {}
LoopInfo(size_t work_amount, size_t increment,
const std::vector<ExpressionPort>& entries,
const std::vector<ExpressionPort>& exits);
const std::vector<ExpressionPort>& exits,
bool outer_splited_loop = false);

std::shared_ptr<LoopInfo> clone_with_new_expr(const ExressionMap& expr_map) const;

size_t work_amount = 0;
size_t increment = 0;
size_t dim_idx = 0; // The numeration begins from the end (dim_idx = 0 -> is the most inner dimension)
// Returns dimension index if dimension indices for all entry and exit points are equal, and UNDEFINED_DIM_IDX otherwise
size_t get_dim_idx() const;
size_t get_work_amount() const;
size_t get_increment() const;
const std::vector<LoopPort>& get_entry_points() const;
const std::vector<LoopPort>& get_exit_points() const;
bool get_outer_splited_loop() const;

/**
* \brief Inserts a separate body for first loop iteration processing if needed.
* Can also modify both main and first iter loop bodies.
* TODO: replace this temporary solution when ticket 119851 is implemented
*
* \param linear_ir LIR which should be modified
* \param loop_end_it iterator on LoopEnd expression for which the handler is called
*
* \return bool value which indicates whether the linear_ir was changed or not.
*/
using FirstIterHandler = std::function<bool(LinearIR&, LinearIR::constExprIt)>;
const FirstIterHandler& get_first_iter_handler() const;

// Sets dim_idx to all entry and exit points
void set_dim_idx(size_t dim_idx);
void set_work_amount(size_t work_amount);
void set_increment(size_t increment);
void set_entry_points(std::vector<LoopPort> entry_points);
void set_exit_points(std::vector<LoopPort> exit_points);
void set_outer_splited_loop(bool outer_splited_loop);
void set_first_iter_handler(FirstIterHandler handler);

private:
size_t m_work_amount = 0;
size_t m_increment = 0;
// The order of entry and exit expressions is important:
// - The position before first entry expr is Loop Begin position
// - The position after last exit expr is Loop End position
// Note: Scalars aren't entry expressions but can be before first entry expr in Linear IR
std::vector<LoopPort> entry_points = {};
std::vector<LoopPort> exit_points = {};
std::vector<LoopPort> m_entry_points = {};
std::vector<LoopPort> m_exit_points = {};
// True if this Loop is outer Loop for nested Loops that splits the same dimension
bool outer_splited_loop = false;
bool m_outer_splited_loop = false;
FirstIterHandler m_first_iter_handler = nullptr;
};
using LoopInfoPtr = std::shared_ptr<LoopInfo>;

Expand All @@ -83,18 +116,45 @@ class LinearIR::LoopManager {
// Return Loop ID
template <typename T>
size_t mark_loop(LinearIR::constExprIt loop_begin_pos,
LinearIR::constExprIt loop_end_pos,
size_t work_amount, size_t work_amount_increment, size_t dim_idx,
const std::vector<T>& entries,
const std::vector<T>& exits) {
const auto loop_info = std::make_shared<LoopManager::LoopInfo>(work_amount, work_amount_increment, dim_idx, entries, exits);
LinearIR::constExprIt loop_end_pos,
size_t work_amount,
size_t work_amount_increment,
size_t dim_idx,
const std::vector<T>& entries,
const std::vector<T>& exits) {
const auto loop_info = std::make_shared<LoopManager::LoopInfo>(work_amount, work_amount_increment, entries, exits);
loop_info->set_dim_idx(dim_idx);
const auto loop_id = this->add_loop_info(loop_info);
for (auto expr_it = loop_begin_pos; expr_it != loop_end_pos; ++expr_it) {
insert_loop_id(*expr_it, loop_id);
}
return loop_id;
}

template <typename T>
size_t mark_loop(LinearIR::constExprIt loop_begin_pos,
LinearIR::constExprIt loop_end_pos,
size_t work_amount,
size_t increment,
const std::vector<T>& entries,
const std::vector<T>& exits) {
const auto loop_info = std::make_shared<LoopManager::LoopInfo>(work_amount, increment, entries, exits);
const auto loop_id = this->add_loop_info(loop_info);
for (auto expr_it = loop_begin_pos; expr_it != loop_end_pos; ++expr_it) {
insert_loop_id(*expr_it, loop_id);
}
return loop_id;
}

size_t replace_with_new_loop(const LinearIR& linear_ir,
LinearIR::constExprIt loop_begin_pos,
LinearIR::constExprIt loop_end_pos,
size_t work_amount,
size_t increment,
const std::vector<LoopPort>& entries,
const std::vector<LoopPort>& exits,
const size_t old_id);

void fuse_loops(const LinearIR& linear_ir, size_t loop_id_upper, size_t loop_id_lower, bool fuse_into_upper = true);
void fuse_loops(LinearIR::constExprIt loop_begin_target, LinearIR::constExprIt loop_end_target,
size_t loop_id_upper, size_t loop_id_lower, bool fuse_into_upper = true);
Expand Down
11 changes: 3 additions & 8 deletions src/common/snippets/include/snippets/lowered/pass/init_loops.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,14 +25,9 @@ class InitLoops : public Pass {
bool run(LinearIR& linear_ir) override;

private:
static void init_ptr_increments(std::vector<LinearIR::LoopManager::LoopPort>& loop_inputs,
std::vector<LinearIR::LoopManager::LoopPort>& loop_outputs,
size_t work_amount, size_t dim_idx);
static void init_finalization_offsets(std::vector<LinearIR::LoopManager::LoopPort>& loop_inputs,
std::vector<LinearIR::LoopManager::LoopPort>& loop_outputs,
size_t work_amount);
static void init_element_type_sizes(std::vector<LinearIR::LoopManager::LoopPort>& loop_inputs,
std::vector<LinearIR::LoopManager::LoopPort>& loop_outputs);
static void init_ptr_increments(const LinearIR::LoopManager::LoopInfoPtr& loop_info);
static void init_finalization_offsets(const LinearIR::LoopManager::LoopInfoPtr& loop_info);
static void init_element_type_sizes(const LinearIR::LoopManager::LoopInfoPtr& loop_info);
};

} // namespace pass
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include "pass.hpp"

#include "snippets/op/loop.hpp"
#include "snippets/lowered/loop_manager.hpp"

namespace ov {
namespace snippets {
Expand All @@ -23,21 +24,26 @@ class InsertTailLoop : public Pass {
public:
OPENVINO_RTTI("InsertTailLoop", "Pass")
bool run(LinearIR& linear_ir) override;
static LinearIR::container copy_loop(const LinearIR& linear_ir, const size_t loop_id);

static constexpr size_t existing_subtensor_value = SIZE_MAX;
static void propagate_updated_subtensor_through_loop(const LinearIR& linear_ir,
const LinearIR::LoopManager::LoopInfoPtr& loop_info,
LinearIR::container::const_iterator begin,
LinearIR::container::const_iterator end,
const size_t new_dim_value = existing_subtensor_value);

private:
static std::shared_ptr<op::LoopEnd> create_tail_loop(LinearIR& linear_ir,
LinearIR::constExprIt vector_begin,
LinearIR::constExprIt vector_end,
LinearIR::constExprIt& tail_begin,
LinearIR::constExprIt& tail_end,
const std::shared_ptr<op::LoopEnd>& vector_loop_end,
bool need_vector_loop,
size_t tail_size, const std::vector<int64_t>& tail_finalization_offsets);
static void create_tail_loop(LinearIR& linear_ir,
LinearIR::constExprIt begin,
LinearIR::constExprIt end,
const std::shared_ptr<op::LoopEnd>& loop_end,
bool need_vector_loop,
size_t tail_size);
static void tail_transformations(LinearIR& linear_ir,
LinearIR::constExprIt tail_begin,
LinearIR::constExprIt tail_end,
size_t tail_size);
static bool optimize_single_evaluation(const std::shared_ptr<op::LoopEnd>& loop);
};

} // namespace pass
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
// Copyright (C) 2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#pragma once

#include "pass.hpp"

namespace ov {
namespace snippets {
namespace lowered {
namespace pass {

/**
* @interface OptimizeLoopSingleEvaluation
* @brief Does the following optimizations if the Loop body can be executed only once:
* - sets evaluate_once parameter to true
* - moves all ptr arithmetic to finalization offsets
* @ingroup snippets
*/
class OptimizeLoopSingleEvaluation : public Pass {
public:
OPENVINO_RTTI("OptimizeLoopSingleEvaluation", "Pass")
bool run(LinearIR& linear_ir) override;
};

} // namespace pass
} // namespace lowered
} // namespace snippets
} // namespace ov
7 changes: 4 additions & 3 deletions src/common/snippets/include/snippets/op/broadcastload.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,15 +21,16 @@ class BroadcastLoad : public MemoryAccess {
public:
OPENVINO_OP("BroadcastLoad", "SnippetsOpset", ov::snippets::op::MemoryAccess);

BroadcastLoad(const Output<Node>& x, ov::PartialShape output_shape, size_t offset = 0lu);
BroadcastLoad(const Output<Node>& x, ov::Dimension bcast_dimension, size_t offset = 0lu);
BroadcastLoad() = default;

size_t get_offset() const { return get_input_offset(0); }

bool visit_attributes(AttributeVisitor& visitor) override;
std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
void validate_and_infer_types() override;
ov::PartialShape get_output_shape() {return output_shape;}
const ov::Dimension& get_bcast_dimension() {return bcast_dimension;}
void set_bcast_dimension(ov::Dimension new_dim) {bcast_dimension = std::move(new_dim);}

// Note:BroadcastMove and BroadcastLoad are implemented as separate classes,
// but have identical shapeInfer semantics. In order to avoid code duplication,
Expand All @@ -39,7 +40,7 @@ class BroadcastLoad : public MemoryAccess {
explicit ShapeInfer(const std::shared_ptr<Node>& n) : BroadcastShapeInfer<BroadcastLoad>(n) {}
};
private:
ov::PartialShape output_shape;
ov::Dimension bcast_dimension;
};

} // namespace op
Expand Down
7 changes: 4 additions & 3 deletions src/common/snippets/include/snippets/op/broadcastmove.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,15 +20,16 @@ class BroadcastMove : public ov::op::Op {
public:
OPENVINO_OP("BroadcastMove", "SnippetsOpset");

BroadcastMove(const Output<Node>& x, ov::PartialShape output_shape);
BroadcastMove(const Output<Node>& x, ov::Dimension bcast_dimension);
BroadcastMove() = default;

bool visit_attributes(AttributeVisitor& visitor) override;

std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;

void validate_and_infer_types() override;
ov::PartialShape get_output_shape() {return output_shape;}
const ov::Dimension& get_bcast_dimension() {return bcast_dimension;}
void set_bcast_dimension(ov::Dimension new_dim) {bcast_dimension = std::move(new_dim);}
// Note:BroadcastMove and BroadcastLoad are implemented as separate classes,
// but have identical shapeInfer semantics. In order to avoid code duplication,
// we created dummy ShapeInfer classes that are essentially instantiations
Expand All @@ -38,7 +39,7 @@ class BroadcastMove : public ov::op::Op {
};

protected:
ov::PartialShape output_shape;
ov::Dimension bcast_dimension;
};

} // namespace op
Expand Down
Loading

0 comments on commit 261cf4e

Please sign in to comment.