Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Snippets] Added single evaluation of Brgemm in Tail Loop by dynamic M #25378

Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 15 additions & 1 deletion src/common/snippets/include/snippets/lowered/loop_info.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -430,7 +430,8 @@ class ExpandedLoopInfo : public LoopInfo {
ExpandedLoopInfo(size_t work_amount, size_t increment,
const std::vector<LoopPort>& entries, const std::vector<LoopPort>& exits,
std::vector<int64_t> ptr_increments, std::vector<int64_t> final_offsets, std::vector<int64_t> data_sizes,
SpecificLoopIterType type, std::shared_ptr<UnifiedLoopInfo> unified_loop_info, bool is_wa_const = false);
SpecificLoopIterType type, std::shared_ptr<UnifiedLoopInfo> unified_loop_info, bool is_wa_const = false,
bool evaluate_once = false);
/**
* @brief Clone LoopInfo with new expressions
* @param expr_map map of new and old expressions
Expand Down Expand Up @@ -474,7 +475,18 @@ class ExpandedLoopInfo : public LoopInfo {
* @return const ref of `m_data_sizes`
*/
const std::vector<int64_t>& get_data_sizes() const;
/**
* @brief Returns True if the current Loop should be executed once
* Otherwise, returns False
* @return `m_evaluance_once`
*/
bool is_evaluate_once() const;

/**
* @brief Set value to `m_evaluance_once`
* @param value - new value of `m_evaluance_once`
*/
void set_evaluate_once(bool value);
/**
* @brief Update `m_ptr_increments` using copy values from `new_values`.
* The count of new values must be equal to the count of current increments.
Expand Down Expand Up @@ -517,6 +529,8 @@ class ExpandedLoopInfo : public LoopInfo {

const SpecificLoopIterType m_type = {};
std::shared_ptr<UnifiedLoopInfo> m_unified_loop_info = {};

bool m_evaluate_once = false;
};
using ExpandedLoopInfoPtr = std::shared_ptr<ExpandedLoopInfo>;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,6 @@ using PortDescriptorPtr = std::shared_ptr<PortDescriptor>;
class PortDescriptor {
friend class LinearIRBuilder;
public:
// The structure with service values for scheduling parameters
struct ServiceDimensions {
// The value for the subtensor that means that scheduling should be by full dimension
static size_t FULL_DIM;
};

explicit PortDescriptor(const ov::Input<ov::Node>& node,
VectorDims subtensor_shape = {},
std::vector<size_t> layout = {});
Expand Down Expand Up @@ -54,6 +48,9 @@ class PortDescriptor {
void set_reg_type(RegType type) { m_reg.type = type; }
void set_reg_idx(size_t idx) { m_reg.idx = idx; }

// Indexing starts from the end (rbegin() + idx)
void set_subtensor_dim(size_t idx, VectorDims::value_type value);

std::string serialize() const;
bool empty() const { return m_layout.empty() && m_subtensor_shape.empty();}
PortDescriptorPtr clone() const;
Expand Down Expand Up @@ -87,6 +84,8 @@ class PortDescriptorUtils {
public:
static void set_port_descriptor_ptr(const ov::Input<ov::Node>& n, const PortDescriptorPtr& desc);
static void set_port_descriptor_ptr(const ov::Output<ov::Node>& n, const PortDescriptorPtr& desc);
static void set_port_descriptor(const ov::Input<ov::Node>& n, std::vector<size_t> subtensor, std::vector<size_t> layout = {});
static void set_port_descriptor(const ov::Output<ov::Node>& n, std::vector<size_t> subtensor, std::vector<size_t> layout = {});

static PortDescriptorPtr get_port_descriptor_ptr(const ov::Input<ov::Node>& in);
static PortDescriptorPtr get_port_descriptor_ptr(const ov::Input<const ov::Node>& out);
Expand Down Expand Up @@ -116,17 +115,6 @@ class PortDescriptorVectorAttribute : public ov::RuntimeAttribute {
std::vector<PortDescriptorPtr> outputs{};
};

template<typename T>
IvanNovoselov marked this conversation as resolved.
Show resolved Hide resolved
void set_port_desc(const T& port, std::vector<size_t> subtensor) {
const auto& shape = port.get_shape();
for (size_t i = 1; i <= std::min(subtensor.size(), shape.size()); i++) {
auto& dim = subtensor[subtensor.size() - i];
if (dim != PortDescriptor::ServiceDimensions::FULL_DIM)
dim = std::min(dim, shape[shape.size() - i]);
}
PortDescriptorUtils::set_port_descriptor_ptr(port, std::make_shared<PortDescriptor>(shape, subtensor));
}

} // namespace lowered
} // namespace snippets
} // namespace ov
30 changes: 20 additions & 10 deletions src/common/snippets/include/snippets/utils/utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,26 @@ namespace ov {
namespace snippets {
namespace utils {

/* --- Special values --- */
template<typename T, typename = typename std::enable_if<(std::is_same<T, size_t>::value || std::is_same<T, int64_t>::value), bool>::type>
constexpr inline T get_dynamic_value() {
return std::numeric_limits<T>::max();
}
template<typename T, typename = typename std::enable_if<(std::is_same<T, size_t>::value || std::is_same<T, int64_t>::value), bool>::type>
constexpr inline bool is_dynamic_value(T value) {
return value == get_dynamic_value<T>();
}

// This value means full dimension
// For example, for the subtensor it means that scheduling should be by full dimension
constexpr inline size_t get_full_dim_value() {
return get_dynamic_value<size_t>() - 1;
}
constexpr inline bool is_full_dim_value(size_t value) {
return value == get_full_dim_value();
}
/* ---------------------- */

// Get non-scalar Constant count that will be created after FakeQuantize decomposition.
// This count is needed to know exact count of non-scalar Constants during tokenization.
auto get_non_scalar_constant_count_for_fq(const std::shared_ptr<ov::op::v0::FakeQuantize>& fq) -> size_t;
Expand Down Expand Up @@ -59,16 +79,6 @@ inline T div_up(const T a, const U b) {
return static_cast<T>((a + b - 1) / b);
}

template<typename T, typename = typename std::enable_if<(std::is_same<T, size_t>::value || std::is_same<T, int64_t>::value), bool>::type>
constexpr inline T get_dynamic_value() {
return std::numeric_limits<T>::max();
}

template<typename T, typename = typename std::enable_if<(std::is_same<T, size_t>::value || std::is_same<T, int64_t>::value), bool>::type>
constexpr inline bool is_dynamic_value(T value) {
return value == get_dynamic_value<T>();
}

inline bool is_dynamic_vdims(const VectorDims& shape) {
return std::any_of(shape.cbegin(), shape.cend(), [](size_t v){ return is_dynamic_value(v); });
}
Expand Down
15 changes: 12 additions & 3 deletions src/common/snippets/src/lowered/loop_info.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -373,10 +373,10 @@ void UnifiedLoopInfo::add_loop_ports(const std::vector<ExpressionPort>& ports) {
ExpandedLoopInfo::ExpandedLoopInfo(size_t work_amount, size_t increment,
const std::vector<LoopPort>& entries, const std::vector<LoopPort>& exits,
std::vector<int64_t> ptr_increments, std::vector<int64_t> final_offsets, std::vector<int64_t> data_sizes,
SpecificLoopIterType type, std::shared_ptr<UnifiedLoopInfo> unified_loop_info, bool is_wa_const)
SpecificLoopIterType type, std::shared_ptr<UnifiedLoopInfo> unified_loop_info, bool is_wa_const, bool evaluate_once)
: LoopInfo(work_amount, increment, entries, exits, is_wa_const),
m_ptr_increments(std::move(ptr_increments)), m_finalization_offsets(std::move(final_offsets)),
m_data_sizes(std::move(data_sizes)), m_type(type), m_unified_loop_info(std::move(unified_loop_info)) {
m_data_sizes(std::move(data_sizes)), m_type(type), m_unified_loop_info(std::move(unified_loop_info)), m_evaluate_once(evaluate_once) {
validate();
}

Expand All @@ -392,7 +392,8 @@ std::shared_ptr<LoopInfo> ExpandedLoopInfo::clone_with_new_expr(const Expression
const auto& new_output_ports = clone_loop_ports(expr_map, m_output_ports);

return std::make_shared<ExpandedLoopInfo>(m_work_amount, m_increment, new_input_ports, new_output_ports,
m_ptr_increments, m_finalization_offsets, m_data_sizes, m_type, m_unified_loop_info, m_is_work_amount_const);
m_ptr_increments, m_finalization_offsets, m_data_sizes, m_type,
m_unified_loop_info, m_is_work_amount_const, m_evaluate_once);
}

bool ExpandedLoopInfo::is_dynamic() const {
Expand Down Expand Up @@ -435,6 +436,14 @@ const std::vector<int64_t>& ExpandedLoopInfo::get_data_sizes() const {
return m_data_sizes;
}

bool ExpandedLoopInfo::is_evaluate_once() const {
return m_evaluate_once;
}

void ExpandedLoopInfo::set_evaluate_once(bool value) {
m_evaluate_once = value;
}

void ExpandedLoopInfo::update_ptr_increments(const std::vector<int64_t>& new_values) {
OPENVINO_ASSERT(new_values.size() == m_ptr_increments.size(), "Failed to update ptr_increments: incompatible counts");
m_ptr_increments.assign(new_values.cbegin(), new_values.end());
Expand Down
11 changes: 5 additions & 6 deletions src/common/snippets/src/lowered/loop_manager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,6 @@ void LoopManager::get_io_loop_ports(LinearIR::constExprIt loop_begin_pos,
void LoopManager::mark_loop(LinearIR::constExprIt loop_begin_pos,
LinearIR::constExprIt loop_end_pos,
size_t loop_depth, size_t vector_size) {
const auto FULL_DIM = PortDescriptor::ServiceDimensions::FULL_DIM;
std::vector<ExpressionPort> loop_input_ports, loop_output_ports;
LoopManager::get_io_loop_ports(loop_begin_pos, loop_end_pos, loop_input_ports, loop_output_ports);

Expand All @@ -178,8 +177,8 @@ void LoopManager::mark_loop(LinearIR::constExprIt loop_begin_pos,
"Failed to broadcast work amount in marking loop");
};

auto is_outside_loop = [&FULL_DIM](const std::vector<size_t>& subtensor) {
return std::all_of(subtensor.begin(), subtensor.end(), [&FULL_DIM](size_t lhs) { return lhs == FULL_DIM; });
auto is_outside_loop = [](const std::vector<size_t>& subtensor) {
return std::all_of(subtensor.begin(), subtensor.end(), utils::is_full_dim_value);
};

std::vector<size_t> loop_subtensor;
Expand All @@ -192,7 +191,7 @@ void LoopManager::mark_loop(LinearIR::constExprIt loop_begin_pos,
subtensor[subtensor.size() - 1] = vector_size;
}

const size_t resizing_value = is_outside_loop(subtensor) ? FULL_DIM : 1;
const size_t resizing_value = is_outside_loop(subtensor) ? utils::get_full_dim_value() : 1;
while (subtensor.size() < loop_depth)
subtensor.insert(subtensor.begin(), resizing_value);
if (loop_subtensor.empty())
Expand All @@ -202,7 +201,7 @@ void LoopManager::mark_loop(LinearIR::constExprIt loop_begin_pos,
"Incorrect scheduling parameters for loop");

for (size_t dim_idx = 0; dim_idx < loop_depth; ++dim_idx) {
if (*(subtensor.rbegin() + dim_idx) != FULL_DIM) {
if (!utils::is_full_dim_value(*(subtensor.rbegin() + dim_idx))) {
broadcast(loop_tensor, shape, dim_idx);
}
}
Expand All @@ -211,7 +210,7 @@ void LoopManager::mark_loop(LinearIR::constExprIt loop_begin_pos,
for (size_t dim_idx = 0; dim_idx < loop_depth; ++dim_idx) {
OPENVINO_ASSERT(dim_idx < loop_subtensor.size(), "Incorrect indexes of Loop for markup");
const auto& subtensor_value = *(loop_subtensor.rbegin() + dim_idx);
if (subtensor_value == FULL_DIM) {
if (utils::is_full_dim_value(subtensor_value)) {
continue;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ size_t ComputeBufferAllocationSize::get_allocation_size(const LoopManagerPtr& lo
const auto processing_rank = !processed_dim_idxs.empty() ? std::max(*processed_dim_idxs.rbegin(), subtensor.size()) : subtensor.size();
for (size_t i = 0; i < std::min(processing_rank, rank); ++i) {
if (processed_dim_idxs.count(i) == 0) {
if (i < subtensor.size())
if (i < subtensor.size() && !utils::is_full_dim_value(*(subtensor.rbegin() + i)))
IvanNovoselov marked this conversation as resolved.
Show resolved Hide resolved
allocation_size = utils::dynamic_safe_mul(allocation_size, std::min(*(planar_shape.rbegin() + i), *(subtensor.rbegin() + i)));
else
allocation_size = utils::dynamic_safe_mul(allocation_size, *(planar_shape.rbegin() + i));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,7 @@ bool InsertSpecificIterations::decompose(LinearIR& linear_ir, LinearIR::constExp
if (is_decomposed_loop_needed(unified_loop_info, iter_type, remaining_work_amount)) {
const auto work_amount = get_decomposed_loop_work_amount(unified_loop_info, iter_type, remaining_work_amount);
const auto increment = get_decomposed_loop_increment(unified_loop_info, iter_type, remaining_work_amount);
const auto evaluate_once = !utils::is_dynamic_value(work_amount) && work_amount == increment;
// Update remaining Loop work amount
// Note: if work_amount is unknown and increment = 1, it means that a loop will iterate by whole work_amount
if (!is_wa_dynamic || increment == 1) {
Expand Down Expand Up @@ -199,7 +200,7 @@ bool InsertSpecificIterations::decompose(LinearIR& linear_ir, LinearIR::constExp
const auto decomposed_loop_info = std::make_shared<ExpandedLoopInfo>(work_amount, increment,
decomposed_loop_entry_ports, decomposed_loop_exit_ports,
decomposed_ptr_increments, decomposed_finalization_offsets,
decomposed_data_sizes, iter_type, unified_loop_info);
decomposed_data_sizes, iter_type, unified_loop_info, false, evaluate_once);
init_decomposed_loop(linear_ir, decomposed_loop_begin_it, decomposed_loop_end_it, decomposed_loop_info, loop_id, decomposed_loop_end);

decomposed = true;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

#include "snippets/lowered/pass/optimize_loop_single_evaluation.hpp"

#include "snippets/lowered/loop_manager.hpp"
#include "snippets/lowered/linear_ir.hpp"
#include "snippets/op/loop.hpp"
#include "snippets/utils/utils.hpp"
Expand All @@ -16,30 +17,31 @@ namespace pass {

bool OptimizeLoopSingleEvaluation::run(lowered::LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) {
OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::OptimizeLoopSingleEvaluation")
const auto& loop_manager = linear_ir.get_loop_manager();

bool is_modified = false;
for (auto expr_it = begin; expr_it != end; ++expr_it) {
const auto& expr = *expr_it;
if (auto loop_end = ov::as_type_ptr<op::LoopEnd>(expr->get_node())) {
// *1* solo vector/tail loop + empty outer loop
// => skip increments (both counter & ptr) : set evaluate_once flag
// *2* solo vector/tail loop + non-empty outer loop
// => skip counter increments but perform ptr increments : set evaluate_once,
// and perform pointer increments through finalization offsets
// *3* vector loop(s) + one tail loop
// => vector as usual, tail depends on outer loop, see *1* and *2*
if (loop_end->has_dynamic_params() || loop_end->get_work_amount() >= 2 * loop_end->get_increment())
continue;

auto new_finalization_offsets = loop_end->get_finalization_offsets();
const auto& ptr_increments = loop_end->get_ptr_increments();
const auto work_amount_incr = static_cast<int64_t>(loop_end->get_increment());
for (size_t i = 0; i < new_finalization_offsets.size(); i++) {
new_finalization_offsets[i] += ptr_increments[i] * work_amount_incr;
const auto& loop_info = loop_manager->get_loop_info<ExpandedLoopInfo>(loop_end->get_id());
if (loop_info->is_evaluate_once()) {
auto new_finalization_offsets = loop_end->get_finalization_offsets();
const auto& ptr_increments = loop_end->get_ptr_increments();
const auto work_amount_incr = static_cast<int64_t>(loop_end->get_increment());
for (size_t i = 0; i < new_finalization_offsets.size(); i++) {
const auto ptr_shift = utils::dynamic_safe_mul(ptr_increments[i], work_amount_incr);
new_finalization_offsets[i] = utils::dynamic_safe_add(new_finalization_offsets[i], ptr_shift);
}
loop_end->set_finalization_offsets(new_finalization_offsets);
loop_end->set_ptr_increments(std::vector<int64_t>(new_finalization_offsets.size(), 0));
loop_end->set_evaluate_once(true);

// Update the corresponding ExpandedLoopInfo
loop_info->update_ptr_increments(loop_end->get_ptr_increments());
loop_info->update_finalization_offsets(loop_end->get_finalization_offsets());

is_modified = true;
}
loop_end->set_finalization_offsets(new_finalization_offsets);
loop_end->set_ptr_increments(std::vector<int64_t>(new_finalization_offsets.size(), 0));
loop_end->set_evaluate_once(true);
is_modified = true;
}
}
return is_modified;
Expand Down
Loading
Loading