Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Snippets] Added single evaluation of Brgemm in Tail Loop by dynamic M #25378

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions src/common/snippets/include/snippets/kernel_executor_table.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ class KernelExecutorBase {
* @brief Update current kernel config in accordance with the passed expression. Corresponding kernel is recompiled if necessary.
* This method should be called to update KernelExecutor based on runtime info (e.g. shapes) available through expression ptr
*/
virtual void update_by_expression(const lowered::ExpressionPtr& expr) = 0;
virtual void update_by_expression(const lowered::ExpressionPtr& expr, const lowered::LinearIRPtr& linear_ir) = 0;
/**
* @brief Replace current kernel config with the provided value. Corresponding kernel is recompiled if necessary.
* This method should be called to restore a saved state of the executor, that was configured using update_by_expression().
Expand All @@ -70,8 +70,8 @@ class KernelExecutor : public KernelExecutorBase {
explicit KernelExecutor(Conf c) : KernelExecutorBase(), m_config{std::move(c)} {}

// Note: override when final is redundant, but needed to avoid warnings on some compilers
void update_by_expression(const lowered::ExpressionPtr& expr) override final { // NOLINT
update_config(expr, m_config);
void update_by_expression(const lowered::ExpressionPtr& expr, const lowered::LinearIRPtr& linear_ir) override final { // NOLINT
update_config(expr, linear_ir, m_config);
OPENVINO_ASSERT(m_config.is_completed(), "Failed to update kernel config in update_by_expression");
update_kernel(m_config, m_kernel);
OPENVINO_ASSERT(m_kernel, "Failed to compile kernel executor");
Expand Down Expand Up @@ -103,7 +103,7 @@ class KernelExecutor : public KernelExecutorBase {

protected:
/*** Updates stored kernel config based on runtime info from expression (e.g. new input shapes). */
virtual void update_config(const lowered::ExpressionPtr& expr, Conf& config) const = 0;
virtual void update_config(const lowered::ExpressionPtr& expr, const lowered::LinearIRPtr& linear_ir, Conf& config) const = 0;
/*** Updates stored kernel in accordance with the passed config. Recompilation of the kernel is
* performed if necessary. */
virtual void update_kernel(const Conf& c, std::shared_ptr<KernelType>& kernel) const = 0;
Expand All @@ -130,9 +130,9 @@ class KernelExecutorTable {
return m_table.at(expr);
}
/*** Updates every registered KernelExecutor in accordance with the corresponding expression */
void update_state() const {
void update_state(const lowered::LinearIRPtr& linear_ir) const {
for (const auto& record : m_table)
record.second->update_by_expression(record.first);
record.second->update_by_expression(record.first, linear_ir);
}

/*** Returns lambda function that contains current state of the table, and restores this state when called */
Expand Down
16 changes: 15 additions & 1 deletion src/common/snippets/include/snippets/lowered/loop_info.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -430,7 +430,8 @@ class ExpandedLoopInfo : public LoopInfo {
ExpandedLoopInfo(size_t work_amount, size_t increment,
const std::vector<LoopPort>& entries, const std::vector<LoopPort>& exits,
std::vector<int64_t> ptr_increments, std::vector<int64_t> final_offsets, std::vector<int64_t> data_sizes,
SpecificLoopIterType type, std::shared_ptr<UnifiedLoopInfo> unified_loop_info, bool is_wa_const = false);
SpecificLoopIterType type, std::shared_ptr<UnifiedLoopInfo> unified_loop_info, bool is_wa_const = false,
bool evaluate_once = false);
/**
* @brief Clone LoopInfo with new expressions
* @param expr_map map of new and old expressions
Expand Down Expand Up @@ -474,7 +475,18 @@ class ExpandedLoopInfo : public LoopInfo {
* @return const ref of `m_data_sizes`
*/
const std::vector<int64_t>& get_data_sizes() const;
/**
* @brief Returns True if the current Loop should be executed once
* Otherwise, returns False
* @return `m_evaluance_once`
*/
bool is_evaluate_once() const;

/**
* @brief Set value to `m_evaluance_once`
* @param value - new value of `m_evaluance_once`
*/
void set_evaluate_once(bool value);
/**
* @brief Update `m_ptr_increments` using copy values from `new_values`.
* The count of new values must be equal to the count of current increments.
Expand Down Expand Up @@ -517,6 +529,8 @@ class ExpandedLoopInfo : public LoopInfo {

const SpecificLoopIterType m_type = {};
std::shared_ptr<UnifiedLoopInfo> m_unified_loop_info = {};

bool m_evaluate_once = false;
};
using ExpandedLoopInfoPtr = std::shared_ptr<ExpandedLoopInfo>;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,6 @@ using PortDescriptorPtr = std::shared_ptr<PortDescriptor>;
class PortDescriptor {
friend class LinearIRBuilder;
public:
// The structure with service values for scheduling parameters
struct ServiceDimensions {
// The value for the subtensor that means that scheduling should be by full dimension
static size_t FULL_DIM;
};

explicit PortDescriptor(const ov::Input<ov::Node>& node,
VectorDims subtensor_shape = {},
std::vector<size_t> layout = {});
Expand Down Expand Up @@ -54,6 +48,9 @@ class PortDescriptor {
void set_reg_type(RegType type) { m_reg.type = type; }
void set_reg_idx(size_t idx) { m_reg.idx = idx; }

// Indexing starts from the end (rbegin() + idx)
void set_subtensor_dim(size_t idx, VectorDims::value_type value);

std::string serialize() const;
bool empty() const { return m_layout.empty() && m_subtensor_shape.empty();}
PortDescriptorPtr clone() const;
Expand Down Expand Up @@ -87,6 +84,8 @@ class PortDescriptorUtils {
public:
static void set_port_descriptor_ptr(const ov::Input<ov::Node>& n, const PortDescriptorPtr& desc);
static void set_port_descriptor_ptr(const ov::Output<ov::Node>& n, const PortDescriptorPtr& desc);
static void set_port_descriptor(const ov::Input<ov::Node>& n, std::vector<size_t> subtensor, std::vector<size_t> layout = {});
static void set_port_descriptor(const ov::Output<ov::Node>& n, std::vector<size_t> subtensor, std::vector<size_t> layout = {});

static PortDescriptorPtr get_port_descriptor_ptr(const ov::Input<ov::Node>& in);
static PortDescriptorPtr get_port_descriptor_ptr(const ov::Input<const ov::Node>& out);
Expand Down Expand Up @@ -116,17 +115,6 @@ class PortDescriptorVectorAttribute : public ov::RuntimeAttribute {
std::vector<PortDescriptorPtr> outputs{};
};

template<typename T>
IvanNovoselov marked this conversation as resolved.
Show resolved Hide resolved
void set_port_desc(const T& port, std::vector<size_t> subtensor) {
const auto& shape = port.get_shape();
for (size_t i = 1; i <= std::min(subtensor.size(), shape.size()); i++) {
auto& dim = subtensor[subtensor.size() - i];
if (dim != PortDescriptor::ServiceDimensions::FULL_DIM)
dim = std::min(dim, shape[shape.size() - i]);
}
PortDescriptorUtils::set_port_descriptor_ptr(port, std::make_shared<PortDescriptor>(shape, subtensor));
}

} // namespace lowered
} // namespace snippets
} // namespace ov
16 changes: 8 additions & 8 deletions src/common/snippets/include/snippets/runtime_configurator.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ class RuntimeConfigurator {
* @param linear_ir LinearIR
* @return updated config
*/
const std::shared_ptr<RuntimeConfig>& get_updated_config(const std::shared_ptr<lowered::LinearIR>& linear_ir);
const std::shared_ptr<RuntimeConfig>& get_updated_config(const lowered::LinearIRPtr& linear_ir);
/*** Returns pointer to KernelExecutorTable owned by the config */
const std::shared_ptr<KernelExecutorTable>& get_kernel_executor_table() const { return m_config->kernel_executor_table; }

Expand All @@ -70,43 +70,43 @@ class RuntimeConfigurator {
* @brief Update RuntimeConfig based on LinearIR
* @param linear_ir LinearIR
*/
virtual void update(const std::shared_ptr<lowered::LinearIR>& linear_ir);
virtual void update(const lowered::LinearIRPtr& linear_ir);
/**
* @brief Allocate and intialize fields in RuntimeConfig and RuntimeConfigurator
* @param linear_ir LinearIR
*/
virtual void initialization(const std::shared_ptr<lowered::LinearIR>& linear_ir);
virtual void initialization(const lowered::LinearIRPtr& linear_ir);

/**
* @brief Initializes input and data information of LinearIR:
* descriptors (that contains shapes and layouts) and data_sizes
* @param linear_ir LinearIR
*/
void init_data_info(const std::shared_ptr<lowered::LinearIR>& linear_ir);
void init_data_info(const lowered::LinearIRPtr& linear_ir);
/**
* @brief Initializes information of buffers:
* - static buffer_scratchpad_size
* - offsets of static clusters (with static buffers)
* - clusters with dynamic buffers (`m_dynamic_buffer_clusters`) for the quick access in `update()`
* @param linear_ir LinearIR
*/
void init_buffer_info(const std::shared_ptr<lowered::LinearIR>& linear_ir);
void init_buffer_info(const lowered::LinearIRPtr& linear_ir);
/**
* @brief Initializes tensor rank of config
* @param linear_ir LinearIR
*/
virtual void init_tensor_rank(const std::shared_ptr<lowered::LinearIR>& linear_ir) const;
virtual void init_tensor_rank(const lowered::LinearIRPtr& linear_ir) const;
/**
* @brief Update Loop informations in LinearIR: Unified and ExpandedLoopInfo
* @param linear_ir LinearIR
*/
void update_loop_info(const std::shared_ptr<lowered::LinearIR>& linear_ir) const;
void update_loop_info(const lowered::LinearIRPtr& linear_ir) const;
/**
* @brief Update Buffer scratchpad size and offsets if needed
* Note: `update_loop_info` must be called before
* @param linear_ir LinearIR
*/
void update_buffer_scratchpad_size(const std::shared_ptr<lowered::LinearIR>& linear_ir) const;
void update_buffer_scratchpad_size(const lowered::LinearIRPtr& linear_ir) const;
/**
* @brief Calculate data offsets of LinearIR and update these values in RuntimeConfig
*/
Expand Down
30 changes: 20 additions & 10 deletions src/common/snippets/include/snippets/utils/utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,26 @@ namespace ov {
namespace snippets {
namespace utils {

/* --- Special values --- */
template<typename T, typename = typename std::enable_if<(std::is_same<T, size_t>::value || std::is_same<T, int64_t>::value), bool>::type>
constexpr inline T get_dynamic_value() {
return std::numeric_limits<T>::max();
}
template<typename T, typename = typename std::enable_if<(std::is_same<T, size_t>::value || std::is_same<T, int64_t>::value), bool>::type>
constexpr inline bool is_dynamic_value(T value) {
return value == get_dynamic_value<T>();
}

// This value means full dimension
// For example, for the subtensor it means that scheduling should be by full dimension
constexpr inline size_t get_full_dim_value() {
return get_dynamic_value<size_t>() - 1;
}
constexpr inline bool is_full_dim_value(size_t value) {
return value == get_full_dim_value();
}
/* ---------------------- */

// Get non-scalar Constant count that will be created after FakeQuantize decomposition.
// This count is needed to know exact count of non-scalar Constants during tokenization.
auto get_non_scalar_constant_count_for_fq(const std::shared_ptr<ov::op::v0::FakeQuantize>& fq) -> size_t;
Expand Down Expand Up @@ -59,16 +79,6 @@ inline T div_up(const T a, const U b) {
return static_cast<T>((a + b - 1) / b);
}

template<typename T, typename = typename std::enable_if<(std::is_same<T, size_t>::value || std::is_same<T, int64_t>::value), bool>::type>
constexpr inline T get_dynamic_value() {
return std::numeric_limits<T>::max();
}

template<typename T, typename = typename std::enable_if<(std::is_same<T, size_t>::value || std::is_same<T, int64_t>::value), bool>::type>
constexpr inline bool is_dynamic_value(T value) {
return value == get_dynamic_value<T>();
}

inline bool is_dynamic_vdims(const VectorDims& shape) {
return std::any_of(shape.cbegin(), shape.cend(), [](size_t v){ return is_dynamic_value(v); });
}
Expand Down
15 changes: 12 additions & 3 deletions src/common/snippets/src/lowered/loop_info.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -373,10 +373,10 @@ void UnifiedLoopInfo::add_loop_ports(const std::vector<ExpressionPort>& ports) {
ExpandedLoopInfo::ExpandedLoopInfo(size_t work_amount, size_t increment,
const std::vector<LoopPort>& entries, const std::vector<LoopPort>& exits,
std::vector<int64_t> ptr_increments, std::vector<int64_t> final_offsets, std::vector<int64_t> data_sizes,
SpecificLoopIterType type, std::shared_ptr<UnifiedLoopInfo> unified_loop_info, bool is_wa_const)
SpecificLoopIterType type, std::shared_ptr<UnifiedLoopInfo> unified_loop_info, bool is_wa_const, bool evaluate_once)
: LoopInfo(work_amount, increment, entries, exits, is_wa_const),
m_ptr_increments(std::move(ptr_increments)), m_finalization_offsets(std::move(final_offsets)),
m_data_sizes(std::move(data_sizes)), m_type(type), m_unified_loop_info(std::move(unified_loop_info)) {
m_data_sizes(std::move(data_sizes)), m_type(type), m_unified_loop_info(std::move(unified_loop_info)), m_evaluate_once(evaluate_once) {
validate();
}

Expand All @@ -392,7 +392,8 @@ std::shared_ptr<LoopInfo> ExpandedLoopInfo::clone_with_new_expr(const Expression
const auto& new_output_ports = clone_loop_ports(expr_map, m_output_ports);

return std::make_shared<ExpandedLoopInfo>(m_work_amount, m_increment, new_input_ports, new_output_ports,
m_ptr_increments, m_finalization_offsets, m_data_sizes, m_type, m_unified_loop_info, m_is_work_amount_const);
m_ptr_increments, m_finalization_offsets, m_data_sizes, m_type,
m_unified_loop_info, m_is_work_amount_const, m_evaluate_once);
}

bool ExpandedLoopInfo::is_dynamic() const {
Expand Down Expand Up @@ -435,6 +436,14 @@ const std::vector<int64_t>& ExpandedLoopInfo::get_data_sizes() const {
return m_data_sizes;
}

bool ExpandedLoopInfo::is_evaluate_once() const {
return m_evaluate_once;
}

void ExpandedLoopInfo::set_evaluate_once(bool value) {
m_evaluate_once = value;
}

void ExpandedLoopInfo::update_ptr_increments(const std::vector<int64_t>& new_values) {
OPENVINO_ASSERT(new_values.size() == m_ptr_increments.size(), "Failed to update ptr_increments: incompatible counts");
m_ptr_increments.assign(new_values.cbegin(), new_values.end());
Expand Down
11 changes: 5 additions & 6 deletions src/common/snippets/src/lowered/loop_manager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,6 @@ void LoopManager::get_io_loop_ports(LinearIR::constExprIt loop_begin_pos,
void LoopManager::mark_loop(LinearIR::constExprIt loop_begin_pos,
LinearIR::constExprIt loop_end_pos,
size_t loop_depth, size_t vector_size) {
const auto FULL_DIM = PortDescriptor::ServiceDimensions::FULL_DIM;
std::vector<ExpressionPort> loop_input_ports, loop_output_ports;
LoopManager::get_io_loop_ports(loop_begin_pos, loop_end_pos, loop_input_ports, loop_output_ports);

Expand All @@ -178,8 +177,8 @@ void LoopManager::mark_loop(LinearIR::constExprIt loop_begin_pos,
"Failed to broadcast work amount in marking loop");
};

auto is_outside_loop = [&FULL_DIM](const std::vector<size_t>& subtensor) {
return std::all_of(subtensor.begin(), subtensor.end(), [&FULL_DIM](size_t lhs) { return lhs == FULL_DIM; });
auto is_outside_loop = [](const std::vector<size_t>& subtensor) {
return std::all_of(subtensor.begin(), subtensor.end(), utils::is_full_dim_value);
};

std::vector<size_t> loop_subtensor;
Expand All @@ -192,7 +191,7 @@ void LoopManager::mark_loop(LinearIR::constExprIt loop_begin_pos,
subtensor[subtensor.size() - 1] = vector_size;
}

const size_t resizing_value = is_outside_loop(subtensor) ? FULL_DIM : 1;
const size_t resizing_value = is_outside_loop(subtensor) ? utils::get_full_dim_value() : 1;
while (subtensor.size() < loop_depth)
subtensor.insert(subtensor.begin(), resizing_value);
if (loop_subtensor.empty())
Expand All @@ -202,7 +201,7 @@ void LoopManager::mark_loop(LinearIR::constExprIt loop_begin_pos,
"Incorrect scheduling parameters for loop");

for (size_t dim_idx = 0; dim_idx < loop_depth; ++dim_idx) {
if (*(subtensor.rbegin() + dim_idx) != FULL_DIM) {
if (!utils::is_full_dim_value(*(subtensor.rbegin() + dim_idx))) {
broadcast(loop_tensor, shape, dim_idx);
}
}
Expand All @@ -211,7 +210,7 @@ void LoopManager::mark_loop(LinearIR::constExprIt loop_begin_pos,
for (size_t dim_idx = 0; dim_idx < loop_depth; ++dim_idx) {
OPENVINO_ASSERT(dim_idx < loop_subtensor.size(), "Incorrect indexes of Loop for markup");
const auto& subtensor_value = *(loop_subtensor.rbegin() + dim_idx);
if (subtensor_value == FULL_DIM) {
if (utils::is_full_dim_value(subtensor_value)) {
continue;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ size_t ComputeBufferAllocationSize::get_allocation_size(const LoopManagerPtr& lo
const auto processing_rank = !processed_dim_idxs.empty() ? std::max(*processed_dim_idxs.rbegin(), subtensor.size()) : subtensor.size();
for (size_t i = 0; i < std::min(processing_rank, rank); ++i) {
if (processed_dim_idxs.count(i) == 0) {
if (i < subtensor.size())
if (i < subtensor.size() && !utils::is_full_dim_value(*(subtensor.rbegin() + i)))
IvanNovoselov marked this conversation as resolved.
Show resolved Hide resolved
allocation_size = utils::dynamic_safe_mul(allocation_size, std::min(*(planar_shape.rbegin() + i), *(subtensor.rbegin() + i)));
else
allocation_size = utils::dynamic_safe_mul(allocation_size, *(planar_shape.rbegin() + i));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,7 @@ bool InsertSpecificIterations::decompose(LinearIR& linear_ir, LinearIR::constExp
if (is_decomposed_loop_needed(unified_loop_info, iter_type, remaining_work_amount)) {
const auto work_amount = get_decomposed_loop_work_amount(unified_loop_info, iter_type, remaining_work_amount);
const auto increment = get_decomposed_loop_increment(unified_loop_info, iter_type, remaining_work_amount);
const auto evaluate_once = !utils::is_dynamic_value(work_amount) && work_amount == increment;
// Update remaining Loop work amount
// Note: if work_amount is unknown and increment = 1, it means that a loop will iterate by whole work_amount
if (!is_wa_dynamic || increment == 1) {
Expand Down Expand Up @@ -199,7 +200,7 @@ bool InsertSpecificIterations::decompose(LinearIR& linear_ir, LinearIR::constExp
const auto decomposed_loop_info = std::make_shared<ExpandedLoopInfo>(work_amount, increment,
decomposed_loop_entry_ports, decomposed_loop_exit_ports,
decomposed_ptr_increments, decomposed_finalization_offsets,
decomposed_data_sizes, iter_type, unified_loop_info);
decomposed_data_sizes, iter_type, unified_loop_info, false, evaluate_once);
init_decomposed_loop(linear_ir, decomposed_loop_begin_it, decomposed_loop_end_it, decomposed_loop_info, loop_id, decomposed_loop_end);

decomposed = true;
Expand Down
Loading
Loading