Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Snippets] Added single evaluation of Brgemm in Tail Loop by dynamic M #25378

Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions src/common/snippets/include/snippets/kernel_executor_table.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ class KernelExecutorBase {
* @brief Update current kernel config in accordance with the passed expression. Corresponding kernel is recompiled if necessary.
* This method should be called to update KernelExecutor based on runtime info (e.g. shapes) available through expression ptr
*/
virtual void update_by_expression(const lowered::ExpressionPtr& expr) = 0;
virtual void update_by_expression(const lowered::ExpressionPtr& expr, const lowered::LinearIRPtr& linear_ir) = 0;
/**
* @brief Replace current kernel config with the provided value. Corresponding kernel is recompiled if necessary.
* This method should be called to restore a saved state of the executor, that was configured using update_by_expression().
Expand All @@ -70,8 +70,8 @@ class KernelExecutor : public KernelExecutorBase {
explicit KernelExecutor(Conf c) : KernelExecutorBase(), m_config{std::move(c)} {}

// Note: override when final is redundant, but needed to avoid warnings on some compilers
void update_by_expression(const lowered::ExpressionPtr& expr) override final { // NOLINT
update_config(expr, m_config);
void update_by_expression(const lowered::ExpressionPtr& expr, const lowered::LinearIRPtr& linear_ir) override final { // NOLINT
update_config(expr, linear_ir, m_config);
OPENVINO_ASSERT(m_config.is_completed(), "Failed to update kernel config in update_by_expression");
update_kernel(m_config, m_kernel);
OPENVINO_ASSERT(m_kernel, "Failed to compile kernel executor");
Expand Down Expand Up @@ -103,7 +103,7 @@ class KernelExecutor : public KernelExecutorBase {

protected:
/*** Updates stored kernel config based on runtime info from expression (e.g. new input shapes). */
virtual void update_config(const lowered::ExpressionPtr& expr, Conf& config) const = 0;
virtual void update_config(const lowered::ExpressionPtr& expr, const lowered::LinearIRPtr& linear_ir, Conf& config) const = 0;
/*** Updates stored kernel in accordance with the passed config. Recompilation of the kernel is
* performed if necessary. */
virtual void update_kernel(const Conf& c, std::shared_ptr<KernelType>& kernel) const = 0;
Expand All @@ -130,9 +130,9 @@ class KernelExecutorTable {
return m_table.at(expr);
}
/*** Updates every registered KernelExecutor in accordance with the corresponding expression */
void update_state() const {
void update_state(const lowered::LinearIRPtr& linear_ir) const {
for (const auto& record : m_table)
record.second->update_by_expression(record.first);
record.second->update_by_expression(record.first, linear_ir);
}

/*** Returns lambda function that contains current state of the table, and restores this state when called */
Expand Down
16 changes: 8 additions & 8 deletions src/common/snippets/include/snippets/runtime_configurator.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ class RuntimeConfigurator {
* @param linear_ir LinearIR
* @return updated config
*/
const std::shared_ptr<RuntimeConfig>& get_updated_config(const std::shared_ptr<lowered::LinearIR>& linear_ir);
const std::shared_ptr<RuntimeConfig>& get_updated_config(const lowered::LinearIRPtr& linear_ir);
/*** Returns pointer to KernelExecutorTable owned by the config */
const std::shared_ptr<KernelExecutorTable>& get_kernel_executor_table() const { return m_config->kernel_executor_table; }

Expand All @@ -70,43 +70,43 @@ class RuntimeConfigurator {
* @brief Update RuntimeConfig based on LinearIR
* @param linear_ir LinearIR
*/
virtual void update(const std::shared_ptr<lowered::LinearIR>& linear_ir);
virtual void update(const lowered::LinearIRPtr& linear_ir);
/**
* @brief Allocate and intialize fields in RuntimeConfig and RuntimeConfigurator
* @param linear_ir LinearIR
*/
virtual void initialization(const std::shared_ptr<lowered::LinearIR>& linear_ir);
virtual void initialization(const lowered::LinearIRPtr& linear_ir);

/**
* @brief Initializes input and data information of LinearIR:
* descriptors (that contains shapes and layouts) and data_sizes
* @param linear_ir LinearIR
*/
void init_data_info(const std::shared_ptr<lowered::LinearIR>& linear_ir);
void init_data_info(const lowered::LinearIRPtr& linear_ir);
/**
* @brief Initializes information of buffers:
* - static buffer_scratchpad_size
* - offsets of static clusters (with static buffers)
* - clusters with dynamic buffers (`m_dynamic_buffer_clusters`) for the quick access in `update()`
* @param linear_ir LinearIR
*/
void init_buffer_info(const std::shared_ptr<lowered::LinearIR>& linear_ir);
void init_buffer_info(const lowered::LinearIRPtr& linear_ir);
/**
* @brief Initializes tensor rank of config
* @param linear_ir LinearIR
*/
virtual void init_tensor_rank(const std::shared_ptr<lowered::LinearIR>& linear_ir) const;
virtual void init_tensor_rank(const lowered::LinearIRPtr& linear_ir) const;
/**
* @brief Update Loop informations in LinearIR: Unified and ExpandedLoopInfo
* @param linear_ir LinearIR
*/
void update_loop_info(const std::shared_ptr<lowered::LinearIR>& linear_ir) const;
void update_loop_info(const lowered::LinearIRPtr& linear_ir) const;
/**
* @brief Update Buffer scratchpad size and offsets if needed
* Note: `update_loop_info` must be called before
* @param linear_ir LinearIR
*/
void update_buffer_scratchpad_size(const std::shared_ptr<lowered::LinearIR>& linear_ir) const;
void update_buffer_scratchpad_size(const lowered::LinearIRPtr& linear_ir) const;
/**
* @brief Calculate data offsets of LinearIR and update these values in RuntimeConfig
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -107,8 +107,8 @@ void propagate_updated_subtensor_through_loop(const LinearIR& linear_ir,
VectorDims new_subtensor(planar_dims.begin() + subtensor_start, planar_dims.end());
for (size_t i = 0; i < new_subtensor.size(); ++i) {
// If user forces dynamic value to set in subtensor, set real dynamic dimension using `get_dynamic_value<size_t>()`
new_subtensor[i] = new_subtensor[i] == FORCED_DYNAMIC_VALUE ? utils::get_dynamic_value<size_t>()
: std::min(new_subtensor[i], subtensor[i]);
new_subtensor[i] = new_subtensor[i] == FORCED_DYNAMIC_VALUE ? utils::get_dynamic_value<size_t>() :
utils::is_full_dim_value(subtensor[i]) ? subtensor[i] : std::min(new_subtensor[i], subtensor[i]);
}
desc->set_subtensor(new_subtensor);
}
Expand Down
2 changes: 1 addition & 1 deletion src/common/snippets/src/op/subgraph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -552,7 +552,7 @@ snippets::Schedule Subgraph::generate(const void* compile_params) const {
exec_table->replace_key_expression(expression_map.at(expr.get()), expr);
// Some kernel executors might've been registered during code emission.
// We need to update them, so appropriate kernels will be compiled.
exec_table->update_state();
exec_table->update_state(m_linear_ir);
return {std::move(lowering_result)};
}

Expand Down
26 changes: 14 additions & 12 deletions src/common/snippets/src/runtime_configurator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ RuntimeConfigurator::RuntimeConfigurator(std::shared_ptr<RuntimeConfig> c) :
OPENVINO_ASSERT(m_config, "Runtime config is nullptr!");
}

const std::shared_ptr<RuntimeConfig>& RuntimeConfigurator::get_updated_config(const std::shared_ptr<lowered::LinearIR>& linear_ir) {
const std::shared_ptr<RuntimeConfig>& RuntimeConfigurator::get_updated_config(const lowered::LinearIRPtr& linear_ir) {
// First initialization
if (m_io_num == 0)
initialization(linear_ir);
Expand All @@ -44,7 +44,7 @@ const std::shared_ptr<RuntimeConfig>& RuntimeConfigurator::get_updated_config(co
return m_config;
}

void RuntimeConfigurator::initialization(const std::shared_ptr<lowered::LinearIR>& linear_ir) {
void RuntimeConfigurator::initialization(const lowered::LinearIRPtr& linear_ir) {
init_data_info(linear_ir);
init_tensor_rank(linear_ir);
init_buffer_info(linear_ir);
Expand All @@ -55,7 +55,7 @@ void RuntimeConfigurator::initialization(const std::shared_ptr<lowered::LinearIR
m_config->tile_rank = linear_ir->get_config().m_loop_depth;
}

void RuntimeConfigurator::update(const std::shared_ptr<lowered::LinearIR>& linear_ir) {
void RuntimeConfigurator::update(const lowered::LinearIRPtr& linear_ir) {
if (linear_ir->is_dynamic()) {
update_loop_info(linear_ir);
update_buffer_scratchpad_size(linear_ir);
Expand All @@ -67,11 +67,11 @@ void RuntimeConfigurator::update(const std::shared_ptr<lowered::LinearIR>& linea
update_latest_shapes();
}

void RuntimeConfigurator::init_tensor_rank(const std::shared_ptr<lowered::LinearIR>& linear_ir) const {
void RuntimeConfigurator::init_tensor_rank(const lowered::LinearIRPtr& linear_ir) const {
m_config->tensor_rank = linear_ir->get_master_shape().size();
}

void RuntimeConfigurator::init_data_info(const std::shared_ptr<lowered::LinearIR>& linear_ir) {
void RuntimeConfigurator::init_data_info(const lowered::LinearIRPtr& linear_ir) {
const auto& parameters = linear_ir->get_parameters();
const auto& results = linear_ir->get_results();
m_in_num = parameters.size();
Expand Down Expand Up @@ -113,7 +113,7 @@ void RuntimeConfigurator::init_data_info(const std::shared_ptr<lowered::LinearIR
}
}

void RuntimeConfigurator::init_buffer_info(const std::shared_ptr<lowered::LinearIR>& linear_ir) {
void RuntimeConfigurator::init_buffer_info(const lowered::LinearIRPtr& linear_ir) {
std::map<size_t, std::set<lowered::ExpressionPtr>> dynamic_buffer_clusters, static_buffer_clusters;

// All needed checks are in Validate pass
Expand Down Expand Up @@ -143,7 +143,7 @@ void RuntimeConfigurator::init_buffer_info(const std::shared_ptr<lowered::Linear
m_dynamic_buffer_clusters = std::move(dynamic_buffer_clusters);
}

void RuntimeConfigurator::update_loop_info(const std::shared_ptr<lowered::LinearIR>& linear_ir) const {
void RuntimeConfigurator::update_loop_info(const lowered::LinearIRPtr& linear_ir) const {
// Initialized UnifiedLoopInfo
struct CurrentUnifiedLoopInfo {
size_t current_work_amount = 0;
Expand Down Expand Up @@ -180,25 +180,27 @@ void RuntimeConfigurator::update_loop_info(const std::shared_ptr<lowered::Linear
continue;
}

expanded_loop_info->set_work_amount(
lowered::pass::InsertSpecificIterations::get_decomposed_loop_work_amount(current_unified_loop_info, decomposed_loop_type, current_work_amount));
const auto work_amount =
lowered::pass::InsertSpecificIterations::get_decomposed_loop_work_amount(current_unified_loop_info, decomposed_loop_type, current_work_amount);
expanded_loop_info->set_work_amount(work_amount);
// Update remaining Loop work amount
current_work_amount -= expanded_loop_info->get_work_amount();
current_work_amount -= work_amount;

// Update only `finalization offsets`. `Ptr increments` are always zeroed in this case
auto updated_finalization_offsets = current_work_amount > 0 ? std::vector<int64_t>(finalization_offsets.size(), 0) : finalization_offsets;
if (expanded_loop_info->is_evaluate_once()) {
expanded_loop_info->set_increment(work_amount);
// work_amount is equal to increment in cases with `evaluate_once`
for (size_t i = 0; i < updated_finalization_offsets.size(); ++i)
updated_finalization_offsets[i] += ptr_increments[i] * expanded_loop_info->get_work_amount();
updated_finalization_offsets[i] += ptr_increments[i] * work_amount;
} else {
expanded_loop_info->update_ptr_increments(ptr_increments);
}
expanded_loop_info->update_finalization_offsets(updated_finalization_offsets);
}
}

void RuntimeConfigurator::update_buffer_scratchpad_size(const std::shared_ptr<lowered::LinearIR>& linear_ir) const {
void RuntimeConfigurator::update_buffer_scratchpad_size(const lowered::LinearIRPtr& linear_ir) const {
const auto& loop_manager = linear_ir->get_loop_manager();
m_config->buffer_scratchpad_size = linear_ir->get_static_buffer_scratchpad_size();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

#include "emitters/snippets/cpu_runtime_configurator.hpp"

#include "transformations/snippets/x64/op/brgemm_cpu.hpp"
#include "snippets/utils/utils.hpp"
#include "snippets/lowered/loop_manager.hpp"

Expand All @@ -15,16 +14,14 @@ namespace intel_cpu {
CPURuntimeConfigurator::CPURuntimeConfigurator() : ov::snippets::RuntimeConfigurator(std::make_shared<CPURuntimeConfig>()) {
}

void CPURuntimeConfigurator::update(const std::shared_ptr<ov::snippets::lowered::LinearIR>& linear_ir) {
void CPURuntimeConfigurator::update(const ov::snippets::lowered::LinearIRPtr& linear_ir) {
if (linear_ir->is_dynamic()) {
const auto& loop_manager = linear_ir->get_loop_manager();
update_loop_info(linear_ir);
update_loop_args(loop_manager);
// Update Brgemm should be before `update_buffer_scratchpad_size`
// because `ComputeAllocationSize` depends on subtensors which are updated in `update_brgemms`
update_brgemms(loop_manager);
update_loop_args(linear_ir);
// Update KernelExecutor Table should be before `update_buffer_scratchpad_size`
// because `ComputeAllocationSize` depends on subtensors which are updated in the table
get_kernel_executor_table()->update_state(linear_ir);
update_buffer_scratchpad_size(linear_ir);
get_kernel_executor_table()->update_state();
}

m_config->master_shape = linear_ir->get_master_shape();
Expand All @@ -33,45 +30,15 @@ void CPURuntimeConfigurator::update(const std::shared_ptr<ov::snippets::lowered:
update_latest_shapes();
}

void CPURuntimeConfigurator::initialization(const std::shared_ptr<ov::snippets::lowered::LinearIR>& linear_ir) {
RuntimeConfigurator::initialization(linear_ir);

for (const auto& expr : *linear_ir) {
if (ov::is_type<ov::intel_cpu::BrgemmCPU>(expr->get_node())) {
const auto& in0_desc = expr->get_input_port_descriptor(0);
const auto& in1_desc = expr->get_input_port_descriptor(1);
const auto& out_desc = expr->get_output_port_descriptor(0);

const auto& in0_subtensor = in0_desc->get_subtensor();
const auto& in1_subtensor = in1_desc->get_subtensor();
const auto& out_subtensor = out_desc->get_subtensor();

// TODO [146125]: At the moment only blocking by dynamic M is supported
// So we save Brgemm with only dynamic M
// If there are other dynamic dimensions, throw exception for now
OPENVINO_ASSERT(!snippets::utils::is_dynamic_value(*in0_subtensor.crbegin()) &&
!snippets::utils::is_dynamic_value(*in1_subtensor.crbegin()) &&
!snippets::utils::is_dynamic_value(*(++in1_subtensor.crbegin())) &&
!snippets::utils::is_dynamic_value(*out_subtensor.crbegin()),
"CPURuntimeConfigurator supports only dynamic M in Brgemm subtensors");
OPENVINO_ASSERT(*(++in0_subtensor.crbegin()) == *(++out_subtensor.crbegin()),
"Incorrect values in subtensors of BrgemmCPU");

if (snippets::utils::is_dynamic_value(*(++in0_subtensor.crbegin())))
m_dynamic_brgemms.insert(expr);
}
}
}

void CPURuntimeConfigurator::init_tensor_rank(const std::shared_ptr<ov::snippets::lowered::LinearIR>& linear_ir) const {
void CPURuntimeConfigurator::init_tensor_rank(const ov::snippets::lowered::LinearIRPtr& linear_ir) const {
m_config->tensor_rank = std::max(linear_ir->get_master_shape().size(), rank6D);
}

void CPURuntimeConfigurator::update_loop_args(const ov::snippets::lowered::LoopManagerPtr& loop_manager) const {
void CPURuntimeConfigurator::update_loop_args(const ov::snippets::lowered::LinearIRPtr& linear_ir) const {
const auto& cpu_config = ov::as_type_ptr<CPURuntimeConfig>(m_config);
OPENVINO_ASSERT(cpu_config, "CPURuntimeConfigurator expects CPURuntimeConfig");

const auto& loop_map = loop_manager->get_map();
const auto& loop_map = linear_ir->get_loop_manager()->get_map();
cpu_config->loop_args.resize(loop_map.size());
for (const auto& loop : loop_map) {
const auto& idx = loop.first;
Expand All @@ -90,18 +57,5 @@ void CPURuntimeConfigurator::update_loop_args(const ov::snippets::lowered::LoopM
}
}

void CPURuntimeConfigurator::update_brgemms(const ov::snippets::lowered::LoopManagerPtr& loop_manager) const {
for (const auto& brgemm_expr : m_dynamic_brgemms) {
const auto& loop_ids = brgemm_expr->get_loop_ids();
OPENVINO_ASSERT(!loop_ids.empty(), "Dynamic Brgemm must be in loops");
// TODO [146125]: Loop by M is first one in `loop_ids`
const auto& expanded_loop_info = loop_manager->get_loop_info<snippets::lowered::ExpandedLoopInfo>(loop_ids.front());
const auto& block_size_m = expanded_loop_info->get_work_amount();

brgemm_expr->get_input_port_descriptor(0)->set_subtensor_dim(1, block_size_m);
brgemm_expr->get_output_port_descriptor(0)->set_subtensor_dim(1, block_size_m);
}
}

} // namespace intel_cpu
} // namespace ov
Original file line number Diff line number Diff line change
Expand Up @@ -29,31 +29,19 @@ class CPURuntimeConfigurator : public ov::snippets::RuntimeConfigurator {
* @brief Update RuntimeConfig based on LinearIR
* @param linear_ir LinearIR
*/
void update(const std::shared_ptr<ov::snippets::lowered::LinearIR>& linear_ir) override;
/**
* @brief Allocate and intialize fields in RuntimeConfig and RuntimeConfigurator
* @param linear_ir LinearIR
*/
void initialization(const std::shared_ptr<ov::snippets::lowered::LinearIR>& linear_ir) override;
void update(const ov::snippets::lowered::LinearIRPtr& linear_ir) override;
/**
* @brief Initializes tensor rank of config
* @param linear_ir LinearIR
*/
void init_tensor_rank(const std::shared_ptr<ov::snippets::lowered::LinearIR>& linear_ir) const override;
void init_tensor_rank(const ov::snippets::lowered::LinearIRPtr& linear_ir) const override;
/**
* @brief Calculate Loop parameters of Loop emitters and update these values in CPURuntimeConfig
* @param loop_manager Loop Manager
*/
void update_loop_args(const ov::snippets::lowered::LoopManagerPtr& loop_manager) const;
/**
* @brief Update subtensors of Brgemms
* @param loop_manager Loop Manager
* @param linear_ir LinearIR
v-Golubev marked this conversation as resolved.
Show resolved Hide resolved
*/
void update_brgemms(const ov::snippets::lowered::LoopManagerPtr& loop_manager) const;
void update_loop_args(const ov::snippets::lowered::LinearIRPtr& linear_ir) const;

const size_t rank6D = 6;
// Brgemm expressions with subtensors with dynamic values
std::unordered_set<ov::snippets::lowered::ExpressionPtr> m_dynamic_brgemms = {};
};

} // namespace intel_cpu
Expand Down
Loading
Loading