Skip to content

Commit

Permalink
[Snippets] Moved UpdateBrgemms to KernelExecutor::update_config
Browse files Browse the repository at this point in the history
  • Loading branch information
a-sidorova committed Jul 25, 2024
1 parent 6b22577 commit 7b0504c
Show file tree
Hide file tree
Showing 8 changed files with 97 additions and 141 deletions.
12 changes: 6 additions & 6 deletions src/common/snippets/include/snippets/kernel_executor_table.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ class KernelExecutorBase {
* @brief Update current kernel config in accordance with the passed expression. Corresponding kernel is recompiled if necessary.
* This method should be called to update KernelExecutor based on runtime info (e.g. shapes) available through expression ptr
*/
virtual void update_by_expression(const lowered::ExpressionPtr& expr) = 0;
virtual void update_by_expression(const lowered::ExpressionPtr& expr, const lowered::LinearIR& linear_ir) = 0;
/**
* @brief Replace current kernel config with the provided value. Corresponding kernel is recompiled if necessary.
* This method should be called to restore a saved state of the executor, that was configured using update_by_expression().
Expand All @@ -70,8 +70,8 @@ class KernelExecutor : public KernelExecutorBase {
explicit KernelExecutor(Conf c) : KernelExecutorBase(), m_config{std::move(c)} {}

// Note: override when final is redundant, but needed to avoid warnings on some compilers
void update_by_expression(const lowered::ExpressionPtr& expr) override final { // NOLINT
update_config(expr, m_config);
void update_by_expression(const lowered::ExpressionPtr& expr, const lowered::LinearIR& linear_ir) override final { // NOLINT
update_config(expr, linear_ir, m_config);
OPENVINO_ASSERT(m_config.is_completed(), "Failed to update kernel config in update_by_expression");
update_kernel(m_config, m_kernel);
OPENVINO_ASSERT(m_kernel, "Failed to compile kernel executor");
Expand Down Expand Up @@ -103,7 +103,7 @@ class KernelExecutor : public KernelExecutorBase {

protected:
/*** Updates stored kernel config based on runtime info from expression (e.g. new input shapes). */
virtual void update_config(const lowered::ExpressionPtr& expr, Conf& config) const = 0;
virtual void update_config(const lowered::ExpressionPtr& expr, const lowered::LinearIR& linear_ir, Conf& config) const = 0;
/*** Updates stored kernel in accordance with the passed config. Recompilation of the kernel is
* performed if necessary. */
virtual void update_kernel(const Conf& c, std::shared_ptr<KernelType>& kernel) const = 0;
Expand All @@ -130,9 +130,9 @@ class KernelExecutorTable {
return m_table.at(expr);
}
/*** Updates every registered KernelExecutor in accordance with the corresponding expression */
void update_state() const {
void update_state(const lowered::LinearIR& linear_ir) const {
for (const auto& record : m_table)
record.second->update_by_expression(record.first);
record.second->update_by_expression(record.first, linear_ir);
}

/*** Returns lambda function that contains current state of the table, and restores this state when called */
Expand Down
16 changes: 8 additions & 8 deletions src/common/snippets/include/snippets/runtime_configurator.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ class RuntimeConfigurator {
* @param linear_ir LinearIR
* @return updated config
*/
const std::shared_ptr<RuntimeConfig>& get_updated_config(const std::shared_ptr<lowered::LinearIR>& linear_ir);
const std::shared_ptr<RuntimeConfig>& get_updated_config(const lowered::LinearIR& linear_ir);
/*** Returns pointer to KernelExecutorTable owned by the config */
const std::shared_ptr<KernelExecutorTable>& get_kernel_executor_table() const { return m_config->kernel_executor_table; }

Expand All @@ -70,43 +70,43 @@ class RuntimeConfigurator {
* @brief Update RuntimeConfig based on LinearIR
* @param linear_ir LinearIR
*/
virtual void update(const std::shared_ptr<lowered::LinearIR>& linear_ir);
virtual void update(const lowered::LinearIR& linear_ir);
/**
* @brief Allocate and intialize fields in RuntimeConfig and RuntimeConfigurator
* @param linear_ir LinearIR
*/
virtual void initialization(const std::shared_ptr<lowered::LinearIR>& linear_ir);
virtual void initialization(const lowered::LinearIR& linear_ir);

/**
* @brief Initializes input and data information of LinearIR:
* descriptors (that contains shapes and layouts) and data_sizes
* @param linear_ir LinearIR
*/
void init_data_info(const std::shared_ptr<lowered::LinearIR>& linear_ir);
void init_data_info(const lowered::LinearIR& linear_ir);
/**
* @brief Initializes information of buffers:
* - static buffer_scratchpad_size
* - offsets of static clusters (with static buffers)
* - clusters with dynamic buffers (`m_dynamic_buffer_clusters`) for the quick access in `update()`
* @param linear_ir LinearIR
*/
void init_buffer_info(const std::shared_ptr<lowered::LinearIR>& linear_ir);
void init_buffer_info(const lowered::LinearIR& linear_ir);
/**
* @brief Initializes tensor rank of config
* @param linear_ir LinearIR
*/
virtual void init_tensor_rank(const std::shared_ptr<lowered::LinearIR>& linear_ir) const;
virtual void init_tensor_rank(const lowered::LinearIR& linear_ir) const;
/**
* @brief Update Loop informations in LinearIR: Unified and ExpandedLoopInfo
* @param linear_ir LinearIR
*/
void update_loop_info(const std::shared_ptr<lowered::LinearIR>& linear_ir) const;
void update_loop_info(const lowered::LinearIR& linear_ir) const;
/**
* @brief Update Buffer scratchpad size and offsets if needed
* Note: `update_loop_info` must be called before
* @param linear_ir LinearIR
*/
void update_buffer_scratchpad_size(const std::shared_ptr<lowered::LinearIR>& linear_ir) const;
void update_buffer_scratchpad_size(const lowered::LinearIR& linear_ir) const;
/**
* @brief Calculate data offsets of LinearIR and update these values in RuntimeConfig
*/
Expand Down
4 changes: 2 additions & 2 deletions src/common/snippets/src/op/subgraph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -552,14 +552,14 @@ snippets::Schedule Subgraph::generate(const void* compile_params) const {
exec_table->replace_key_expression(expression_map.at(expr.get()), expr);
// Some kernel executors might've been registered during code emission.
// We need to update them, so appropriate kernels will be compiled.
exec_table->update_state();
exec_table->update_state(*m_linear_ir);
return {std::move(lowering_result)};
}

const std::shared_ptr<RuntimeConfig>& Subgraph::update_runtime_config() const {
OPENVINO_ASSERT(m_generator, "Generator has not been inited!");
OPENVINO_ASSERT(m_linear_ir, "LoweredLinearIR has not been inited!");
return m_generator->get_target_machine()->get_runtime_configurator()->get_updated_config(m_linear_ir);
return m_generator->get_target_machine()->get_runtime_configurator()->get_updated_config(*m_linear_ir);
}

void Subgraph::print() const {
Expand Down
48 changes: 25 additions & 23 deletions src/common/snippets/src/runtime_configurator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ RuntimeConfigurator::RuntimeConfigurator(std::shared_ptr<RuntimeConfig> c) :
OPENVINO_ASSERT(m_config, "Runtime config is nullptr!");
}

const std::shared_ptr<RuntimeConfig>& RuntimeConfigurator::get_updated_config(const std::shared_ptr<lowered::LinearIR>& linear_ir) {
const std::shared_ptr<RuntimeConfig>& RuntimeConfigurator::get_updated_config(const lowered::LinearIR& linear_ir) {
// First initialization
if (m_io_num == 0)
initialization(linear_ir);
Expand All @@ -44,36 +44,36 @@ const std::shared_ptr<RuntimeConfig>& RuntimeConfigurator::get_updated_config(co
return m_config;
}

void RuntimeConfigurator::initialization(const std::shared_ptr<lowered::LinearIR>& linear_ir) {
void RuntimeConfigurator::initialization(const lowered::LinearIR& linear_ir) {
init_data_info(linear_ir);
init_tensor_rank(linear_ir);
init_buffer_info(linear_ir);

OPENVINO_ASSERT(m_io_num > 0, "LinearIR must have parameters and results");
m_latest_shapes.resize(m_io_num);
m_config->io_data_offsets.resize(m_io_num);
m_config->tile_rank = linear_ir->get_config().m_loop_depth;
m_config->tile_rank = linear_ir.get_config().m_loop_depth;
}

void RuntimeConfigurator::update(const std::shared_ptr<lowered::LinearIR>& linear_ir) {
if (linear_ir->is_dynamic()) {
void RuntimeConfigurator::update(const lowered::LinearIR& linear_ir) {
if (linear_ir.is_dynamic()) {
update_loop_info(linear_ir);
update_buffer_scratchpad_size(linear_ir);
}

m_config->master_shape = linear_ir->get_master_shape();
m_config->master_shape = linear_ir.get_master_shape();

update_data_offsets();
update_latest_shapes();
}

void RuntimeConfigurator::init_tensor_rank(const std::shared_ptr<lowered::LinearIR>& linear_ir) const {
m_config->tensor_rank = linear_ir->get_master_shape().size();
void RuntimeConfigurator::init_tensor_rank(const lowered::LinearIR& linear_ir) const {
m_config->tensor_rank = linear_ir.get_master_shape().size();
}

void RuntimeConfigurator::init_data_info(const std::shared_ptr<lowered::LinearIR>& linear_ir) {
const auto& parameters = linear_ir->get_parameters();
const auto& results = linear_ir->get_results();
void RuntimeConfigurator::init_data_info(const lowered::LinearIR& linear_ir) {
const auto& parameters = linear_ir.get_parameters();
const auto& results = linear_ir.get_results();
m_in_num = parameters.size();
m_io_num = m_in_num + results.size();
m_io_descs.reserve(m_io_num);
Expand Down Expand Up @@ -113,11 +113,11 @@ void RuntimeConfigurator::init_data_info(const std::shared_ptr<lowered::LinearIR
}
}

void RuntimeConfigurator::init_buffer_info(const std::shared_ptr<lowered::LinearIR>& linear_ir) {
void RuntimeConfigurator::init_buffer_info(const lowered::LinearIR& linear_ir) {
std::map<size_t, std::set<lowered::ExpressionPtr>> dynamic_buffer_clusters, static_buffer_clusters;

// All needed checks are in Validate pass
const auto& buffer_expressions = linear_ir->get_buffers();
const auto& buffer_expressions = linear_ir.get_buffers();
for (const auto& buffer_expr : buffer_expressions) {
const auto buffer = ov::as_type_ptr<op::Buffer>(buffer_expr->get_node());
OPENVINO_ASSERT(buffer, "Expected Buffer ops in Buffer expressions of LinearIR");
Expand All @@ -128,7 +128,7 @@ void RuntimeConfigurator::init_buffer_info(const std::shared_ptr<lowered::Linear
}

const auto cluster_count = dynamic_buffer_clusters.size() + static_buffer_clusters.size();
m_config->buffer_scratchpad_size = linear_ir->get_static_buffer_scratchpad_size();
m_config->buffer_scratchpad_size = linear_ir.get_static_buffer_scratchpad_size();
m_config->buffer_cluster_offsets.resize(cluster_count, utils::get_dynamic_value<size_t>());

for (const auto& p : static_buffer_clusters) {
Expand All @@ -143,7 +143,7 @@ void RuntimeConfigurator::init_buffer_info(const std::shared_ptr<lowered::Linear
m_dynamic_buffer_clusters = std::move(dynamic_buffer_clusters);
}

void RuntimeConfigurator::update_loop_info(const std::shared_ptr<lowered::LinearIR>& linear_ir) const {
void RuntimeConfigurator::update_loop_info(const lowered::LinearIR& linear_ir) const {
// Initialized UnifiedLoopInfo
struct CurrentUnifiedLoopInfo {
size_t current_work_amount = 0;
Expand All @@ -152,7 +152,7 @@ void RuntimeConfigurator::update_loop_info(const std::shared_ptr<lowered::Linear
};
std::unordered_map<lowered::UnifiedLoopInfoPtr, CurrentUnifiedLoopInfo> initializated_info_map;

const auto& loop_map = linear_ir->get_loop_manager()->get_map();
const auto& loop_map = linear_ir.get_loop_manager()->get_map();
for (const auto& p : loop_map) {
const auto& expanded_loop_info = ov::as_type_ptr<lowered::ExpandedLoopInfo>(p.second);
OPENVINO_ASSERT(expanded_loop_info, "UpdateLoopInfo expects ExpandedLoopInfo in LoopManager");
Expand Down Expand Up @@ -180,27 +180,29 @@ void RuntimeConfigurator::update_loop_info(const std::shared_ptr<lowered::Linear
continue;
}

expanded_loop_info->set_work_amount(
lowered::pass::InsertSpecificIterations::get_decomposed_loop_work_amount(current_unified_loop_info, decomposed_loop_type, current_work_amount));
const auto work_amount =
lowered::pass::InsertSpecificIterations::get_decomposed_loop_work_amount(current_unified_loop_info, decomposed_loop_type, current_work_amount);
expanded_loop_info->set_work_amount(work_amount);
// Update remaining Loop work amount
current_work_amount -= expanded_loop_info->get_work_amount();
current_work_amount -= work_amount;

// Update only `finalization offsets`. `Ptr increments` are always zeroed in this case
auto updated_finalization_offsets = current_work_amount > 0 ? std::vector<int64_t>(finalization_offsets.size(), 0) : finalization_offsets;
if (expanded_loop_info->is_evaluate_once()) {
expanded_loop_info->set_increment(work_amount);
// work_amount is equal to increment in cases with `evaluate_once`
for (size_t i = 0; i < updated_finalization_offsets.size(); ++i)
updated_finalization_offsets[i] += ptr_increments[i] * expanded_loop_info->get_work_amount();
updated_finalization_offsets[i] += ptr_increments[i] * work_amount;
} else {
expanded_loop_info->update_ptr_increments(ptr_increments);
}
expanded_loop_info->update_finalization_offsets(updated_finalization_offsets);
}
}

void RuntimeConfigurator::update_buffer_scratchpad_size(const std::shared_ptr<lowered::LinearIR>& linear_ir) const {
const auto& loop_manager = linear_ir->get_loop_manager();
m_config->buffer_scratchpad_size = linear_ir->get_static_buffer_scratchpad_size();
void RuntimeConfigurator::update_buffer_scratchpad_size(const lowered::LinearIR& linear_ir) const {
const auto& loop_manager = linear_ir.get_loop_manager();
m_config->buffer_scratchpad_size = linear_ir.get_static_buffer_scratchpad_size();

for (const auto& p : m_dynamic_buffer_clusters) {
const auto& cluster_id = p.first;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,63 +15,24 @@ namespace intel_cpu {
CPURuntimeConfigurator::CPURuntimeConfigurator() : ov::snippets::RuntimeConfigurator(std::make_shared<CPURuntimeConfig>()) {
}

void CPURuntimeConfigurator::update(const std::shared_ptr<ov::snippets::lowered::LinearIR>& linear_ir) {
if (linear_ir->is_dynamic()) {
const auto& loop_manager = linear_ir->get_loop_manager();
update_loop_info(linear_ir);
update_loop_args(loop_manager);
// Update Brgemm should be before `update_buffer_scratchpad_size`
// because `ComputeAllocationSize` depends on subtensors which are updated in `update_brgemms`
update_brgemms(loop_manager);
update_buffer_scratchpad_size(linear_ir);
get_kernel_executor_table()->update_state();
}

m_config->master_shape = linear_ir->get_master_shape();

update_data_offsets();
update_latest_shapes();
}

void CPURuntimeConfigurator::initialization(const std::shared_ptr<ov::snippets::lowered::LinearIR>& linear_ir) {
RuntimeConfigurator::initialization(linear_ir);

for (const auto& expr : *linear_ir) {
if (ov::is_type<ov::intel_cpu::BrgemmCPU>(expr->get_node())) {
const auto& in0_desc = expr->get_input_port_descriptor(0);
const auto& in1_desc = expr->get_input_port_descriptor(1);
const auto& out_desc = expr->get_output_port_descriptor(0);

const auto& in0_subtensor = in0_desc->get_subtensor();
const auto& in1_subtensor = in1_desc->get_subtensor();
const auto& out_subtensor = out_desc->get_subtensor();
void CPURuntimeConfigurator::update(const ov::snippets::lowered::LinearIR& linear_ir) {
RuntimeConfigurator::update(linear_ir);

// TODO [146125]: At the moment only blocking by dynamic M is supported
// So we save Brgemm with only dynamic M
// If there are other dynamic dimensions, throw exception for now
OPENVINO_ASSERT(!snippets::utils::is_dynamic_value(*in0_subtensor.crbegin()) &&
!snippets::utils::is_dynamic_value(*in1_subtensor.crbegin()) &&
!snippets::utils::is_dynamic_value(*(++in1_subtensor.crbegin())) &&
!snippets::utils::is_dynamic_value(*out_subtensor.crbegin()),
"CPURuntimeConfigurator supports only dynamic M in Brgemm subtensors");
OPENVINO_ASSERT(*(++in0_subtensor.crbegin()) == *(++out_subtensor.crbegin()),
"Incorrect values in subtensors of BrgemmCPU");

if (snippets::utils::is_dynamic_value(*(++in0_subtensor.crbegin())))
m_dynamic_brgemms.insert(expr);
}
if (linear_ir.is_dynamic()) {
get_kernel_executor_table()->update_state(linear_ir);
update_loop_args(linear_ir);
}
}

void CPURuntimeConfigurator::init_tensor_rank(const std::shared_ptr<ov::snippets::lowered::LinearIR>& linear_ir) const {
m_config->tensor_rank = std::max(linear_ir->get_master_shape().size(), rank6D);
void CPURuntimeConfigurator::init_tensor_rank(const ov::snippets::lowered::LinearIR& linear_ir) const {
m_config->tensor_rank = std::max(linear_ir.get_master_shape().size(), rank6D);
}

void CPURuntimeConfigurator::update_loop_args(const ov::snippets::lowered::LoopManagerPtr& loop_manager) const {
void CPURuntimeConfigurator::update_loop_args(const ov::snippets::lowered::LinearIR& linear_ir) const {
const auto& cpu_config = ov::as_type_ptr<CPURuntimeConfig>(m_config);
OPENVINO_ASSERT(cpu_config, "CPURuntimeConfigurator expects CPURuntimeConfig");

const auto& loop_map = loop_manager->get_map();
const auto& loop_map = linear_ir.get_loop_manager()->get_map();
cpu_config->loop_args.resize(loop_map.size());
for (const auto& loop : loop_map) {
const auto& idx = loop.first;
Expand All @@ -90,18 +51,5 @@ void CPURuntimeConfigurator::update_loop_args(const ov::snippets::lowered::LoopM
}
}

void CPURuntimeConfigurator::update_brgemms(const ov::snippets::lowered::LoopManagerPtr& loop_manager) const {
for (const auto& brgemm_expr : m_dynamic_brgemms) {
const auto& loop_ids = brgemm_expr->get_loop_ids();
OPENVINO_ASSERT(!loop_ids.empty(), "Dynamic Brgemm must be in loops");
// TODO [146125]: Loop by M is first one in `loop_ids`
const auto& expanded_loop_info = loop_manager->get_loop_info<snippets::lowered::ExpandedLoopInfo>(loop_ids.front());
const auto& block_size_m = expanded_loop_info->get_work_amount();

brgemm_expr->get_input_port_descriptor(0)->set_subtensor_dim(1, block_size_m);
brgemm_expr->get_output_port_descriptor(0)->set_subtensor_dim(1, block_size_m);
}
}

} // namespace intel_cpu
} // namespace ov
Loading

0 comments on commit 7b0504c

Please sign in to comment.