From dbd8ad72886a3758c486ab3eef3433b55de504c7 Mon Sep 17 00:00:00 2001 From: Vladislav Golubev Date: Fri, 2 Aug 2024 12:45:12 +0200 Subject: [PATCH] Moved all the logic in ParallelWAOptimizer class --- .../include/snippets/runtime_configurator.hpp | 19 +- .../snippets/src/runtime_configurator.cpp | 60 +++-- .../snippets/cpu_runtime_configurator.cpp | 231 ++++++++++-------- .../snippets/cpu_runtime_configurator.hpp | 33 ++- 4 files changed, 196 insertions(+), 147 deletions(-) diff --git a/src/common/snippets/include/snippets/runtime_configurator.hpp b/src/common/snippets/include/snippets/runtime_configurator.hpp index 058eca59716d1b..cb9233ef0a89ef 100644 --- a/src/common/snippets/include/snippets/runtime_configurator.hpp +++ b/src/common/snippets/include/snippets/runtime_configurator.hpp @@ -5,6 +5,7 @@ #pragma once #include "snippets/lowered/linear_ir.hpp" +#include "snippets/lowered/loop_info.hpp" #include "snippets/kernel_executor_table.hpp" #include "snippets/lowered/pass/pass.hpp" @@ -96,11 +97,20 @@ class RuntimeConfigurator { * @param linear_ir LinearIR */ virtual void init_tensor_rank(const lowered::LinearIRPtr& linear_ir) const; + + struct UnifiedLoopInfoRtParams { + size_t work_amount = 0; + std::vector ptr_increments; + std::vector finalization_offsets; + }; + using LoopInfoRuntimeParamsMap = std::unordered_map; /** * @brief Update Loop informations in LinearIR: Unified and ExpandedLoopInfo * @param linear_ir LinearIR + * @param initializated_info_map Reference on a map [LoopInfo->RuntimeParams]. + * Can be used to pass in the method loop infos which were already initialized, e.g. by parallel domain optimization */ - void update_loop_info(const lowered::LinearIRPtr& linear_ir) const; + void update_loop_info(const lowered::LinearIRPtr& linear_ir, LoopInfoRuntimeParamsMap& initializated_info_map) const; /** * @brief Update Buffer scratchpad size and offsets if needed * Note: `update_loop_info` must be called before @@ -110,12 +120,17 @@ class RuntimeConfigurator { /** * @brief Calculate data offsets of LinearIR and update these values in RuntimeConfig */ - void update_data_offsets() const; + void update_data_offsets(const std::vector& shapes = {}, + const std::vector>& layouts = {}) const; /** * @brief Update latest input shapes */ void update_latest_shapes(); + static void init_data_ptr_shifts(const lowered::UnifiedLoopInfoPtr& unified_loop_info, + std::vector& ptr_increments, + std::vector& finalization_offsets); + std::shared_ptr m_config = nullptr; size_t m_io_num = 0; diff --git a/src/common/snippets/src/runtime_configurator.cpp b/src/common/snippets/src/runtime_configurator.cpp index 515a8ba3a7484f..e241563b40f6a2 100644 --- a/src/common/snippets/src/runtime_configurator.cpp +++ b/src/common/snippets/src/runtime_configurator.cpp @@ -13,23 +13,6 @@ namespace ov { namespace snippets { -namespace { -void init_data_ptr_shifts(const lowered::UnifiedLoopInfoPtr& unified_loop_info, std::vector& ptr_increments, - std::vector& finalization_offsets) { - const auto count = unified_loop_info->get_input_count() + unified_loop_info->get_output_count(); - ptr_increments.resize(count); - finalization_offsets.resize(count); - - size_t idx = 0; - unified_loop_info->iterate_through_descs( - [&ptr_increments, &finalization_offsets, &idx](const lowered::UnifiedLoopInfo::LoopPortDesc& desc) { - ptr_increments[idx] = desc.ptr_increment; - finalization_offsets[idx] = desc.finalization_offset; - ++idx; - }); -} -} // namespace - RuntimeConfigurator::RuntimeConfigurator(std::shared_ptr c) : m_config(std::move(c)) { OPENVINO_ASSERT(m_config, "Runtime config is nullptr!"); @@ -57,7 +40,8 @@ void RuntimeConfigurator::initialization(const lowered::LinearIRPtr& linear_ir) void RuntimeConfigurator::update(const lowered::LinearIRPtr& linear_ir) { if (linear_ir->is_dynamic()) { - update_loop_info(linear_ir); + LoopInfoRuntimeParamsMap initialized_info; + update_loop_info(linear_ir, initialized_info); update_buffer_scratchpad_size(linear_ir); } @@ -143,15 +127,8 @@ void RuntimeConfigurator::init_buffer_info(const lowered::LinearIRPtr& linear_ir m_dynamic_buffer_clusters = std::move(dynamic_buffer_clusters); } -void RuntimeConfigurator::update_loop_info(const lowered::LinearIRPtr& linear_ir) const { - // Initialized UnifiedLoopInfo - struct CurrentUnifiedLoopInfo { - size_t current_work_amount = 0; - std::vector ptr_increments; - std::vector finalization_offsets; - }; - std::unordered_map initializated_info_map; - +void RuntimeConfigurator::update_loop_info(const lowered::LinearIRPtr& linear_ir, + LoopInfoRuntimeParamsMap& initializated_info_map) const { const auto& loop_map = linear_ir->get_loop_manager()->get_map(); for (const auto& p : loop_map) { const auto& expanded_loop_info = ov::as_type_ptr(p.second); @@ -163,12 +140,12 @@ void RuntimeConfigurator::update_loop_info(const lowered::LinearIRPtr& linear_ir auto& current_info = initializated_info_map[current_unified_loop_info]; lowered::pass::InitLoops::update_runtime_parameters(current_unified_loop_info); - current_info.current_work_amount = current_unified_loop_info->get_work_amount(); + current_info.work_amount = current_unified_loop_info->get_work_amount(); init_data_ptr_shifts(current_unified_loop_info, current_info.ptr_increments, current_info.finalization_offsets); } auto& initializated_info = initializated_info_map.at(current_unified_loop_info); - auto& current_work_amount = initializated_info.current_work_amount; + auto& current_work_amount = initializated_info.work_amount; const auto& ptr_increments = initializated_info.ptr_increments; const auto& finalization_offsets = initializated_info.finalization_offsets; @@ -226,7 +203,10 @@ void RuntimeConfigurator::update_buffer_scratchpad_size(const lowered::LinearIRP OPENVINO_ASSERT(!utils::is_dynamic_value(m_config->buffer_scratchpad_size), "Buffer scratchpad size must be defined!"); } -void RuntimeConfigurator::update_data_offsets() const { +void RuntimeConfigurator::update_data_offsets(const std::vector& shapes, + const std::vector>& layouts) const { + OPENVINO_ASSERT(shapes.empty() || shapes.size() == m_io_num); + OPENVINO_ASSERT(layouts.empty() || layouts.size() == m_io_num); for (size_t i = 0; i < m_io_num; ++i) { // offsets represent distance between consecutive elements of corresponding dimension. // If a dim size == 1, then the next dim starts immediately and the stride is 0 @@ -236,11 +216,11 @@ void RuntimeConfigurator::update_data_offsets() const { // case 2: // shape: s0, s1, s2 == 1, s3 // offsets: s1*s3, s3, 0, 1 - const auto& shape = m_io_descs[i]->get_shape(); + const auto& shape = shapes.empty() ? m_io_descs[i]->get_shape() : shapes[i]; if (shape == m_latest_shapes[i]) continue; - const auto& layout = m_io_descs[i]->get_layout(); + const auto& layout = layouts.empty() ? m_io_descs[i]->get_layout() : layouts[i]; auto& offsets = m_config->io_data_offsets[i]; offsets.resize(m_config->tensor_rank); @@ -276,5 +256,21 @@ void RuntimeConfigurator::update_latest_shapes() { } } +void RuntimeConfigurator::init_data_ptr_shifts(const lowered::UnifiedLoopInfoPtr& unified_loop_info, + std::vector& ptr_increments, + std::vector& finalization_offsets) { + const auto count = unified_loop_info->get_input_count() + unified_loop_info->get_output_count(); + ptr_increments.resize(count); + finalization_offsets.resize(count); + + size_t idx = 0; + unified_loop_info->iterate_through_descs( + [&ptr_increments, &finalization_offsets, &idx](const lowered::UnifiedLoopInfo::LoopPortDesc& desc) { + ptr_increments[idx] = desc.ptr_increment; + finalization_offsets[idx] = desc.finalization_offset; + ++idx; + }); +} + } // namespace snippets } // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp index 4fd8755d2478aa..2c96431a99e2ac 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp @@ -4,22 +4,36 @@ #include "emitters/snippets/cpu_runtime_configurator.hpp" -#include "snippets/utils/utils.hpp" #include "snippets/lowered/loop_manager.hpp" +#include "snippets/lowered/pass/init_loops.hpp" #include "snippets/pass/split_dimension_m.hpp" - +#include "snippets/utils/utils.hpp" namespace ov { namespace intel_cpu { +using namespace ov::snippets::lowered; +using namespace ov::snippets::pass; + CPURuntimeConfigurator::CPURuntimeConfigurator() : ov::snippets::RuntimeConfigurator(std::make_shared()) { } void CPURuntimeConfigurator::update(const ov::snippets::lowered::LinearIRPtr& linear_ir) { m_config->master_shape = linear_ir->get_master_shape(); + + ov::snippets::RuntimeConfigurator::LoopInfoRuntimeParamsMap initialized_info; + std::vector updated_shapes; + std::vector> updated_layouts; + bool optimize_work_amount = m_optimizer.need_optimize(m_config->master_shape); + if (optimize_work_amount) { + m_optimizer.update_split_loops_info(initialized_info); + m_optimizer.update_shapes(m_io_descs, updated_shapes, m_in_num); + m_optimizer.update_layouts(m_io_descs, updated_layouts, m_in_num); + m_optimizer.update_config(m_config); + } + if (linear_ir->is_dynamic()) { - parallel_work_amount_optimization(linear_ir); - update_loop_info(linear_ir); + update_loop_info(linear_ir, initialized_info); update_loop_args(linear_ir); // Update KernelExecutor Table should be before `update_buffer_scratchpad_size` // because `ComputeAllocationSize` depends on subtensors which are updated in the table @@ -27,94 +41,62 @@ void CPURuntimeConfigurator::update(const ov::snippets::lowered::LinearIRPtr& li update_buffer_scratchpad_size(linear_ir); } - update_data_offsets(); - // Original shapes must be restored after offsets computation - // in order to avoid the incorrect split shape propagation through the LIR in next shape inference - if (!m_original_layouts.empty() || !m_original_shapes.empty()) { - for (size_t i = 0; i < m_io_descs.size(); ++i) { - m_io_descs[i]->set_layout(m_original_layouts[i]); - m_io_descs[i]->set_shape(m_original_shapes[i]); - } + update_data_offsets(updated_shapes, updated_layouts); + // TODO: unify this logic somehow? + if (!optimize_work_amount) { + update_latest_shapes(); + } else { + m_latest_shapes = std::move(updated_shapes); } - update_latest_shapes(); } -void CPURuntimeConfigurator::parallel_work_amount_optimization(const ov::snippets::lowered::LinearIRPtr& linear_ir) { - using namespace ov::snippets::pass; - if (m_loops_to_split.empty()) - return; +void CPURuntimeConfigurator::initialization(const ov::snippets::lowered::LinearIRPtr& linear_ir) { + RuntimeConfigurator::initialization(linear_ir); + m_optimizer.init(linear_ir); +} - const auto optimal_wa = linear_ir->get_config().m_min_parallel_work_amount; - size_t m_batch_m, m_new_m; - if (!SplitDimensionM::split(m_config->master_shape, optimal_wa, m_batch_m, m_new_m)) - return; +void CPURuntimeConfigurator::init_tensor_rank(const ov::snippets::lowered::LinearIRPtr& linear_ir) const { + m_config->tensor_rank = std::max(linear_ir->get_master_shape().size(), rank6D); +} - // WA: const work amount allows prohibit to change work amount of loops which were affected by split_dim_m logic - for (const auto& loop : m_loops_to_split) { - loop->set_work_amount_const(true); - loop->set_work_amount(m_new_m); - } +void CPURuntimeConfigurator::update_loop_args(const ov::snippets::lowered::LinearIRPtr& linear_ir) const { + const auto& cpu_config = ov::as_type_ptr(m_config); + OPENVINO_ASSERT(cpu_config, "CPURuntimeConfigurator expects CPURuntimeConfig"); + + const auto& loop_map = linear_ir->get_loop_manager()->get_map(); + cpu_config->loop_args.resize(loop_map.size()); + for (const auto& loop : loop_map) { + const auto& idx = loop.first; + const auto& loop_info = ov::as_type_ptr(loop.second); + OPENVINO_ASSERT(loop_info, "CPURuntimeConfigurator expects ExpandedLoopInfo in loop manager"); + + const auto& increment = loop_info->get_increment(); + const auto& data_sizes = loop_info->get_data_sizes(); - auto transformed_to_idces = [&]() { - std::set transformed_res; - size_t i = 0; - for (const auto& param : linear_ir->get_parameters()) { - if (m_not_m_related_params.count(param)) - transformed_res.insert(i); - i++; + auto& loop_arg = cpu_config->loop_args[idx]; + loop_arg = jit_snippets_call_args::loop_args_t(loop_info->get_work_amount(), loop_info->get_ptr_increments(), loop_info->get_finalization_offsets()); + for (int64_t i = 0; i < loop_arg.m_num_data_ptrs; ++i) { + loop_arg.m_ptr_increments[i] *= (increment * data_sizes[i]); + loop_arg.m_finalization_offsets[i] *= data_sizes[i]; } - return transformed_res; - }(); - - m_original_shapes.resize(m_io_descs.size()); - m_original_layouts.resize(m_io_descs.size()); - for (size_t i = 0; i < m_in_num; ++i) { - const auto& desc = m_io_descs[i]; - m_original_shapes[i] = m_io_descs[i]->get_shape(); - m_original_layouts[i] = m_io_descs[i]->get_layout(); - - const auto dim_idx = ov::snippets::utils::get_input_dim_idx(desc->get_layout(), 1); - const auto new_shape = transformed_to_idces.count(i) == 0 - ? SplitDimensionM::reshape_m_dim(desc->get_shape(), dim_idx, m_batch_m, m_new_m) - : SplitDimensionM::unsqueeze_m_dim(desc->get_shape(), dim_idx); - const auto new_layout = SplitDimensionM::get_updated_order(desc->get_layout(), dim_idx); - desc->set_shape(new_shape); - desc->set_layout(new_layout); } - for (size_t i = m_in_num; i < m_io_num; ++i) { - const auto& desc = m_io_descs[i]; - m_original_shapes[i] = m_io_descs[i]->get_shape(); - m_original_layouts[i] = m_io_descs[i]->get_layout(); - - const auto shape_dim_idx = i < m_io_descs.size() - 1 - ? ov::snippets::utils::get_input_dim_idx(m_original_layouts[i], 1) - : ov::snippets::utils::get_output_dim_idx(m_original_layouts[i], 1); - const auto new_shape = SplitDimensionM::reshape_m_dim(desc->get_shape(), shape_dim_idx, m_batch_m, m_new_m); - const auto new_layout = SplitDimensionM::get_updated_order(desc->get_layout(), desc->get_layout().size() - 2); - desc->set_shape(new_shape); - desc->set_layout(new_layout); - } - - *++m_config->master_shape.rbegin() = m_new_m; - m_config->master_shape.insert(m_config->master_shape.cbegin() + m_config->master_shape.size() - 2, m_batch_m); - m_config->tensor_rank = std::max(m_config->master_shape.size(), rank6D); } -namespace { -std::set find_not_m_related_params(const ov::snippets::lowered::LinearIRPtr& linear_ir) { +std::unordered_set CPURuntimeConfigurator::ParallelWAOptimizer::find_not_m_related_params( + const ov::snippets::lowered::LinearIRPtr& linear_ir) { using namespace ov::snippets::lowered; auto is_brgemm = [](const ExpressionPtr& expr) { return ov::is_type(expr->get_node()); }; - std::set brgemms; + std::unordered_set brgemms; auto brgemm_it = std::find_if(linear_ir->begin(), linear_ir->end(), is_brgemm); while (brgemm_it != linear_ir->end()) { brgemms.insert(*brgemm_it); brgemm_it = std::find_if(std::next(brgemm_it), linear_ir->end(), is_brgemm); } - std::set visited; - std::set res; + std::unordered_set visited; + std::unordered_set res; const auto& params = linear_ir->get_parameters(); for (const auto& brgemm : brgemms) { // Find all parameters which are placed on B Brgemm inputs: these params must be skipped @@ -125,7 +107,7 @@ std::set find_not_m_related_params(const o if (ov::is_type(curr_expr->get_node())) { auto found_param = std::find(params.begin(), params.end(), curr_expr); OPENVINO_ASSERT(found_param != params.end(), "find_param didn't found parameter for expr"); - res.insert(*found_param); + res.insert(std::distance(params.begin(), found_param)); continue; } @@ -140,25 +122,19 @@ std::set find_not_m_related_params(const o } return res; } -} // namespace - -void CPURuntimeConfigurator::initialization(const ov::snippets::lowered::LinearIRPtr& linear_ir) { - using namespace ov::snippets::lowered; - // TODO: run this code only on MHA subgraphs - RuntimeConfigurator::initialization(linear_ir); - if (linear_ir->get_config().m_enable_domain_optimization || !linear_ir->is_dynamic()) - return; +std::unordered_set CPURuntimeConfigurator::ParallelWAOptimizer::find_loops_to_split( + const ov::snippets::lowered::LinearIRPtr& linear_ir, + const std::unordered_set& params_to_skip) { + std::unordered_set loops_to_split; const auto& loop_manager = linear_ir->get_loop_manager(); - m_not_m_related_params = find_not_m_related_params(linear_ir); - OPENVINO_ASSERT(m_not_m_related_params.size() <= 2, "actual size: ", m_not_m_related_params.size()); - // The idea is to traverse LIR down from the M dimension related parameters // and find all the outermost loops: these loops will be split in runtime - std::set visited; + std::unordered_set visited; + size_t i = 0; for (const auto& param : linear_ir->get_parameters()) { // Ops after non related params mustn't be traversed - if (m_not_m_related_params.count(param)) + if (params_to_skip.count(i++)) continue; std::deque exprs{param}; @@ -169,7 +145,7 @@ void CPURuntimeConfigurator::initialization(const ov::snippets::lowered::LinearI if (!loop_ids.empty()) { const auto outermost_loop_idx = loop_ids[0]; const auto loop_info_to_add = loop_manager->get_loop_info(outermost_loop_idx); - m_loops_to_split.insert(loop_info_to_add->get_unified_loop_info()); + loops_to_split.insert(loop_info_to_add->get_unified_loop_info()); } for (const auto& output_connector : curr_expr->get_output_port_connectors()) { @@ -183,35 +159,76 @@ void CPURuntimeConfigurator::initialization(const ov::snippets::lowered::LinearI } } } - std::cout << "m_loops_to_split size = " << m_loops_to_split.size() << std::endl; + return loops_to_split; } -void CPURuntimeConfigurator::init_tensor_rank(const ov::snippets::lowered::LinearIRPtr& linear_ir) const { - m_config->tensor_rank = std::max(linear_ir->get_master_shape().size(), rank6D); +bool CPURuntimeConfigurator::ParallelWAOptimizer::lir_with_brgemm(const ov::snippets::lowered::LinearIRPtr& linear_ir) { + auto found_it = std::find_if(linear_ir->begin(), linear_ir->end(), [](const ExpressionPtr& expr) { + return ov::is_type(expr->get_node()); + }); + return found_it != linear_ir->end(); } -void CPURuntimeConfigurator::update_loop_args(const ov::snippets::lowered::LinearIRPtr& linear_ir) const { - const auto& cpu_config = ov::as_type_ptr(m_config); - OPENVINO_ASSERT(cpu_config, "CPURuntimeConfigurator expects CPURuntimeConfig"); - - const auto& loop_map = linear_ir->get_loop_manager()->get_map(); - cpu_config->loop_args.resize(loop_map.size()); - for (const auto& loop : loop_map) { - const auto& idx = loop.first; - const auto& loop_info = ov::as_type_ptr(loop.second); - OPENVINO_ASSERT(loop_info, "CPURuntimeConfigurator expects ExpandedLoopInfo in loop manager"); +void CPURuntimeConfigurator::ParallelWAOptimizer::init(const ov::snippets::lowered::LinearIRPtr& linear_ir) { + if (linear_ir->get_config().m_enable_domain_optimization || !linear_ir->is_dynamic() || !lir_with_brgemm(linear_ir) || + // Temporary wokaround in order to avoid optimization calling on a separate matmuls in tests + linear_ir->get_parameters().size() <= 2) + return; + not_m_related_params = find_not_m_related_params(linear_ir); + loops_to_split = find_loops_to_split(linear_ir, not_m_related_params); + concurrency = linear_ir->get_config().m_min_parallel_work_amount; +} - const auto& increment = loop_info->get_increment(); - const auto& data_sizes = loop_info->get_data_sizes(); +bool CPURuntimeConfigurator::ParallelWAOptimizer::need_optimize(const ov::snippets::VectorDims& master_shape) { + return !loops_to_split.empty() && SplitDimensionM::split(master_shape, concurrency, batch_m, new_m); +} - auto& loop_arg = cpu_config->loop_args[idx]; - loop_arg = jit_snippets_call_args::loop_args_t(loop_info->get_work_amount(), loop_info->get_ptr_increments(), loop_info->get_finalization_offsets()); - for (int64_t i = 0; i < loop_arg.m_num_data_ptrs; ++i) { - loop_arg.m_ptr_increments[i] *= (increment * data_sizes[i]); - loop_arg.m_finalization_offsets[i] *= data_sizes[i]; +void CPURuntimeConfigurator::ParallelWAOptimizer::update_split_loops_info( + ov::snippets::RuntimeConfigurator::LoopInfoRuntimeParamsMap& initialized_info) { + for (const auto& loop : loops_to_split) { + if (initialized_info.count(loop) == 0) { + loop->set_work_amount(new_m); + auto& current_info = initialized_info[loop]; + ov::snippets::lowered::pass::InitLoops::update_runtime_parameters(loop, false); + current_info.work_amount = loop->get_work_amount(); + init_data_ptr_shifts(loop, current_info.ptr_increments, current_info.finalization_offsets); } } } +void CPURuntimeConfigurator::ParallelWAOptimizer::update_config(const std::shared_ptr& config) { + *++config->master_shape.rbegin() = new_m; + config->master_shape.insert(config->master_shape.cbegin() + config->master_shape.size() - 2, batch_m); + config->tensor_rank = std::max(config->master_shape.size(), CPURuntimeConfigurator::rank6D); +} + +void CPURuntimeConfigurator::ParallelWAOptimizer::update_shapes( + const std::vector& io_descs, + std::vector& shapes, + size_t in_num) { + shapes.resize(io_descs.size()); + for (size_t i = 0; i < io_descs.size(); ++i) { + const auto& desc = io_descs[i]; + const auto dim_idx = i < in_num ? ov::snippets::utils::get_input_dim_idx(desc->get_layout(), 1) + : ov::snippets::utils::get_output_dim_idx(desc->get_layout(), 1); + shapes[i] = not_m_related_params.count(i) + ? SplitDimensionM::unsqueeze_m_dim(desc->get_shape(), dim_idx) + : SplitDimensionM::reshape_m_dim(desc->get_shape(), dim_idx, batch_m, new_m); + } +} + +void CPURuntimeConfigurator::ParallelWAOptimizer::update_layouts( + const std::vector& io_descs, + std::vector>& layouts, + size_t in_num) { + layouts.resize(io_descs.size()); + for (size_t i = 0; i < io_descs.size(); ++i) { + const auto& original_layout = io_descs[i]->get_layout(); + const auto dim_idx = + i < in_num ? ov::snippets::utils::get_input_dim_idx(original_layout, 1) : original_layout.size() - 2; + layouts[i] = SplitDimensionM::get_updated_order(original_layout, dim_idx); + } +} + } // namespace intel_cpu } // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp index 6bac9cb83b2759..3c4ff059ae2db0 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp @@ -46,14 +46,35 @@ class CPURuntimeConfigurator : public ov::snippets::RuntimeConfigurator { */ void update_loop_args(const ov::snippets::lowered::LinearIRPtr& linear_ir) const; - void parallel_work_amount_optimization(const ov::snippets::lowered::LinearIRPtr& linear_ir); + static const size_t rank6D = 6; - const size_t rank6D = 6; + class ParallelWAOptimizer { + public: + void init(const ov::snippets::lowered::LinearIRPtr& linear_ir); + bool need_optimize(const ov::snippets::VectorDims& master_shape); - std::set m_loops_to_split{}; - std::vector m_original_shapes{}; - std::vector m_original_layouts{}; - std::set m_not_m_related_params{}; + void update_split_loops_info(ov::snippets::RuntimeConfigurator::LoopInfoRuntimeParamsMap& map); + void update_shapes(const std::vector& io_descs, + std::vector& shapes, + size_t in_num); + void update_layouts(const std::vector& io_descs, + std::vector>& layouts, + size_t in_num); + void update_config(const std::shared_ptr& config); + + private: + bool lir_with_brgemm(const ov::snippets::lowered::LinearIRPtr& linear_ir); + static std::unordered_set find_not_m_related_params(const ov::snippets::lowered::LinearIRPtr& linear_ir); + static std::unordered_set find_loops_to_split( + const ov::snippets::lowered::LinearIRPtr& linear_ir, + const std::unordered_set& params_to_skip); + + std::unordered_set loops_to_split{}; + std::unordered_set not_m_related_params{}; + size_t concurrency = 0; + size_t batch_m = 0; + size_t new_m = 0; + } m_optimizer; }; } // namespace intel_cpu