diff --git a/src/common/snippets/src/utils/loop_utils.cpp b/src/common/snippets/src/utils/loop_utils.cpp index dabd129fce451d..4402b75d3512c5 100644 --- a/src/common/snippets/src/utils/loop_utils.cpp +++ b/src/common/snippets/src/utils/loop_utils.cpp @@ -62,17 +62,34 @@ inline void init_work_amount(const LoopInfoPtr& loop_info) { } // namespace void update_data_pointer_shifts(const UnifiedLoopInfoPtr& loop_info) { + static size_t loop_id = 0; OPENVINO_ASSERT(loop_info != nullptr, "UnifiedLoopInfo is nullptr, nothing to update"); const auto work_amount = loop_info->get_work_amount(); const auto input_count = loop_info->get_input_count(); const auto output_count = loop_info->get_output_count(); - - auto update_shifts = [&work_amount, &input_count, &output_count](LoopPort& loop_port, UnifiedLoopInfo::LoopPortDesc& ptr_shifts_params) { + size_t idx = 0; + // if (loop_id < 3) + std::cout << "update_shifts is called for loop_id " << loop_id << std::endl; + auto update_shifts = [&](LoopPort& loop_port, UnifiedLoopInfo::LoopPortDesc& ptr_shifts_params) { ptr_shifts_params.ptr_increment = get_ptr_increment(loop_port, work_amount, loop_port.expr_port->get_type() == ExpressionPort::Input ? input_count : output_count); + // Loop by K + if ((loop_id == 0 || loop_id == 4) && idx == 1) { + ptr_shifts_params.ptr_increment = 32; // increment = inner_N_block size + } + // Loop by N + if ((loop_id == 1 || loop_id == 5) && idx == 1) { + // increment = K dimension rounded by K block + ptr_shifts_params.ptr_increment = *++loop_port.expr_port->get_descriptor_ptr()->get_shape().rbegin(); + } ptr_shifts_params.finalization_offset = get_finalization_offset(work_amount, ptr_shifts_params.ptr_increment); + // if (loop_id < 3) + std::cout << "\t ptr_increment[" << idx << "]=" << ptr_shifts_params.ptr_increment + << ", finalization_offset[" << idx << "]=" << ptr_shifts_params.finalization_offset << std::endl; + idx++; }; loop_info->iterate_through_infos(update_shifts); + loop_id++; } void update_runtime_parameters(const UnifiedLoopInfoPtr& loop_info) { diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm.cpp index e46de866990005..5c14f8e6dbf940 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm.cpp @@ -287,7 +287,8 @@ void BrgemmKernelExecutor::update_config(const ov::snippets::lowered::Expression // In case of data repacking LDB is chosen in accordance with repacking buffer size if (with_repacking(brgemm_node->get_type())) LDB = brgemm_utils::repacking::compute_out_leading_dim(N, brgemm_node->get_input_element_type(1)); - + // hack to imitate blocking layout + LDB = 32; config.update(DIM_CAST(M), DIM_CAST(N), DIM_CAST(K), LDA, LDB, LDC, beta); } diff --git a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_utils.cpp b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_utils.cpp index fa273ac3d6c3ff..f65b57ed2b7626 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_utils.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_utils.cpp @@ -26,6 +26,9 @@ MemoryPtr prepareWeightsMemory(const DnnlMemoryDescPtr srcWeightDesc, const auto& eng = context->getEngine(); const auto& format = dstWeightDesc->serializeFormat(); + std::cout << "[ INFO ] prepareWeightsMemory info\n"; + std::cout << "Format: from " << srcWeightDesc->serializeFormat() << " to " << dstWeightDesc->serializeFormat() << std::endl; + const auto privateWeightCache = context->getPrivateWeighCache(); OPENVINO_ASSERT(privateWeightCache, "privateWeightCache is nullptr"); if (privateWeightCache) { diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.cpp b/src/plugins/intel_cpu/src/nodes/subgraph.cpp index 5003deabc0bd40..f44bdcca266499 100644 --- a/src/plugins/intel_cpu/src/nodes/subgraph.cpp +++ b/src/plugins/intel_cpu/src/nodes/subgraph.cpp @@ -40,6 +40,10 @@ #include "transformations/snippets/x64/shape_inference.hpp" #endif +#include "memory_desc/dnnl_blocked_memory_desc.h" +#include "memory_desc/cpu_memory_desc_utils.h" +#include "nodes/executors/dnnl/dnnl_utils.hpp" + #include "utils/cpu_utils.hpp" #include "utils/ngraph_utils.hpp" @@ -753,8 +757,51 @@ void Subgraph::optimizeIR() { control_flow_config, control_flow_passes); } +DnnlMemoryDescPtr makeTransposedWeightDescriptor(const DnnlMemoryDescPtr srcDesc, + const DnnlMemoryDescPtr dstDesc, + bool weightsNonTransposed) { + if (!weightsNonTransposed) + return srcDesc; + + const auto& weiDesc = srcDesc->getDnnlDesc(); + const auto reorderedWeiDesc = + dnnl::memory::desc{weiDesc.get_dims(), weiDesc.get_data_type(), dnnl::memory::format_tag::ba}; + const auto transposedWeiDesc = DnnlExtensionUtils::makeDescriptor(reorderedWeiDesc.reshape(dstDesc->getDnnlDesc().get_dims())); + return transposedWeiDesc; +} + void Subgraph::prepareParams() { const auto& cache = context->getParamsCache(); + const auto& input_shape = getSrcMemoryAtPort(0)->getDescPtr()->getShape().getStaticDims(); + const auto& weights_shape = getSrcMemoryAtPort(1)->getDescPtr()->getShape().getStaticDims(); + + const auto M = DnnlExtensionUtils::convertToDnnlDim(*++input_shape.rbegin()); + const auto K = DnnlExtensionUtils::convertToDnnlDim(*input_shape.rbegin()); + const auto N = DnnlExtensionUtils::convertToDnnlDim(*weights_shape.rbegin()); + auto get_wei_desc = [&]() { + const auto inputDesc = dnnl::memory::desc({M, K}, dnnl::memory::data_type::f32, dnnl::memory::format_tag::ab); + const auto weightsDesc = dnnl::memory::desc({N, K}, dnnl::memory::data_type::f32, dnnl::memory::format_tag::any); + const auto biasDesc = dnnl::memory::desc(); + const auto outputDesc = dnnl::memory::desc({M, N}, dnnl::memory::data_type::f32, dnnl::memory::format_tag::ab); + + auto fc_desc = dnnl::inner_product_forward::primitive_desc(context->getEngine(), + dnnl::prop_kind::forward_inference, + inputDesc, + weightsDesc, + biasDesc, + outputDesc); + auto weiDesc = DnnlExtensionUtils::makeDescriptor(fc_desc.weights_desc()); + return weiDesc; + }; + auto prepareWeightsMemory = [&]() { + const auto memory = getSrcMemoryAtPort(1); + auto originalMemDesc = DnnlExtensionUtils::makeDescriptor(dnnl::memory::desc({N, K}, dnnl::memory::data_type::f32, dnnl::memory::format_tag::ab)); + const auto blocked_desc = get_wei_desc(); + originalMemDesc = makeTransposedWeightDescriptor(originalMemDesc, blocked_desc, true); + const auto exec_context = std::make_shared(context, std::vector{}, privateWeightCache); + srcMemPtrs[1] = utils::prepareWeightsMemory(originalMemDesc, blocked_desc, memory, exec_context, true); + }; + prepareWeightsMemory(); auto builder = [this, &cache](const SubgraphKey& key) -> std::shared_ptr { const auto& snippet = subgraph_attrs->snippet; diff --git a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp index 5982d784cbc4d3..c94f44abde5f0d 100644 --- a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp +++ b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp @@ -877,6 +877,9 @@ void Transformations::PostLpt() { } void Transformations::MainSnippets(void) { + if (std::getenv("REFERENCE")) { + return; + } auto is_supported_isa = [](){ #if defined(OPENVINO_ARCH_X86_64) return dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx2); diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/matmul.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/matmul.cpp index b0e8d58da2f0b2..240d747d88306a 100644 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/matmul.cpp +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/matmul.cpp @@ -149,6 +149,26 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MatMulEltwiseChainCascade, MatMulEltwise ::testing::Values(ov::test::utils::DEVICE_CPU)), MatMul::getTestCaseName); +const size_t M = std::atoi(std::getenv("M")); +const size_t K = std::atoi(std::getenv("K")); +const size_t N = std::atoi(std::getenv("N")); + +std::vector> fc_input_shapes{ + { + {PartialShape{}, {{1, 1, M, K}}}, + {{}, {{K, N}}} + }, +}; + +INSTANTIATE_TEST_SUITE_P(smoke_Snippets_FullyConnected, MatMul, + ::testing::Combine( + ::testing::ValuesIn(fc_input_shapes), + ::testing::ValuesIn(precisions(false)), + ::testing::Values(MatMulType::FullyConnected), + ::testing::Values(1), // MatMul + ::testing::Values(1), // Tokenized MatMul + ::testing::Values(ov::test::utils::DEVICE_CPU)), + MatMul::getTestCaseName); const auto& transpose_b_shapes = STATIC_SHAPES( {{3, 3, 64, 64}, {3, 3, 64, 64}}, {{1, 1, 32, 128}, {1, 1, 64, 128}},