diff --git a/src/common/snippets/src/runtime_configurator.cpp b/src/common/snippets/src/runtime_configurator.cpp index 9174d93eea3f98..0231df8791caf3 100644 --- a/src/common/snippets/src/runtime_configurator.cpp +++ b/src/common/snippets/src/runtime_configurator.cpp @@ -291,6 +291,10 @@ void RuntimeConfigurator::update_data_offsets(const std::vector& sha dim_step *= shape[i + 1]; offsets[i + idx_stride] = shape[i] != 1 ? dim_step : 0; } + // TODO: remove this hardcode + if (!std::getenv("REFERENCE") && i == 1) + offsets[3] = 2048 * 2; + std::cout << "offsets[" << i << "] = " << ov::PartialShape(offsets) << std::endl; if (!layout.empty()) { std::vector reordered_offsets(offsets.size()); const auto is_input = i < m_in_num; diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_copy_b.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_copy_b.cpp index dcf7ca6eebad64..104c18ab145f96 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_copy_b.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_copy_b.cpp @@ -6,6 +6,7 @@ #include "snippets/lowered/loop_manager.hpp" #include "emitters/plugin/x64/utils.hpp" +#include "nodes/common/cpu_memcpy.h" #include "transformations/snippets/x64/op/brgemm_utils.hpp" #define DTYPE_CAST(X) static_cast(DnnlExtensionUtils::ElementTypeToDataType(X)) @@ -233,6 +234,9 @@ void BrgemmCopyBKernel::emit_brgemm_copy_b_kernel_call(size_t N, size_t K, size_ spill.postamble(); } +uintptr_t base_addr_src = 0; +uintptr_t base_addr_dst = 0; + void BrgemmCopyBKernel::execute(matmul::jit_brgemm_matmul_copy_b_t* kernel, const void* src, const void* dst, const void* comp, size_t N, size_t K) { auto ctx = matmul::jit_brgemm_matmul_copy_b_t::ctx_t(); ctx.current_N_blk = N; @@ -244,8 +248,37 @@ void BrgemmCopyBKernel::execute(matmul::jit_brgemm_matmul_copy_b_t* kernel, cons ctx.current_K_start = 0; ctx.current_K_iters = K; + if (base_addr_src == 0) + base_addr_src = reinterpret_cast(src); + else + std::cout << "Stride from base_addr_src = " << reinterpret_cast(src) - base_addr_src << std::endl; + if (base_addr_dst == 0) + base_addr_dst = reinterpret_cast(dst); + else + std::cout << "Stride from base_addr_dst = " << reinterpret_cast(dst) - base_addr_dst << std::endl; + OV_CPU_JIT_EMITTER_ASSERT(kernel, "Kernel hasn't been created"); - (*kernel)(&ctx); + if (std::getenv("REFERENCE")) { + (*kernel)(&ctx); + std::cout << "Ref Repacked, KN = " << K * N << std::endl; + const auto* data = reinterpret_cast(dst); + for (size_t i = 0; i < K * N; ++i) { + std::cout << static_cast(data[i]) << "\t"; + } + std::cout << "\n"; + } else { + auto srcPtr = static_cast(src); + auto dstPtr = const_cast(static_cast(dst)); + + auto copySize = K * N * sizeof(bfloat16); + cpu_memcpy(dstPtr, srcPtr, copySize); + std::cout << "Just copy, KN = " << K * N << std::endl; + const auto* data = reinterpret_cast(dst); + for (size_t i = 0; i < K * N; ++i) { + std::cout << static_cast(data[i]) << "\t"; + } + std::cout << "\n"; + } } BrgemmCopyBKernelExecutor::BrgemmCopyBKernelExecutor(ov::intel_cpu::MultiCacheWeakPtr kernel_cache, BrgemmCopyBKernelConfig config) diff --git a/src/plugins/intel_cpu/src/nodes/reorder.cpp b/src/plugins/intel_cpu/src/nodes/reorder.cpp index 9b521cdb3b57c7..7257e31369bd66 100644 --- a/src/plugins/intel_cpu/src/nodes/reorder.cpp +++ b/src/plugins/intel_cpu/src/nodes/reorder.cpp @@ -17,13 +17,7 @@ #include #include -#include "convert.h" #include "cpu/x64/cpu_isa_traits.hpp" -#include "nodes/common/cpu_convert.h" -#include "nodes/common/cpu_memcpy.h" -#include "nodes/common/reorder_prim.h" -#include "openvino/core/parallel.hpp" -#include "shape_inference/shape_inference_pass_through.hpp" #include "utils/precision_support.h" #include "nodes/executors/executor.hpp" #include "nodes/executors/transpose_list.hpp" diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.cpp b/src/plugins/intel_cpu/src/nodes/subgraph.cpp index 5003deabc0bd40..bee62fd4e0b548 100644 --- a/src/plugins/intel_cpu/src/nodes/subgraph.cpp +++ b/src/plugins/intel_cpu/src/nodes/subgraph.cpp @@ -3,6 +3,9 @@ // #include "subgraph.h" +#include "nodes/reorder.h" +#include "nodes/common/reorder_prim.h" +#include "memory_desc/cpu_memory_desc_utils.h" #include "common/primitive_hashing_utils.hpp" #include "dnnl_extension_utils.h" #include "onednn/dnnl.h" @@ -756,6 +759,36 @@ void Subgraph::optimizeIR() { void Subgraph::prepareParams() { const auto& cache = context->getParamsCache(); + const auto& input_shape = getSrcMemoryAtPort(0)->getDescPtr()->getShape().getStaticDims(); + const auto& b_shape = getSrcMemoryAtPort(1)->getDescPtr()->getShape().getStaticDims(); + + // Note: this code was tested only on static shapes, in case of dynamic M will most likely fail + const auto M = DnnlExtensionUtils::convertToDnnlDim(*++input_shape.rbegin()); + const auto K = DnnlExtensionUtils::convertToDnnlDim(*input_shape.rbegin()); + const auto N = DnnlExtensionUtils::convertToDnnlDim(*b_shape.rbegin()); + const auto B_2 = DnnlExtensionUtils::convertToDnnlDim(*++b_shape.begin()); + + auto get_wei_desc = [&]() { + const auto inputDesc = dnnl::memory::desc({1, M, K}, dnnl::memory::data_type::bf16, dnnl::memory::format_tag::abc); + // Notes: + // 1. "Any" layout must be set to enable weights layout selection heuristics + // 2. Shape must be in NK order (even if the original shape is in KN order) + const auto BDesc = dnnl::memory::desc({B_2, K, N}, dnnl::memory::data_type::bf16, dnnl::memory::format_tag::any); + const auto outputDesc = dnnl::memory::desc({B_2, M, N}, dnnl::memory::data_type::bf16, dnnl::memory::format_tag::abc); + + // Hack: we create inner product primitive just to know which weights layout was chosen by OneDNN heuristics + // Then, this layout is used in Snippets implementation + auto mm_desc = dnnl::matmul::primitive_desc(getEngine(), inputDesc, BDesc, outputDesc); + // Note: based on weights layout, it is necessary to set N block sizes inside Snippets. + // Example: in case of "AB16b32a" layout, N_block must be 32. K_block can be any + std::cout << "[ INFO ] matmul primitive selected the following B layout for BF16: " + << DnnlExtensionUtils::makeDescriptor(mm_desc.weights_desc())->serializeFormat() << std::endl; + return DnnlExtensionUtils::makeDescriptor(mm_desc.weights_desc()); + }; + + // auto reorder = ov::intel_cpu::getReorderPrim(context->getParamsCache(), getEngine(), originalMemDesc->getDnnlDesc(), get_wei_desc()); + requested_desc_b = get_wei_desc(); + auto builder = [this, &cache](const SubgraphKey& key) -> std::shared_ptr { const auto& snippet = subgraph_attrs->snippet; @@ -843,6 +876,29 @@ bool Subgraph::created() const { void Subgraph::execute(dnnl::stream strm) { OPENVINO_ASSERT(execPtr, "Can't execute Subgraph node. Primitive didn't created"); + if (requested_desc_b) { + auto repacked_memory = std::make_shared(getEngine(), requested_desc_b); + repacked_memory->load(*srcMemPtrs[1]); + if (!std::getenv("REFERENCE")) + srcMemPtrs[1] = repacked_memory; + + // TODO: remove + const auto& input_shape = getSrcMemoryAtPort(0)->getDescPtr()->getShape().getStaticDims(); + const auto& b_shape = getSrcMemoryAtPort(1)->getDescPtr()->getShape().getStaticDims(); + const auto K = DnnlExtensionUtils::convertToDnnlDim(*input_shape.rbegin()); + const auto N = DnnlExtensionUtils::convertToDnnlDim(*b_shape.rbegin()); + auto* data = repacked_memory->getDataAs(); + std::cout << "Repacked, KN = " << K * N << std::endl; + auto upper_bound = repacked_memory->getSize(); + for (decltype(upper_bound) i = 0; i < upper_bound; ++i) { + std::cout << static_cast(data[i]) << "\t"; + if (static_cast(data[i]) == 5.21875f) { + // std::cout << "Stride is found: " << i << std::endl; + upper_bound = i + K * N; + } + } + std::cout << "\n"; + } execPtr->exec(srcMemPtrs, dstMemPtrs); } diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.h b/src/plugins/intel_cpu/src/nodes/subgraph.h index ffd7944c59d48a..c6bcc2c2fb8480 100644 --- a/src/plugins/intel_cpu/src/nodes/subgraph.h +++ b/src/plugins/intel_cpu/src/nodes/subgraph.h @@ -105,6 +105,8 @@ class Subgraph : public Node { mutable std::vector in_shapes; std::shared_ptr execPtr = nullptr; + + ov::intel_cpu::MemoryDescPtr requested_desc_b; }; class Subgraph::SubgraphCodeGenerator {