Skip to content

Commit

Permalink
Start to move repacking stage in executor
Browse files Browse the repository at this point in the history
  • Loading branch information
v-Golubev committed Oct 10, 2024
1 parent 7204352 commit 973ba7b
Show file tree
Hide file tree
Showing 3 changed files with 44 additions and 33 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,20 @@
#include "snippets/lowered/port_descriptor.hpp"
#include "emitters/snippets/jit_snippets_call_args.hpp"

#include "memory_desc/cpu_memory_desc.h"

namespace ov {
namespace intel_cpu {

// class RequestedDesc {
// public:
// RequestedDesc() = default;
// RequestedDesc(DnnlMemoryDescPtr requested_desc);

// private:
// DnnlMemoryDescPtr m_requested_desc = nullptr;
// };

class CPURuntimeConfig : public ov::snippets::RuntimeConfig {
public:
OPENVINO_RTTI("CPURuntimeConfig", "0", ov::snippets::RuntimeConfig)
Expand Down
58 changes: 27 additions & 31 deletions src/plugins/intel_cpu/src/nodes/subgraph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

#include "nodes/reorder.h"
#include "nodes/common/reorder_prim.h"
#include "memory_desc/dnnl_blocked_memory_desc.h"
#include "memory_desc/cpu_memory_desc_utils.h"
#include "common/primitive_hashing_utils.hpp"
#include "dnnl_extension_utils.h"
Expand Down Expand Up @@ -81,7 +82,8 @@ class SubgraphStaticExecutor : public Subgraph::SubgraphExecutor {
const BufferScratchpadAllocator& allocator)
: SubgraphExecutor(snippet_attrs, snippet, start_offset_in, start_offset_out, snippet_config, allocator) {}

void exec(const std::vector<MemoryPtr>& inMemPtrs, const std::vector<MemoryPtr>& outMemPtrs) override {
void exec(dnnl::stream strm, std::vector<MemoryPtr>& inMemPtrs, std::vector<MemoryPtr>& outMemPtrs) override {
repack_inputs(strm, inMemPtrs);
const auto& callable = m_schedule->get_callable<kernel>();

auto initializer = [&](jit_snippets_call_args& call_args, size_t ithr) {
Expand Down Expand Up @@ -129,7 +131,8 @@ class SubgraphDynamicSpecializedExecutor : public Subgraph::SubgraphExecutor {
reset_exec_table_state = snippet_config->kernel_executor_table->get_state_reset();
}

void exec(const std::vector<MemoryPtr>& inMemPtrs, const std::vector<MemoryPtr>& outMemPtrs) override {
void exec(dnnl::stream strm, std::vector<MemoryPtr>& inMemPtrs, std::vector<MemoryPtr>& outMemPtrs) override {
repack_inputs(strm, inMemPtrs);
const auto& callable = m_schedule->get_callable<dynamic_kernel>();

OPENVINO_ASSERT(data_offsets.size() == inMemPtrs.size() + outMemPtrs.size(), "Incorrect data offset count!");
Expand Down Expand Up @@ -759,35 +762,15 @@ void Subgraph::optimizeIR() {
void Subgraph::prepareParams() {
const auto& cache = context->getParamsCache();

const auto& input_shape = getSrcMemoryAtPort(0)->getDescPtr()->getShape().getStaticDims();
const auto& b_shape = getSrcMemoryAtPort(1)->getDescPtr()->getShape().getStaticDims();

// Note: this code was tested only on static shapes, in case of dynamic M will most likely fail
const auto M = DnnlExtensionUtils::convertToDnnlDim(*++input_shape.rbegin());
const auto K = DnnlExtensionUtils::convertToDnnlDim(*input_shape.rbegin());
const auto N = DnnlExtensionUtils::convertToDnnlDim(*b_shape.rbegin());
const auto B_2 = DnnlExtensionUtils::convertToDnnlDim(*++b_shape.begin());

auto get_wei_desc = [&]() {
const auto inputDesc = dnnl::memory::desc({1, M, K}, dnnl::memory::data_type::bf16, dnnl::memory::format_tag::abc);
// Notes:
// 1. "Any" layout must be set to enable weights layout selection heuristics
// 2. Shape must be in NK order (even if the original shape is in KN order)
const auto BDesc = dnnl::memory::desc({B_2, K, N}, dnnl::memory::data_type::bf16, dnnl::memory::format_tag::any);
const auto outputDesc = dnnl::memory::desc({B_2, M, N}, dnnl::memory::data_type::bf16, dnnl::memory::format_tag::abc);

// Hack: we create inner product primitive just to know which weights layout was chosen by OneDNN heuristics
// Then, this layout is used in Snippets implementation
auto mm_desc = dnnl::matmul::primitive_desc(getEngine(), inputDesc, BDesc, outputDesc);
// Note: based on weights layout, it is necessary to set N block sizes inside Snippets.
// Example: in case of "AB16b32a" layout, N_block must be 32. K_block can be any
std::cout << "[ INFO ] matmul primitive selected the following B layout for BF16: "
<< DnnlExtensionUtils::makeDescriptor(mm_desc.weights_desc())->serializeFormat() << std::endl;
return DnnlExtensionUtils::makeDescriptor(mm_desc.weights_desc());
};
const auto& b_dims = getSrcMemoryAtPort(1)->getDescPtr()->getShape().getDims();
VectorDims normalized_dims(3, 1);
*normalized_dims.rbegin() = *b_dims.rbegin();
*++normalized_dims.rbegin() = *++b_dims.rbegin();
normalized_dims[0] = std::accumulate(b_dims.begin(), b_dims.end() - 2, static_cast<Dim>(1), std::multiplies<Dim>());

// auto reorder = ov::intel_cpu::getReorderPrim(context->getParamsCache(), getEngine(), originalMemDesc->getDnnlDesc(), get_wei_desc());
requested_desc_b = get_wei_desc();
requested_desc_b = std::make_shared<DnnlBlockedMemoryDesc>(Shape(normalized_dims),
dnnl::memory::data_type::bf16,
dnnl::memory::format_tag::aCB16b64c2b);

auto builder = [this, &cache](const SubgraphKey& key) -> std::shared_ptr<SubgraphExecutor> {
const auto& snippet = subgraph_attrs->snippet;
Expand Down Expand Up @@ -882,7 +865,7 @@ void Subgraph::execute(dnnl::stream strm) {
if (!std::getenv("REFERENCE"))
srcMemPtrs[1] = repacked_memory;
}
execPtr->exec(srcMemPtrs, dstMemPtrs);
execPtr->exec(strm, srcMemPtrs, dstMemPtrs);
}

void Subgraph::executeDynamicImpl(dnnl::stream strm) {
Expand Down Expand Up @@ -1006,6 +989,19 @@ void Subgraph::SubgraphExecutor::parallel_forNd(const std::function<void(jit_sni
});
}

void Subgraph::SubgraphExecutor::repack_inputs(dnnl::stream strm, std::vector<MemoryPtr>& inMemPtrs) {
// TODO: remove check on empty
OPENVINO_ASSERT(m_requested_descs.empty() || inMemPtrs.size() == m_requested_descs.size());
for (size_t i = 0; i < m_requested_descs.size(); ++i) {
if (m_requested_descs[i]) {
auto repacked_memory = std::make_shared<Memory>(strm.get_engine(), m_requested_descs[i]);
repacked_memory->load(*inMemPtrs[i]);
if (!std::getenv("REFERENCE"))
inMemPtrs[i] = repacked_memory;
}
}
}

} // namespace node
} // namespace intel_cpu
} // namespace ov
8 changes: 6 additions & 2 deletions src/plugins/intel_cpu/src/nodes/subgraph.h
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ class Subgraph : public Node {

std::shared_ptr<SubgraphExecutor> execPtr = nullptr;

ov::intel_cpu::MemoryDescPtr requested_desc_b;
MemoryDescPtr requested_desc_b;
};

class Subgraph::SubgraphCodeGenerator {
Expand All @@ -131,7 +131,7 @@ class Subgraph::SubgraphExecutor {
const BufferScratchpadAllocator& allocator);
virtual ~SubgraphExecutor() = default;

virtual void exec(const std::vector<MemoryPtr>& inMemPtrs, const std::vector<MemoryPtr>& outMemPtrs) = 0;
virtual void exec(dnnl::stream strm, std::vector<MemoryPtr>& inMemPtrs, std::vector<MemoryPtr>& outMemPtrs) = 0;

protected:
void parallel_for6d(const std::function<void(jit_snippets_call_args&, size_t)>& initializer,
Expand All @@ -144,6 +144,10 @@ class Subgraph::SubgraphExecutor {
scratchpad_ptr = m_buffer_scratchpad->getDataAs<uint8_t>() + ithr * m_buffer_scratchpad_size;
}

void repack_inputs(dnnl::stream strm, std::vector<MemoryPtr>& inMemPtrs);

std::vector<MemoryDescPtr> m_requested_descs = {};

std::shared_ptr<snippets::Schedule> m_schedule;
// Holds index of output used as in execution domain
// it should be compatible with a schedule's work size
Expand Down

0 comments on commit 973ba7b

Please sign in to comment.