Skip to content

Commit

Permalink
[Snippets][CPU] Fixed allocator to Subgraph node
Browse files Browse the repository at this point in the history
  • Loading branch information
a-sidorova committed Aug 21, 2024
1 parent 3b746e8 commit 4e82725
Show file tree
Hide file tree
Showing 4 changed files with 25 additions and 14 deletions.
3 changes: 2 additions & 1 deletion src/common/snippets/src/pass/mha_tokenization.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -476,7 +476,8 @@ ov::snippets::pass::TokenizeMHASnippets::TokenizeMHASnippets(const SnippetsToken
const auto data_count = io_count + uniqie_buffer_reg_group_count;
auto available_regs = config.get_data_ptr_gpr_count();
// [150148, 150149] Currently Snippets don't have mechanism of spilling registers on stack.
// Due to this limitation we have to not tokenize some subgraphs if we need more registers than we have on target machine.
// Due to this limitation we have to skip tokenization of some subgraphs
// if we need more registers than we have on the target machine.
// `config.get_data_ptr_gpr_count()` provides available data registers count (including parameters, results and buffers)
// after excluding 2 registers for work amounts.
// However, MHA Subgraph has `SplitLoops` optimization which adds outermost blocked Loop by M. This Loop requires
Expand Down
7 changes: 7 additions & 0 deletions src/plugins/intel_cpu/src/node.h
Original file line number Diff line number Diff line change
Expand Up @@ -785,6 +785,13 @@ class Node {
return scratchpadMem;
}

MemoryPtr getScratchPadMem(const CpuBlockedMemoryDescPtr& desc) {
if (!scratchpadMem || !scratchpadMem->getDesc().isCompatible(*desc)) {
scratchpadMem = getScratchPad()->createScratchPadMem(desc);
}
return scratchpadMem;
}

std::vector<VectorDims> lastInputDims = {};

std::shared_ptr<IShapeInfer> shapeInference;
Expand Down
25 changes: 13 additions & 12 deletions src/plugins/intel_cpu/src/nodes/subgraph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -73,8 +73,8 @@ class SubgraphStaticExecutor : public Subgraph::SubgraphExecutor {
const std::vector<ptrdiff_t>& start_offset_in,
const std::vector<ptrdiff_t>& start_offset_out,
const std::shared_ptr<CPURuntimeConfig>& snippet_config,
const DnnlScratchPadPtr& scratchpad)
: SubgraphExecutor(snippet_attrs, snippet, start_offset_in, start_offset_out, snippet_config, scratchpad) {}
const BufferScratchpadAllocator& allocator)
: SubgraphExecutor(snippet_attrs, snippet, start_offset_in, start_offset_out, snippet_config, allocator) {}

void exec(const std::vector<MemoryPtr>& inMemPtrs, const std::vector<MemoryPtr>& outMemPtrs) override {
const auto& callable = m_schedule->get_callable<kernel>();
Expand Down Expand Up @@ -116,8 +116,8 @@ class SubgraphDynamicSpecializedExecutor : public Subgraph::SubgraphExecutor {
const std::vector<ptrdiff_t>& start_offset_in,
const std::vector<ptrdiff_t>& start_offset_out,
const std::shared_ptr<CPURuntimeConfig>& snippet_config,
const DnnlScratchPadPtr& scratchpad)
: SubgraphExecutor(snippet_attrs, snippet, start_offset_in, start_offset_out, snippet_config, scratchpad) {
const BufferScratchpadAllocator& allocator)
: SubgraphExecutor(snippet_attrs, snippet, start_offset_in, start_offset_out, snippet_config, allocator) {
buffer_offsets = snippet_config->buffer_cluster_offsets;
data_offsets = snippet_config->io_data_offsets;
loop_args = snippet_config->loop_args;
Expand Down Expand Up @@ -752,7 +752,11 @@ void Subgraph::prepareParams() {

auto builder = [this, &cache](const SubgraphKey& key) -> std::shared_ptr<SubgraphExecutor> {
const auto& snippet = subgraph_attrs->snippet;
const auto& scratchpad = getScratchPad();

SubgraphExecutor::BufferScratchpadAllocator allocator = [this](size_t size) {
return getScratchPadMem(std::make_shared<CpuBlockedMemoryDesc>(ov::element::u8, intel_cpu::Shape{size}));
};

if (is_dynamic) {
// Dynamic case:
// 1. Generate JIT code if needed
Expand All @@ -769,7 +773,7 @@ void Subgraph::prepareParams() {
snippet->get_runtime_configurator()->set_kernel_executor_table(code_gen->get()->lowering_result.kernel_executor_table);
}
const auto& snippet_config = ov::as_type_ptr<CPURuntimeConfig>(snippet->update_runtime_config());
return std::make_shared<SubgraphDynamicSpecializedExecutor>(key.attrs, code_gen, start_offset_in, start_offset_out, snippet_config, scratchpad);
return std::make_shared<SubgraphDynamicSpecializedExecutor>(key.attrs, code_gen, start_offset_in, start_offset_out, snippet_config, allocator);
} else {
// Static case:
// 1. Update runtime config to get static scheduling data (io data offsets, parallel domain) which will be compiled in JIT code
Expand All @@ -780,7 +784,7 @@ void Subgraph::prepareParams() {
[&snippet_config](const SubgraphCodeGeneratorKey& key) -> std::shared_ptr<SubgraphCodeGenerator> {
return std::make_shared<SubgraphCodeGenerator>(key.attrs, snippet_config);
});
return std::make_shared<SubgraphStaticExecutor>(key.attrs, code_gen_result.first, start_offset_in, start_offset_out, snippet_config, scratchpad);
return std::make_shared<SubgraphStaticExecutor>(key.attrs, code_gen_result.first, start_offset_in, start_offset_out, snippet_config, allocator);
}
};

Expand Down Expand Up @@ -869,7 +873,7 @@ Subgraph::SubgraphExecutor::SubgraphExecutor(const std::shared_ptr<Subgraph::Sub
const std::vector<ptrdiff_t>& start_offset_in,
const std::vector<ptrdiff_t>& start_offset_out,
const std::shared_ptr<CPURuntimeConfig>& snippet_config,
const DnnlScratchPadPtr& scratchpad)
const BufferScratchpadAllocator& allocator)
: m_schedule(snippet->get()), m_start_offset_in(start_offset_in), m_start_offset_out(start_offset_out) {
OPENVINO_ASSERT(m_schedule, "Schedule is empty!");
OPENVINO_ASSERT(snippet_config, "Runtime Config is empty!");
Expand All @@ -880,10 +884,7 @@ Subgraph::SubgraphExecutor::SubgraphExecutor(const std::shared_ptr<Subgraph::Sub

m_buffer_scratchpad_size = snippet_config->buffer_scratchpad_size;
OPENVINO_ASSERT(!ov::snippets::utils::is_dynamic_value(m_buffer_scratchpad_size), "Undefined buffer scratchpad size!");

const auto buffer_shape = intel_cpu::Shape{static_cast<size_t>(m_nthreads), m_buffer_scratchpad_size};
const auto memory_desc = std::make_shared<CpuBlockedMemoryDesc>(ov::element::u8, buffer_shape);
m_buffer_scratchpad = scratchpad->createScratchPadMem(memory_desc);
m_buffer_scratchpad = allocator(static_cast<size_t>(m_nthreads) * m_buffer_scratchpad_size);

#if defined(__linux__) && defined(OPENVINO_ARCH_X86_64) && defined(SNIPPETS_DEBUG_CAPS)
const auto target = std::dynamic_pointer_cast<const CPUTargetMachine>(snippet_attrs->snippet->get_generator()->get_target_machine());
Expand Down
4 changes: 3 additions & 1 deletion src/plugins/intel_cpu/src/nodes/subgraph.h
Original file line number Diff line number Diff line change
Expand Up @@ -119,12 +119,14 @@ class Subgraph::SubgraphCodeGenerator {

class Subgraph::SubgraphExecutor {
public:
using BufferScratchpadAllocator = std::function<MemoryPtr(size_t)>;

SubgraphExecutor(const std::shared_ptr<Subgraph::SubgraphAttrs>& snippet_attrs,
const std::shared_ptr<Subgraph::SubgraphCodeGenerator>& snippet,
const std::vector<ptrdiff_t>& start_offset_in,
const std::vector<ptrdiff_t>& start_offset_out,
const std::shared_ptr<CPURuntimeConfig>& snippet_config,
const DnnlScratchPadPtr& scratchpad);
const BufferScratchpadAllocator& allocator);
virtual ~SubgraphExecutor() = default;

virtual void exec(const std::vector<MemoryPtr>& inMemPtrs, const std::vector<MemoryPtr>& outMemPtrs) = 0;
Expand Down

0 comments on commit 4e82725

Please sign in to comment.