From 3e05b00ea479b10044ba64b6356faa5ea5a68538 Mon Sep 17 00:00:00 2001 From: Egor Duplensky Date: Fri, 5 Jul 2024 15:39:09 +0200 Subject: [PATCH] [CPU] Create dnnl stream once per graph to reduce infer overhead. Even dnnl stream is very lightweight object, it is not actually allocated on stack but performs dynamic memory allocation in constructor --- src/plugins/intel_cpu/src/graph.cpp | 16 ++++++---------- src/plugins/intel_cpu/src/graph.h | 1 + 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/src/plugins/intel_cpu/src/graph.cpp b/src/plugins/intel_cpu/src/graph.cpp index c85af994d0c393..490c15fceb2ec4 100644 --- a/src/plugins/intel_cpu/src/graph.cpp +++ b/src/plugins/intel_cpu/src/graph.cpp @@ -71,6 +71,7 @@ void Graph::CreateGraph(NET &net, const GraphContext::CPtr ctx) { ForgetGraphData(); context = ctx; + m_stream = dnnl::stream(getEngine()); Replicate(net); @@ -87,6 +88,7 @@ void Graph::CreateGraph(const std::vector& graphNodes, ForgetGraphData(); context = ctx; + m_stream = dnnl::stream(getEngine()); this->_name = std::move(name); this->reuse_io_tensors = false; @@ -440,8 +442,6 @@ void Graph::InitOptimalPrimitiveDescriptors() { void Graph::CreatePrimitivesAndExecConstants() const { OV_ITT_SCOPE(FIRST_INFERENCE, itt::domains::intel_cpu_LT, "Graph::CreatePrimitivesAndExecConstants"); - dnnl::stream stream(getEngine()); - using shared_memory_ptr = WeightsSharing::SharedMemory::Ptr; auto acquireSharedOutputs = [this](const NodePtr & node) { @@ -481,13 +481,13 @@ void Graph::CreatePrimitivesAndExecConstants() const { auto sharedOutputs = acquireSharedOutputs(node); if (std::get<0>(sharedOutputs) || std::get<1>(sharedOutputs)) { - ExecuteNode(node, stream); + ExecuteNode(node, m_stream); for (auto & output : std::get<2>(sharedOutputs)) output->valid(true); } } else { - ExecuteNode(node, stream); + ExecuteNode(node, m_stream); } } } @@ -1124,15 +1124,13 @@ void Graph::PullOutputData(std::unordered_map>& } void Graph::InferStatic(SyncInferRequest* request) { - dnnl::stream stream(getEngine()); - for (const auto& node : m_executableGraphNodes) { VERBOSE(node, getConfig().debugCaps.verbose); PERF(node, getConfig().collectPerfCounters); if (request) request->throw_if_canceled(); - ExecuteNode(node, stream); + ExecuteNode(node, m_stream); } } @@ -1339,8 +1337,6 @@ class UpdateNodes : public UpdateNodesBase { template void Graph::InferDynamic(SyncInferRequest* request, UpdateStrategy&& update) { - dnnl::stream stream(getEngine()); - size_t inferCounter = 0; for (auto stopIndx : m_executableSyncNodesInds) { update(stopIndx); @@ -1353,7 +1349,7 @@ void Graph::InferDynamic(SyncInferRequest* request, UpdateStrategy&& update) { if (request) request->throw_if_canceled(); try { - ExecuteNode(node, stream); + ExecuteNode(node, m_stream); } catch (const std::exception& exp) { OPENVINO_THROW(node, exp.what()); } diff --git a/src/plugins/intel_cpu/src/graph.h b/src/plugins/intel_cpu/src/graph.h index 9475b2ef98b66b..4e6d6e6f3beca6 100644 --- a/src/plugins/intel_cpu/src/graph.h +++ b/src/plugins/intel_cpu/src/graph.h @@ -253,6 +253,7 @@ class Graph { std::vector m_executableSyncNodesInds; GraphContext::CPtr context; + dnnl::stream m_stream; void EnforceInferencePrecision(); void EnforceBF16();