From c27af2711af2cb74d59d2599c6f35e9c837bbfa5 Mon Sep 17 00:00:00 2001 From: Egor Duplenskii Date: Mon, 15 Jul 2024 07:26:15 +0200 Subject: [PATCH] [CPU] Create dnnl stream once per graph (#25407) to reduce infer overhead. Even dnnl stream is very lightweight object, it is not actually allocated on stack but performs dynamic memory allocation in constructor ### Tickets: - *ticket-id* --- src/plugins/intel_cpu/src/graph.cpp | 16 ++++++---------- src/plugins/intel_cpu/src/graph.h | 1 + 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/src/plugins/intel_cpu/src/graph.cpp b/src/plugins/intel_cpu/src/graph.cpp index c85af994d0c393..490c15fceb2ec4 100644 --- a/src/plugins/intel_cpu/src/graph.cpp +++ b/src/plugins/intel_cpu/src/graph.cpp @@ -71,6 +71,7 @@ void Graph::CreateGraph(NET &net, const GraphContext::CPtr ctx) { ForgetGraphData(); context = ctx; + m_stream = dnnl::stream(getEngine()); Replicate(net); @@ -87,6 +88,7 @@ void Graph::CreateGraph(const std::vector& graphNodes, ForgetGraphData(); context = ctx; + m_stream = dnnl::stream(getEngine()); this->_name = std::move(name); this->reuse_io_tensors = false; @@ -440,8 +442,6 @@ void Graph::InitOptimalPrimitiveDescriptors() { void Graph::CreatePrimitivesAndExecConstants() const { OV_ITT_SCOPE(FIRST_INFERENCE, itt::domains::intel_cpu_LT, "Graph::CreatePrimitivesAndExecConstants"); - dnnl::stream stream(getEngine()); - using shared_memory_ptr = WeightsSharing::SharedMemory::Ptr; auto acquireSharedOutputs = [this](const NodePtr & node) { @@ -481,13 +481,13 @@ void Graph::CreatePrimitivesAndExecConstants() const { auto sharedOutputs = acquireSharedOutputs(node); if (std::get<0>(sharedOutputs) || std::get<1>(sharedOutputs)) { - ExecuteNode(node, stream); + ExecuteNode(node, m_stream); for (auto & output : std::get<2>(sharedOutputs)) output->valid(true); } } else { - ExecuteNode(node, stream); + ExecuteNode(node, m_stream); } } } @@ -1124,15 +1124,13 @@ void Graph::PullOutputData(std::unordered_map>& } void Graph::InferStatic(SyncInferRequest* request) { - dnnl::stream stream(getEngine()); - for (const auto& node : m_executableGraphNodes) { VERBOSE(node, getConfig().debugCaps.verbose); PERF(node, getConfig().collectPerfCounters); if (request) request->throw_if_canceled(); - ExecuteNode(node, stream); + ExecuteNode(node, m_stream); } } @@ -1339,8 +1337,6 @@ class UpdateNodes : public UpdateNodesBase { template void Graph::InferDynamic(SyncInferRequest* request, UpdateStrategy&& update) { - dnnl::stream stream(getEngine()); - size_t inferCounter = 0; for (auto stopIndx : m_executableSyncNodesInds) { update(stopIndx); @@ -1353,7 +1349,7 @@ void Graph::InferDynamic(SyncInferRequest* request, UpdateStrategy&& update) { if (request) request->throw_if_canceled(); try { - ExecuteNode(node, stream); + ExecuteNode(node, m_stream); } catch (const std::exception& exp) { OPENVINO_THROW(node, exp.what()); } diff --git a/src/plugins/intel_cpu/src/graph.h b/src/plugins/intel_cpu/src/graph.h index 9475b2ef98b66b..4e6d6e6f3beca6 100644 --- a/src/plugins/intel_cpu/src/graph.h +++ b/src/plugins/intel_cpu/src/graph.h @@ -253,6 +253,7 @@ class Graph { std::vector m_executableSyncNodesInds; GraphContext::CPtr context; + dnnl::stream m_stream; void EnforceInferencePrecision(); void EnforceBF16();