[CPU] Create dnnl stream once per graph (openvinotoolkit#25407)

to reduce infer overhead. Even dnnl stream is very lightweight object, it is not actually allocated on stack but performs dynamic memory allocation in constructor ### Tickets: - *ticket-id*
ynimmaga · Jul 15, 2024 · c27af27 · c27af27
1 parent 5136304
commit c27af27
Show file tree

Hide file tree

Showing 2 changed files with 7 additions and 10 deletions.
diff --git a/src/plugins/intel_cpu/src/graph.cpp b/src/plugins/intel_cpu/src/graph.cpp
@@ -71,6 +71,7 @@ void Graph::CreateGraph(NET &net, const GraphContext::CPtr ctx) {
         ForgetGraphData();
 
     context = ctx;
+    m_stream = dnnl::stream(getEngine());
 
     Replicate(net);
 
@@ -87,6 +88,7 @@ void Graph::CreateGraph(const std::vector<NodePtr>& graphNodes,
         ForgetGraphData();
 
     context = ctx;
+    m_stream = dnnl::stream(getEngine());
 
     this->_name = std::move(name);
     this->reuse_io_tensors = false;
@@ -440,8 +442,6 @@ void Graph::InitOptimalPrimitiveDescriptors() {
 
 void Graph::CreatePrimitivesAndExecConstants() const {
     OV_ITT_SCOPE(FIRST_INFERENCE, itt::domains::intel_cpu_LT, "Graph::CreatePrimitivesAndExecConstants");
-    dnnl::stream stream(getEngine());
-
     using shared_memory_ptr = WeightsSharing::SharedMemory::Ptr;
 
     auto acquireSharedOutputs = [this](const NodePtr & node) {
@@ -481,13 +481,13 @@ void Graph::CreatePrimitivesAndExecConstants() const {
             auto sharedOutputs = acquireSharedOutputs(node);
 
             if (std::get<0>(sharedOutputs) || std::get<1>(sharedOutputs)) {
-                ExecuteNode(node, stream);
+                ExecuteNode(node, m_stream);
 
                 for (auto & output : std::get<2>(sharedOutputs))
                     output->valid(true);
             }
         } else {
-            ExecuteNode(node, stream);
+            ExecuteNode(node, m_stream);
         }
     }
 }
@@ -1124,15 +1124,13 @@ void Graph::PullOutputData(std::unordered_map<std::size_t, ov::SoPtr<ITensor>>&
 }
 
 void Graph::InferStatic(SyncInferRequest* request) {
-    dnnl::stream stream(getEngine());
-
     for (const auto& node : m_executableGraphNodes) {
         VERBOSE(node, getConfig().debugCaps.verbose);
         PERF(node, getConfig().collectPerfCounters);
 
         if (request)
             request->throw_if_canceled();
-        ExecuteNode(node, stream);
+        ExecuteNode(node, m_stream);
     }
 }
 
@@ -1339,8 +1337,6 @@ class UpdateNodes : public UpdateNodesBase {
 
 template<typename UpdateStrategy>
 void Graph::InferDynamic(SyncInferRequest* request, UpdateStrategy&& update) {
-    dnnl::stream stream(getEngine());
-
     size_t inferCounter = 0;
     for (auto stopIndx : m_executableSyncNodesInds) {
         update(stopIndx);
@@ -1353,7 +1349,7 @@ void Graph::InferDynamic(SyncInferRequest* request, UpdateStrategy&& update) {
             if (request)
                 request->throw_if_canceled();
             try {
-                ExecuteNode(node, stream);
+                ExecuteNode(node, m_stream);
             } catch (const std::exception& exp) {
                 OPENVINO_THROW(node, exp.what());
             }

diff --git a/src/plugins/intel_cpu/src/graph.h b/src/plugins/intel_cpu/src/graph.h
@@ -253,6 +253,7 @@ class Graph {
     std::vector<size_t> m_executableSyncNodesInds;
 
     GraphContext::CPtr context;
+    dnnl::stream m_stream;
 
     void EnforceInferencePrecision();
     void EnforceBF16();