[CPU] Create dnnl stream once per graph

to reduce infer overhead. Even dnnl stream is very lightweight object, it is not actually allocated on stack but performs dynamic memory allocation in constructor
openvinotoolkit · Jul 5, 2024 · c5db24a · c5db24a
1 parent dce3308
commit c5db24a
Show file tree

Hide file tree

Showing 2 changed files with 7 additions and 10 deletions.
diff --git a/src/plugins/intel_cpu/src/graph.cpp b/src/plugins/intel_cpu/src/graph.cpp
@@ -71,6 +71,7 @@ void Graph::CreateGraph(NET &net, const GraphContext::CPtr ctx) {
         ForgetGraphData();
 
     context = ctx;
+    m_stream = dnnl::stream(getEngine());
 
     Replicate(net);
 
@@ -87,6 +88,7 @@ void Graph::CreateGraph(const std::vector<NodePtr>& graphNodes,
         ForgetGraphData();
 
     context = ctx;
+    m_stream = dnnl::stream(getEngine());
 
     this->_name = std::move(name);
     this->reuse_io_tensors = false;
@@ -439,8 +441,6 @@ void Graph::InitOptimalPrimitiveDescriptors() {
 
 void Graph::CreatePrimitivesAndExecConstants() const {
     OV_ITT_SCOPE(FIRST_INFERENCE, itt::domains::intel_cpu_LT, "Graph::CreatePrimitivesAndExecConstants");
-    dnnl::stream stream(getEngine());
-
     using shared_memory_ptr = WeightsSharing::SharedMemory::Ptr;
 
     auto acquireSharedOutputs = [this](const NodePtr & node) {
@@ -480,13 +480,13 @@ void Graph::CreatePrimitivesAndExecConstants() const {
             auto sharedOutputs = acquireSharedOutputs(node);
 
             if (std::get<0>(sharedOutputs) || std::get<1>(sharedOutputs)) {
-                ExecuteNode(node, stream);
+                ExecuteNode(node, m_stream);
 
                 for (auto & output : std::get<2>(sharedOutputs))
                     output->valid(true);
             }
         } else {
-            ExecuteNode(node, stream);
+            ExecuteNode(node, m_stream);
         }
     }
 }
@@ -1123,15 +1123,13 @@ void Graph::PullOutputData(std::unordered_map<std::size_t, ov::SoPtr<ITensor>>&
 }
 
 void Graph::InferStatic(SyncInferRequest* request) {
-    dnnl::stream stream(getEngine());
-
     for (const auto& node : m_executableGraphNodes) {
         VERBOSE(node, getConfig().debugCaps.verbose);
         PERF(node, getConfig().collectPerfCounters);
 
         if (request)
             request->throw_if_canceled();
-        ExecuteNode(node, stream);
+        ExecuteNode(node, m_stream);
     }
 }
 
@@ -1342,8 +1340,6 @@ class UpdateNodes : public UpdateNodesBase {
 
 
 void Graph::InferDynamic(SyncInferRequest* request) {
-    dnnl::stream stream(getEngine());
-
     std::unique_ptr<IUpdateNodes> updateNodes{};
     if (parallel_get_max_threads() > 1) {
         updateNodes.reset(new UpdateNodes(m_executableGraphNodes));
@@ -1362,7 +1358,7 @@ void Graph::InferDynamic(SyncInferRequest* request) {
             if (request)
                 request->throw_if_canceled();
             try {
-                ExecuteNode(node, stream);
+                ExecuteNode(node, m_stream);
             } catch (const std::exception& exp) {
                 OPENVINO_THROW(node, exp.what());
             }

diff --git a/src/plugins/intel_cpu/src/graph.h b/src/plugins/intel_cpu/src/graph.h
@@ -250,6 +250,7 @@ class Graph {
     std::vector<size_t> m_executableSyncNodesInds;
 
     GraphContext::CPtr context;
+    dnnl::stream m_stream;
 
     void EnforceInferencePrecision();
     void EnforceBF16();