[CPU] Throw when release_memory is called during inference (#27520)

### Details: This PR changes the behavior of the `CompiledModel::release_memory()` implementation in the CPU plugin for the situation when the method is being called concurrently with the other graph state modifying methods (e.g. graph initialization, inference, properties request). This is necessary to ensure thread safety and provide a clear defined behavior when the method is called concurrently. Also, the PR contains some refactoring of the Infer request implementation, aimed at decoupling the InferRequest implementation from the compiled model internals and providing a safer interface that ensures thread safe access to the CPU graph structures.
openvinotoolkit · Dec 16, 2024 · 05a6f4f · 05a6f4f
1 parent 3af0f7c
commit 05a6f4f
Show file tree

Hide file tree

Showing 12 changed files with 406 additions and 156 deletions.
diff --git a/src/plugins/intel_cpu/src/compiled_model.cpp b/src/plugins/intel_cpu/src/compiled_model.cpp
@@ -184,7 +184,6 @@ CompiledModel::GraphGuard::Lock CompiledModel::get_graph() const {
 }
 
 std::shared_ptr<ov::ISyncInferRequest> CompiledModel::create_sync_infer_request() const {
-    m_numRequests++;
     return std::make_shared<SyncInferRequest>(std::static_pointer_cast<const CompiledModel>(shared_from_this()));
 }
 
@@ -344,8 +343,12 @@ void CompiledModel::export_model(std::ostream& modelStream) const {
 
 void CompiledModel::release_memory() {
     for (auto&& graph : m_graphs) {
-        GraphGuard::Lock graph_lock{graph};
-        auto ctx = graph_lock._graph.getGraphContext();
+        // try to lock mutex, since it may be already locked (e.g by an infer request)
+        std::unique_lock<std::mutex> lock(graph._mutex, std::try_to_lock);
+        OPENVINO_ASSERT(lock.owns_lock(),
+                        "Attempt to call release_memory() on a compiled model in a busy state. Please ensure that all "
+                        "infer requests are completed before releasing memory.");
+        auto ctx = graph.getGraphContext();
         ctx->getNetworkMemoryControl()->releaseMemory();
     }
 }

diff --git a/src/plugins/intel_cpu/src/compiled_model.h b/src/plugins/intel_cpu/src/compiled_model.h
@@ -20,6 +20,15 @@ namespace ov {
 namespace intel_cpu {
 
 class CompiledModel : public ov::ICompiledModel {
+public:
+    struct GraphGuard : public Graph {
+        std::mutex _mutex;
+        struct Lock : public std::unique_lock<std::mutex> {
+            explicit Lock(GraphGuard& graph) : std::unique_lock<std::mutex>(graph._mutex), _graph(graph) {}
+            GraphGuard& _graph;
+        };
+    };
+
 public:
     typedef std::shared_ptr<CompiledModel> Ptr;
 
@@ -51,9 +60,13 @@ class CompiledModel : public ov::ICompiledModel {
 
     void release_memory() override;
 
+    std::string name() const {
+        return m_name;
+    }
+
 private:
     std::shared_ptr<ov::ISyncInferRequest> create_sync_infer_request() const override;
-    friend class SyncInferRequest;
+    friend class CompiledModelHolder;
 
     const std::shared_ptr<ov::Model> m_model;
     const std::shared_ptr<const ov::IPlugin> m_plugin;
@@ -66,13 +79,6 @@ class CompiledModel : public ov::ICompiledModel {
     Config m_cfg;
     mutable std::atomic_int m_numRequests = {0};
     std::string m_name;
-    struct GraphGuard : public Graph {
-        std::mutex _mutex;
-        struct Lock : public std::unique_lock<std::mutex> {
-            explicit Lock(GraphGuard& graph) : std::unique_lock<std::mutex>(graph._mutex), _graph(graph) {}
-            GraphGuard& _graph;
-        };
-    };
 
     const bool m_loaded_from_cache;
     // WARNING: Do not use m_graphs directly.
@@ -94,5 +100,59 @@ class CompiledModel : public ov::ICompiledModel {
     bool m_has_sub_compiled_models = false;
 };
 
+// This class provides safe access to the internal CompiledModel structures and helps to decouple SyncInferRequest and
+// the CompiledModel internal structures
+class CompiledModelHolder {
+public:
+    CompiledModelHolder(std::shared_ptr<const CompiledModel> compiled_model)
+        : m_compiled_model(std::move(compiled_model)) {
+        OPENVINO_ASSERT(!m_compiled_model->m_graphs.empty(),
+                        "No graph was found in the compiled model: ",
+                        m_compiled_model->name());
+        m_graph = &(m_compiled_model->get_graph()._graph);
+        m_id = (m_compiled_model->m_numRequests)++;
+    }
+
+    ~CompiledModelHolder() {
+        if (m_compiled_model) {
+            --(m_compiled_model->m_numRequests);
+        }
+    }
+
+    CompiledModelHolder(const CompiledModelHolder&) = delete;
+    CompiledModelHolder& operator=(const CompiledModelHolder&) = delete;
+
+    CompiledModelHolder(CompiledModelHolder&&) = default;
+    CompiledModelHolder& operator=(CompiledModelHolder&&) = default;
+
+    const Graph& graph() const {
+        return *m_graph;
+    }
+
+    CompiledModel::GraphGuard::Lock lock() {
+        auto lock = m_compiled_model->get_graph();
+        m_graph = &(lock._graph);
+        OPENVINO_ASSERT(m_graph, "Graph ptr null check failed");
+        return lock;
+    }
+
+    std::string name() const {
+        return m_compiled_model->name();
+    }
+
+    std::shared_ptr<const ov::ICompiledModel> compiled_model() const {
+        return m_compiled_model;
+    }
+
+    int id() const {
+        return m_id;
+    }
+
+private:
+    std::shared_ptr<const CompiledModel> m_compiled_model;
+    const Graph* m_graph;
+    int m_id;
+};
+
 }  // namespace intel_cpu
 }  // namespace ov
diff --git a/src/plugins/intel_cpu/src/graph.cpp b/src/plugins/intel_cpu/src/graph.cpp
@@ -1940,8 +1940,23 @@ std::shared_ptr<ov::Model> Graph::dump() const {
     return dump_graph_as_ie_ngraph_net(*this);
 }
 
-const std::unordered_map<std::string, node::MemoryStateNode*>& Graph::getInternalStateNodes() const {
-    return m_context->getMemoryStatesRegister()->getMemoryStates();
+std::vector<MemStatePtr> Graph::memoryStates() const {
+    std::vector<MemStatePtr> resultVector;
+
+    for (auto&& item : m_context->getMemoryStatesRegister()->getMemoryStates()) {
+        resultVector.emplace_back(item.second->makeState());
+    }
+    return resultVector;
+}
+
+void Graph::assignStates(const std::vector<MemStatePtr>& states) {
+    auto&& inputStateNodes = m_context->getMemoryStatesRegister()->getMemoryStates();
+    for (const auto& state : states) {
+        auto itr = inputStateNodes.find(state->get_name());
+        if (itr != inputStateNodes.end()) {
+            itr->second->assignState(state);
+        }
+    }
 }
 
 }  // namespace intel_cpu

diff --git a/src/plugins/intel_cpu/src/graph.h b/src/plugins/intel_cpu/src/graph.h
@@ -14,6 +14,7 @@
 #include "edge.h"
 #include "graph_context.h"
 #include "memory_control.hpp"
+#include "memory_state.h"
 #include "node.h"
 #include "nodes/input.h"
 #include "openvino/core/node_vector.hpp"
@@ -87,28 +88,42 @@ class Graph {
         return _name;
     }
 
-    std::map<std::size_t, NodePtr>& GetInputNodesMap() {
-        return inputNodesMap;
+    NodePtr getInputNodeByIndex(std::size_t index) {
+        auto input = inputNodesMap.find(index);
+        if (input == inputNodesMap.end())
+            return nullptr;
+        return input->second;
     }
 
-    std::map<std::size_t, NodePtr>& GetOutputNodesMap() {
-        return outputNodesMap;
+    NodePtr getOutputNodeByIndex(std::size_t index) {
+        auto output = outputNodesMap.find(index);
+        if (output == outputNodesMap.end())
+            return nullptr;
+        return output->second;
     }
 
-    NodePtr getInputNodeByIndex(const std::size_t& index) {
+    NodeConstPtr getInputNodeByIndex(std::size_t index) const {
         auto input = inputNodesMap.find(index);
         if (input == inputNodesMap.end())
-            OPENVINO_THROW("CPU execution graph doesn't contain input node with index: ", index);
+            return nullptr;
         return input->second;
     }
 
-    NodePtr getOutputNodeByIndex(const std::size_t& index) {
+    NodeConstPtr getOutputNodeByIndex(std::size_t index) const {
         auto output = outputNodesMap.find(index);
         if (output == outputNodesMap.end())
-            OPENVINO_THROW("CPU execution graph doesn't contain output node with index: ", index);
+            return nullptr;
         return output->second;
     }
 
+    size_t inputsNumber() const {
+        return inputNodesMap.size();
+    }
+
+    size_t outputsNumber() const {
+        return outputNodesMap.size();
+    }
+
     dnnl::engine getEngine() const {
         return m_context->getEngine();
     }
@@ -117,6 +132,9 @@ class Graph {
         return m_context;
     }
 
+    std::vector<MemStatePtr> memoryStates() const;
+    void assignStates(const std::vector<MemStatePtr>& state);
+
     void GetPerfData(std::vector<ov::ProfilingInfo>& perfMap) const;
 
     void CreateEdge(const NodePtr& parent, const NodePtr& child, int parentPort = 0, int childPort = 0);
@@ -202,8 +220,6 @@ class Graph {
         return graphHasDynamicInput;
     }
 
-    const std::unordered_map<std::string, node::MemoryStateNode*>& getInternalStateNodes() const;
-
     /**
      * Init graph using \p model, \p context, \p inputConfigs and \p outputConfigs
      */
@@ -218,7 +234,7 @@ class Graph {
     void Activate(const std::vector<MemoryPtr>& externalInputMemory = {},
                   const std::vector<MemoryPtr>& externalOutputMemory = {});
 
-    const std::unordered_map<std::size_t, ProxyMemoryBlockPtr>& getOutputNodesMemBlocksMap() const {
+    const std::unordered_map<std::size_t, ProxyMemoryBlockPtr>& getOutputNodesMemBlocksMap() {
         return outputNodesMemBlocksMap;
     }