Serialize bank name

smirnov-alexey · Jan 9, 2025 · 9b1913f · 9b1913f
1 parent 19bea23
commit 9b1913f
Show file tree

Hide file tree

Showing 3 changed files with 26 additions and 10 deletions.
diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp
@@ -564,6 +564,7 @@ void ov::npuw::LLMCompiledModel::export_model(std::ostream& stream) const {
     const auto& p_bank = m_prefill_compiled->m_weights_bank;
     NPUW_ASSERT(kv_bank && p_bank && kv_bank == p_bank && "Prefill and KVCache models' weight bank should be shared!");
     // FIXME: support weightless flow
+    write(stream, kv_bank->get_name());
     kv_bank->serialize(stream);
 
     LOG_INFO("Done.");
@@ -628,7 +629,9 @@ std::shared_ptr<ov::npuw::LLMCompiledModel> ov::npuw::LLMCompiledModel::deserial
     read(stream, compiled->m_prefill_compiled->m_cfg);
 
     // Deserialize weights bank (if required)
-    auto bank = ov::npuw::weights::Bank::deserialize(stream, compiled->get_plugin()->get_core());
+    std::string bank_name;
+    read(stream, bank_name);
+    auto bank = ov::npuw::weights::Bank::deserialize(stream, compiled->get_plugin()->get_core(), bank_name);
 
     // FIXME: support weightless option
     compiled->m_kvcache_compiled->m_weights_bank = bank;

diff --git a/src/plugins/intel_npu/src/plugin/npuw/weights_bank.cpp b/src/plugins/intel_npu/src/plugin/npuw/weights_bank.cpp
@@ -183,7 +183,9 @@ void Bank::serialize(std::ostream& stream) const {
     LOG_INFO("DONE.");
 }
 
-std::shared_ptr<Bank> Bank::deserialize(std::istream& stream, const std::shared_ptr<const ov::ICore>& core) {
+std::shared_ptr<Bank> Bank::deserialize(std::istream& stream,
+                                        const std::shared_ptr<const ov::ICore>& core,
+                                        const std::string& name) {
     using namespace ov::npuw::s11n;
 
     LOG_INFO("Deserializing weights bank...");
@@ -193,9 +195,8 @@ std::shared_ptr<Bank> Bank::deserialize(std::istream& stream, const std::shared_
     std::string device;
     read(stream, device);
 
-    // Bank is assumed to be shared - thus no need for a unique name.
-    // FIXME: is that right? What about multi-model pipeline or several pipelines?
-    auto bank = ov::npuw::weights::bank("shared_serialized", core, device);
+    // Note: bank is assumed to be shared
+    auto bank = ov::npuw::weights::bank(name, core, device);
     std::size_t bank_size = 0;
     read(stream, bank_size);
 
@@ -254,14 +255,18 @@ void Bank::read_and_add_tensor(std::istream& stream, int64_t uid, const std::str
     stream.read(reinterpret_cast<char*>(allocated_tensor.data()), byte_size);
 }
 
+std::string Bank::get_name() const {
+    return m_bank_name;
+}
+
 std::shared_ptr<Bank> BankManager::getBank(const std::string& bank_name,
                                            const std::shared_ptr<const ov::ICore>& core,
                                            const std::string& alloc_device) {
     std::lock_guard<std::mutex> guard(m_mutex);
 
     auto iter = m_bank_map.find(bank_name);
     if (iter == m_bank_map.end() || iter->second.expired()) {
-        auto bank = std::make_shared<Bank>(core, alloc_device);
+        auto bank = std::make_shared<Bank>(core, alloc_device, bank_name);
         m_bank_map[bank_name] = bank;
         return bank;
     }
@@ -273,7 +278,7 @@ std::shared_ptr<Bank> ov::npuw::weights::bank(const std::string& bank_name,
                                               const std::string& alloc_device) {
     if (bank_name.empty()) {
         // Don't share this bank in manager
-        return std::make_shared<Bank>(core, alloc_device);
+        return std::make_shared<Bank>(core, alloc_device, bank_name);
     }
 
     auto& instance = BankManager::getInstance();

diff --git a/src/plugins/intel_npu/src/plugin/npuw/weights_bank.hpp b/src/plugins/intel_npu/src/plugin/npuw/weights_bank.hpp
@@ -25,9 +25,12 @@ namespace weights {
 
 class Bank {
 public:
-    explicit Bank(const std::shared_ptr<const ov::ICore>& core, const std::string& alloc_device)
+    explicit Bank(const std::shared_ptr<const ov::ICore>& core,
+                  const std::string& alloc_device,
+                  const std::string& bank_name)
         : m_core(core),
-          m_alloc_device(alloc_device) {}
+          m_alloc_device(alloc_device),
+          m_bank_name(bank_name) {}
 
     // Register LazyTensor in a bank if it's not there. Returns LazyTensor's unique id
     int64_t registerLT(const LazyTensor& tensor, const std::string& device);
@@ -40,6 +43,8 @@ class Bank {
 
     bool is_remote(int64_t uid) const;
 
+    std::string get_name() const;
+
 private:
     friend class ov::npuw::LLMCompiledModel;
     friend class ov::npuw::CompiledModel;
@@ -59,14 +64,17 @@ class Bank {
     ov::Tensor eval_and_alloc(const LazyTensor& tensor, DeviceBank& dbank, const std::string& device);
 
     void serialize(std::ostream& stream) const;
-    static std::shared_ptr<Bank> deserialize(std::istream& stream, const std::shared_ptr<const ov::ICore>& core);
+    static std::shared_ptr<Bank> deserialize(std::istream& stream,
+                                             const std::shared_ptr<const ov::ICore>& core,
+                                             const std::string& name);
     // Used during deserialization
     void read_and_add_tensor(std::istream& stream, int64_t uid, const std::string& device);
 
     mutable std::mutex m_mutex;
     std::shared_ptr<const ov::ICore> m_core = nullptr;
     std::string m_alloc_device;
     int64_t uid_count = 0;
+    std::string m_bank_name;
 };
 
 std::shared_ptr<Bank> bank(const std::string& bank_name,