diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp index 88b99a1aaa1f70..9f0709a22c843b 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp @@ -564,6 +564,7 @@ void ov::npuw::LLMCompiledModel::export_model(std::ostream& stream) const { const auto& p_bank = m_prefill_compiled->m_weights_bank; NPUW_ASSERT(kv_bank && p_bank && kv_bank == p_bank && "Prefill and KVCache models' weight bank should be shared!"); // FIXME: support weightless flow + write(stream, kv_bank->get_name()); kv_bank->serialize(stream); LOG_INFO("Done."); @@ -628,7 +629,9 @@ std::shared_ptr ov::npuw::LLMCompiledModel::deserial read(stream, compiled->m_prefill_compiled->m_cfg); // Deserialize weights bank (if required) - auto bank = ov::npuw::weights::Bank::deserialize(stream, compiled->get_plugin()->get_core()); + std::string bank_name; + read(stream, bank_name); + auto bank = ov::npuw::weights::Bank::deserialize(stream, compiled->get_plugin()->get_core(), bank_name); // FIXME: support weightless option compiled->m_kvcache_compiled->m_weights_bank = bank; diff --git a/src/plugins/intel_npu/src/plugin/npuw/weights_bank.cpp b/src/plugins/intel_npu/src/plugin/npuw/weights_bank.cpp index 28ce2cd270f576..8ed87c4c0b3af7 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/weights_bank.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/weights_bank.cpp @@ -183,7 +183,9 @@ void Bank::serialize(std::ostream& stream) const { LOG_INFO("DONE."); } -std::shared_ptr Bank::deserialize(std::istream& stream, const std::shared_ptr& core) { +std::shared_ptr Bank::deserialize(std::istream& stream, + const std::shared_ptr& core, + const std::string& name) { using namespace ov::npuw::s11n; LOG_INFO("Deserializing weights bank..."); @@ -193,9 +195,8 @@ std::shared_ptr Bank::deserialize(std::istream& stream, const std::shared_ std::string device; read(stream, device); - // Bank is assumed to be shared - thus no need for a unique name. - // FIXME: is that right? What about multi-model pipeline or several pipelines? - auto bank = ov::npuw::weights::bank("shared_serialized", core, device); + // Note: bank is assumed to be shared + auto bank = ov::npuw::weights::bank(name, core, device); std::size_t bank_size = 0; read(stream, bank_size); @@ -254,6 +255,10 @@ void Bank::read_and_add_tensor(std::istream& stream, int64_t uid, const std::str stream.read(reinterpret_cast(allocated_tensor.data()), byte_size); } +std::string Bank::get_name() const { + return m_bank_name; +} + std::shared_ptr BankManager::getBank(const std::string& bank_name, const std::shared_ptr& core, const std::string& alloc_device) { @@ -261,7 +266,7 @@ std::shared_ptr BankManager::getBank(const std::string& bank_name, auto iter = m_bank_map.find(bank_name); if (iter == m_bank_map.end() || iter->second.expired()) { - auto bank = std::make_shared(core, alloc_device); + auto bank = std::make_shared(core, alloc_device, bank_name); m_bank_map[bank_name] = bank; return bank; } @@ -273,7 +278,7 @@ std::shared_ptr ov::npuw::weights::bank(const std::string& bank_name, const std::string& alloc_device) { if (bank_name.empty()) { // Don't share this bank in manager - return std::make_shared(core, alloc_device); + return std::make_shared(core, alloc_device, bank_name); } auto& instance = BankManager::getInstance(); diff --git a/src/plugins/intel_npu/src/plugin/npuw/weights_bank.hpp b/src/plugins/intel_npu/src/plugin/npuw/weights_bank.hpp index 6d8262e2684bf3..fd9f0e39841b7a 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/weights_bank.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/weights_bank.hpp @@ -25,9 +25,12 @@ namespace weights { class Bank { public: - explicit Bank(const std::shared_ptr& core, const std::string& alloc_device) + explicit Bank(const std::shared_ptr& core, + const std::string& alloc_device, + const std::string& bank_name) : m_core(core), - m_alloc_device(alloc_device) {} + m_alloc_device(alloc_device), + m_bank_name(bank_name) {} // Register LazyTensor in a bank if it's not there. Returns LazyTensor's unique id int64_t registerLT(const LazyTensor& tensor, const std::string& device); @@ -40,6 +43,8 @@ class Bank { bool is_remote(int64_t uid) const; + std::string get_name() const; + private: friend class ov::npuw::LLMCompiledModel; friend class ov::npuw::CompiledModel; @@ -59,7 +64,9 @@ class Bank { ov::Tensor eval_and_alloc(const LazyTensor& tensor, DeviceBank& dbank, const std::string& device); void serialize(std::ostream& stream) const; - static std::shared_ptr deserialize(std::istream& stream, const std::shared_ptr& core); + static std::shared_ptr deserialize(std::istream& stream, + const std::shared_ptr& core, + const std::string& name); // Used during deserialization void read_and_add_tensor(std::istream& stream, int64_t uid, const std::string& device); @@ -67,6 +74,7 @@ class Bank { std::shared_ptr m_core = nullptr; std::string m_alloc_device; int64_t uid_count = 0; + std::string m_bank_name; }; std::shared_ptr bank(const std::string& bank_name,