Skip to content

Commit

Permalink
Serialize bank name
Browse files Browse the repository at this point in the history
  • Loading branch information
smirnov-alexey committed Jan 9, 2025
1 parent 19bea23 commit 9b1913f
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 10 deletions.
5 changes: 4 additions & 1 deletion src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -564,6 +564,7 @@ void ov::npuw::LLMCompiledModel::export_model(std::ostream& stream) const {
const auto& p_bank = m_prefill_compiled->m_weights_bank;
NPUW_ASSERT(kv_bank && p_bank && kv_bank == p_bank && "Prefill and KVCache models' weight bank should be shared!");
// FIXME: support weightless flow
write(stream, kv_bank->get_name());
kv_bank->serialize(stream);

LOG_INFO("Done.");
Expand Down Expand Up @@ -628,7 +629,9 @@ std::shared_ptr<ov::npuw::LLMCompiledModel> ov::npuw::LLMCompiledModel::deserial
read(stream, compiled->m_prefill_compiled->m_cfg);

// Deserialize weights bank (if required)
auto bank = ov::npuw::weights::Bank::deserialize(stream, compiled->get_plugin()->get_core());
std::string bank_name;
read(stream, bank_name);
auto bank = ov::npuw::weights::Bank::deserialize(stream, compiled->get_plugin()->get_core(), bank_name);

// FIXME: support weightless option
compiled->m_kvcache_compiled->m_weights_bank = bank;
Expand Down
17 changes: 11 additions & 6 deletions src/plugins/intel_npu/src/plugin/npuw/weights_bank.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,9 @@ void Bank::serialize(std::ostream& stream) const {
LOG_INFO("DONE.");
}

std::shared_ptr<Bank> Bank::deserialize(std::istream& stream, const std::shared_ptr<const ov::ICore>& core) {
std::shared_ptr<Bank> Bank::deserialize(std::istream& stream,
const std::shared_ptr<const ov::ICore>& core,
const std::string& name) {
using namespace ov::npuw::s11n;

LOG_INFO("Deserializing weights bank...");
Expand All @@ -193,9 +195,8 @@ std::shared_ptr<Bank> Bank::deserialize(std::istream& stream, const std::shared_
std::string device;
read(stream, device);

// Bank is assumed to be shared - thus no need for a unique name.
// FIXME: is that right? What about multi-model pipeline or several pipelines?
auto bank = ov::npuw::weights::bank("shared_serialized", core, device);
// Note: bank is assumed to be shared
auto bank = ov::npuw::weights::bank(name, core, device);
std::size_t bank_size = 0;
read(stream, bank_size);

Expand Down Expand Up @@ -254,14 +255,18 @@ void Bank::read_and_add_tensor(std::istream& stream, int64_t uid, const std::str
stream.read(reinterpret_cast<char*>(allocated_tensor.data()), byte_size);
}

std::string Bank::get_name() const {
return m_bank_name;
}

std::shared_ptr<Bank> BankManager::getBank(const std::string& bank_name,
const std::shared_ptr<const ov::ICore>& core,
const std::string& alloc_device) {
std::lock_guard<std::mutex> guard(m_mutex);

auto iter = m_bank_map.find(bank_name);
if (iter == m_bank_map.end() || iter->second.expired()) {
auto bank = std::make_shared<Bank>(core, alloc_device);
auto bank = std::make_shared<Bank>(core, alloc_device, bank_name);
m_bank_map[bank_name] = bank;
return bank;
}
Expand All @@ -273,7 +278,7 @@ std::shared_ptr<Bank> ov::npuw::weights::bank(const std::string& bank_name,
const std::string& alloc_device) {
if (bank_name.empty()) {
// Don't share this bank in manager
return std::make_shared<Bank>(core, alloc_device);
return std::make_shared<Bank>(core, alloc_device, bank_name);
}

auto& instance = BankManager::getInstance();
Expand Down
14 changes: 11 additions & 3 deletions src/plugins/intel_npu/src/plugin/npuw/weights_bank.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,12 @@ namespace weights {

class Bank {
public:
explicit Bank(const std::shared_ptr<const ov::ICore>& core, const std::string& alloc_device)
explicit Bank(const std::shared_ptr<const ov::ICore>& core,
const std::string& alloc_device,
const std::string& bank_name)
: m_core(core),
m_alloc_device(alloc_device) {}
m_alloc_device(alloc_device),
m_bank_name(bank_name) {}

// Register LazyTensor in a bank if it's not there. Returns LazyTensor's unique id
int64_t registerLT(const LazyTensor& tensor, const std::string& device);
Expand All @@ -40,6 +43,8 @@ class Bank {

bool is_remote(int64_t uid) const;

std::string get_name() const;

private:
friend class ov::npuw::LLMCompiledModel;
friend class ov::npuw::CompiledModel;
Expand All @@ -59,14 +64,17 @@ class Bank {
ov::Tensor eval_and_alloc(const LazyTensor& tensor, DeviceBank& dbank, const std::string& device);

void serialize(std::ostream& stream) const;
static std::shared_ptr<Bank> deserialize(std::istream& stream, const std::shared_ptr<const ov::ICore>& core);
static std::shared_ptr<Bank> deserialize(std::istream& stream,
const std::shared_ptr<const ov::ICore>& core,
const std::string& name);
// Used during deserialization
void read_and_add_tensor(std::istream& stream, int64_t uid, const std::string& device);

mutable std::mutex m_mutex;
std::shared_ptr<const ov::ICore> m_core = nullptr;
std::string m_alloc_device;
int64_t uid_count = 0;
std::string m_bank_name;
};

std::shared_ptr<Bank> bank(const std::string& bank_name,
Expand Down

0 comments on commit 9b1913f

Please sign in to comment.