From 423185ad7860f73ecd9444f10ff47f56e811a6d7 Mon Sep 17 00:00:00 2001 From: Alexey Smirnov Date: Fri, 10 Jan 2025 15:08:43 +0000 Subject: [PATCH] Address review comments part 2 --- .../src/plugin/npuw/compiled_model.cpp | 27 +++---- .../src/plugin/npuw/compiled_model.hpp | 4 +- .../src/plugin/npuw/llm_compiled_model.cpp | 54 +++++++------- .../src/plugin/npuw/llm_compiled_model.hpp | 4 +- .../src/plugin/npuw/serialization.cpp | 71 ++++++++----------- .../src/plugin/npuw/serialization.hpp | 30 +++++++- .../src/plugin/npuw/weights_bank.cpp | 43 +++++------ .../intel_npu/src/plugin/src/plugin.cpp | 18 +---- 8 files changed, 122 insertions(+), 129 deletions(-) diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp index a565b332925c23..7ef8205095c69f 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp @@ -489,7 +489,6 @@ ov::npuw::CompiledModel::CompiledModel(const std::shared_ptr& model, ov::npuw::CompiledModel::CompiledModel(const std::shared_ptr& model, const std::shared_ptr& plugin, - const ov::AnyMap& properties, const bool serialized) : ov::npuw::ICompiledModel(model, plugin), m_options_desc(std::make_shared<::intel_npu::OptionsDesc>()), @@ -610,6 +609,9 @@ void ov::npuw::CompiledModel::serialize(std::ostream& stream) const { // Write device list write(stream, m_dev_list); + // Write config + write(stream, m_cfg); + // Serialize compiled submodels write(stream, m_compiled_submodels.size()); for (const auto& subm : m_compiled_submodels) { @@ -635,8 +637,7 @@ void ov::npuw::CompiledModel::serialize(std::ostream& stream) const { std::shared_ptr ov::npuw::CompiledModel::deserialize( std::istream& stream, - const std::shared_ptr& plugin, - const ov::AnyMap& properties) { + const std::shared_ptr& plugin) { LOG_INFO("Deserializing CompiledModel..."); LOG_BLOCK(); @@ -656,7 +657,7 @@ std::shared_ptr ov::npuw::CompiledModel::deserialize( auto ov_model = std::make_shared(results, parameters, model_name); - auto compiled = std::make_shared(ov_model, plugin, properties, true); + auto compiled = std::make_shared(ov_model, plugin, true); // Deserialize meta compiled->m_name = model_name; @@ -668,13 +669,8 @@ std::shared_ptr ov::npuw::CompiledModel::deserialize( // Deserialize device list read(stream, compiled->m_dev_list); - // Drop NPUW-related properties from the config for submodels import - std::map non_npuw_props; - for (auto it = properties.begin(); it != properties.end(); ++it) { - if (it->first.find("NPUW_LLM") == it->first.npos && it->first.find("NPUW") == it->first.npos) { - non_npuw_props.insert(*it); - } - } + // Deserialize config + read(stream, compiled->m_cfg); // Deserialize compiled submodels std::size_t subm_size = 0; @@ -691,14 +687,9 @@ std::shared_ptr ov::npuw::CompiledModel::deserialize( // FIXME: workaround for import/export model since import model seems to reset the file pointer std::string buf; read(stream, buf); - - // FIXME: extra copy - std::stringstream buffer; - buffer.write(&buf[0], buf.size()); - - // No NPUW properties are present in this config + std::stringstream buffer(buf); compiled->m_compiled_submodels[i].compiled_model = - plugin->get_core()->import_model(buffer, compiled->m_dev_list[device_idx], non_npuw_props); + plugin->get_core()->import_model(buffer, compiled->m_dev_list[device_idx]); } compiled->m_compiled_submodels[i].device_it = compiled->m_dev_list.begin() + device_idx; compiled->m_compiled_submodels[i].deserialize(stream); diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp index c610ec7da03cc4..b4faf9d417b003 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp @@ -42,7 +42,6 @@ class CompiledModel : public ov::npuw::ICompiledModel { const ov::AnyMap& properties); CompiledModel(const std::shared_ptr& model, const std::shared_ptr& plugin, - const ov::AnyMap& properties, const bool serialized); void export_model(std::ostream& model) const override; @@ -73,8 +72,7 @@ class CompiledModel : public ov::npuw::ICompiledModel { void serialize(std::ostream& stream) const; static std::shared_ptr deserialize(std::istream& stream, - const std::shared_ptr& plugin, - const ov::AnyMap& properties); + const std::shared_ptr& plugin); // This is used for removing too long output tensor names to fix some compilation issues // NB: These two methods has nothing to do with this particular class and should be diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp index 22519e207a4144..c004b349ddf3c2 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp @@ -517,7 +517,6 @@ ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr& m ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr& model, const std::shared_ptr& plugin, - const ov::AnyMap& properties, const bool serialized) : ov::npuw::ICompiledModel(model, plugin), m_name(model->get_friendly_name()), @@ -540,6 +539,7 @@ void ov::npuw::LLMCompiledModel::export_model(std::ostream& stream) const { write(stream, OPENVINO_VERSION_MAJOR); write(stream, OPENVINO_VERSION_MINOR); write(stream, OPENVINO_VERSION_PATCH); + write(stream, std::string(NPUW_SERIALIZATION_VERSION)); // Serialize name write(stream, m_name); @@ -558,10 +558,6 @@ void ov::npuw::LLMCompiledModel::export_model(std::ostream& stream) const { m_kvcache_compiled->serialize(stream); m_prefill_compiled->serialize(stream); - // Serialize configs - write(stream, m_kvcache_compiled->m_cfg); - write(stream, m_prefill_compiled->m_cfg); - // Serialize weights bank (if required) const auto& kv_bank = m_kvcache_compiled->m_weights_bank; const auto& p_bank = m_prefill_compiled->m_weights_bank; @@ -575,26 +571,45 @@ void ov::npuw::LLMCompiledModel::export_model(std::ostream& stream) const { std::shared_ptr ov::npuw::LLMCompiledModel::deserialize( std::istream& stream, - const std::shared_ptr& plugin, - const ov::AnyMap& properties) { + const std::shared_ptr& plugin) { LOG_INFO("Deserializing LLMCompiledModel..."); LOG_BLOCK(); using namespace ov::npuw::s11n; // Sanity check magic number - uint64_t serialization_indicator = 0; + std::array serialization_indicator; read(stream, serialization_indicator); NPUW_ASSERT(serialization_indicator == NPUW_SERIALIZATION_INDICATOR && "This blob wasn't serialized via NPUW!"); // Deserialize general meta info int vmajor, vminor, vpatch; + std::string s11n_version; read(stream, vmajor); read(stream, vminor); read(stream, vpatch); - - NPUW_ASSERT(vmajor == OPENVINO_VERSION_MAJOR && vminor == OPENVINO_VERSION_MINOR && - vpatch == OPENVINO_VERSION_PATCH && "Only blobs serialized with the same OV version are supported!"); + read(stream, s11n_version); + + if (vmajor != OPENVINO_VERSION_MAJOR || vminor != OPENVINO_VERSION_MINOR || vpatch != OPENVINO_VERSION_PATCH || + s11n_version != std::string(NPUW_SERIALIZATION_VERSION)) { + OPENVINO_THROW("This blobs was serialized with different OV version!", + " Serialized by OV ", + vmajor, + '.', + vminor, + '.', + vpatch, + " Current OV version ", + OPENVINO_VERSION_MAJOR, + '.', + OPENVINO_VERSION_MINOR, + '.', + OPENVINO_VERSION_PATCH, + " NPUW serialized by version ", + s11n_version, + " NPUW current serialization version ", + NPUW_SERIALIZATION_VERSION); + } // Deserialize model name first std::string model_name; @@ -610,7 +625,7 @@ std::shared_ptr ov::npuw::LLMCompiledModel::deserial auto ov_model = std::make_shared(results, parameters, model_name); - auto compiled = std::make_shared(ov_model, plugin, properties, true); + auto compiled = std::make_shared(ov_model, plugin, true); // Deserialize LLMCompiledModel-specific data read(stream, compiled->m_kvcache_desc.max_prompt_size); @@ -618,16 +633,9 @@ std::shared_ptr ov::npuw::LLMCompiledModel::deserial read(stream, compiled->m_kvcache_desc.num_stored_tokens); read(stream, compiled->m_kvcache_desc.dim); - // Deserialize CompiledModels. Remove NPUW_LLM properties beforehand - std::map npuw_llm_props; - std::map other_props; - split_llm_properties(properties, npuw_llm_props, other_props); - compiled->m_kvcache_compiled = ov::npuw::CompiledModel::deserialize(stream, plugin, other_props); - compiled->m_prefill_compiled = ov::npuw::CompiledModel::deserialize(stream, plugin, other_props); - - // Deserialize configs - read(stream, compiled->m_kvcache_compiled->m_cfg); - read(stream, compiled->m_prefill_compiled->m_cfg); + // Deserialize CompiledModels + compiled->m_kvcache_compiled = ov::npuw::CompiledModel::deserialize(stream, plugin); + compiled->m_prefill_compiled = ov::npuw::CompiledModel::deserialize(stream, plugin); // Deserialize weights bank (if required) std::string bank_name; @@ -642,8 +650,6 @@ std::shared_ptr ov::npuw::LLMCompiledModel::deserial compiled->m_kvcache_compiled->reconstruct_closure(); compiled->m_prefill_compiled->reconstruct_closure(); - compiled->implement_properties(); - LOG_INFO("Done."); return compiled; } diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.hpp b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.hpp index 1394277e4fce9b..5003ccce40bb9d 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.hpp @@ -30,14 +30,12 @@ class LLMCompiledModel : public ov::npuw::ICompiledModel { const ov::AnyMap& properties); LLMCompiledModel(const std::shared_ptr& model, const std::shared_ptr& plugin, - const ov::AnyMap& properties, const bool serialized); LLMCompiledModel() = delete; void export_model(std::ostream& model) const override; static std::shared_ptr deserialize(std::istream& stream, - const std::shared_ptr& plugin, - const ov::AnyMap& properties); + const std::shared_ptr& plugin); std::shared_ptr get_runtime_model() const override; diff --git a/src/plugins/intel_npu/src/plugin/npuw/serialization.cpp b/src/plugins/intel_npu/src/plugin/npuw/serialization.cpp index fbaad987262ea4..5dcdc4be1a9cbc 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/serialization.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/serialization.cpp @@ -137,48 +137,35 @@ void ov::npuw::s11n::read(std::istream& stream, ::intel_npu::Config& var) { var.update(config); } -void ov::npuw::s11n::read(std::istream& stream, std::vector>& var) { - var.clear(); - std::size_t params_size = 0; - read(stream, params_size); - for (std::size_t i = 0; i < params_size; ++i) { - std::string elem_type_str; - std::string part_shape_str; - std::unordered_set names; - read(stream, elem_type_str); - read(stream, part_shape_str); - read(stream, names); - // NOTE: the code below is taken from NPU plugin's create_dummy_model() - auto param = - std::make_shared(ov::element::Type(elem_type_str), ov::PartialShape(part_shape_str)); - param->set_friendly_name(*names.begin()); // FIXME: any_name ? - param->output(0).get_tensor().set_names(names); - var.push_back(param); - } +void ov::npuw::s11n::read(std::istream& stream, std::shared_ptr& var) { + std::string elem_type_str; + std::string part_shape_str; + std::unordered_set names; + read(stream, elem_type_str); + read(stream, part_shape_str); + read(stream, names); + // NOTE: the code below is taken from NPU plugin's create_dummy_model() + var = std::make_shared(ov::element::Type(elem_type_str), ov::PartialShape(part_shape_str)); + var->set_friendly_name(*names.begin()); // FIXME: any_name ? + var->output(0).get_tensor().set_names(names); } -void ov::npuw::s11n::read(std::istream& stream, std::vector>& var) { - var.clear(); - std::size_t results_size = 0; - read(stream, results_size); - for (std::size_t i = 0; i < results_size; ++i) { - std::string elem_type_str; - std::string part_shape_str; - std::unordered_set names; - read(stream, elem_type_str); - read(stream, part_shape_str); - read(stream, names); - // NOTE: the code below is taken from NPU plugin's create_dummy_model() - std::shared_ptr res = - std::make_shared(ov::element::Type(elem_type_str), std::vector{1}); - // FIXME: serialize names as well? - const std::shared_ptr& tensor_dummy = - std::make_shared(ov::element::Type(elem_type_str), - ov::PartialShape(part_shape_str), - names); - std::shared_ptr result = std::make_shared(res); - result->output(0).set_tensor_ptr(tensor_dummy); - result->set_friendly_name(*names.begin()); // any_name ? - var.push_back(result); - } +void ov::npuw::s11n::read(std::istream& stream, std::shared_ptr& var) { + std::string elem_type_str; + std::string part_shape_str; + std::unordered_set names; + read(stream, elem_type_str); + read(stream, part_shape_str); + read(stream, names); + // NOTE: the code below is taken from NPU plugin's create_dummy_model() + std::shared_ptr res = + std::make_shared(ov::element::Type(elem_type_str), std::vector{1}); + // FIXME: serialize names as well? + const std::shared_ptr& tensor_dummy = + std::make_shared(ov::element::Type(elem_type_str), + ov::PartialShape(part_shape_str), + names); + var = std::make_shared(res); + var->output(0).set_tensor_ptr(tensor_dummy); + var->set_friendly_name(*names.begin()); // any_name ? } diff --git a/src/plugins/intel_npu/src/plugin/npuw/serialization.hpp b/src/plugins/intel_npu/src/plugin/npuw/serialization.hpp index 8afe83f70ff6a7..77a6b3aa865254 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/serialization.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/serialization.hpp @@ -4,6 +4,7 @@ #pragma once +#include #include #include #include @@ -13,7 +14,10 @@ #include #include -const constexpr uint64_t NPUW_SERIALIZATION_INDICATOR = 0x0123456789abcdef; +const constexpr std::array NPUW_SERIALIZATION_INDICATOR = + {char{0x13}, char{0x37}, char{0x6e}, char{0x70}, char{0x75}, char{0x77}}; + +const constexpr char* NPUW_SERIALIZATION_VERSION = "0.0"; // Forward declaration namespace intel_npu { @@ -59,18 +63,22 @@ void read(std::istream& stream, bool& var); void read(std::istream& stream, ov::npuw::compiled::Spatial& var); void read(std::istream& stream, ov::Tensor& var); void read(std::istream& stream, ::intel_npu::Config& var); -void read(std::istream& stream, std::vector>& var); -void read(std::istream& stream, std::vector>& var); +void read(std::istream& stream, std::shared_ptr& var); +void read(std::istream& stream, std::shared_ptr& var); // Forward declaration template void write(std::ostream& stream, const std::pair& var); template void write(std::ostream& stream, const std::vector& var); +template +void write(std::ostream& stream, const std::array& var); template void read(std::istream& stream, std::pair& var); template void read(std::istream& stream, std::vector& var); +template +void read(std::istream& stream, std::array& var); // Serialization template ::value, bool> = true> @@ -92,6 +100,13 @@ void write(std::ostream& stream, const std::vector& var) { } } +template +void write(std::ostream& stream, const std::array& var) { + for (const auto& el : var) { + write(stream, el); + } +} + template void write(std::ostream& stream, const std::unordered_set& var) { write(stream, var.size()); @@ -143,6 +158,15 @@ void read(std::istream& stream, std::vector& var) { } } +template +void read(std::istream& stream, std::array& var) { + for (std::size_t i = 0; i < N; ++i) { + T elem; + read(stream, elem); + var[i] = elem; + } +} + template void read(std::istream& stream, std::unordered_set& var) { var.clear(); diff --git a/src/plugins/intel_npu/src/plugin/npuw/weights_bank.cpp b/src/plugins/intel_npu/src/plugin/npuw/weights_bank.cpp index 8ed87c4c0b3af7..5a0735ebdc24d5 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/weights_bank.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/weights_bank.cpp @@ -165,19 +165,18 @@ void Bank::serialize(std::ostream& stream) const { std::lock_guard guard(m_mutex); - // For now only a singular device is supported - // Sanity check - NPUW_ASSERT(m_device_banks.size() == 1 && "Bank containing several devices can't be serialized"); - auto it_cpu = m_device_banks.find("CPU"); - auto it = it_cpu == m_device_banks.end() ? m_device_banks.find("NPU") : it_cpu; - - const auto& device_bank = it->second; - std::lock_guard dev_guard(device_bank.mutex); - write(stream, it->first); - write(stream, device_bank.storage.size()); - for (const auto& t_pair : device_bank.storage) { - write(stream, t_pair.first); - write(stream, t_pair.second.tensor); + write(stream, m_device_banks.size()); + + for (const auto& elem : m_device_banks) { + const auto& device = elem.first; + const auto& device_bank = elem.second; + std::lock_guard dev_guard(device_bank.mutex); + write(stream, device); + write(stream, device_bank.storage.size()); + for (const auto& t_pair : device_bank.storage) { + write(stream, t_pair.first); + write(stream, t_pair.second.tensor); + } } LOG_INFO("DONE."); @@ -191,19 +190,21 @@ std::shared_ptr Bank::deserialize(std::istream& stream, LOG_INFO("Deserializing weights bank..."); LOG_BLOCK(); - // For now only a singular device is supported - std::string device; - read(stream, device); + auto bank = ov::npuw::weights::bank(name, core, ""); - // Note: bank is assumed to be shared - auto bank = ov::npuw::weights::bank(name, core, device); std::size_t bank_size = 0; read(stream, bank_size); for (std::size_t i = 0; i < bank_size; ++i) { - int64_t uid = -1; - read(stream, uid); - bank->read_and_add_tensor(stream, uid, device); + std::string device; + read(stream, device); + std::size_t storage_size = 0; + read(stream, storage_size); + for (std::size_t i = 0; i < bank_size; ++i) { + int64_t uid = -1; + read(stream, uid); + bank->read_and_add_tensor(stream, uid, device); + } } LOG_INFO("DONE."); diff --git a/src/plugins/intel_npu/src/plugin/src/plugin.cpp b/src/plugins/intel_npu/src/plugin/src/plugin.cpp index 9640f896d3e03e..57eaf8b8306d01 100644 --- a/src/plugins/intel_npu/src/plugin/src/plugin.cpp +++ b/src/plugins/intel_npu/src/plugin/src/plugin.cpp @@ -756,24 +756,12 @@ std::shared_ptr Plugin::import_model(std::istream& stream, c OV_ITT_SCOPED_TASK(itt::domains::NPUPlugin, "Plugin::import_model"); OV_ITT_TASK_CHAIN(PLUGIN_IMPORT_MODEL, itt::domains::NPUPlugin, "Plugin::import_model", "merge_configs"); - // If NPUW is active - import via NPUW - auto useNpuwKey = ov::intel_npu::use_npuw.name(); - ov::AnyMap localProperties = properties; - if (localProperties.count(useNpuwKey) && localProperties.at(useNpuwKey).as() == true) { - auto llm_enabled = ov::intel_npu::npuw::llm::enabled.name(); - // Only dynamic stateful models are supported for now supported - if (!localProperties.count(llm_enabled) || localProperties.at(llm_enabled).as() == false) { - OPENVINO_THROW("Cannot import non-dynamic NPUW model!"); - } - return ov::npuw::LLMCompiledModel::deserialize(stream, shared_from_this(), localProperties); - } - // If was exported via NPUW - uint64_t serialization_indicator; - stream.read(reinterpret_cast(&serialization_indicator), sizeof serialization_indicator); + std::array serialization_indicator; + ov::npuw::s11n::read(stream, serialization_indicator); if (serialization_indicator == NPUW_SERIALIZATION_INDICATOR) { stream.seekg(0); - return ov::npuw::LLMCompiledModel::deserialize(stream, shared_from_this(), localProperties); + return ov::npuw::LLMCompiledModel::deserialize(stream, shared_from_this()); } stream.seekg(0);