Address review comments part 2

smirnov-alexey · Jan 10, 2025 · 423185a · 423185a
1 parent 5d094c9
commit 423185a
Show file tree

Hide file tree

Showing 8 changed files with 122 additions and 129 deletions.
diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp
@@ -489,7 +489,6 @@ ov::npuw::CompiledModel::CompiledModel(const std::shared_ptr<ov::Model>& model,
 
 ov::npuw::CompiledModel::CompiledModel(const std::shared_ptr<ov::Model>& model,
                                        const std::shared_ptr<const ov::IPlugin>& plugin,
-                                       const ov::AnyMap& properties,
                                        const bool serialized)
     : ov::npuw::ICompiledModel(model, plugin),
       m_options_desc(std::make_shared<::intel_npu::OptionsDesc>()),
@@ -610,6 +609,9 @@ void ov::npuw::CompiledModel::serialize(std::ostream& stream) const {
     // Write device list
     write(stream, m_dev_list);
 
+    // Write config
+    write(stream, m_cfg);
+
     // Serialize compiled submodels
     write(stream, m_compiled_submodels.size());
     for (const auto& subm : m_compiled_submodels) {
@@ -635,8 +637,7 @@ void ov::npuw::CompiledModel::serialize(std::ostream& stream) const {
 
 std::shared_ptr<ov::npuw::CompiledModel> ov::npuw::CompiledModel::deserialize(
     std::istream& stream,
-    const std::shared_ptr<const ov::IPlugin>& plugin,
-    const ov::AnyMap& properties) {
+    const std::shared_ptr<const ov::IPlugin>& plugin) {
     LOG_INFO("Deserializing CompiledModel...");
     LOG_BLOCK();
 
@@ -656,7 +657,7 @@ std::shared_ptr<ov::npuw::CompiledModel> ov::npuw::CompiledModel::deserialize(
 
     auto ov_model = std::make_shared<ov::Model>(results, parameters, model_name);
 
-    auto compiled = std::make_shared<ov::npuw::CompiledModel>(ov_model, plugin, properties, true);
+    auto compiled = std::make_shared<ov::npuw::CompiledModel>(ov_model, plugin, true);
 
     // Deserialize meta
     compiled->m_name = model_name;
@@ -668,13 +669,8 @@ std::shared_ptr<ov::npuw::CompiledModel> ov::npuw::CompiledModel::deserialize(
     // Deserialize device list
     read(stream, compiled->m_dev_list);
 
-    // Drop NPUW-related properties from the config for submodels import
-    std::map<std::string, ov::Any> non_npuw_props;
-    for (auto it = properties.begin(); it != properties.end(); ++it) {
-        if (it->first.find("NPUW_LLM") == it->first.npos && it->first.find("NPUW") == it->first.npos) {
-            non_npuw_props.insert(*it);
-        }
-    }
+    // Deserialize config
+    read(stream, compiled->m_cfg);
 
     // Deserialize compiled submodels
     std::size_t subm_size = 0;
@@ -691,14 +687,9 @@ std::shared_ptr<ov::npuw::CompiledModel> ov::npuw::CompiledModel::deserialize(
             // FIXME: workaround for import/export model since import model seems to reset the file pointer
             std::string buf;
             read(stream, buf);
-
-            // FIXME: extra copy
-            std::stringstream buffer;
-            buffer.write(&buf[0], buf.size());
-
-            // No NPUW properties are present in this config
+            std::stringstream buffer(buf);
             compiled->m_compiled_submodels[i].compiled_model =
-                plugin->get_core()->import_model(buffer, compiled->m_dev_list[device_idx], non_npuw_props);
+                plugin->get_core()->import_model(buffer, compiled->m_dev_list[device_idx]);
         }
         compiled->m_compiled_submodels[i].device_it = compiled->m_dev_list.begin() + device_idx;
         compiled->m_compiled_submodels[i].deserialize(stream);

diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp
@@ -42,7 +42,6 @@ class CompiledModel : public ov::npuw::ICompiledModel {
                   const ov::AnyMap& properties);
     CompiledModel(const std::shared_ptr<ov::Model>& model,
                   const std::shared_ptr<const ov::IPlugin>& plugin,
-                  const ov::AnyMap& properties,
                   const bool serialized);
 
     void export_model(std::ostream& model) const override;
@@ -73,8 +72,7 @@ class CompiledModel : public ov::npuw::ICompiledModel {
 
     void serialize(std::ostream& stream) const;
     static std::shared_ptr<CompiledModel> deserialize(std::istream& stream,
-                                                      const std::shared_ptr<const ov::IPlugin>& plugin,
-                                                      const ov::AnyMap& properties);
+                                                      const std::shared_ptr<const ov::IPlugin>& plugin);
 
     // This is used for removing too long output tensor names to fix some compilation issues
     // NB: These two methods has nothing to do with this particular class and should be

diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp
@@ -517,7 +517,6 @@ ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr<ov::Model>& m
 
 ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr<ov::Model>& model,
                                              const std::shared_ptr<const ov::IPlugin>& plugin,
-                                             const ov::AnyMap& properties,
                                              const bool serialized)
     : ov::npuw::ICompiledModel(model, plugin),
       m_name(model->get_friendly_name()),
@@ -540,6 +539,7 @@ void ov::npuw::LLMCompiledModel::export_model(std::ostream& stream) const {
     write(stream, OPENVINO_VERSION_MAJOR);
     write(stream, OPENVINO_VERSION_MINOR);
     write(stream, OPENVINO_VERSION_PATCH);
+    write(stream, std::string(NPUW_SERIALIZATION_VERSION));
 
     // Serialize name
     write(stream, m_name);
@@ -558,10 +558,6 @@ void ov::npuw::LLMCompiledModel::export_model(std::ostream& stream) const {
     m_kvcache_compiled->serialize(stream);
     m_prefill_compiled->serialize(stream);
 
-    // Serialize configs
-    write(stream, m_kvcache_compiled->m_cfg);
-    write(stream, m_prefill_compiled->m_cfg);
-
     // Serialize weights bank (if required)
     const auto& kv_bank = m_kvcache_compiled->m_weights_bank;
     const auto& p_bank = m_prefill_compiled->m_weights_bank;
@@ -575,26 +571,45 @@ void ov::npuw::LLMCompiledModel::export_model(std::ostream& stream) const {
 
 std::shared_ptr<ov::npuw::LLMCompiledModel> ov::npuw::LLMCompiledModel::deserialize(
     std::istream& stream,
-    const std::shared_ptr<const ov::IPlugin>& plugin,
-    const ov::AnyMap& properties) {
+    const std::shared_ptr<const ov::IPlugin>& plugin) {
     LOG_INFO("Deserializing LLMCompiledModel...");
     LOG_BLOCK();
 
     using namespace ov::npuw::s11n;
 
     // Sanity check magic number
-    uint64_t serialization_indicator = 0;
+    std::array<uint8_t, 6> serialization_indicator;
     read(stream, serialization_indicator);
     NPUW_ASSERT(serialization_indicator == NPUW_SERIALIZATION_INDICATOR && "This blob wasn't serialized via NPUW!");
 
     // Deserialize general meta info
     int vmajor, vminor, vpatch;
+    std::string s11n_version;
     read(stream, vmajor);
     read(stream, vminor);
     read(stream, vpatch);
-
-    NPUW_ASSERT(vmajor == OPENVINO_VERSION_MAJOR && vminor == OPENVINO_VERSION_MINOR &&
-                vpatch == OPENVINO_VERSION_PATCH && "Only blobs serialized with the same OV version are supported!");
+    read(stream, s11n_version);
+
+    if (vmajor != OPENVINO_VERSION_MAJOR || vminor != OPENVINO_VERSION_MINOR || vpatch != OPENVINO_VERSION_PATCH ||
+        s11n_version != std::string(NPUW_SERIALIZATION_VERSION)) {
+        OPENVINO_THROW("This blobs was serialized with different OV version!",
+                       " Serialized by OV ",
+                       vmajor,
+                       '.',
+                       vminor,
+                       '.',
+                       vpatch,
+                       " Current OV version ",
+                       OPENVINO_VERSION_MAJOR,
+                       '.',
+                       OPENVINO_VERSION_MINOR,
+                       '.',
+                       OPENVINO_VERSION_PATCH,
+                       " NPUW serialized by version ",
+                       s11n_version,
+                       " NPUW current serialization version ",
+                       NPUW_SERIALIZATION_VERSION);
+    }
 
     // Deserialize model name first
     std::string model_name;
@@ -610,24 +625,17 @@ std::shared_ptr<ov::npuw::LLMCompiledModel> ov::npuw::LLMCompiledModel::deserial
 
     auto ov_model = std::make_shared<ov::Model>(results, parameters, model_name);
 
-    auto compiled = std::make_shared<ov::npuw::LLMCompiledModel>(ov_model, plugin, properties, true);
+    auto compiled = std::make_shared<ov::npuw::LLMCompiledModel>(ov_model, plugin, true);
 
     // Deserialize LLMCompiledModel-specific data
     read(stream, compiled->m_kvcache_desc.max_prompt_size);
     read(stream, compiled->m_kvcache_desc.total_size);
     read(stream, compiled->m_kvcache_desc.num_stored_tokens);
     read(stream, compiled->m_kvcache_desc.dim);
 
-    // Deserialize CompiledModels. Remove NPUW_LLM properties beforehand
-    std::map<std::string, ov::Any> npuw_llm_props;
-    std::map<std::string, ov::Any> other_props;
-    split_llm_properties(properties, npuw_llm_props, other_props);
-    compiled->m_kvcache_compiled = ov::npuw::CompiledModel::deserialize(stream, plugin, other_props);
-    compiled->m_prefill_compiled = ov::npuw::CompiledModel::deserialize(stream, plugin, other_props);
-
-    // Deserialize configs
-    read(stream, compiled->m_kvcache_compiled->m_cfg);
-    read(stream, compiled->m_prefill_compiled->m_cfg);
+    // Deserialize CompiledModels
+    compiled->m_kvcache_compiled = ov::npuw::CompiledModel::deserialize(stream, plugin);
+    compiled->m_prefill_compiled = ov::npuw::CompiledModel::deserialize(stream, plugin);
 
     // Deserialize weights bank (if required)
     std::string bank_name;
@@ -642,8 +650,6 @@ std::shared_ptr<ov::npuw::LLMCompiledModel> ov::npuw::LLMCompiledModel::deserial
     compiled->m_kvcache_compiled->reconstruct_closure();
     compiled->m_prefill_compiled->reconstruct_closure();
 
-    compiled->implement_properties();
-
     LOG_INFO("Done.");
     return compiled;
 }

diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.hpp b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.hpp
@@ -30,14 +30,12 @@ class LLMCompiledModel : public ov::npuw::ICompiledModel {
                      const ov::AnyMap& properties);
     LLMCompiledModel(const std::shared_ptr<ov::Model>& model,
                      const std::shared_ptr<const ov::IPlugin>& plugin,
-                     const ov::AnyMap& properties,
                      const bool serialized);
     LLMCompiledModel() = delete;
 
     void export_model(std::ostream& model) const override;
     static std::shared_ptr<LLMCompiledModel> deserialize(std::istream& stream,
-                                                         const std::shared_ptr<const ov::IPlugin>& plugin,
-                                                         const ov::AnyMap& properties);
+                                                         const std::shared_ptr<const ov::IPlugin>& plugin);
 
     std::shared_ptr<const ov::Model> get_runtime_model() const override;
 

diff --git a/src/plugins/intel_npu/src/plugin/npuw/serialization.cpp b/src/plugins/intel_npu/src/plugin/npuw/serialization.cpp
@@ -137,48 +137,35 @@ void ov::npuw::s11n::read(std::istream& stream, ::intel_npu::Config& var) {
     var.update(config);
 }
 
-void ov::npuw::s11n::read(std::istream& stream, std::vector<std::shared_ptr<ov::op::v0::Parameter>>& var) {
-    var.clear();
-    std::size_t params_size = 0;
-    read(stream, params_size);
-    for (std::size_t i = 0; i < params_size; ++i) {
-        std::string elem_type_str;
-        std::string part_shape_str;
-        std::unordered_set<std::string> names;
-        read(stream, elem_type_str);
-        read(stream, part_shape_str);
-        read(stream, names);
-        // NOTE: the code below is taken from NPU plugin's create_dummy_model()
-        auto param =
-            std::make_shared<op::v0::Parameter>(ov::element::Type(elem_type_str), ov::PartialShape(part_shape_str));
-        param->set_friendly_name(*names.begin());  // FIXME: any_name ?
-        param->output(0).get_tensor().set_names(names);
-        var.push_back(param);
-    }
+void ov::npuw::s11n::read(std::istream& stream, std::shared_ptr<ov::op::v0::Parameter>& var) {
+    std::string elem_type_str;
+    std::string part_shape_str;
+    std::unordered_set<std::string> names;
+    read(stream, elem_type_str);
+    read(stream, part_shape_str);
+    read(stream, names);
+    // NOTE: the code below is taken from NPU plugin's create_dummy_model()
+    var = std::make_shared<op::v0::Parameter>(ov::element::Type(elem_type_str), ov::PartialShape(part_shape_str));
+    var->set_friendly_name(*names.begin());  // FIXME: any_name ?
+    var->output(0).get_tensor().set_names(names);
 }
 
-void ov::npuw::s11n::read(std::istream& stream, std::vector<std::shared_ptr<ov::Node>>& var) {
-    var.clear();
-    std::size_t results_size = 0;
-    read(stream, results_size);
-    for (std::size_t i = 0; i < results_size; ++i) {
-        std::string elem_type_str;
-        std::string part_shape_str;
-        std::unordered_set<std::string> names;
-        read(stream, elem_type_str);
-        read(stream, part_shape_str);
-        read(stream, names);
-        // NOTE: the code below is taken from NPU plugin's create_dummy_model()
-        std::shared_ptr<ov::Node> res =
-            std::make_shared<ov::op::v0::Constant>(ov::element::Type(elem_type_str), std::vector<size_t>{1});
-        // FIXME: serialize names as well?
-        const std::shared_ptr<ov::descriptor::Tensor>& tensor_dummy =
-            std::make_shared<ov::descriptor::Tensor>(ov::element::Type(elem_type_str),
-                                                     ov::PartialShape(part_shape_str),
-                                                     names);
-        std::shared_ptr<ov::Node> result = std::make_shared<ov::op::v0::Result>(res);
-        result->output(0).set_tensor_ptr(tensor_dummy);
-        result->set_friendly_name(*names.begin());  // any_name ?
-        var.push_back(result);
-    }
+void ov::npuw::s11n::read(std::istream& stream, std::shared_ptr<ov::Node>& var) {
+    std::string elem_type_str;
+    std::string part_shape_str;
+    std::unordered_set<std::string> names;
+    read(stream, elem_type_str);
+    read(stream, part_shape_str);
+    read(stream, names);
+    // NOTE: the code below is taken from NPU plugin's create_dummy_model()
+    std::shared_ptr<ov::Node> res =
+        std::make_shared<ov::op::v0::Constant>(ov::element::Type(elem_type_str), std::vector<size_t>{1});
+    // FIXME: serialize names as well?
+    const std::shared_ptr<ov::descriptor::Tensor>& tensor_dummy =
+        std::make_shared<ov::descriptor::Tensor>(ov::element::Type(elem_type_str),
+                                                 ov::PartialShape(part_shape_str),
+                                                 names);
+    var = std::make_shared<ov::op::v0::Result>(res);
+    var->output(0).set_tensor_ptr(tensor_dummy);
+    var->set_friendly_name(*names.begin());  // any_name ?
 }
diff --git a/src/plugins/intel_npu/src/plugin/npuw/serialization.hpp b/src/plugins/intel_npu/src/plugin/npuw/serialization.hpp
@@ -4,6 +4,7 @@
 
 #pragma once
 
+#include <array>
 #include <iostream>
 #include <map>
 #include <memory>
@@ -13,7 +14,10 @@
 #include <unordered_set>
 #include <vector>
 
-const constexpr uint64_t NPUW_SERIALIZATION_INDICATOR = 0x0123456789abcdef;
+const constexpr std::array<uint8_t, 6> NPUW_SERIALIZATION_INDICATOR =
+    {char{0x13}, char{0x37}, char{0x6e}, char{0x70}, char{0x75}, char{0x77}};
+
+const constexpr char* NPUW_SERIALIZATION_VERSION = "0.0";
 
 // Forward declaration
 namespace intel_npu {
@@ -59,18 +63,22 @@ void read(std::istream& stream, bool& var);
 void read(std::istream& stream, ov::npuw::compiled::Spatial& var);
 void read(std::istream& stream, ov::Tensor& var);
 void read(std::istream& stream, ::intel_npu::Config& var);
-void read(std::istream& stream, std::vector<std::shared_ptr<ov::op::v0::Parameter>>& var);
-void read(std::istream& stream, std::vector<std::shared_ptr<ov::Node>>& var);
+void read(std::istream& stream, std::shared_ptr<ov::op::v0::Parameter>& var);
+void read(std::istream& stream, std::shared_ptr<ov::Node>& var);
 
 // Forward declaration
 template <typename T1, typename T2>
 void write(std::ostream& stream, const std::pair<T1, T2>& var);
 template <typename T>
 void write(std::ostream& stream, const std::vector<T>& var);
+template <typename T, size_t N>
+void write(std::ostream& stream, const std::array<T, N>& var);
 template <typename T1, typename T2>
 void read(std::istream& stream, std::pair<T1, T2>& var);
 template <typename T>
 void read(std::istream& stream, std::vector<T>& var);
+template <typename T, std::size_t N>
+void read(std::istream& stream, std::array<T, N>& var);
 
 // Serialization
 template <typename T, std::enable_if_t<std::is_integral<T>::value, bool> = true>
@@ -92,6 +100,13 @@ void write(std::ostream& stream, const std::vector<T>& var) {
     }
 }
 
+template <typename T, size_t N>
+void write(std::ostream& stream, const std::array<T, N>& var) {
+    for (const auto& el : var) {
+        write(stream, el);
+    }
+}
+
 template <typename T>
 void write(std::ostream& stream, const std::unordered_set<T>& var) {
     write(stream, var.size());
@@ -143,6 +158,15 @@ void read(std::istream& stream, std::vector<T>& var) {
     }
 }
 
+template <typename T, std::size_t N>
+void read(std::istream& stream, std::array<T, N>& var) {
+    for (std::size_t i = 0; i < N; ++i) {
+        T elem;
+        read(stream, elem);
+        var[i] = elem;
+    }
+}
+
 template <typename T>
 void read(std::istream& stream, std::unordered_set<T>& var) {
     var.clear();