From 423185ad7860f73ecd9444f10ff47f56e811a6d7 Mon Sep 17 00:00:00 2001
From: Alexey Smirnov <alexey.smirnov@intel.com>
Date: Fri, 10 Jan 2025 15:08:43 +0000
Subject: [PATCH] Address review comments part 2

---
 .../src/plugin/npuw/compiled_model.cpp        | 27 +++----
 .../src/plugin/npuw/compiled_model.hpp        |  4 +-
 .../src/plugin/npuw/llm_compiled_model.cpp    | 54 +++++++-------
 .../src/plugin/npuw/llm_compiled_model.hpp    |  4 +-
 .../src/plugin/npuw/serialization.cpp         | 71 ++++++++-----------
 .../src/plugin/npuw/serialization.hpp         | 30 +++++++-
 .../src/plugin/npuw/weights_bank.cpp          | 43 +++++------
 .../intel_npu/src/plugin/src/plugin.cpp       | 18 +----
 8 files changed, 122 insertions(+), 129 deletions(-)
diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp
index a565b332925c23..7ef8205095c69f 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp
@@ -489,7 +489,6 @@ ov::npuw::CompiledModel::CompiledModel(const std::shared_ptr<ov::Model>& model,
 
 ov::npuw::CompiledModel::CompiledModel(const std::shared_ptr<ov::Model>& model,
                                        const std::shared_ptr<const ov::IPlugin>& plugin,
-                                       const ov::AnyMap& properties,
                                        const bool serialized)
     : ov::npuw::ICompiledModel(model, plugin),
       m_options_desc(std::make_shared<::intel_npu::OptionsDesc>()),
@@ -610,6 +609,9 @@ void ov::npuw::CompiledModel::serialize(std::ostream& stream) const {
     // Write device list
     write(stream, m_dev_list);
 
+    // Write config
+    write(stream, m_cfg);
+
     // Serialize compiled submodels
     write(stream, m_compiled_submodels.size());
     for (const auto& subm : m_compiled_submodels) {
@@ -635,8 +637,7 @@ void ov::npuw::CompiledModel::serialize(std::ostream& stream) const {
 
 std::shared_ptr<ov::npuw::CompiledModel> ov::npuw::CompiledModel::deserialize(
     std::istream& stream,
-    const std::shared_ptr<const ov::IPlugin>& plugin,
-    const ov::AnyMap& properties) {
+    const std::shared_ptr<const ov::IPlugin>& plugin) {
     LOG_INFO("Deserializing CompiledModel...");
     LOG_BLOCK();
 
@@ -656,7 +657,7 @@ std::shared_ptr<ov::npuw::CompiledModel> ov::npuw::CompiledModel::deserialize(
 
     auto ov_model = std::make_shared<ov::Model>(results, parameters, model_name);
 
-    auto compiled = std::make_shared<ov::npuw::CompiledModel>(ov_model, plugin, properties, true);
+    auto compiled = std::make_shared<ov::npuw::CompiledModel>(ov_model, plugin, true);
 
     // Deserialize meta
     compiled->m_name = model_name;
@@ -668,13 +669,8 @@ std::shared_ptr<ov::npuw::CompiledModel> ov::npuw::CompiledModel::deserialize(
     // Deserialize device list
     read(stream, compiled->m_dev_list);
 
-    // Drop NPUW-related properties from the config for submodels import
-    std::map<std::string, ov::Any> non_npuw_props;
-    for (auto it = properties.begin(); it != properties.end(); ++it) {
-        if (it->first.find("NPUW_LLM") == it->first.npos && it->first.find("NPUW") == it->first.npos) {
-            non_npuw_props.insert(*it);
-        }
-    }
+    // Deserialize config
+    read(stream, compiled->m_cfg);
 
     // Deserialize compiled submodels
     std::size_t subm_size = 0;
@@ -691,14 +687,9 @@ std::shared_ptr<ov::npuw::CompiledModel> ov::npuw::CompiledModel::deserialize(
             // FIXME: workaround for import/export model since import model seems to reset the file pointer
             std::string buf;
             read(stream, buf);
-
-            // FIXME: extra copy
-            std::stringstream buffer;
-            buffer.write(&buf[0], buf.size());
-
-            // No NPUW properties are present in this config
+            std::stringstream buffer(buf);
             compiled->m_compiled_submodels[i].compiled_model =
-                plugin->get_core()->import_model(buffer, compiled->m_dev_list[device_idx], non_npuw_props);
+                plugin->get_core()->import_model(buffer, compiled->m_dev_list[device_idx]);
         }
         compiled->m_compiled_submodels[i].device_it = compiled->m_dev_list.begin() + device_idx;
         compiled->m_compiled_submodels[i].deserialize(stream);
diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp
index c610ec7da03cc4..b4faf9d417b003 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp
@@ -42,7 +42,6 @@ class CompiledModel : public ov::npuw::ICompiledModel {
                   const ov::AnyMap& properties);
     CompiledModel(const std::shared_ptr<ov::Model>& model,
                   const std::shared_ptr<const ov::IPlugin>& plugin,
-                  const ov::AnyMap& properties,
                   const bool serialized);
 
     void export_model(std::ostream& model) const override;
@@ -73,8 +72,7 @@ class CompiledModel : public ov::npuw::ICompiledModel {
 
     void serialize(std::ostream& stream) const;
     static std::shared_ptr<CompiledModel> deserialize(std::istream& stream,
-                                                      const std::shared_ptr<const ov::IPlugin>& plugin,
-                                                      const ov::AnyMap& properties);
+                                                      const std::shared_ptr<const ov::IPlugin>& plugin);
 
     // This is used for removing too long output tensor names to fix some compilation issues
     // NB: These two methods has nothing to do with this particular class and should be
diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp
index 22519e207a4144..c004b349ddf3c2 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp
@@ -517,7 +517,6 @@ ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr<ov::Model>& m
 
 ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr<ov::Model>& model,
                                              const std::shared_ptr<const ov::IPlugin>& plugin,
-                                             const ov::AnyMap& properties,
                                              const bool serialized)
     : ov::npuw::ICompiledModel(model, plugin),
       m_name(model->get_friendly_name()),
@@ -540,6 +539,7 @@ void ov::npuw::LLMCompiledModel::export_model(std::ostream& stream) const {
     write(stream, OPENVINO_VERSION_MAJOR);
     write(stream, OPENVINO_VERSION_MINOR);
     write(stream, OPENVINO_VERSION_PATCH);
+    write(stream, std::string(NPUW_SERIALIZATION_VERSION));
 
     // Serialize name
     write(stream, m_name);
@@ -558,10 +558,6 @@ void ov::npuw::LLMCompiledModel::export_model(std::ostream& stream) const {
     m_kvcache_compiled->serialize(stream);
     m_prefill_compiled->serialize(stream);
 
-    // Serialize configs
-    write(stream, m_kvcache_compiled->m_cfg);
-    write(stream, m_prefill_compiled->m_cfg);
-
     // Serialize weights bank (if required)
     const auto& kv_bank = m_kvcache_compiled->m_weights_bank;
     const auto& p_bank = m_prefill_compiled->m_weights_bank;
@@ -575,26 +571,45 @@ void ov::npuw::LLMCompiledModel::export_model(std::ostream& stream) const {
 
 std::shared_ptr<ov::npuw::LLMCompiledModel> ov::npuw::LLMCompiledModel::deserialize(
     std::istream& stream,
-    const std::shared_ptr<const ov::IPlugin>& plugin,
-    const ov::AnyMap& properties) {
+    const std::shared_ptr<const ov::IPlugin>& plugin) {
     LOG_INFO("Deserializing LLMCompiledModel...");
     LOG_BLOCK();
 
     using namespace ov::npuw::s11n;
 
     // Sanity check magic number
-    uint64_t serialization_indicator = 0;
+    std::array<uint8_t, 6> serialization_indicator;
     read(stream, serialization_indicator);
     NPUW_ASSERT(serialization_indicator == NPUW_SERIALIZATION_INDICATOR && "This blob wasn't serialized via NPUW!");
 
     // Deserialize general meta info
     int vmajor, vminor, vpatch;
+    std::string s11n_version;
     read(stream, vmajor);
     read(stream, vminor);
     read(stream, vpatch);
-
-    NPUW_ASSERT(vmajor == OPENVINO_VERSION_MAJOR && vminor == OPENVINO_VERSION_MINOR &&
-                vpatch == OPENVINO_VERSION_PATCH && "Only blobs serialized with the same OV version are supported!");
+    read(stream, s11n_version);
+
+    if (vmajor != OPENVINO_VERSION_MAJOR || vminor != OPENVINO_VERSION_MINOR || vpatch != OPENVINO_VERSION_PATCH ||
+        s11n_version != std::string(NPUW_SERIALIZATION_VERSION)) {
+        OPENVINO_THROW("This blobs was serialized with different OV version!",
+                       " Serialized by OV ",
+                       vmajor,
+                       '.',
+                       vminor,
+                       '.',
+                       vpatch,
+                       " Current OV version ",
+                       OPENVINO_VERSION_MAJOR,
+                       '.',
+                       OPENVINO_VERSION_MINOR,
+                       '.',
+                       OPENVINO_VERSION_PATCH,
+                       " NPUW serialized by version ",
+                       s11n_version,
+                       " NPUW current serialization version ",
+                       NPUW_SERIALIZATION_VERSION);
+    }
 
     // Deserialize model name first
     std::string model_name;
@@ -610,7 +625,7 @@ std::shared_ptr<ov::npuw::LLMCompiledModel> ov::npuw::LLMCompiledModel::deserial
 
     auto ov_model = std::make_shared<ov::Model>(results, parameters, model_name);
 
-    auto compiled = std::make_shared<ov::npuw::LLMCompiledModel>(ov_model, plugin, properties, true);
+    auto compiled = std::make_shared<ov::npuw::LLMCompiledModel>(ov_model, plugin, true);
 
     // Deserialize LLMCompiledModel-specific data
     read(stream, compiled->m_kvcache_desc.max_prompt_size);
@@ -618,16 +633,9 @@ std::shared_ptr<ov::npuw::LLMCompiledModel> ov::npuw::LLMCompiledModel::deserial
     read(stream, compiled->m_kvcache_desc.num_stored_tokens);
     read(stream, compiled->m_kvcache_desc.dim);
 
-    // Deserialize CompiledModels. Remove NPUW_LLM properties beforehand
-    std::map<std::string, ov::Any> npuw_llm_props;
-    std::map<std::string, ov::Any> other_props;
-    split_llm_properties(properties, npuw_llm_props, other_props);
-    compiled->m_kvcache_compiled = ov::npuw::CompiledModel::deserialize(stream, plugin, other_props);
-    compiled->m_prefill_compiled = ov::npuw::CompiledModel::deserialize(stream, plugin, other_props);
-
-    // Deserialize configs
-    read(stream, compiled->m_kvcache_compiled->m_cfg);
-    read(stream, compiled->m_prefill_compiled->m_cfg);
+    // Deserialize CompiledModels
+    compiled->m_kvcache_compiled = ov::npuw::CompiledModel::deserialize(stream, plugin);
+    compiled->m_prefill_compiled = ov::npuw::CompiledModel::deserialize(stream, plugin);
 
     // Deserialize weights bank (if required)
     std::string bank_name;
@@ -642,8 +650,6 @@ std::shared_ptr<ov::npuw::LLMCompiledModel> ov::npuw::LLMCompiledModel::deserial
     compiled->m_kvcache_compiled->reconstruct_closure();
     compiled->m_prefill_compiled->reconstruct_closure();
 
-    compiled->implement_properties();
-
     LOG_INFO("Done.");
     return compiled;
 }
diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.hpp b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.hpp
index 1394277e4fce9b..5003ccce40bb9d 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.hpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.hpp
@@ -30,14 +30,12 @@ class LLMCompiledModel : public ov::npuw::ICompiledModel {
                      const ov::AnyMap& properties);
     LLMCompiledModel(const std::shared_ptr<ov::Model>& model,
                      const std::shared_ptr<const ov::IPlugin>& plugin,
-                     const ov::AnyMap& properties,
                      const bool serialized);
     LLMCompiledModel() = delete;
 
     void export_model(std::ostream& model) const override;
     static std::shared_ptr<LLMCompiledModel> deserialize(std::istream& stream,
-                                                         const std::shared_ptr<const ov::IPlugin>& plugin,
-                                                         const ov::AnyMap& properties);
+                                                         const std::shared_ptr<const ov::IPlugin>& plugin);
 
     std::shared_ptr<const ov::Model> get_runtime_model() const override;
 
diff --git a/src/plugins/intel_npu/src/plugin/npuw/serialization.cpp b/src/plugins/intel_npu/src/plugin/npuw/serialization.cpp
index fbaad987262ea4..5dcdc4be1a9cbc 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/serialization.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/serialization.cpp
@@ -137,48 +137,35 @@ void ov::npuw::s11n::read(std::istream& stream, ::intel_npu::Config& var) {
     var.update(config);
 }
 
-void ov::npuw::s11n::read(std::istream& stream, std::vector<std::shared_ptr<ov::op::v0::Parameter>>& var) {
-    var.clear();
-    std::size_t params_size = 0;
-    read(stream, params_size);
-    for (std::size_t i = 0; i < params_size; ++i) {
-        std::string elem_type_str;
-        std::string part_shape_str;
-        std::unordered_set<std::string> names;
-        read(stream, elem_type_str);
-        read(stream, part_shape_str);
-        read(stream, names);
-        // NOTE: the code below is taken from NPU plugin's create_dummy_model()
-        auto param =
-            std::make_shared<op::v0::Parameter>(ov::element::Type(elem_type_str), ov::PartialShape(part_shape_str));
-        param->set_friendly_name(*names.begin());  // FIXME: any_name ?
-        param->output(0).get_tensor().set_names(names);
-        var.push_back(param);
-    }
+void ov::npuw::s11n::read(std::istream& stream, std::shared_ptr<ov::op::v0::Parameter>& var) {
+    std::string elem_type_str;
+    std::string part_shape_str;
+    std::unordered_set<std::string> names;
+    read(stream, elem_type_str);
+    read(stream, part_shape_str);
+    read(stream, names);
+    // NOTE: the code below is taken from NPU plugin's create_dummy_model()
+    var = std::make_shared<op::v0::Parameter>(ov::element::Type(elem_type_str), ov::PartialShape(part_shape_str));
+    var->set_friendly_name(*names.begin());  // FIXME: any_name ?
+    var->output(0).get_tensor().set_names(names);
 }
 
-void ov::npuw::s11n::read(std::istream& stream, std::vector<std::shared_ptr<ov::Node>>& var) {
-    var.clear();
-    std::size_t results_size = 0;
-    read(stream, results_size);
-    for (std::size_t i = 0; i < results_size; ++i) {
-        std::string elem_type_str;
-        std::string part_shape_str;
-        std::unordered_set<std::string> names;
-        read(stream, elem_type_str);
-        read(stream, part_shape_str);
-        read(stream, names);
-        // NOTE: the code below is taken from NPU plugin's create_dummy_model()
-        std::shared_ptr<ov::Node> res =
-            std::make_shared<ov::op::v0::Constant>(ov::element::Type(elem_type_str), std::vector<size_t>{1});
-        // FIXME: serialize names as well?
-        const std::shared_ptr<ov::descriptor::Tensor>& tensor_dummy =
-            std::make_shared<ov::descriptor::Tensor>(ov::element::Type(elem_type_str),
-                                                     ov::PartialShape(part_shape_str),
-                                                     names);
-        std::shared_ptr<ov::Node> result = std::make_shared<ov::op::v0::Result>(res);
-        result->output(0).set_tensor_ptr(tensor_dummy);
-        result->set_friendly_name(*names.begin());  // any_name ?
-        var.push_back(result);
-    }
+void ov::npuw::s11n::read(std::istream& stream, std::shared_ptr<ov::Node>& var) {
+    std::string elem_type_str;
+    std::string part_shape_str;
+    std::unordered_set<std::string> names;
+    read(stream, elem_type_str);
+    read(stream, part_shape_str);
+    read(stream, names);
+    // NOTE: the code below is taken from NPU plugin's create_dummy_model()
+    std::shared_ptr<ov::Node> res =
+        std::make_shared<ov::op::v0::Constant>(ov::element::Type(elem_type_str), std::vector<size_t>{1});
+    // FIXME: serialize names as well?
+    const std::shared_ptr<ov::descriptor::Tensor>& tensor_dummy =
+        std::make_shared<ov::descriptor::Tensor>(ov::element::Type(elem_type_str),
+                                                 ov::PartialShape(part_shape_str),
+                                                 names);
+    var = std::make_shared<ov::op::v0::Result>(res);
+    var->output(0).set_tensor_ptr(tensor_dummy);
+    var->set_friendly_name(*names.begin());  // any_name ?
 }
diff --git a/src/plugins/intel_npu/src/plugin/npuw/serialization.hpp b/src/plugins/intel_npu/src/plugin/npuw/serialization.hpp
index 8afe83f70ff6a7..77a6b3aa865254 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/serialization.hpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/serialization.hpp
@@ -4,6 +4,7 @@
 
 #pragma once
 
+#include <array>
 #include <iostream>
 #include <map>
 #include <memory>
@@ -13,7 +14,10 @@
 #include <unordered_set>
 #include <vector>
 
-const constexpr uint64_t NPUW_SERIALIZATION_INDICATOR = 0x0123456789abcdef;
+const constexpr std::array<uint8_t, 6> NPUW_SERIALIZATION_INDICATOR =
+    {char{0x13}, char{0x37}, char{0x6e}, char{0x70}, char{0x75}, char{0x77}};
+
+const constexpr char* NPUW_SERIALIZATION_VERSION = "0.0";
 
 // Forward declaration
 namespace intel_npu {
@@ -59,18 +63,22 @@ void read(std::istream& stream, bool& var);
 void read(std::istream& stream, ov::npuw::compiled::Spatial& var);
 void read(std::istream& stream, ov::Tensor& var);
 void read(std::istream& stream, ::intel_npu::Config& var);
-void read(std::istream& stream, std::vector<std::shared_ptr<ov::op::v0::Parameter>>& var);
-void read(std::istream& stream, std::vector<std::shared_ptr<ov::Node>>& var);
+void read(std::istream& stream, std::shared_ptr<ov::op::v0::Parameter>& var);
+void read(std::istream& stream, std::shared_ptr<ov::Node>& var);
 
 // Forward declaration
 template <typename T1, typename T2>
 void write(std::ostream& stream, const std::pair<T1, T2>& var);
 template <typename T>
 void write(std::ostream& stream, const std::vector<T>& var);
+template <typename T, size_t N>
+void write(std::ostream& stream, const std::array<T, N>& var);
 template <typename T1, typename T2>
 void read(std::istream& stream, std::pair<T1, T2>& var);
 template <typename T>
 void read(std::istream& stream, std::vector<T>& var);
+template <typename T, std::size_t N>
+void read(std::istream& stream, std::array<T, N>& var);
 
 // Serialization
 template <typename T, std::enable_if_t<std::is_integral<T>::value, bool> = true>
@@ -92,6 +100,13 @@ void write(std::ostream& stream, const std::vector<T>& var) {
     }
 }
 
+template <typename T, size_t N>
+void write(std::ostream& stream, const std::array<T, N>& var) {
+    for (const auto& el : var) {
+        write(stream, el);
+    }
+}
+
 template <typename T>
 void write(std::ostream& stream, const std::unordered_set<T>& var) {
     write(stream, var.size());
@@ -143,6 +158,15 @@ void read(std::istream& stream, std::vector<T>& var) {
     }
 }
 
+template <typename T, std::size_t N>
+void read(std::istream& stream, std::array<T, N>& var) {
+    for (std::size_t i = 0; i < N; ++i) {
+        T elem;
+        read(stream, elem);
+        var[i] = elem;
+    }
+}
+
 template <typename T>
 void read(std::istream& stream, std::unordered_set<T>& var) {
     var.clear();
diff --git a/src/plugins/intel_npu/src/plugin/npuw/weights_bank.cpp b/src/plugins/intel_npu/src/plugin/npuw/weights_bank.cpp
index 8ed87c4c0b3af7..5a0735ebdc24d5 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/weights_bank.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/weights_bank.cpp
@@ -165,19 +165,18 @@ void Bank::serialize(std::ostream& stream) const {
 
     std::lock_guard<std::mutex> guard(m_mutex);
 
-    // For now only a singular device is supported
-    // Sanity check
-    NPUW_ASSERT(m_device_banks.size() == 1 && "Bank containing several devices can't be serialized");
-    auto it_cpu = m_device_banks.find("CPU");
-    auto it = it_cpu == m_device_banks.end() ? m_device_banks.find("NPU") : it_cpu;
-
-    const auto& device_bank = it->second;
-    std::lock_guard<std::mutex> dev_guard(device_bank.mutex);
-    write(stream, it->first);
-    write(stream, device_bank.storage.size());
-    for (const auto& t_pair : device_bank.storage) {
-        write(stream, t_pair.first);
-        write(stream, t_pair.second.tensor);
+    write(stream, m_device_banks.size());
+
+    for (const auto& elem : m_device_banks) {
+        const auto& device = elem.first;
+        const auto& device_bank = elem.second;
+        std::lock_guard<std::mutex> dev_guard(device_bank.mutex);
+        write(stream, device);
+        write(stream, device_bank.storage.size());
+        for (const auto& t_pair : device_bank.storage) {
+            write(stream, t_pair.first);
+            write(stream, t_pair.second.tensor);
+        }
     }
 
     LOG_INFO("DONE.");
@@ -191,19 +190,21 @@ std::shared_ptr<Bank> Bank::deserialize(std::istream& stream,
     LOG_INFO("Deserializing weights bank...");
     LOG_BLOCK();
 
-    // For now only a singular device is supported
-    std::string device;
-    read(stream, device);
+    auto bank = ov::npuw::weights::bank(name, core, "");
 
-    // Note: bank is assumed to be shared
-    auto bank = ov::npuw::weights::bank(name, core, device);
     std::size_t bank_size = 0;
     read(stream, bank_size);
 
     for (std::size_t i = 0; i < bank_size; ++i) {
-        int64_t uid = -1;
-        read(stream, uid);
-        bank->read_and_add_tensor(stream, uid, device);
+        std::string device;
+        read(stream, device);
+        std::size_t storage_size = 0;
+        read(stream, storage_size);
+        for (std::size_t i = 0; i < bank_size; ++i) {
+            int64_t uid = -1;
+            read(stream, uid);
+            bank->read_and_add_tensor(stream, uid, device);
+        }
     }
 
     LOG_INFO("DONE.");
diff --git a/src/plugins/intel_npu/src/plugin/src/plugin.cpp b/src/plugins/intel_npu/src/plugin/src/plugin.cpp
index 9640f896d3e03e..57eaf8b8306d01 100644
--- a/src/plugins/intel_npu/src/plugin/src/plugin.cpp
+++ b/src/plugins/intel_npu/src/plugin/src/plugin.cpp
@@ -756,24 +756,12 @@ std::shared_ptr<ov::ICompiledModel> Plugin::import_model(std::istream& stream, c
     OV_ITT_SCOPED_TASK(itt::domains::NPUPlugin, "Plugin::import_model");
     OV_ITT_TASK_CHAIN(PLUGIN_IMPORT_MODEL, itt::domains::NPUPlugin, "Plugin::import_model", "merge_configs");
 
-    // If NPUW is active - import via NPUW
-    auto useNpuwKey = ov::intel_npu::use_npuw.name();
-    ov::AnyMap localProperties = properties;
-    if (localProperties.count(useNpuwKey) && localProperties.at(useNpuwKey).as<bool>() == true) {
-        auto llm_enabled = ov::intel_npu::npuw::llm::enabled.name();
-        // Only dynamic stateful models are supported for now supported
-        if (!localProperties.count(llm_enabled) || localProperties.at(llm_enabled).as<bool>() == false) {
-            OPENVINO_THROW("Cannot import non-dynamic NPUW model!");
-        }
-        return ov::npuw::LLMCompiledModel::deserialize(stream, shared_from_this(), localProperties);
-    }
-
     // If was exported via NPUW
-    uint64_t serialization_indicator;
-    stream.read(reinterpret_cast<char*>(&serialization_indicator), sizeof serialization_indicator);
+    std::array<uint8_t, 6> serialization_indicator;
+    ov::npuw::s11n::read(stream, serialization_indicator);
     if (serialization_indicator == NPUW_SERIALIZATION_INDICATOR) {
         stream.seekg(0);
-        return ov::npuw::LLMCompiledModel::deserialize(stream, shared_from_this(), localProperties);
+        return ov::npuw::LLMCompiledModel::deserialize(stream, shared_from_this());
     }
     stream.seekg(0);