Skip to content

Commit

Permalink
Address review comments part 2
Browse files Browse the repository at this point in the history
  • Loading branch information
smirnov-alexey committed Jan 10, 2025
1 parent 5d094c9 commit 423185a
Show file tree
Hide file tree
Showing 8 changed files with 122 additions and 129 deletions.
27 changes: 9 additions & 18 deletions src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -489,7 +489,6 @@ ov::npuw::CompiledModel::CompiledModel(const std::shared_ptr<ov::Model>& model,

ov::npuw::CompiledModel::CompiledModel(const std::shared_ptr<ov::Model>& model,
const std::shared_ptr<const ov::IPlugin>& plugin,
const ov::AnyMap& properties,
const bool serialized)
: ov::npuw::ICompiledModel(model, plugin),
m_options_desc(std::make_shared<::intel_npu::OptionsDesc>()),
Expand Down Expand Up @@ -610,6 +609,9 @@ void ov::npuw::CompiledModel::serialize(std::ostream& stream) const {
// Write device list
write(stream, m_dev_list);

// Write config
write(stream, m_cfg);

// Serialize compiled submodels
write(stream, m_compiled_submodels.size());
for (const auto& subm : m_compiled_submodels) {
Expand All @@ -635,8 +637,7 @@ void ov::npuw::CompiledModel::serialize(std::ostream& stream) const {

std::shared_ptr<ov::npuw::CompiledModel> ov::npuw::CompiledModel::deserialize(
std::istream& stream,
const std::shared_ptr<const ov::IPlugin>& plugin,
const ov::AnyMap& properties) {
const std::shared_ptr<const ov::IPlugin>& plugin) {
LOG_INFO("Deserializing CompiledModel...");
LOG_BLOCK();

Expand All @@ -656,7 +657,7 @@ std::shared_ptr<ov::npuw::CompiledModel> ov::npuw::CompiledModel::deserialize(

auto ov_model = std::make_shared<ov::Model>(results, parameters, model_name);

auto compiled = std::make_shared<ov::npuw::CompiledModel>(ov_model, plugin, properties, true);
auto compiled = std::make_shared<ov::npuw::CompiledModel>(ov_model, plugin, true);

// Deserialize meta
compiled->m_name = model_name;
Expand All @@ -668,13 +669,8 @@ std::shared_ptr<ov::npuw::CompiledModel> ov::npuw::CompiledModel::deserialize(
// Deserialize device list
read(stream, compiled->m_dev_list);

// Drop NPUW-related properties from the config for submodels import
std::map<std::string, ov::Any> non_npuw_props;
for (auto it = properties.begin(); it != properties.end(); ++it) {
if (it->first.find("NPUW_LLM") == it->first.npos && it->first.find("NPUW") == it->first.npos) {
non_npuw_props.insert(*it);
}
}
// Deserialize config
read(stream, compiled->m_cfg);

// Deserialize compiled submodels
std::size_t subm_size = 0;
Expand All @@ -691,14 +687,9 @@ std::shared_ptr<ov::npuw::CompiledModel> ov::npuw::CompiledModel::deserialize(
// FIXME: workaround for import/export model since import model seems to reset the file pointer
std::string buf;
read(stream, buf);

// FIXME: extra copy
std::stringstream buffer;
buffer.write(&buf[0], buf.size());

// No NPUW properties are present in this config
std::stringstream buffer(buf);
compiled->m_compiled_submodels[i].compiled_model =
plugin->get_core()->import_model(buffer, compiled->m_dev_list[device_idx], non_npuw_props);
plugin->get_core()->import_model(buffer, compiled->m_dev_list[device_idx]);
}
compiled->m_compiled_submodels[i].device_it = compiled->m_dev_list.begin() + device_idx;
compiled->m_compiled_submodels[i].deserialize(stream);
Expand Down
4 changes: 1 addition & 3 deletions src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@ class CompiledModel : public ov::npuw::ICompiledModel {
const ov::AnyMap& properties);
CompiledModel(const std::shared_ptr<ov::Model>& model,
const std::shared_ptr<const ov::IPlugin>& plugin,
const ov::AnyMap& properties,
const bool serialized);

void export_model(std::ostream& model) const override;
Expand Down Expand Up @@ -73,8 +72,7 @@ class CompiledModel : public ov::npuw::ICompiledModel {

void serialize(std::ostream& stream) const;
static std::shared_ptr<CompiledModel> deserialize(std::istream& stream,
const std::shared_ptr<const ov::IPlugin>& plugin,
const ov::AnyMap& properties);
const std::shared_ptr<const ov::IPlugin>& plugin);

// This is used for removing too long output tensor names to fix some compilation issues
// NB: These two methods has nothing to do with this particular class and should be
Expand Down
54 changes: 30 additions & 24 deletions src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -517,7 +517,6 @@ ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr<ov::Model>& m

ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr<ov::Model>& model,
const std::shared_ptr<const ov::IPlugin>& plugin,
const ov::AnyMap& properties,
const bool serialized)
: ov::npuw::ICompiledModel(model, plugin),
m_name(model->get_friendly_name()),
Expand All @@ -540,6 +539,7 @@ void ov::npuw::LLMCompiledModel::export_model(std::ostream& stream) const {
write(stream, OPENVINO_VERSION_MAJOR);
write(stream, OPENVINO_VERSION_MINOR);
write(stream, OPENVINO_VERSION_PATCH);
write(stream, std::string(NPUW_SERIALIZATION_VERSION));

// Serialize name
write(stream, m_name);
Expand All @@ -558,10 +558,6 @@ void ov::npuw::LLMCompiledModel::export_model(std::ostream& stream) const {
m_kvcache_compiled->serialize(stream);
m_prefill_compiled->serialize(stream);

// Serialize configs
write(stream, m_kvcache_compiled->m_cfg);
write(stream, m_prefill_compiled->m_cfg);

// Serialize weights bank (if required)
const auto& kv_bank = m_kvcache_compiled->m_weights_bank;
const auto& p_bank = m_prefill_compiled->m_weights_bank;
Expand All @@ -575,26 +571,45 @@ void ov::npuw::LLMCompiledModel::export_model(std::ostream& stream) const {

std::shared_ptr<ov::npuw::LLMCompiledModel> ov::npuw::LLMCompiledModel::deserialize(
std::istream& stream,
const std::shared_ptr<const ov::IPlugin>& plugin,
const ov::AnyMap& properties) {
const std::shared_ptr<const ov::IPlugin>& plugin) {
LOG_INFO("Deserializing LLMCompiledModel...");
LOG_BLOCK();

using namespace ov::npuw::s11n;

// Sanity check magic number
uint64_t serialization_indicator = 0;
std::array<uint8_t, 6> serialization_indicator;
read(stream, serialization_indicator);
NPUW_ASSERT(serialization_indicator == NPUW_SERIALIZATION_INDICATOR && "This blob wasn't serialized via NPUW!");

// Deserialize general meta info
int vmajor, vminor, vpatch;
std::string s11n_version;
read(stream, vmajor);
read(stream, vminor);
read(stream, vpatch);

NPUW_ASSERT(vmajor == OPENVINO_VERSION_MAJOR && vminor == OPENVINO_VERSION_MINOR &&
vpatch == OPENVINO_VERSION_PATCH && "Only blobs serialized with the same OV version are supported!");
read(stream, s11n_version);

if (vmajor != OPENVINO_VERSION_MAJOR || vminor != OPENVINO_VERSION_MINOR || vpatch != OPENVINO_VERSION_PATCH ||
s11n_version != std::string(NPUW_SERIALIZATION_VERSION)) {
OPENVINO_THROW("This blobs was serialized with different OV version!",
" Serialized by OV ",
vmajor,
'.',
vminor,
'.',
vpatch,
" Current OV version ",
OPENVINO_VERSION_MAJOR,
'.',
OPENVINO_VERSION_MINOR,
'.',
OPENVINO_VERSION_PATCH,
" NPUW serialized by version ",
s11n_version,
" NPUW current serialization version ",
NPUW_SERIALIZATION_VERSION);
}

// Deserialize model name first
std::string model_name;
Expand All @@ -610,24 +625,17 @@ std::shared_ptr<ov::npuw::LLMCompiledModel> ov::npuw::LLMCompiledModel::deserial

auto ov_model = std::make_shared<ov::Model>(results, parameters, model_name);

auto compiled = std::make_shared<ov::npuw::LLMCompiledModel>(ov_model, plugin, properties, true);
auto compiled = std::make_shared<ov::npuw::LLMCompiledModel>(ov_model, plugin, true);

// Deserialize LLMCompiledModel-specific data
read(stream, compiled->m_kvcache_desc.max_prompt_size);
read(stream, compiled->m_kvcache_desc.total_size);
read(stream, compiled->m_kvcache_desc.num_stored_tokens);
read(stream, compiled->m_kvcache_desc.dim);

// Deserialize CompiledModels. Remove NPUW_LLM properties beforehand
std::map<std::string, ov::Any> npuw_llm_props;
std::map<std::string, ov::Any> other_props;
split_llm_properties(properties, npuw_llm_props, other_props);
compiled->m_kvcache_compiled = ov::npuw::CompiledModel::deserialize(stream, plugin, other_props);
compiled->m_prefill_compiled = ov::npuw::CompiledModel::deserialize(stream, plugin, other_props);

// Deserialize configs
read(stream, compiled->m_kvcache_compiled->m_cfg);
read(stream, compiled->m_prefill_compiled->m_cfg);
// Deserialize CompiledModels
compiled->m_kvcache_compiled = ov::npuw::CompiledModel::deserialize(stream, plugin);
compiled->m_prefill_compiled = ov::npuw::CompiledModel::deserialize(stream, plugin);

// Deserialize weights bank (if required)
std::string bank_name;
Expand All @@ -642,8 +650,6 @@ std::shared_ptr<ov::npuw::LLMCompiledModel> ov::npuw::LLMCompiledModel::deserial
compiled->m_kvcache_compiled->reconstruct_closure();
compiled->m_prefill_compiled->reconstruct_closure();

compiled->implement_properties();

LOG_INFO("Done.");
return compiled;
}
Expand Down
4 changes: 1 addition & 3 deletions src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,14 +30,12 @@ class LLMCompiledModel : public ov::npuw::ICompiledModel {
const ov::AnyMap& properties);
LLMCompiledModel(const std::shared_ptr<ov::Model>& model,
const std::shared_ptr<const ov::IPlugin>& plugin,
const ov::AnyMap& properties,
const bool serialized);
LLMCompiledModel() = delete;

void export_model(std::ostream& model) const override;
static std::shared_ptr<LLMCompiledModel> deserialize(std::istream& stream,
const std::shared_ptr<const ov::IPlugin>& plugin,
const ov::AnyMap& properties);
const std::shared_ptr<const ov::IPlugin>& plugin);

std::shared_ptr<const ov::Model> get_runtime_model() const override;

Expand Down
71 changes: 29 additions & 42 deletions src/plugins/intel_npu/src/plugin/npuw/serialization.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -137,48 +137,35 @@ void ov::npuw::s11n::read(std::istream& stream, ::intel_npu::Config& var) {
var.update(config);
}

void ov::npuw::s11n::read(std::istream& stream, std::vector<std::shared_ptr<ov::op::v0::Parameter>>& var) {
var.clear();
std::size_t params_size = 0;
read(stream, params_size);
for (std::size_t i = 0; i < params_size; ++i) {
std::string elem_type_str;
std::string part_shape_str;
std::unordered_set<std::string> names;
read(stream, elem_type_str);
read(stream, part_shape_str);
read(stream, names);
// NOTE: the code below is taken from NPU plugin's create_dummy_model()
auto param =
std::make_shared<op::v0::Parameter>(ov::element::Type(elem_type_str), ov::PartialShape(part_shape_str));
param->set_friendly_name(*names.begin()); // FIXME: any_name ?
param->output(0).get_tensor().set_names(names);
var.push_back(param);
}
void ov::npuw::s11n::read(std::istream& stream, std::shared_ptr<ov::op::v0::Parameter>& var) {
std::string elem_type_str;
std::string part_shape_str;
std::unordered_set<std::string> names;
read(stream, elem_type_str);
read(stream, part_shape_str);
read(stream, names);
// NOTE: the code below is taken from NPU plugin's create_dummy_model()
var = std::make_shared<op::v0::Parameter>(ov::element::Type(elem_type_str), ov::PartialShape(part_shape_str));
var->set_friendly_name(*names.begin()); // FIXME: any_name ?
var->output(0).get_tensor().set_names(names);
}

void ov::npuw::s11n::read(std::istream& stream, std::vector<std::shared_ptr<ov::Node>>& var) {
var.clear();
std::size_t results_size = 0;
read(stream, results_size);
for (std::size_t i = 0; i < results_size; ++i) {
std::string elem_type_str;
std::string part_shape_str;
std::unordered_set<std::string> names;
read(stream, elem_type_str);
read(stream, part_shape_str);
read(stream, names);
// NOTE: the code below is taken from NPU plugin's create_dummy_model()
std::shared_ptr<ov::Node> res =
std::make_shared<ov::op::v0::Constant>(ov::element::Type(elem_type_str), std::vector<size_t>{1});
// FIXME: serialize names as well?
const std::shared_ptr<ov::descriptor::Tensor>& tensor_dummy =
std::make_shared<ov::descriptor::Tensor>(ov::element::Type(elem_type_str),
ov::PartialShape(part_shape_str),
names);
std::shared_ptr<ov::Node> result = std::make_shared<ov::op::v0::Result>(res);
result->output(0).set_tensor_ptr(tensor_dummy);
result->set_friendly_name(*names.begin()); // any_name ?
var.push_back(result);
}
void ov::npuw::s11n::read(std::istream& stream, std::shared_ptr<ov::Node>& var) {
std::string elem_type_str;
std::string part_shape_str;
std::unordered_set<std::string> names;
read(stream, elem_type_str);
read(stream, part_shape_str);
read(stream, names);
// NOTE: the code below is taken from NPU plugin's create_dummy_model()
std::shared_ptr<ov::Node> res =
std::make_shared<ov::op::v0::Constant>(ov::element::Type(elem_type_str), std::vector<size_t>{1});
// FIXME: serialize names as well?
const std::shared_ptr<ov::descriptor::Tensor>& tensor_dummy =
std::make_shared<ov::descriptor::Tensor>(ov::element::Type(elem_type_str),
ov::PartialShape(part_shape_str),
names);
var = std::make_shared<ov::op::v0::Result>(res);
var->output(0).set_tensor_ptr(tensor_dummy);
var->set_friendly_name(*names.begin()); // any_name ?
}
30 changes: 27 additions & 3 deletions src/plugins/intel_npu/src/plugin/npuw/serialization.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

#pragma once

#include <array>
#include <iostream>
#include <map>
#include <memory>
Expand All @@ -13,7 +14,10 @@
#include <unordered_set>
#include <vector>

const constexpr uint64_t NPUW_SERIALIZATION_INDICATOR = 0x0123456789abcdef;
const constexpr std::array<uint8_t, 6> NPUW_SERIALIZATION_INDICATOR =
{char{0x13}, char{0x37}, char{0x6e}, char{0x70}, char{0x75}, char{0x77}};

const constexpr char* NPUW_SERIALIZATION_VERSION = "0.0";

// Forward declaration
namespace intel_npu {
Expand Down Expand Up @@ -59,18 +63,22 @@ void read(std::istream& stream, bool& var);
void read(std::istream& stream, ov::npuw::compiled::Spatial& var);
void read(std::istream& stream, ov::Tensor& var);
void read(std::istream& stream, ::intel_npu::Config& var);
void read(std::istream& stream, std::vector<std::shared_ptr<ov::op::v0::Parameter>>& var);
void read(std::istream& stream, std::vector<std::shared_ptr<ov::Node>>& var);
void read(std::istream& stream, std::shared_ptr<ov::op::v0::Parameter>& var);
void read(std::istream& stream, std::shared_ptr<ov::Node>& var);

// Forward declaration
template <typename T1, typename T2>
void write(std::ostream& stream, const std::pair<T1, T2>& var);
template <typename T>
void write(std::ostream& stream, const std::vector<T>& var);
template <typename T, size_t N>
void write(std::ostream& stream, const std::array<T, N>& var);
template <typename T1, typename T2>
void read(std::istream& stream, std::pair<T1, T2>& var);
template <typename T>
void read(std::istream& stream, std::vector<T>& var);
template <typename T, std::size_t N>
void read(std::istream& stream, std::array<T, N>& var);

// Serialization
template <typename T, std::enable_if_t<std::is_integral<T>::value, bool> = true>
Expand All @@ -92,6 +100,13 @@ void write(std::ostream& stream, const std::vector<T>& var) {
}
}

template <typename T, size_t N>
void write(std::ostream& stream, const std::array<T, N>& var) {
for (const auto& el : var) {
write(stream, el);
}
}

template <typename T>
void write(std::ostream& stream, const std::unordered_set<T>& var) {
write(stream, var.size());
Expand Down Expand Up @@ -143,6 +158,15 @@ void read(std::istream& stream, std::vector<T>& var) {
}
}

template <typename T, std::size_t N>
void read(std::istream& stream, std::array<T, N>& var) {
for (std::size_t i = 0; i < N; ++i) {
T elem;
read(stream, elem);
var[i] = elem;
}
}

template <typename T>
void read(std::istream& stream, std::unordered_set<T>& var) {
var.clear();
Expand Down
Loading

0 comments on commit 423185a

Please sign in to comment.