Skip to content

Commit

Permalink
NPUW: Introduce DQ (openvinotoolkit#26362)
Browse files Browse the repository at this point in the history
### Details:
 - Introduced dyn quant patterns and transformations for common cases
- When supported, dramatically improves performance for "Option B" when
no DCOFF etc options applied
- Currently experimental, disabled by default (assuming compiler will do
this too), enabled by `"NPUW_DQ" : "YES"`
- CW pattern works fine with prefill, GQ is generate only - not
recommend to use with prefill models

### Tickets:
 - E-137655 (CW)
 - E-137657 (GQ)
  • Loading branch information
dmatveev authored Sep 12, 2024
1 parent 2d7adc5 commit 4b6d2c7
Show file tree
Hide file tree
Showing 11 changed files with 650 additions and 17 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ DEFINE_OPT(NPUW_ONLINE_DUMP_PLAN, std::string, "", npuw::partitioning::online::d
DEFINE_OPT(NPUW_PLAN, std::string, "", npuw::partitioning::plan, CompileTime);
DEFINE_OPT(NPUW_FOLD, bool, false, npuw::partitioning::fold, CompileTime);
DEFINE_OPT(NPUW_CWAI, bool, false, npuw::partitioning::cwai, CompileTime);
DEFINE_OPT(NPUW_DQ, bool, false, npuw::partitioning::dyn_quant, CompileTime);
DEFINE_OPT(NPUW_DCOFF_TYPE, std::string, "", npuw::partitioning::dcoff_type, CompileTime);
DEFINE_OPT(NPUW_DCOFF_SCALE, bool, false, npuw::partitioning::dcoff_with_scale, CompileTime);
DEFINE_OPT(NPUW_FUNCALL_FOR_ALL, bool, false, npuw::partitioning::funcall_for_all, CompileTime);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,14 @@ static constexpr ov::Property<bool> fold{"NPUW_FOLD"};
*/
static constexpr ov::Property<bool> cwai{"NPUW_CWAI"};

/**
* @brief
* Type: bool.
* Apply dynamic quantization transformations at the plugin side.
* Default value: false.
*/
static constexpr ov::Property<bool> dyn_quant{"NPUW_DQ"};

/**
* @brief
* Type: std::string.
Expand Down
1 change: 1 addition & 0 deletions src/plugins/intel_npu/src/al/src/config/npuw.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ void intel_npu::registerNPUWOptions(OptionsDesc& desc) {
desc.add<NPUW_PLAN>();
desc.add<NPUW_FOLD>();
desc.add<NPUW_CWAI>();
desc.add<NPUW_DQ>();
desc.add<NPUW_DCOFF_TYPE>();
desc.add<NPUW_DCOFF_SCALE>();
desc.add<NPUW_FUNCALL_FOR_ALL>();
Expand Down
1 change: 1 addition & 0 deletions src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -797,6 +797,7 @@ void ov::npuw::CompiledModel::implement_properties() {
BIND(npuw::partitioning::plan, NPUW_PLAN),
BIND(npuw::partitioning::fold, NPUW_FOLD),
BIND(npuw::partitioning::cwai, NPUW_CWAI),
BIND(npuw::partitioning::dyn_quant, NPUW_DQ),
BIND(npuw::partitioning::funcall_for_all, NPUW_FUNCALL_FOR_ALL),
BIND(npuw::partitioning::dcoff_type, NPUW_DCOFF_TYPE),
BIND(npuw::partitioning::dcoff_with_scale, NPUW_DCOFF_SCALE),
Expand Down
29 changes: 21 additions & 8 deletions src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -478,6 +478,8 @@ void ov::npuw::JustInferRequest::unpack_closure(std::size_t idx, RqPtr request)
// Bind extra parameters from the function's closure
// First, do easy things & delay heavy stuff
std::vector<std::size_t> closure_unpack_required;
std::vector<std::size_t> closure_copy_required;

for (std::size_t cidx = 0u; cidx < comp_model_desc.closure.size(); cidx++) {
auto& closure = comp_model_desc.closure[cidx];

Expand All @@ -488,14 +490,27 @@ void ov::npuw::JustInferRequest::unpack_closure(std::size_t idx, RqPtr request)
// Remember where the unpack is required
closure_unpack_required.push_back(cidx);
} else if (comp_model_desc.update_required[cidx]) {
// Easy case, just set one to another. Copy_to is also possible
// and even may be preferrable for some devices, like this:
// ```ov::get_tensor_impl(closure)->copy_to(clparam._ptr);'''
request->set_tensor(iport, ov::get_tensor_impl(closure));
if (needs_copy(idx)) {
// Remember where copy is requried
closure_copy_required.push_back(cidx);
} else {
// Easy case, just set one to another
request->set_tensor(iport, ov::get_tensor_impl(closure));
}
}
} // for(closure)
// m_ms_unpack += ov::npuw::perf::ms_to_run([&](){
// ov::parallel_for(closure_unpack_required.size(), [&](std::size_t j) {

// m_ms_unpack += ov::npuw::perf::ms_to_run([&](){
ov::parallel_for(closure_copy_required.size(), [&](std::size_t j) {
auto cidx = closure_copy_required[j];
auto& closure = comp_model_desc.closure[cidx];
const auto closure_param_id = comp_model_desc.param_base + cidx;
auto& iport = func_desc.compiled_model->inputs()[closure_param_id];
auto clparam = request->get_tensor(iport);
ov::get_tensor_impl(closure)->copy_to(clparam._ptr);
});
// }); // ms_to_run

for (std::size_t j = 0; j != closure_unpack_required.size(); j++) {
// NB: No need to protect anything here as containers are all
// preallocated and we only access elements under particular (thread
Expand Down Expand Up @@ -525,8 +540,6 @@ void ov::npuw::JustInferRequest::unpack_closure(std::size_t idx, RqPtr request)
ov::npuw::util::unpack(ov::get_tensor_impl(closure), clparam);
}
}
//}); // ov_parallel_for
// }); // ms_to_run
}

void ov::npuw::JustInferRequest::recreate_subrequests(std::size_t idx) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include "snapshot.hpp"

#include "../../logging.hpp"
#include "../../util.hpp"
#include "../patterns/avoid.hpp"
#include "../patterns/compute.hpp"
#include "group.hpp"
Expand Down Expand Up @@ -72,6 +73,8 @@ void Snapshot::buildGraph() {
++gid;
}

using namespace ov::npuw::util::at;

for (const auto& nh : m_graph->sorted()) {
auto gptr = m_graph->meta(nh).get<Group::GPtr>();
auto ov_node = gptr->getInitialNode();
Expand All @@ -83,17 +86,17 @@ void Snapshot::buildGraph() {
auto ov_node_child = target_output.get_node()->shared_from_this();

// Insert readers from other layers
m_node_to_prod_cons->at(ov_node).second.insert(ov_node_child);
_(m_node_to_prod_cons).at(ov_node).second.insert(ov_node_child);

// Save ports for repeated blocks pipeline
m_ports_map.insert({{ov_node, ov_node_child}, {i, target_output.get_index()}});

if (!isOp(ov_node_child)) {
continue;
}

if (!m_graph->linked(nh, m_node_to_gr->at(ov_node_child)->getHandle())) {
m_graph->link(nh, m_node_to_gr->at(ov_node_child)->getHandle());
Group::GPtr gr_child = _(m_node_to_gr).at(ov_node_child);
if (!m_graph->linked(nh, gr_child->getHandle())) {
m_graph->link(nh, gr_child->getHandle());
}
}
} // for(outputs)
Expand All @@ -103,7 +106,7 @@ void Snapshot::buildGraph() {
auto ov_node_parent = target_input.get_node()->shared_from_this();

// Insert writers from other layers
m_node_to_prod_cons->at(ov_node).first.insert(ov_node_parent);
_(m_node_to_prod_cons).at(ov_node).first.insert(ov_node_parent);

// Save ports for repeated blocks pipeline
m_ports_map.insert({{ov_node_parent, ov_node}, {target_input.get_index(), i}});
Expand All @@ -112,8 +115,9 @@ void Snapshot::buildGraph() {
continue;
}

if (!m_graph->linked(m_node_to_gr->at(ov_node_parent)->getHandle(), nh)) {
m_graph->link(m_node_to_gr->at(ov_node_parent)->getHandle(), nh);
Group::GPtr gr_parent = _(m_node_to_gr).at(ov_node_parent);
if (!m_graph->linked(gr_parent->getHandle(), nh)) {
m_graph->link(gr_parent->getHandle(), nh);
}
} // for(inputs)
} // for(get_ordered_ops)
Expand Down Expand Up @@ -1028,11 +1032,11 @@ GPtrSet Snapshot::getRepGroups(const Group::GPtr& group) const {
}

const OVNodeSet& Snapshot::getNodeProducers(const OVNodePtr& node) const {
return m_node_to_prod_cons->at(node).first;
return ov::npuw::util::at::_(m_node_to_prod_cons).at(node).first;
}

const OVNodeSet& Snapshot::getNodeConsumers(const OVNodePtr& node) const {
return m_node_to_prod_cons->at(node).second;
return ov::npuw::util::at::_(m_node_to_prod_cons).at(node).second;
}

// Updated within a group during fusion
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,15 @@
#include "intel_npu/al/config/npuw.hpp"
#include "online/compiler.hpp"
#include "online/utils/utils.hpp" // getMetaDesc
#include "openvino/core/parallel.hpp"
#include "openvino/op/convert.hpp"
#include "openvino/op/slice.hpp"
#include "openvino/op/util/op_types.hpp"
#include "openvino/pass/validate.hpp"
#include "openvino/util/common_util.hpp"
#include "openvino/util/xml_parse_utils.hpp"
#include "patterns/dcoff.hpp"
#include "patterns/opt.hpp"

namespace {

Expand Down Expand Up @@ -277,6 +279,7 @@ class Partitioner {
void matchResults(const std::string& func_name);
void createFunction(const std::string& func_name);
void matchRepeatedSubgraphs(const std::string& func_name);
void optimize(const std::string& func_name);
void decompressionCutOff(const std::string& func_name);

// Final steps
Expand Down Expand Up @@ -1557,6 +1560,50 @@ void Partitioner::matchRepeatedSubgraphs(const std::string& func_name) {
LOG_VERB("Done");
}

void Partitioner::optimize(const std::string& func_name) {
if (!cfg.get<::intel_npu::NPUW_DQ>()) {
LOG_VERB("No optimizations will be done to " << func_name << " in model " << model->get_friendly_name()
<< "...");
return;
}

LOG_VERB("Optimize function " << func_name << " in model " << model->get_friendly_name() << "...");
LOG_BLOCK();

ov::npuw::Function& f = P.functions.at(func_name);

ov::npuw::patterns::opt::Context ctx;
ov::pass::GraphRewrite rewr;
rewr.add_matcher<ov::npuw::patterns::opt::DQMatMulCWi>();
rewr.add_matcher<ov::npuw::patterns::opt::DQMatMulGQi>(std::ref(ctx));
rewr.add_matcher<ov::npuw::patterns::opt::DQMatMulGQ2i>(std::ref(ctx));
rewr.run_on_model(f._model);
ov::pass::Validate().run_on_model(f._model);

// Permute tensors where required
auto& func_group = all_functions.at(func_name);
for (auto&& p : ctx.closures_to_permute) {
auto param_idx = f._model->get_parameter_index(p.first);
auto closure_idx = param_idx - f._param_offset;
ov::parallel_for(func_group.refs.size(), [&](std::size_t f_idx) {
auto& funcall = func_group.refs[f_idx].get();
ov::npuw::util::permute(funcall._closure[closure_idx], p.second);
});
}

// Convert tensors where required
for (auto&& p : ctx.closures_to_f16) {
auto param_idx = f._model->get_parameter_index(p);
auto closure_idx = param_idx - f._param_offset;
ov::parallel_for(func_group.refs.size(), [&](std::size_t f_idx) {
auto& funcall = func_group.refs[f_idx].get();
ov::npuw::util::to_f16(funcall._closure[closure_idx]);
});
}

LOG_VERB("Done");
}

void Partitioner::decompressionCutOff(const std::string& func_name) {
LOG_VERB("Decompression cut-off for function " << func_name << " in model " << model->get_friendly_name() << "...");
LOG_BLOCK();
Expand Down Expand Up @@ -1826,6 +1873,7 @@ ov::npuw::Partitioning ov::npuw::getPartitioning(const std::shared_ptr<ov::Model
p.matchParameters(func_group);
p.matchResults(func_group);
p.matchRepeatedSubgraphs(func_group);
p.optimize(func_group);
p.decompressionCutOff(func_group);
}
} else if (cfg.get<::intel_npu::NPUW_CWAI>()) {
Expand All @@ -1841,6 +1889,7 @@ ov::npuw::Partitioning ov::npuw::getPartitioning(const std::shared_ptr<ov::Model
p.saveTinyConstants(func_group);
p.saveScaleFactors(func_group);
p.createFunction(func_group);
p.optimize(func_group);
p.decompressionCutOff(func_group);
}
} else {
Expand Down
Loading

0 comments on commit 4b6d2c7

Please sign in to comment.