Skip to content

Commit

Permalink
NPUW: Head/tail optimizations (openvinotoolkit#26633)
Browse files Browse the repository at this point in the history
### Details:
 - This PR enables WS and DQ for head/tail
- Some efficiency problems were fixed by introducing a host-side Gather
(ON by default, can be turned OFF for evaluation purposes)

### Tickets:
 - E-139867
  • Loading branch information
dmatveev authored Sep 27, 2024
1 parent 0eaddb9 commit 046080f
Show file tree
Hide file tree
Showing 12 changed files with 902 additions and 44 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ DEFINE_OPT(NPUW_FOLD, bool, false, npuw::partitioning::fold, CompileTime);
DEFINE_OPT(NPUW_CWAI, bool, false, npuw::partitioning::cwai, CompileTime);
DEFINE_OPT(NPUW_DQ, bool, false, npuw::partitioning::dyn_quant, CompileTime);
DEFINE_OPT(NPUW_PMM, std::string, "2", npuw::partitioning::par_matmul_merge_dims, CompileTime);
DEFINE_OPT(NPUW_HOST_GATHER, bool, true, npuw::partitioning::host_gather, CompileTime);
DEFINE_OPT(NPUW_DCOFF_TYPE, std::string, "", npuw::partitioning::dcoff_type, CompileTime);
DEFINE_OPT(NPUW_DCOFF_SCALE, bool, false, npuw::partitioning::dcoff_with_scale, CompileTime);
DEFINE_OPT(NPUW_FUNCALL_FOR_ALL, bool, false, npuw::partitioning::funcall_for_all, CompileTime);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,14 @@ static constexpr ov::Property<bool> dyn_quant{"NPUW_DQ"};
*/
static constexpr ov::Property<std::string> par_matmul_merge_dims{"NPUW_PMM"};

/**
* @brief
* Type: boolean
* When applicable, do embedding gather on host.
* Default value: true.
*/
static constexpr ov::Property<bool> host_gather{"NPUW_HOST_GATHER"};

/**
* @brief
* Type: std::string.
Expand Down
1 change: 1 addition & 0 deletions src/plugins/intel_npu/src/al/src/config/npuw.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ void intel_npu::registerNPUWOptions(OptionsDesc& desc) {
desc.add<NPUW_CWAI>();
desc.add<NPUW_DQ>();
desc.add<NPUW_PMM>();
desc.add<NPUW_HOST_GATHER>();
desc.add<NPUW_DCOFF_TYPE>();
desc.add<NPUW_DCOFF_SCALE>();
desc.add<NPUW_FUNCALL_FOR_ALL>();
Expand Down
13 changes: 13 additions & 0 deletions src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include "openvino/runtime/internal_properties.hpp"
#include "openvino/runtime/properties.hpp"
#include "openvino/util/common_util.hpp"
#include "partitioning/patterns/opt.hpp"
#include "plugin.hpp"
#include "util.hpp"

Expand Down Expand Up @@ -135,6 +136,16 @@ ov::npuw::CompiledModel::CompiledModel(const std::shared_ptr<ov::Model>& model,
// FIXME: Find a better place to call this transformation
ov::pass::ConvertPrecision(ov::element::bf16, ov::element::f16).run_on_model(model);

if (m_cfg.get<::intel_npu::NPUW_FOLD>() && m_cfg.get<::intel_npu::NPUW_FUNCALL_FOR_ALL>()) {
// If there's folding enabled AND non-repeating graphs are forced to be
// functions, do extra lifting for gather (if any)
ov::pass::GraphRewrite rewr;
rewr.add_matcher<ov::npuw::patterns::opt::DQLiftGatherAsymCW>();
rewr.add_matcher<ov::npuw::patterns::opt::DQLiftGatherSymCW>();
rewr.add_matcher<ov::npuw::patterns::opt::DQLiftGatherSymGQ>();
rewr.run_on_model(model);
}

auto partitioning = getPartitioning(model, m_cfg);
m_total_stat.gflops = partitioning.total_gflops;
m_total_stat.ops = partitioning.total_ops;
Expand Down Expand Up @@ -271,6 +282,7 @@ ov::npuw::CompiledModel::CompiledModel(const std::shared_ptr<ov::Model>& model,
m_compiled_submodels[id].replaced_by = compiled_fcn_iter->second;
LOG_INFO("Subgraph[" << id << "] is a function call to [" << compiled_fcn_iter->second << "]");
}
m_compiled_submodels[id].host_gather = subgraph._host_gather;
m_compiled_submodels[id].param_base = fcn_template._param_offset;
m_compiled_submodels[id].closure = subgraph._closure;
m_compiled_submodels[id].scales = subgraph._scales;
Expand Down Expand Up @@ -799,6 +811,7 @@ void ov::npuw::CompiledModel::implement_properties() {
BIND(npuw::partitioning::cwai, NPUW_CWAI),
BIND(npuw::partitioning::dyn_quant, NPUW_DQ),
BIND(npuw::partitioning::par_matmul_merge_dims, NPUW_PMM),
BIND(npuw::partitioning::host_gather, NPUW_HOST_GATHER),
BIND(npuw::partitioning::funcall_for_all, NPUW_FUNCALL_FOR_ALL),
BIND(npuw::partitioning::dcoff_type, NPUW_DCOFF_TYPE),
BIND(npuw::partitioning::dcoff_with_scale, NPUW_DCOFF_SCALE),
Expand Down
2 changes: 2 additions & 0 deletions src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,8 @@ class CompiledModel : public ov::ICompiledModel {

// FIXME: This is a 1:1 copy of the ov::npuw::Subgraph structure
// w.r.t. function calls
Subgraph::Gather host_gather;

std::size_t param_base = 0;
std::vector<ov::Tensor> closure;
std::vector<ov::Tensor> scales;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -381,6 +381,15 @@ void ov::npuw::JustInferRequest::bind_global_parameters(std::size_t idx) {
it.first->copy_to(dst._ptr);
});

// Run host-side gather, if required
if (comp_model_desc.host_gather.dst_idx != -1) {
auto& dst = comp_model_desc.closure[comp_model_desc.host_gather.dst_idx - comp_model_desc.param_base];
const auto& vocab = comp_model_desc.closure[comp_model_desc.host_gather.src_idx - comp_model_desc.param_base];
const auto& lport = comp_model_desc.compiled_model->inputs()[comp_model_desc.host_gather.idx_idx];
const auto lookup = subr->get_tensor(lport);
ov::npuw::util::gather(ov::get_tensor_impl(vocab), lookup, ov::get_tensor_impl(dst));
}

LOG_DEBUG("Done");
}

Expand Down
133 changes: 113 additions & 20 deletions src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#include "openvino/op/slice.hpp"
#include "openvino/op/util/op_types.hpp"
#include "openvino/pass/validate.hpp"
#include "openvino/runtime/make_tensor.hpp"
#include "openvino/util/common_util.hpp"
#include "openvino/util/xml_parse_utils.hpp"
#include "patterns/dcoff.hpp"
Expand Down Expand Up @@ -1565,16 +1566,59 @@ void Partitioner::optimize(const std::string& func_name) {
ov::npuw::Function& f = P.functions.at(func_name);
auto& func_group = all_functions.at(func_name);

auto do_permute = [&](ov::npuw::patterns::opt::Context& ctx) {
for (auto&& p : ctx.closures_to_permute) {
auto param_idx = f._model->get_parameter_index(p.first);
auto closure_idx = param_idx - f._param_offset;
ov::parallel_for(func_group.refs.size(), [&](std::size_t f_idx) {
auto& funcall = func_group.refs[f_idx].get();
ov::npuw::util::permute(funcall._closure[closure_idx], p.second);
});
}
};
auto do_cvtf16 = [&](ov::npuw::patterns::opt::Context& ctx) {
for (auto&& p : ctx.closures_to_f16) {
auto param_idx = f._model->get_parameter_index(p);
auto closure_idx = param_idx - f._param_offset;
ov::parallel_for(func_group.refs.size(), [&](std::size_t f_idx) {
auto& funcall = func_group.refs[f_idx].get();
ov::npuw::util::to_f16(funcall._closure[closure_idx]);
});
}
};

// Regardless of DQ setting, run this first
{
ov::npuw::patterns::opt::Context ctx;
ctx.pmm_dims = cfg.get<::intel_npu::NPUW_PMM>();

// Run Head/Tail passes
ov::pass::GraphRewrite rewr;
rewr.add_matcher<ov::npuw::patterns::opt::DQUnpackDictGatherCWu>(std::ref(ctx));
rewr.add_matcher<ov::npuw::patterns::opt::DQUnpackDictGatherGQi>(std::ref(ctx));
rewr.add_matcher<ov::npuw::patterns::opt::DQUnpackDictMatMulCWu>(std::ref(ctx));
// NB: This pass is disabled for reason! It doesn't make things better
// rewr.add_matcher<ov::npuw::patterns::opt::DQUnpackDictMatMulGQi>(std::ref(ctx));
rewr.add_matcher<ov::npuw::patterns::opt::CompressDictMatMulf32>(std::ref(ctx));
rewr.add_matcher<ov::npuw::patterns::opt::DQParMMGQ>(std::ref(ctx));
rewr.run_on_model(f._model);

// Move Gather to host, if required
if (cfg.get<::intel_npu::NPUW_HOST_GATHER>()) {
ov::pass::GraphRewrite rewr2;
rewr2.add_matcher<ov::npuw::patterns::opt::HostGather>(std::ref(ctx));
rewr2.add_matcher<ov::npuw::patterns::opt::HostGatherDQ>(std::ref(ctx));
rewr2.run_on_model(f._model);
}

// Run parallel matmul merge
mergeParallelMatMuls(f._model, ctx);

// Concatenate closures for "concatenated" parameters
ov::ParameterVector new_params;
std::vector<ov::npuw::patterns::opt::Context::PPtr> to_remove;
std::set<std::size_t> to_remove_idx;

// Concatenate closures for "concatenated" parameters
for (auto&& p : ctx.params_to_concat) {
new_params.push_back(p.first);
const auto& params_to_concat = p.second.first;
Expand All @@ -1596,6 +1640,59 @@ void Partitioner::optimize(const std::string& func_name) {
funcall._closure.push_back(ov::npuw::util::concat(to_concat, axis));
});
}

// Unpack closures in compile time, where requested
for (auto&& p : ctx.params_to_unpack) {
const auto& tensor_to_unpack = p.second;
auto w_idx = f._model->get_parameter_index(tensor_to_unpack.w);
auto z_idx = f._model->get_parameter_index(tensor_to_unpack.z);
auto s_idx = f._model->get_parameter_index(tensor_to_unpack.s);

new_params.push_back(p.first);
to_remove.push_back(tensor_to_unpack.w);
to_remove.push_back(tensor_to_unpack.s);
to_remove_idx.insert(w_idx);
to_remove_idx.insert(s_idx);

if (tensor_to_unpack.z) {
to_remove.push_back(tensor_to_unpack.z);
to_remove_idx.insert(z_idx);
}

ov::parallel_for(func_group.refs.size(), [&](std::size_t f_idx) {
auto& funcall = func_group.refs[f_idx].get();
ov::Tensor cw = funcall._closure[w_idx - f._param_offset];
ov::Tensor cz = z_idx != -1 ? funcall._closure[z_idx - f._param_offset] : ov::Tensor{};
ov::Tensor cs = funcall._closure[s_idx - f._param_offset];
ov::Tensor dst(p.first->get_element_type(), p.first->get_shape());

const auto& gti = ov::get_tensor_impl;
if (cw && cz && cs) {
ov::npuw::util::unpack(gti(cw), gti(cz), gti(cs), gti(dst));
} else if (cw && cs) {
ov::npuw::util::unpack(gti(cw), gti(cs), gti(dst));
} else {
NPUW_ASSERT(false && "Unsupported combination");
}
funcall._closure.push_back(std::move(dst));
});
}

// Convert parameters to f16 where required
do_cvtf16(ctx);

// Host-side gather, pt 1. Add new parameters first
if (ctx.params_to_gather) {
auto& params_to_gather = *ctx.params_to_gather;
new_params.push_back(params_to_gather.pnew);
for (auto&& funcall : func_group.refs) {
auto new_elem_type = params_to_gather.pnew->get_element_type();
auto new_shape = params_to_gather.pnew->get_shape();
funcall.get()._closure.push_back(ov::Tensor(new_elem_type, new_shape));
}
}

// Add all new parameters introduced by this change
f._model->add_parameters(new_params);

// Remove parameters and closures that were concatenated
Expand All @@ -1613,7 +1710,19 @@ void Partitioner::optimize(const std::string& func_name) {
for (auto&& now_remove : to_remove) {
f._model->remove_parameter(now_remove);
}

f._model->validate_nodes_and_infer_types();

// Host-side gather, pt. 2: Write the gather mappings to funcall
if (ctx.params_to_gather) {
auto& params_to_gather = *ctx.params_to_gather;
auto gather_dst_id = f._model->get_parameter_index(params_to_gather.pnew);
auto gather_src_id = f._model->get_parameter_index(params_to_gather.pold);
auto gather_idx_id = f._model->get_parameter_index(params_to_gather.pids);
for (auto&& funcall : func_group.refs) {
funcall.get()._host_gather = ov::npuw::Subgraph::Gather{gather_dst_id, gather_src_id, gather_idx_id};
}
}
}

if (!cfg.get<::intel_npu::NPUW_DQ>()) {
Expand All @@ -1625,6 +1734,7 @@ void Partitioner::optimize(const std::string& func_name) {
LOG_VERB("Optimize function " << func_name << " in model " << model->get_friendly_name() << "...");
LOG_BLOCK();

// Run "dynamic quantization"
ov::npuw::patterns::opt::Context ctx;
ov::pass::GraphRewrite rewr;
rewr.add_matcher<ov::npuw::patterns::opt::DQMatMulCWi>();
Expand All @@ -1635,25 +1745,8 @@ void Partitioner::optimize(const std::string& func_name) {
rewr.run_on_model(f._model);
ov::pass::Validate().run_on_model(f._model);

// Permute tensors where required
for (auto&& p : ctx.closures_to_permute) {
auto param_idx = f._model->get_parameter_index(p.first);
auto closure_idx = param_idx - f._param_offset;
ov::parallel_for(func_group.refs.size(), [&](std::size_t f_idx) {
auto& funcall = func_group.refs[f_idx].get();
ov::npuw::util::permute(funcall._closure[closure_idx], p.second);
});
}

// Convert tensors where required
for (auto&& p : ctx.closures_to_f16) {
auto param_idx = f._model->get_parameter_index(p);
auto closure_idx = param_idx - f._param_offset;
ov::parallel_for(func_group.refs.size(), [&](std::size_t f_idx) {
auto& funcall = func_group.refs[f_idx].get();
ov::npuw::util::to_f16(funcall._closure[closure_idx]);
});
}
do_permute(ctx);
do_cvtf16(ctx);

LOG_VERB("Done");
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,14 @@ struct Subgraph {
std::vector<ov::Tensor> _scales; // Scale coeffs for manual unpacking
std::vector<ov::Tensor> _zerops; // Zero points for manual unpacking

struct Gather {
// NB.: int64_t is strange but it is used by OV to refer to parameters
int64_t dst_idx = -1;
int64_t src_idx = -1;
int64_t idx_idx = -1;
};
Gather _host_gather;

using Ref = std::reference_wrapper<Subgraph>;
};

Expand Down
Loading

0 comments on commit 046080f

Please sign in to comment.