diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/al/config/npuw.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/al/config/npuw.hpp index 2ce67a71a5da57..c5e91369ce80e2 100644 --- a/src/plugins/intel_npu/src/al/include/intel_npu/al/config/npuw.hpp +++ b/src/plugins/intel_npu/src/al/include/intel_npu/al/config/npuw.hpp @@ -41,6 +41,7 @@ DEFINE_OPT(NPUW_ONLINE_DUMP_PLAN, std::string, "", npuw::partitioning::online::d DEFINE_OPT(NPUW_PLAN, std::string, "", npuw::partitioning::plan, CompileTime); DEFINE_OPT(NPUW_FOLD, bool, false, npuw::partitioning::fold, CompileTime); DEFINE_OPT(NPUW_CWAI, bool, false, npuw::partitioning::cwai, CompileTime); +DEFINE_OPT(NPUW_DQ, bool, false, npuw::partitioning::dyn_quant, CompileTime); DEFINE_OPT(NPUW_DCOFF_TYPE, std::string, "", npuw::partitioning::dcoff_type, CompileTime); DEFINE_OPT(NPUW_DCOFF_SCALE, bool, false, npuw::partitioning::dcoff_with_scale, CompileTime); DEFINE_OPT(NPUW_FUNCALL_FOR_ALL, bool, false, npuw::partitioning::funcall_for_all, CompileTime); diff --git a/src/plugins/intel_npu/src/al/include/npuw_private_properties.hpp b/src/plugins/intel_npu/src/al/include/npuw_private_properties.hpp index 627b6b957ebfb3..b0cf6f4a608f66 100644 --- a/src/plugins/intel_npu/src/al/include/npuw_private_properties.hpp +++ b/src/plugins/intel_npu/src/al/include/npuw_private_properties.hpp @@ -159,6 +159,14 @@ static constexpr ov::Property fold{"NPUW_FOLD"}; */ static constexpr ov::Property cwai{"NPUW_CWAI"}; +/** + * @brief + * Type: bool. + * Apply dynamic quantization transformations at the plugin side. + * Default value: false. + */ +static constexpr ov::Property dyn_quant{"NPUW_DQ"}; + /** * @brief * Type: std::string. diff --git a/src/plugins/intel_npu/src/al/src/config/npuw.cpp b/src/plugins/intel_npu/src/al/src/config/npuw.cpp index a43794e883368c..406fb9e6288fff 100644 --- a/src/plugins/intel_npu/src/al/src/config/npuw.cpp +++ b/src/plugins/intel_npu/src/al/src/config/npuw.cpp @@ -26,6 +26,7 @@ void intel_npu::registerNPUWOptions(OptionsDesc& desc) { desc.add(); desc.add(); desc.add(); + desc.add(); desc.add(); desc.add(); desc.add(); diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp index 6cfb7fd7ce1dc7..bfb1b474ee0f34 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp @@ -797,6 +797,7 @@ void ov::npuw::CompiledModel::implement_properties() { BIND(npuw::partitioning::plan, NPUW_PLAN), BIND(npuw::partitioning::fold, NPUW_FOLD), BIND(npuw::partitioning::cwai, NPUW_CWAI), + BIND(npuw::partitioning::dyn_quant, NPUW_DQ), BIND(npuw::partitioning::funcall_for_all, NPUW_FUNCALL_FOR_ALL), BIND(npuw::partitioning::dcoff_type, NPUW_DCOFF_TYPE), BIND(npuw::partitioning::dcoff_with_scale, NPUW_DCOFF_SCALE), diff --git a/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp b/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp index 6bd1b296c3a9a9..ba1f56b060e0c4 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp @@ -478,6 +478,8 @@ void ov::npuw::JustInferRequest::unpack_closure(std::size_t idx, RqPtr request) // Bind extra parameters from the function's closure // First, do easy things & delay heavy stuff std::vector closure_unpack_required; + std::vector closure_copy_required; + for (std::size_t cidx = 0u; cidx < comp_model_desc.closure.size(); cidx++) { auto& closure = comp_model_desc.closure[cidx]; @@ -488,14 +490,27 @@ void ov::npuw::JustInferRequest::unpack_closure(std::size_t idx, RqPtr request) // Remember where the unpack is required closure_unpack_required.push_back(cidx); } else if (comp_model_desc.update_required[cidx]) { - // Easy case, just set one to another. Copy_to is also possible - // and even may be preferrable for some devices, like this: - // ```ov::get_tensor_impl(closure)->copy_to(clparam._ptr);''' - request->set_tensor(iport, ov::get_tensor_impl(closure)); + if (needs_copy(idx)) { + // Remember where copy is requried + closure_copy_required.push_back(cidx); + } else { + // Easy case, just set one to another + request->set_tensor(iport, ov::get_tensor_impl(closure)); + } } } // for(closure) - // m_ms_unpack += ov::npuw::perf::ms_to_run([&](){ - // ov::parallel_for(closure_unpack_required.size(), [&](std::size_t j) { + + // m_ms_unpack += ov::npuw::perf::ms_to_run([&](){ + ov::parallel_for(closure_copy_required.size(), [&](std::size_t j) { + auto cidx = closure_copy_required[j]; + auto& closure = comp_model_desc.closure[cidx]; + const auto closure_param_id = comp_model_desc.param_base + cidx; + auto& iport = func_desc.compiled_model->inputs()[closure_param_id]; + auto clparam = request->get_tensor(iport); + ov::get_tensor_impl(closure)->copy_to(clparam._ptr); + }); + // }); // ms_to_run + for (std::size_t j = 0; j != closure_unpack_required.size(); j++) { // NB: No need to protect anything here as containers are all // preallocated and we only access elements under particular (thread @@ -525,8 +540,6 @@ void ov::npuw::JustInferRequest::unpack_closure(std::size_t idx, RqPtr request) ov::npuw::util::unpack(ov::get_tensor_impl(closure), clparam); } } - //}); // ov_parallel_for - // }); // ms_to_run } void ov::npuw::JustInferRequest::recreate_subrequests(std::size_t idx) { diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/snapshot.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/snapshot.cpp index 4e373dda5a24eb..2ee36fcb09361a 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/snapshot.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/snapshot.cpp @@ -5,6 +5,7 @@ #include "snapshot.hpp" #include "../../logging.hpp" +#include "../../util.hpp" #include "../patterns/avoid.hpp" #include "../patterns/compute.hpp" #include "group.hpp" @@ -72,6 +73,8 @@ void Snapshot::buildGraph() { ++gid; } + using namespace ov::npuw::util::at; + for (const auto& nh : m_graph->sorted()) { auto gptr = m_graph->meta(nh).get(); auto ov_node = gptr->getInitialNode(); @@ -83,7 +86,7 @@ void Snapshot::buildGraph() { auto ov_node_child = target_output.get_node()->shared_from_this(); // Insert readers from other layers - m_node_to_prod_cons->at(ov_node).second.insert(ov_node_child); + _(m_node_to_prod_cons).at(ov_node).second.insert(ov_node_child); // Save ports for repeated blocks pipeline m_ports_map.insert({{ov_node, ov_node_child}, {i, target_output.get_index()}}); @@ -91,9 +94,9 @@ void Snapshot::buildGraph() { if (!isOp(ov_node_child)) { continue; } - - if (!m_graph->linked(nh, m_node_to_gr->at(ov_node_child)->getHandle())) { - m_graph->link(nh, m_node_to_gr->at(ov_node_child)->getHandle()); + Group::GPtr gr_child = _(m_node_to_gr).at(ov_node_child); + if (!m_graph->linked(nh, gr_child->getHandle())) { + m_graph->link(nh, gr_child->getHandle()); } } } // for(outputs) @@ -103,7 +106,7 @@ void Snapshot::buildGraph() { auto ov_node_parent = target_input.get_node()->shared_from_this(); // Insert writers from other layers - m_node_to_prod_cons->at(ov_node).first.insert(ov_node_parent); + _(m_node_to_prod_cons).at(ov_node).first.insert(ov_node_parent); // Save ports for repeated blocks pipeline m_ports_map.insert({{ov_node_parent, ov_node}, {target_input.get_index(), i}}); @@ -112,8 +115,9 @@ void Snapshot::buildGraph() { continue; } - if (!m_graph->linked(m_node_to_gr->at(ov_node_parent)->getHandle(), nh)) { - m_graph->link(m_node_to_gr->at(ov_node_parent)->getHandle(), nh); + Group::GPtr gr_parent = _(m_node_to_gr).at(ov_node_parent); + if (!m_graph->linked(gr_parent->getHandle(), nh)) { + m_graph->link(gr_parent->getHandle(), nh); } } // for(inputs) } // for(get_ordered_ops) @@ -1028,11 +1032,11 @@ GPtrSet Snapshot::getRepGroups(const Group::GPtr& group) const { } const OVNodeSet& Snapshot::getNodeProducers(const OVNodePtr& node) const { - return m_node_to_prod_cons->at(node).first; + return ov::npuw::util::at::_(m_node_to_prod_cons).at(node).first; } const OVNodeSet& Snapshot::getNodeConsumers(const OVNodePtr& node) const { - return m_node_to_prod_cons->at(node).second; + return ov::npuw::util::at::_(m_node_to_prod_cons).at(node).second; } // Updated within a group during fusion diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp index 6fef5d8b6fdf94..39716df4527236 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp @@ -9,6 +9,7 @@ #include "intel_npu/al/config/npuw.hpp" #include "online/compiler.hpp" #include "online/utils/utils.hpp" // getMetaDesc +#include "openvino/core/parallel.hpp" #include "openvino/op/convert.hpp" #include "openvino/op/slice.hpp" #include "openvino/op/util/op_types.hpp" @@ -16,6 +17,7 @@ #include "openvino/util/common_util.hpp" #include "openvino/util/xml_parse_utils.hpp" #include "patterns/dcoff.hpp" +#include "patterns/opt.hpp" namespace { @@ -277,6 +279,7 @@ class Partitioner { void matchResults(const std::string& func_name); void createFunction(const std::string& func_name); void matchRepeatedSubgraphs(const std::string& func_name); + void optimize(const std::string& func_name); void decompressionCutOff(const std::string& func_name); // Final steps @@ -1557,6 +1560,50 @@ void Partitioner::matchRepeatedSubgraphs(const std::string& func_name) { LOG_VERB("Done"); } +void Partitioner::optimize(const std::string& func_name) { + if (!cfg.get<::intel_npu::NPUW_DQ>()) { + LOG_VERB("No optimizations will be done to " << func_name << " in model " << model->get_friendly_name() + << "..."); + return; + } + + LOG_VERB("Optimize function " << func_name << " in model " << model->get_friendly_name() << "..."); + LOG_BLOCK(); + + ov::npuw::Function& f = P.functions.at(func_name); + + ov::npuw::patterns::opt::Context ctx; + ov::pass::GraphRewrite rewr; + rewr.add_matcher(); + rewr.add_matcher(std::ref(ctx)); + rewr.add_matcher(std::ref(ctx)); + rewr.run_on_model(f._model); + ov::pass::Validate().run_on_model(f._model); + + // Permute tensors where required + auto& func_group = all_functions.at(func_name); + for (auto&& p : ctx.closures_to_permute) { + auto param_idx = f._model->get_parameter_index(p.first); + auto closure_idx = param_idx - f._param_offset; + ov::parallel_for(func_group.refs.size(), [&](std::size_t f_idx) { + auto& funcall = func_group.refs[f_idx].get(); + ov::npuw::util::permute(funcall._closure[closure_idx], p.second); + }); + } + + // Convert tensors where required + for (auto&& p : ctx.closures_to_f16) { + auto param_idx = f._model->get_parameter_index(p); + auto closure_idx = param_idx - f._param_offset; + ov::parallel_for(func_group.refs.size(), [&](std::size_t f_idx) { + auto& funcall = func_group.refs[f_idx].get(); + ov::npuw::util::to_f16(funcall._closure[closure_idx]); + }); + } + + LOG_VERB("Done"); +} + void Partitioner::decompressionCutOff(const std::string& func_name) { LOG_VERB("Decompression cut-off for function " << func_name << " in model " << model->get_friendly_name() << "..."); LOG_BLOCK(); @@ -1826,6 +1873,7 @@ ov::npuw::Partitioning ov::npuw::getPartitioning(const std::shared_ptr()) { @@ -1841,6 +1889,7 @@ ov::npuw::Partitioning ov::npuw::getPartitioning(const std::shared_ptr +// Param(W) -> to(f16) -> Multiply -> to(f32) -> MatMul +// Param(S) ------------> +// +// TO: +// ???(Act) -> to(f16) -> +// Param(W) -> to(f16) -> MatMul -> Multiply -> to(f32) +// Param(S) -> Reshape -----------> +// + +DQMatMulCWi::DQMatMulCWi() { + auto qweight = opp::wrap_type(); + auto qcoeff = opp::wrap_type(); + auto qcvtw = opp::wrap_type({qweight}); + auto qmuls = opp::wrap_type({qcvtw, qcoeff}); + auto qcvtm = opp::wrap_type({qmuls}); + auto qmmi = opp::any_input(); + auto qmm = opp::wrap_type({qmmi, qcvtm}); + + // Note: Use [=] to make sure the above objects stay alive in the callback + auto callback = [=](ov::pass::pattern::Matcher& m) { + auto& node_to_output = m.get_pattern_value_map(); + + auto matched_node_qweight = node_to_output.at(qweight).get_node_shared_ptr(); + auto matched_node_qcoeff = node_to_output.at(qcoeff).get_node_shared_ptr(); + auto matched_node_matmul = node_to_output.at(qmm).get_node_shared_ptr(); + + auto matched_qweight = std::static_pointer_cast(matched_node_qweight); + auto matched_qcoeff = std::static_pointer_cast(matched_node_qcoeff); + auto matched_matmul = std::static_pointer_cast(matched_node_matmul); + + auto qcoeff_shape = matched_qcoeff->output(0).get_shape(); + + if (ov::element::i4 == matched_qweight->get_element_type() && qcoeff_shape[1] == 1 && + !matched_matmul->get_transpose_a() && matched_matmul->get_transpose_b()) { + auto matched_node_cvtw = node_to_output.at(qcvtw).get_node_shared_ptr(); + auto matched_node_cvtm = node_to_output.at(qcvtm).get_node_shared_ptr(); + auto matched_node_muls = node_to_output.at(qmuls).get_node_shared_ptr(); + auto matched_node_mmi = node_to_output.at(qmmi).get_node_shared_ptr(); + + // Reconnect MatMul to read from Convert(W) directly. + // Note: ACT is f32 so has to be converted too. + auto new_cvt_act = std::make_shared(matched_node_mmi, ov::element::f16); + matched_matmul->input(0).replace_source_output(new_cvt_act); + matched_matmul->input(1).replace_source_output(matched_node_cvtw); + + // Store MatMul's readers + auto mm_readers = matched_matmul->output(0).get_target_inputs(); + + // Introduce a Reshape to alter Scale factor's shape + auto new_dims = std::vector{qcoeff_shape[1], qcoeff_shape[0]}; + auto new_const = std::make_shared(ov::element::i32, ov::Shape{2}, new_dims); + auto new_reshape = std::make_shared(matched_node_qcoeff, new_const, false); + + // Reconnect Multiply's both inputs. Drop all outputs + matched_node_muls->input(0).replace_source_output(matched_matmul); + matched_node_muls->input(1).replace_source_output(new_reshape); + for (auto&& r : matched_node_muls->output(0).get_target_inputs()) { + matched_node_muls->output(0).remove_target_input(r); + } + + // Reconnect Convert(M) to convert the Multiply's result + matched_node_cvtm->input(0).replace_source_output(matched_node_muls); + + // Reconnect MatMul's old readers to Convert(Multiply) + for (auto&& r : mm_readers) { + r.replace_source_output(matched_node_cvtm); + } + } + + return true; // root has changed + }; + register_matcher(std::make_shared(qmm, "OptDQMatMulCWi"), std::move(callback)); +} + +// FROM: +// ???(Act) --------------------------------------------> +// Param(W) -> Convert(f16|f32) -> Multiply -> Reshape -> MatMul +// Param(S) ---------------------> +// +// WHERE (example): +// Act: [ 1, 1, 4096] +// W: [32,128,11008] +// S: [32, 1,11008] +// [1, 1 ,128] x +// TO: [1,11K,128]T = +// [32,1,128] [1, 1 ,11K] [32,1,11K] +// ???(Act) -> Reshape > Split(/32) ->[to(f16) -> ]} +// Param(W*) -----------> Split(/32) ->[to(f16) -> MatMul]} Concat v +// Param(S) ---------------------------------------------> Multiply +// Reshape(1,a,b,c) +// ReduceSum(1) +// Reshape(a,b,c) +// to(f32) +// WHERE: +// W* : [32,11008,128] + +DQMatMulGQi::DQMatMulGQi(Context::Ref ctx) { + auto qweight = opp::wrap_type(); + auto qcoeff = opp::wrap_type(); + auto qcvtw = opp::wrap_type({qweight}); + auto qmuls = opp::wrap_type({qcvtw, qcoeff}); + auto qreshp = opp::wrap_type({qmuls, opp::any_input()}); + auto qmmi = opp::any_input(); + auto qmm = opp::wrap_type({qmmi, qreshp}); + + // Note: Use [=] to make sure the above objects stay alive in the callback + auto callback = [=](ov::pass::pattern::Matcher& m) { + auto& node_to_output = m.get_pattern_value_map(); + + auto matched_node_qweight = node_to_output.at(qweight).get_node_shared_ptr(); + auto matched_node_qcoeff = node_to_output.at(qcoeff).get_node_shared_ptr(); + auto matched_node_matmul = node_to_output.at(qmm).get_node_shared_ptr(); + auto matched_out_mmi = node_to_output.at(qmmi); + + auto matched_qweight = std::static_pointer_cast(matched_node_qweight); + auto matched_qcoeff = std::static_pointer_cast(matched_node_qcoeff); + auto matched_matmul = std::static_pointer_cast(matched_node_matmul); + + auto qweight_shape = matched_qweight->output(0).get_shape(); + auto qcoeff_shape = matched_qcoeff->output(0).get_shape(); + auto act_shape = matched_out_mmi.get_shape(); + auto out_shape = matched_node_matmul->output(0).get_shape(); + + if (ov::element::i4 == matched_qweight->get_element_type() && + ov::element::f32 == matched_qcoeff->get_element_type() && qcoeff_shape.size() == 3 && + qweight_shape.size() == 3 && act_shape.size() == 3 && qcoeff_shape[0] == qweight_shape[0] && + qcoeff_shape[1] == 1 && qcoeff_shape[2] == qweight_shape[2] && !matched_matmul->get_transpose_a() && + !matched_matmul->get_transpose_b()) { + // Mark W closure to transpose, and transpose the respective parameter + ctx.get().permute(matched_qweight, {0, 2, 1}); + + // Mark S closure to be lowered fo f16 + ctx.get().to_f16(matched_qcoeff); + + ov::Shape tw_shape = {qweight_shape[0], qweight_shape[2], qweight_shape[1]}; + matched_qweight->set_partial_shape(tw_shape); + matched_qweight->validate_and_infer_types(); + + matched_qcoeff->set_element_type(ov::element::f16); + matched_qcoeff->validate_and_infer_types(); + + // Reshape the Act to group format + const auto NSPLIT = qweight_shape[0]; + std::vector rshp_act_v = {NSPLIT, act_shape[1], act_shape[2] / NSPLIT}; + auto rshp_act_c = std::make_shared(ov::element::i32, ov::Shape{3}, rshp_act_v); + auto rshp_act = std::make_shared(matched_out_mmi, rshp_act_c, false); + + // Split Act and W, and S tensors by NSPLIT + auto split_axis = std::make_shared(ov::element::i32, ov::Shape{}, 0); + auto split_a = std::make_shared(rshp_act, split_axis, NSPLIT); + auto split_w = std::make_shared(matched_qweight, split_axis, NSPLIT); + + // Do the CW MM for every split + std::vector> to_concat; + for (std::size_t i = 0; i < NSPLIT; i++) { + auto a_f16 = std::make_shared(split_a->output(i), ov::element::f16); + auto w_f16 = std::make_shared(split_w->output(i), ov::element::f16); + auto m_f16 = std::make_shared(a_f16, w_f16, false, true); + to_concat.push_back(m_f16); + } + + // Now concat and scale the result + auto concat = std::make_shared(to_concat, 0); + auto s_f16 = std::make_shared(concat, matched_qcoeff); + + // Now reshape to a better shape, ReduceSum, and reshape to the right size again + std::vector rshp_ccat_v = {1, NSPLIT, 1, qweight_shape[2]}; + auto rshp_ccat_c = std::make_shared(ov::element::i32, ov::Shape{4}, rshp_ccat_v); + auto rshp_ccat = std::make_shared(s_f16, rshp_ccat_c, false); + + auto reduce_axis = std::make_shared(ov::element::i32, ov::Shape{}, 1); + auto reduce = std::make_shared(rshp_ccat, reduce_axis, true); + + auto rshp_out_c = std::make_shared(ov::element::i32, ov::Shape{3}, out_shape); + auto rshp_out = std::make_shared(reduce, rshp_out_c, false); + + // Convert the result to f32 to maintain the graph contracts. FIXME should be avoided + auto out = std::make_shared(rshp_out, ov::element::f32); + + // Now.. Reconnect the matmul readers to the new output (reducesum) + for (auto&& r : matched_matmul->output(0).get_target_inputs()) { + r.replace_source_output(out); + } + return true; // root has changed + } + return false; // did nothing here + }; + register_matcher(std::make_shared(qmm, "OptDQMatMulGQi"), std::move(callback)); +} + +// FROM: +// ???(Act) --------------------------------------------------------> +// Param(W) -> Convert(f16) -> Multiply -> Reshape -> Convert(f32) -> MatMul +// Param(S) -----------------> +// +// WHERE (example): +// Act: [ 1, 1,2048] +// W: [512,16, 128] +// S: [512,16, 1] +// [1, 1,128] x +// TO: [1,512,128]T = +// [16,1,128] [1, 1,512] [16,1,512] +// ???(Act) -> Reshape > Split(/16) ->[to(f16) -> ]} +// Param(W*) -----------> Split(/16) ->[to(f16) -> MatMul v ]} Concat +// Param(S) -----------> Split(/16) ->[---------> Multiply ]} v +// Reshape(1,16,1,512) +// ReduceSum(1) +// Reshape( 1,1,512) +// to(f32) +// WHERE: +// W* : [16,512,128] + +DQMatMulGQ2i::DQMatMulGQ2i(Context::Ref ctx) { + auto qweight = opp::wrap_type(); + auto qcoeff = opp::wrap_type(); + auto qcvtw = opp::wrap_type({qweight}); + auto qmuls = opp::wrap_type({qcvtw, qcoeff}); + auto qreshp = opp::wrap_type({qmuls, opp::any_input()}); + auto qcvtr = opp::wrap_type({qreshp}); + auto qmmi = opp::any_input(); + auto qmm = opp::wrap_type({qmmi, qcvtr}); + + // Note: Use [=] to make sure the above objects stay alive in the callback + auto callback = [=](ov::pass::pattern::Matcher& m) { + auto& node_to_output = m.get_pattern_value_map(); + + auto matched_node_qweight = node_to_output.at(qweight).get_node_shared_ptr(); + auto matched_node_qcoeff = node_to_output.at(qcoeff).get_node_shared_ptr(); + auto matched_node_matmul = node_to_output.at(qmm).get_node_shared_ptr(); + auto matched_out_mmi = node_to_output.at(qmmi); + + auto matched_qweight = std::static_pointer_cast(matched_node_qweight); + auto matched_qcoeff = std::static_pointer_cast(matched_node_qcoeff); + auto matched_matmul = std::static_pointer_cast(matched_node_matmul); + + auto qweight_shape = matched_qweight->output(0).get_shape(); + auto qcoeff_shape = matched_qcoeff->output(0).get_shape(); + auto act_shape = matched_out_mmi.get_shape(); + auto out_shape = matched_node_matmul->output(0).get_shape(); + + if (ov::element::i4 == matched_qweight->get_element_type() && qweight_shape.size() == 3 && + ov::element::f16 == matched_qcoeff->get_element_type() && qcoeff_shape.size() == 3 && + act_shape.size() == 3 && qcoeff_shape[0] == qweight_shape[0] && qcoeff_shape[2] == 1 && + qcoeff_shape[1] == qweight_shape[1] && !matched_matmul->get_transpose_a() && + matched_matmul->get_transpose_b()) { + // Mark W closure to transpose, and transpose the respective parameter + ctx.get().permute(matched_qweight, {1, 0, 2}); + + ov::Shape tw_shape = {qweight_shape[1], qweight_shape[0], qweight_shape[2]}; + matched_qweight->set_partial_shape(tw_shape); + matched_qweight->validate_and_infer_types(); + + // Reshape the Act to group format + const auto NSPLIT = qweight_shape[1]; + std::vector rshp_act_v = {NSPLIT, 1, act_shape[2] / NSPLIT}; + auto rshp_act_c = std::make_shared(ov::element::i32, ov::Shape{3}, rshp_act_v); + auto rshp_act = std::make_shared(matched_out_mmi, rshp_act_c, false); + + // Split Act and W, and S tensors by NSPLIT + auto split_axis = std::make_shared(ov::element::i32, ov::Shape{}, 0); + auto split_a = std::make_shared(rshp_act, split_axis, NSPLIT); + auto split_w = std::make_shared(matched_qweight, split_axis, NSPLIT); + + auto split_axis_s = std::make_shared(ov::element::i32, ov::Shape{}, 1); + auto split_s = std::make_shared(matched_qcoeff, split_axis_s, NSPLIT); + + std::vector rshp_scale_v = {1, 1, qcoeff_shape[0]}; + auto rshp_scale_c = std::make_shared(ov::element::i32, ov::Shape{3}, rshp_scale_v); + + // Do the CW MM for every split + std::vector> to_concat; + for (std::size_t i = 0; i < NSPLIT; i++) { + auto a_f16 = std::make_shared(split_a->output(i), ov::element::f16); + auto w_f16 = std::make_shared(split_w->output(i), ov::element::f16); + auto m_f16 = std::make_shared(a_f16, w_f16, false, true); + + auto r_f16 = std::make_shared(split_s->output(i), rshp_scale_c, false); + auto s_f16 = std::make_shared(m_f16, r_f16); + to_concat.push_back(s_f16); + } + + // Now concat and scale the result + auto concat = std::make_shared(to_concat, 0); + + // Now reshape to a better shape, ReduceSum, and reshape to the right size again + std::vector rshp_ccat_v = {1, NSPLIT, 1, qweight_shape[0]}; + auto rshp_ccat_c = std::make_shared(ov::element::i32, ov::Shape{4}, rshp_ccat_v); + auto rshp_ccat = std::make_shared(concat, rshp_ccat_c, false); + + auto reduce_axis = std::make_shared(ov::element::i32, ov::Shape{}, 1); + auto reduce = std::make_shared(rshp_ccat, reduce_axis, true); + + auto rshp_out_c = std::make_shared(ov::element::i32, ov::Shape{3}, out_shape); + auto rshp_out = std::make_shared(reduce, rshp_out_c, false); + + // Convert the result to f32 to maintain the graph contracts. FIXME should be avoided + auto out = std::make_shared(rshp_out, ov::element::f32); + + // Now.. Reconnect the matmul readers to the new output (reducesum) + for (auto&& r : matched_matmul->output(0).get_target_inputs()) { + r.replace_source_output(out); + } + return true; // root has changed + } + return false; // did nothing here + }; + register_matcher(std::make_shared(qmm, "OptDQMatMulGQ2i"), std::move(callback)); +} + +} // namespace opt +} // namespace patterns +} // namespace npuw +} // namespace ov diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.hpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.hpp new file mode 100644 index 00000000000000..fee216b706639c --- /dev/null +++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.hpp @@ -0,0 +1,52 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include + +#include "openvino/openvino.hpp" +#include "openvino/pass/graph_rewrite.hpp" + +namespace ov { +namespace npuw { + +// Model optimization patterns. Triggered by the plugin at the very top +namespace patterns { +namespace opt { + +class DQMatMulCWi : public ov::pass::MatcherPass { +public: + DQMatMulCWi(); +}; + +struct Context { + using PPtr = std::shared_ptr; + + using Axes = std::vector; + std::map closures_to_permute; + void permute(PPtr orig_param, const Axes& order); + + std::set closures_to_f16; + void to_f16(PPtr orig_param); + + using Ref = std::reference_wrapper; +}; + +class DQMatMulGQi : public ov::pass::MatcherPass { +public: + explicit DQMatMulGQi(Context::Ref ctx); +}; + +class DQMatMulGQ2i : public ov::pass::MatcherPass { +public: + explicit DQMatMulGQ2i(Context::Ref ctx); +}; + +} // namespace opt +} // namespace patterns +} // namespace npuw +} // namespace ov diff --git a/src/plugins/intel_npu/src/plugin/npuw/util.cpp b/src/plugins/intel_npu/src/plugin/npuw/util.cpp index 7d74192a01a2b6..5a5f8f733d1a7a 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/util.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/util.cpp @@ -15,6 +15,7 @@ #include "logging.hpp" #include "openvino/op/constant.hpp" +#include "openvino/op/transpose.hpp" #include "openvino/op/util/op_types.hpp" #ifdef UNPACK_PROFILING @@ -1405,3 +1406,105 @@ void ov::npuw::util::to_f32(const ov::Tensor& in, ov::Tensor& out) { break; } } + +void ov::npuw::util::to_f16(ov::Tensor& t) { + ov::Shape shape = t.get_shape(); + NPUW_ASSERT(t.get_element_type() == ov::element::f32); + NPUW_ASSERT(t.get_size() % 8 == 0); + NPUW_ASSERT(t.is_continuous()); + + ov::Tensor tnew(ov::element::f16, shape); + + const float* psrc = t.data(); + uint8_t* pdst = static_cast(tnew.data()); + + for (std::size_t i = 0; i < t.get_size() / 8; i++) { + __m256 vsrc = _mm256_loadu_ps(psrc); + __m128i vout = _mm256_cvtps_ph(vsrc, _MM_FROUND_TO_NEAREST_INT); + __m128i* pout = reinterpret_cast<__m128i*>(pdst); + _mm_storeu_si128(pout, vout); + psrc += 8; // offset in sizeof(float) + pdst += (8 * 2); // offset in bytes + } + + t = std::move(tnew); +} + +inline uint8_t tread_4b(const ov::Tensor& t, std::size_t r, std::size_t c, std::size_t COLS) { + const uint8_t* tdata = static_cast(t.data()); + const uint8_t* trow = tdata + r * COLS / 2; + const uint8_t* telem = trow + c / 2; + if (c % 2 == 0) { + return lo4(*telem); + } + return hi4(*telem); +} + +inline void twrite_4b(ov::Tensor& t, uint8_t value, std::size_t r, std::size_t c, std::size_t COLS) { + uint8_t* tdata = static_cast(t.data()); + uint8_t* trow = tdata + r * COLS / 2; + uint8_t* telem = trow + c / 2; + if (c % 2 == 0) { + *telem = (hi4(*telem) << 4) | lo4(value); + } else { + *telem = (lo4(value) << 4) | lo4(*telem); + } +} + +void ov::npuw::util::transpose(ov::Tensor& t) { + ov::Shape shape = t.get_shape(); + NPUW_ASSERT(shape.size() == 3); // Yes, so far only transpose 3D tensors + NPUW_ASSERT(t.get_element_type() == ov::element::i4); + + ov::Shape tshape = {shape[2], shape[0], shape[1]}; + ov::Tensor tnew(t.get_element_type(), tshape); + + const auto IN_ROWS = shape[0] * shape[1]; + const auto IN_COLS = shape[2]; + for (std::size_t i = 0; i < IN_ROWS; i++) { + for (std::size_t j = 0; j < IN_COLS; j++) { + uint8_t value = tread_4b(t, i, j, IN_COLS); + twrite_4b(tnew, value, j, i, IN_ROWS); + } + } + t = std::move(tnew); +} + +void ov::npuw::util::permute(ov::Tensor& t, const std::vector& axes) { + ov::Shape shape = t.get_shape(); + NPUW_ASSERT(shape.size() == 3); // Yes, so far only transpose 3D tensors + NPUW_ASSERT(t.get_element_type() == ov::element::i4); // And, yes, 4bit only! + + if (axes[0] == 2 && axes[1] == 0 && axes[2] == 1) { + transpose(t); + } else if (axes[0] == 0 && axes[1] == 2 && axes[2] == 1) { + ov::Shape tshape = {shape[0], shape[2], shape[1]}; + ov::Tensor tnew(t.get_element_type(), tshape); + + for (std::size_t p = 0; p < shape[0]; p++) { + for (std::size_t r = 0; r < shape[1]; r++) { + for (std::size_t c = 0; c < shape[2]; c++) { + uint8_t value = tread_4b(t, p * shape[1] + r, c, shape[2]); + twrite_4b(tnew, value, p * shape[2] + c, r, shape[1]); + } + } + } + t = std::move(tnew); + } else if (axes[0] == 1 && axes[1] == 0 && axes[2] == 2) { + ov::Shape tshape = {shape[1], shape[0], shape[2]}; + ov::Tensor tnew(t.get_element_type(), tshape); + + // Iterate over output tensor coordinates + for (std::size_t p = 0; p < tshape[0]; p++) { + for (std::size_t r = 0; r < tshape[1]; r++) { + for (std::size_t c = 0; c < tshape[2]; c++) { + uint8_t value = tread_4b(t, r, p * shape[2] + c, shape[1] * shape[2]); + twrite_4b(tnew, value, p * tshape[1] + r, c, tshape[2]); + } + } + } + t = std::move(tnew); + } else { + NPUW_ASSERT(false && "Not supported yet"); + } +} diff --git a/src/plugins/intel_npu/src/plugin/npuw/util.hpp b/src/plugins/intel_npu/src/plugin/npuw/util.hpp index 88bb7896d94b0f..2a3c0ea7787145 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/util.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/util.hpp @@ -6,6 +6,7 @@ #include +#include "logging.hpp" #include "openvino/runtime/itensor.hpp" #include "openvino/runtime/so_ptr.hpp" @@ -50,6 +51,48 @@ void unpack(const ov::SoPtr& from, const UnpackOptions& unpack_options = UnpackOptions{true, 16, false}); void to_f32(const ov::Tensor& in, ov::Tensor& out); +void to_f16(ov::Tensor& t); +void transpose(ov::Tensor& t); +void permute(ov::Tensor& t, const std::vector& axes); + +namespace at { +template +struct Impl { + using V = typename M::mapped_type; + + M* m = nullptr; + explicit Impl(M* pM) : m(pM) {} + + template + V& at(const K& k) { + const auto iter = m->find(k); + if (iter == m->end()) { + std::stringstream ss; + ss << "Key " << k << " is not found in a map of type " << typeid(m).name(); + const auto msg = ss.str(); + LOG_ERROR(msg); + throw std::out_of_range(msg); + } + return iter->second; + } + + template + const V& at(const K& k) const { + return const_cast(this)->at(k); + } +}; + +template +Impl _(M* pM) { + return Impl(pM); +} + +template +Impl _(std::shared_ptr pM) { + return Impl(pM.get()); +} + +} // namespace at } // namespace util } // namespace npuw