From 2aed7d8d9d23ed83ca2d427fa9d28e0c934a2f7a Mon Sep 17 00:00:00 2001 From: Dmitry Matveev Date: Sat, 5 Oct 2024 18:56:25 +0000 Subject: [PATCH 1/3] NPUW: Fix FUNCALL_FOR_ALL=YES + PIPELINE=NONE - In this case, function pipelining is enabled but there's no "next" rq to prepare - Generally, exclude FCEW subgraphs from function pipelining --- .../src/plugin/npuw/compiled_model.cpp | 1 + .../src/plugin/npuw/compiled_model.hpp | 2 ++ .../plugin/npuw/just_sync_infer_request.cpp | 34 ++++++++++++------- .../plugin/npuw/just_sync_infer_request.hpp | 1 + .../plugin/npuw/partitioning/partitioning.cpp | 5 +++ .../plugin/npuw/partitioning/partitioning.hpp | 11 ++++-- 6 files changed, 40 insertions(+), 14 deletions(-) diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp index 563e99fcf2bad9..1f4959c4c88ed0 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp @@ -288,6 +288,7 @@ ov::npuw::CompiledModel::CompiledModel(const std::shared_ptr& model, m_compiled_submodels[id].closure = subgraph._closure; m_compiled_submodels[id].scales = subgraph._scales; m_compiled_submodels[id].zerops = subgraph._zerops; + m_compiled_submodels[id].forced_to_fcall = subgraph._forced_to_fcall; m_compiled_submodels[id].update_required.resize(subgraph._closure.size(), false); fill_weights_bank(id); } // if(!funcall) diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp index 1ddaf3f543eaa8..90095e0640406a 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp @@ -126,6 +126,8 @@ class CompiledModel : public ov::ICompiledModel { std::vector zerops; std::vector update_required; + bool forced_to_fcall = false; + // FIXME: Take it out of structure ov::SoPtr ref_compiled_model; bool switched_to_ref = false; diff --git a/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp b/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp index 6638fbcbe12a57..13d22d04494e7f 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp @@ -48,7 +48,8 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptrm_compiled_submodels[real_idx].compiled_model; + auto& proto_comp_model_desc = m_npuw_model->m_compiled_submodels[real_idx]; + auto& proto_comp_model = proto_comp_model_desc.compiled_model; for (size_t out_idx = 0; out_idx < proto_comp_model->outputs().size(); out_idx++) { const auto& port = proto_comp_model->outputs()[out_idx]; m_funcall_result[LinkFrom{i, out_idx}] = @@ -63,12 +64,13 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptrm_compiled_submodels[i]; if (comp_model_desc.replaced_by) { // a function call.. + if (!is_pipelined(i)) { + LOG_INFO("Skip subgraph[" << i << "] as it is a single-call function"); + continue; + } // Use real_id to accumulate information about // different functions const auto real_id = comp_model_desc.replaced_by.value(); @@ -331,8 +337,6 @@ void ov::npuw::JustInferRequest::bind_global_parameters(std::size_t idx) { auto& comp_model_desc = m_npuw_model->m_compiled_submodels[idx]; const auto real_idx = comp_model_desc.replaced_by.value_or(idx); - LOG_DEBUG("Real idx is..." << real_idx); - const bool do_copy = needs_copy(idx); const auto& iodesc = m_subrequests_gio.at(idx); @@ -341,7 +345,7 @@ void ov::npuw::JustInferRequest::bind_global_parameters(std::size_t idx) { // pick which subrequest we actually work on here auto subr = [&]() { - if (now_idx() && real_idx == real(now_idx().value()) && m_use_function_pipelining) { + if (now_idx() && real_idx == real(now_idx().value()) && is_pipelined(now_idx().value())) { LOG_DEBUG("Accessing the pipeline subrequest"); // The real index of request we need to prepare IS // the same request which executes now AND @@ -458,7 +462,7 @@ void ov::npuw::JustInferRequest::function_prologue(std::size_t idx) { // 2. Unpack the function closure -- right here, if pipelining if not enabled. // If it is enabled, the flow is a little bit different - see run_subrequest_for_success() // for details. - if (!m_use_function_pipelining) { + if (!is_pipelined(idx)) { LOG_DEBUG("Unpacking closures..."); LOG_BLOCK(); unpack_closure(idx, m_subrequests[real_idx]); @@ -555,7 +559,8 @@ void ov::npuw::JustInferRequest::recreate_subrequests(std::size_t idx) { auto& comp_model_desc = m_npuw_model->m_compiled_submodels[idx]; auto real_idx = comp_model_desc.replaced_by.value_or(idx); - auto new_rqs = create_infer_requests(idx, m_use_function_pipelining ? 2 : 1); + const auto is_piped = is_pipelined(idx); + auto new_rqs = create_infer_requests(idx, is_piped ? 2 : 1); // NB: Regardless if this subrequest was a function call // or not, always use the real_idx here - for regular @@ -563,7 +568,7 @@ void ov::npuw::JustInferRequest::recreate_subrequests(std::size_t idx) { // is critical here to update the function body, not the // function calls (which are left empty now in the vector) m_subrequests[real_idx] = new_rqs.at(0); - if (comp_model_desc.replaced_by && m_use_function_pipelining) { + if (is_piped) { m_funcall_pipeline[real_idx].subrequest = new_rqs.at(1); } // After an infer request is recreated, the internal cross-request @@ -637,7 +642,7 @@ void ov::npuw::JustInferRequest::run_subrequest_for_success(std::size_t idx, boo if (job_done) { dump_output_tensors(idx); // FIXME: Called here unconditionally, need to refactor - if (m_use_function_pipelining && m_funcall_pipeline[idx].next) { + if (is_pipelined(idx) && m_funcall_pipeline[idx].next) { // Swap the next (pipelined, semi-prepared) infer request in the chain // with the default (to be accessed next) one. std::swap(m_subrequests[real_idx], m_funcall_pipeline[real_idx].subrequest); @@ -666,7 +671,7 @@ void ov::npuw::JustInferRequest::unsafe_run_this_prep_next(std::size_t idx, bool // The next subgraph is a call to the same function... // At this point, THIS infer request is already prepared. // Run it, then prepare it again for the next entrace - if (m_use_function_pipelining) { + if (is_pipelined(real_idx)) { // function pipelining is here! and the next rq is ours. NPUW_ASSERT(m_funcall_pipeline[idx].next.value() == next_idx); during(this_subr, [&]() { @@ -697,7 +702,7 @@ void ov::npuw::JustInferRequest::unsafe_run_this_prep_next(std::size_t idx, bool bind_global_parameters(next_idx); next_prepared = true; } - if (m_use_function_pipelining && m_funcall_pipeline[idx].next) { + if (is_pipelined(idx) && m_funcall_pipeline[idx].next) { const auto my_next_idx = m_funcall_pipeline[idx].next.value(); LOG_DEBUG("Unpacking closures for the NEXT subrequest[" << my_next_idx << "]..."); LOG_BLOCK(); @@ -745,3 +750,8 @@ bool ov::npuw::JustInferRequest::supports_async_pipeline() const { void ov::npuw::JustInferRequest::update_subrequest_links(std::size_t) { connect_subrequests(); } + +bool ov::npuw::JustInferRequest::is_pipelined(std::size_t idx) const { + const auto& desc = m_npuw_model->m_compiled_submodels[real(idx)]; + return m_use_function_pipelining && desc.replaced_by && !desc.forced_to_fcall; +} diff --git a/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.hpp b/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.hpp index e63f2f18b85ece..ab6186785102d2 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.hpp @@ -64,6 +64,7 @@ class JustInferRequest final : public IBaseInferRequest { using TensorPtr = ov::SoPtr; std::map m_funcall_result; + bool is_pipelined(std::size_t idx) const; bool m_use_function_pipelining = false; struct FuncallPipeline { // A "brother" subrequest for a "primary" subrequest. Initialized only diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp index 192d975509ce5e..5449ab603513fb 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp @@ -355,6 +355,7 @@ void Partitioner::identifySubgraphs() { group_nodes.insert(it->second); } group.sg._repeated_id = group.repeated_id; + group.sg._forced_to_fcall = group.forced_to_fcall; group.sg._gflops = group.gflops; group.sg._ops = group.all_layers.size(); P.total_ops += group.sg._ops; @@ -1450,6 +1451,7 @@ void Partitioner::createFunction(FunctionPipeline& func_ggg) { funcall._gflops = body_sg._gflops; // preserving this is required for proper stats funcall._ops = body_sg._ops; // preserving this is requried for proper stats funcall._avoid_list = body_sg._avoid_list; + funcall._forced_to_fcall = body_sg._forced_to_fcall; // Declare a new function AND record a function call ov::npuw::Function function; @@ -1549,6 +1551,7 @@ void Partitioner::matchRepeatedSubgraphs(const std::string& func_name) { funcall._gflops = this_sg._gflops; // duplicated code again! funcall._ops = this_sg._ops; // duplicated code again! funcall._avoid_list = this_sg._avoid_list; // duplicated code again! + funcall._forced_to_fcall = this_sg._forced_to_fcall; rearrange_to_function_protocol(this_sg, body_params, funcall._parameters, func_ggg.param_call_to_proto); rearrange_to_function_protocol(this_sg, body_results, funcall._results, func_ggg.result_call_to_proto); @@ -2017,6 +2020,8 @@ ov::npuw::Partitioning ov::npuw::getPartitioning(const std::shared_ptr _closure; std::vector _scales; // Scale coeffs for manual unpacking std::vector _zerops; // Zero points for manual unpacking + bool _forced_to_fcall = false; + struct Gather { // NB.: int64_t is strange but it is used by OV to refer to parameters int64_t dst_idx = -1; @@ -72,6 +74,11 @@ struct Group { std::string avoid_list; + // Set to true if the Group was forcibly turned to functon. Such + // function has just a single associated funcall and are subjects + // to some optimizations (simplifications). + bool forced_to_fcall = false; + ov::npuw::Subgraph sg; }; From 544c4bc21363b4042c5d8ed5df1f33c9ea9fd885 Mon Sep 17 00:00:00 2001 From: Dmitry Matveev Date: Sat, 5 Oct 2024 21:07:05 +0000 Subject: [PATCH 2/3] NPUW: Fix FUNCAL_FOR_ALL + DCOFF (no DQ) combination DCOFF rearranges closures what breaks.. Host Gather, which is only activated with FUNCALL_FOR_ALL --- .../plugin/npuw/partitioning/partitioning.cpp | 2 +- .../npuw/partitioning/patterns/dcoff.cpp | 30 ++++++++++++++++++- .../npuw/partitioning/patterns/dcoff.hpp | 2 +- 3 files changed, 31 insertions(+), 3 deletions(-) diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp index 5449ab603513fb..e669f22a2862ac 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp @@ -1885,7 +1885,7 @@ void Partitioner::decompressionCutOff(const std::string& func_name) { } // Finally, remove the function body's parameters here - ov::npuw::patterns::finalize_remap(f, closure_remap); + ov::npuw::patterns::finalize_remap(f, func_group.refs.front(), closure_remap); } // if (CAST_SCALE && have(params_to_scale)) } LOG_DEBUG("Function model inputs after the DCOFF:"); diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/dcoff.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/dcoff.cpp index ffbece94b04176..c4b5c1ac7382e4 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/dcoff.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/dcoff.cpp @@ -136,14 +136,42 @@ void apply_remap(Subgraph& fcall, const ClosureRemap& m) { fcall._zerops = std::move(new_zerops); } -void finalize_remap(Function& fbody, const ClosureRemap& m) { +void finalize_remap(Function& fbody, Subgraph &fsg, const ClosureRemap& m) { LOG_DEBUG("Removing retired parameters..."); LOG_BLOCK(); + + // Unfortunate truth - this function has to be aware of the + // Host Gather existence to properly update indices after + // Remap. + using PPtr = std::shared_ptr; + struct GatherParams { + PPtr pidx; // Parameter @ function body - input_ids + PPtr psrc; // Parameter @ function body - vocab tensor + PPtr pdst; // Parameter @ function body - gathered ids + }; + GatherParams gather_params; + const auto ¶ms = fbody._model->get_parameters(); + if (fsg._host_gather.dst_idx != -1) { + gather_params = GatherParams { + params[fsg._host_gather.idx_idx], + params[fsg._host_gather.src_idx], + params[fsg._host_gather.dst_idx] + }; + } + for (auto&& p : m.params_to_remove) { LOG_DEBUG("Removing parameter " << p); LOG_BLOCK(); fbody._model->remove_parameter(p); } + + // Update indices for gather + if (fsg._host_gather.dst_idx != -1) { + fsg._host_gather.idx_idx = fbody._model->get_parameter_index(gather_params.pidx); + fsg._host_gather.src_idx = fbody._model->get_parameter_index(gather_params.psrc); + fsg._host_gather.dst_idx = fbody._model->get_parameter_index(gather_params.pdst); + } + fbody._model->validate_nodes_and_infer_types(); LOG_DEBUG("DONE"); } diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/dcoff.hpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/dcoff.hpp index 9bb3c132fa9c5d..e06420bbdea73c 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/dcoff.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/dcoff.hpp @@ -46,7 +46,7 @@ struct ClosureRemap { ClosureRemap build_remap(const Function& fbody, const DCOFFParams& p); void apply_remap(Subgraph& fcall, const ClosureRemap& m); -void finalize_remap(Function& fbody, const ClosureRemap& m); +void finalize_remap(Function& fbody, Subgraph &fsg, const ClosureRemap& m); // Various patterns here From b4e60872fc2120106312efa29e38bc29ed30f7d0 Mon Sep 17 00:00:00 2001 From: Dmitry Matveev Date: Mon, 7 Oct 2024 14:38:10 +0000 Subject: [PATCH 3/3] NPUW FCFA fixes: clang-format --- .../plugin/npuw/partitioning/partitioning.hpp | 4 ++-- .../npuw/partitioning/patterns/dcoff.cpp | 18 ++++++++---------- .../npuw/partitioning/patterns/dcoff.hpp | 2 +- 3 files changed, 11 insertions(+), 13 deletions(-) diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.hpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.hpp index 23e2202154ea2f..dd1c3d9a27bcfe 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.hpp @@ -35,8 +35,8 @@ struct Subgraph { // // FIXME: Replace with variant or some other proper way (maybe // even a class hierarchy) - std::string _repeated_id; // FIXME: What's the difference - std::string _funcall; // ..between these two? + std::string _repeated_id; // FIXME: What's the difference + std::string _funcall; // ..between these two? std::vector _closure; std::vector _scales; // Scale coeffs for manual unpacking std::vector _zerops; // Zero points for manual unpacking diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/dcoff.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/dcoff.cpp index c4b5c1ac7382e4..8536492aa862e0 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/dcoff.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/dcoff.cpp @@ -136,7 +136,7 @@ void apply_remap(Subgraph& fcall, const ClosureRemap& m) { fcall._zerops = std::move(new_zerops); } -void finalize_remap(Function& fbody, Subgraph &fsg, const ClosureRemap& m) { +void finalize_remap(Function& fbody, Subgraph& fsg, const ClosureRemap& m) { LOG_DEBUG("Removing retired parameters..."); LOG_BLOCK(); @@ -145,18 +145,16 @@ void finalize_remap(Function& fbody, Subgraph &fsg, const ClosureRemap& m) { // Remap. using PPtr = std::shared_ptr; struct GatherParams { - PPtr pidx; // Parameter @ function body - input_ids - PPtr psrc; // Parameter @ function body - vocab tensor - PPtr pdst; // Parameter @ function body - gathered ids + PPtr pidx; // Parameter @ function body - input_ids + PPtr psrc; // Parameter @ function body - vocab tensor + PPtr pdst; // Parameter @ function body - gathered ids }; GatherParams gather_params; - const auto ¶ms = fbody._model->get_parameters(); + const auto& params = fbody._model->get_parameters(); if (fsg._host_gather.dst_idx != -1) { - gather_params = GatherParams { - params[fsg._host_gather.idx_idx], - params[fsg._host_gather.src_idx], - params[fsg._host_gather.dst_idx] - }; + gather_params = GatherParams{params[fsg._host_gather.idx_idx], + params[fsg._host_gather.src_idx], + params[fsg._host_gather.dst_idx]}; } for (auto&& p : m.params_to_remove) { diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/dcoff.hpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/dcoff.hpp index e06420bbdea73c..eae6d8aa1afbed 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/dcoff.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/dcoff.hpp @@ -46,7 +46,7 @@ struct ClosureRemap { ClosureRemap build_remap(const Function& fbody, const DCOFFParams& p); void apply_remap(Subgraph& fcall, const ClosureRemap& m); -void finalize_remap(Function& fbody, Subgraph &fsg, const ClosureRemap& m); +void finalize_remap(Function& fbody, Subgraph& fsg, const ClosureRemap& m); // Various patterns here