From 2aed7d8d9d23ed83ca2d427fa9d28e0c934a2f7a Mon Sep 17 00:00:00 2001
From: Dmitry Matveev <dmitry.matveev@intel.com>
Date: Sat, 5 Oct 2024 18:56:25 +0000
Subject: [PATCH 1/3] NPUW: Fix FUNCALL_FOR_ALL=YES + PIPELINE=NONE

- In this case, function pipelining is enabled but there's no
  "next" rq to prepare

- Generally, exclude FCEW subgraphs from function pipelining
---
 .../src/plugin/npuw/compiled_model.cpp        |  1 +
 .../src/plugin/npuw/compiled_model.hpp        |  2 ++
 .../plugin/npuw/just_sync_infer_request.cpp   | 34 ++++++++++++-------
 .../plugin/npuw/just_sync_infer_request.hpp   |  1 +
 .../plugin/npuw/partitioning/partitioning.cpp |  5 +++
 .../plugin/npuw/partitioning/partitioning.hpp | 11 ++++--
 6 files changed, 40 insertions(+), 14 deletions(-)
diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp
index 563e99fcf2bad9..1f4959c4c88ed0 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp
@@ -288,6 +288,7 @@ ov::npuw::CompiledModel::CompiledModel(const std::shared_ptr<ov::Model>& model,
             m_compiled_submodels[id].closure = subgraph._closure;
             m_compiled_submodels[id].scales = subgraph._scales;
             m_compiled_submodels[id].zerops = subgraph._zerops;
+            m_compiled_submodels[id].forced_to_fcall = subgraph._forced_to_fcall;
             m_compiled_submodels[id].update_required.resize(subgraph._closure.size(), false);
             fill_weights_bank(id);
         }  // if(!funcall)
diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp
index 1ddaf3f543eaa8..90095e0640406a 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp
@@ -126,6 +126,8 @@ class CompiledModel : public ov::ICompiledModel {
         std::vector<ov::Tensor> zerops;
         std::vector<bool> update_required;
 
+        bool forced_to_fcall = false;
+
         // FIXME: Take it out of structure
         ov::SoPtr<ov::ICompiledModel> ref_compiled_model;
         bool switched_to_ref = false;
diff --git a/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp b/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp
index 6638fbcbe12a57..13d22d04494e7f 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp
@@ -48,7 +48,8 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr<ov::npuw::Com
         if (comp_model_desc.replaced_by) {
             // Pre-allocate output tesnors for this function call
             const auto real_idx = comp_model_desc.replaced_by.value();
-            auto& proto_comp_model = m_npuw_model->m_compiled_submodels[real_idx].compiled_model;
+            auto& proto_comp_model_desc = m_npuw_model->m_compiled_submodels[real_idx];
+            auto& proto_comp_model = proto_comp_model_desc.compiled_model;
             for (size_t out_idx = 0; out_idx < proto_comp_model->outputs().size(); out_idx++) {
                 const auto& port = proto_comp_model->outputs()[out_idx];
                 m_funcall_result[LinkFrom{i, out_idx}] =
@@ -63,12 +64,13 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr<ov::npuw::Com
         }  // if(replaced_by)
 
         // Special cases are handled -- so nothing to do here
+        const bool is_piped = is_pipelined(i);
         bool recompiled = false;
-        auto rqs = create_infer_requests(i, m_use_function_pipelining ? 2 : 1, &recompiled);
+        auto rqs = create_infer_requests(i, is_piped ? 2 : 1, &recompiled);
         failover_happened |= recompiled;
         m_subrequests[i] = rqs.at(0);
         m_subrequest_devices[i] = *comp_model_desc.device_it;
-        if (comp_model_desc.replaced_by && m_use_function_pipelining) {
+        if (is_piped) {
             m_funcall_pipeline[i].subrequest = rqs.at(1);
         }
 
@@ -89,6 +91,10 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr<ov::npuw::Com
         for (std::size_t i = 0; i < m_num_submodels; i++) {
             auto& comp_model_desc = m_npuw_model->m_compiled_submodels[i];
             if (comp_model_desc.replaced_by) {  // a function call..
+                if (!is_pipelined(i)) {
+                    LOG_INFO("Skip subgraph[" << i << "] as it is a single-call function");
+                    continue;
+                }
                 // Use real_id to accumulate information about
                 // different functions
                 const auto real_id = comp_model_desc.replaced_by.value();
@@ -331,8 +337,6 @@ void ov::npuw::JustInferRequest::bind_global_parameters(std::size_t idx) {
     auto& comp_model_desc = m_npuw_model->m_compiled_submodels[idx];
     const auto real_idx = comp_model_desc.replaced_by.value_or(idx);
 
-    LOG_DEBUG("Real idx is..." << real_idx);
-
     const bool do_copy = needs_copy(idx);
     const auto& iodesc = m_subrequests_gio.at(idx);
 
@@ -341,7 +345,7 @@ void ov::npuw::JustInferRequest::bind_global_parameters(std::size_t idx) {
 
     // pick which subrequest we actually work on here
     auto subr = [&]() {
-        if (now_idx() && real_idx == real(now_idx().value()) && m_use_function_pipelining) {
+        if (now_idx() && real_idx == real(now_idx().value()) && is_pipelined(now_idx().value())) {
             LOG_DEBUG("Accessing the pipeline subrequest");
             // The real index of request we need to prepare IS
             // the same request which executes now AND
@@ -458,7 +462,7 @@ void ov::npuw::JustInferRequest::function_prologue(std::size_t idx) {
     // 2. Unpack the function closure -- right here, if pipelining if not enabled.
     // If it is enabled, the flow is a little bit different - see run_subrequest_for_success()
     // for details.
-    if (!m_use_function_pipelining) {
+    if (!is_pipelined(idx)) {
         LOG_DEBUG("Unpacking closures...");
         LOG_BLOCK();
         unpack_closure(idx, m_subrequests[real_idx]);
@@ -555,7 +559,8 @@ void ov::npuw::JustInferRequest::recreate_subrequests(std::size_t idx) {
     auto& comp_model_desc = m_npuw_model->m_compiled_submodels[idx];
     auto real_idx = comp_model_desc.replaced_by.value_or(idx);
 
-    auto new_rqs = create_infer_requests(idx, m_use_function_pipelining ? 2 : 1);
+    const auto is_piped = is_pipelined(idx);
+    auto new_rqs = create_infer_requests(idx, is_piped ? 2 : 1);
 
     // NB: Regardless if this subrequest was a function call
     // or not, always use the real_idx here - for regular
@@ -563,7 +568,7 @@ void ov::npuw::JustInferRequest::recreate_subrequests(std::size_t idx) {
     // is critical here to update the function body, not the
     // function calls (which are left empty now in the vector)
     m_subrequests[real_idx] = new_rqs.at(0);
-    if (comp_model_desc.replaced_by && m_use_function_pipelining) {
+    if (is_piped) {
         m_funcall_pipeline[real_idx].subrequest = new_rqs.at(1);
     }
     // After an infer request is recreated, the internal cross-request
@@ -637,7 +642,7 @@ void ov::npuw::JustInferRequest::run_subrequest_for_success(std::size_t idx, boo
 
     if (job_done) {
         dump_output_tensors(idx);  // FIXME: Called here unconditionally, need to refactor
-        if (m_use_function_pipelining && m_funcall_pipeline[idx].next) {
+        if (is_pipelined(idx) && m_funcall_pipeline[idx].next) {
             // Swap the next (pipelined, semi-prepared) infer request in the chain
             // with the default (to be accessed next) one.
             std::swap(m_subrequests[real_idx], m_funcall_pipeline[real_idx].subrequest);
@@ -666,7 +671,7 @@ void ov::npuw::JustInferRequest::unsafe_run_this_prep_next(std::size_t idx, bool
             // The next subgraph is a call to the same function...
             // At this point, THIS infer request is already prepared.
             // Run it, then prepare it again for the next entrace
-            if (m_use_function_pipelining) {
+            if (is_pipelined(real_idx)) {
                 // function pipelining is here! and the next rq is ours.
                 NPUW_ASSERT(m_funcall_pipeline[idx].next.value() == next_idx);
                 during(this_subr, [&]() {
@@ -697,7 +702,7 @@ void ov::npuw::JustInferRequest::unsafe_run_this_prep_next(std::size_t idx, bool
                         bind_global_parameters(next_idx);
                         next_prepared = true;
                     }
-                    if (m_use_function_pipelining && m_funcall_pipeline[idx].next) {
+                    if (is_pipelined(idx) && m_funcall_pipeline[idx].next) {
                         const auto my_next_idx = m_funcall_pipeline[idx].next.value();
                         LOG_DEBUG("Unpacking closures for the NEXT subrequest[" << my_next_idx << "]...");
                         LOG_BLOCK();
@@ -745,3 +750,8 @@ bool ov::npuw::JustInferRequest::supports_async_pipeline() const {
 void ov::npuw::JustInferRequest::update_subrequest_links(std::size_t) {
     connect_subrequests();
 }
+
+bool ov::npuw::JustInferRequest::is_pipelined(std::size_t idx) const {
+    const auto& desc = m_npuw_model->m_compiled_submodels[real(idx)];
+    return m_use_function_pipelining && desc.replaced_by && !desc.forced_to_fcall;
+}
diff --git a/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.hpp b/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.hpp
index e63f2f18b85ece..ab6186785102d2 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.hpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.hpp
@@ -64,6 +64,7 @@ class JustInferRequest final : public IBaseInferRequest {
     using TensorPtr = ov::SoPtr<ov::ITensor>;
     std::map<LinkFrom, TensorPtr> m_funcall_result;
 
+    bool is_pipelined(std::size_t idx) const;
     bool m_use_function_pipelining = false;
     struct FuncallPipeline {
         // A "brother" subrequest for a "primary" subrequest. Initialized only
diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp
index 192d975509ce5e..5449ab603513fb 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp
@@ -355,6 +355,7 @@ void Partitioner::identifySubgraphs() {
             group_nodes.insert(it->second);
         }
         group.sg._repeated_id = group.repeated_id;
+        group.sg._forced_to_fcall = group.forced_to_fcall;
         group.sg._gflops = group.gflops;
         group.sg._ops = group.all_layers.size();
         P.total_ops += group.sg._ops;
@@ -1450,6 +1451,7 @@ void Partitioner::createFunction(FunctionPipeline& func_ggg) {
     funcall._gflops = body_sg._gflops;  // preserving this is required for proper stats
     funcall._ops = body_sg._ops;        // preserving this is requried for proper stats
     funcall._avoid_list = body_sg._avoid_list;
+    funcall._forced_to_fcall = body_sg._forced_to_fcall;
 
     // Declare a new function AND record a function call
     ov::npuw::Function function;
@@ -1549,6 +1551,7 @@ void Partitioner::matchRepeatedSubgraphs(const std::string& func_name) {
         funcall._gflops = this_sg._gflops;          // duplicated code again!
         funcall._ops = this_sg._ops;                // duplicated code again!
         funcall._avoid_list = this_sg._avoid_list;  // duplicated code again!
+        funcall._forced_to_fcall = this_sg._forced_to_fcall;
         rearrange_to_function_protocol(this_sg, body_params, funcall._parameters, func_ggg.param_call_to_proto);
         rearrange_to_function_protocol(this_sg, body_results, funcall._results, func_ggg.result_call_to_proto);
 
@@ -2017,6 +2020,8 @@ ov::npuw::Partitioning ov::npuw::getPartitioning(const std::shared_ptr<ov::Model
                 LOG_INFO("Turning block " << gid << " into a function " << this_group.repeated_id << "...");
                 LOG_BLOCK();
                 this_group.repeated_id = std::move(new_id);
+                this_group.forced_to_fcall = true;
+
                 ov::npuw::RepeatedBlock this_block;
                 for (const auto& layer : this_group.all_layers) {
                     // Note: NOT move(layer)! It breaks the code here.
diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.hpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.hpp
index 35c4eacfeffe8b..23e2202154ea2f 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.hpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.hpp
@@ -35,12 +35,14 @@ struct Subgraph {
     //
     // FIXME: Replace with variant or some other proper way (maybe
     // even a class hierarchy)
-    std::string _repeated_id;
-    std::string _funcall;
+    std::string _repeated_id; // FIXME: What's the difference
+    std::string _funcall;     // ..between these two?
     std::vector<ov::Tensor> _closure;
     std::vector<ov::Tensor> _scales;  // Scale coeffs for manual unpacking
     std::vector<ov::Tensor> _zerops;  // Zero points for manual unpacking
 
+    bool _forced_to_fcall = false;
+
     struct Gather {
         // NB.: int64_t is strange but it is used by OV to refer to parameters
         int64_t dst_idx = -1;
@@ -72,6 +74,11 @@ struct Group {
 
     std::string avoid_list;
 
+    // Set to true if the Group was forcibly turned to functon. Such
+    // function has just a single associated funcall and are subjects
+    // to some optimizations (simplifications).
+    bool forced_to_fcall = false;
+
     ov::npuw::Subgraph sg;
 };
 

From 544c4bc21363b4042c5d8ed5df1f33c9ea9fd885 Mon Sep 17 00:00:00 2001
From: Dmitry Matveev <dmitry.matveev@intel.com>
Date: Sat, 5 Oct 2024 21:07:05 +0000
Subject: [PATCH 2/3] NPUW: Fix FUNCAL_FOR_ALL + DCOFF (no DQ) combination

DCOFF rearranges closures what breaks.. Host Gather, which
is only activated with FUNCALL_FOR_ALL
---
 .../plugin/npuw/partitioning/partitioning.cpp |  2 +-
 .../npuw/partitioning/patterns/dcoff.cpp      | 30 ++++++++++++++++++-
 .../npuw/partitioning/patterns/dcoff.hpp      |  2 +-
 3 files changed, 31 insertions(+), 3 deletions(-)

diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp
index 5449ab603513fb..e669f22a2862ac 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp
@@ -1885,7 +1885,7 @@ void Partitioner::decompressionCutOff(const std::string& func_name) {
             }
 
             // Finally, remove the function body's parameters here
-            ov::npuw::patterns::finalize_remap(f, closure_remap);
+            ov::npuw::patterns::finalize_remap(f, func_group.refs.front(), closure_remap);
         }  // if (CAST_SCALE && have(params_to_scale))
     }
     LOG_DEBUG("Function model inputs after the DCOFF:");
diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/dcoff.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/dcoff.cpp
index ffbece94b04176..c4b5c1ac7382e4 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/dcoff.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/dcoff.cpp
@@ -136,14 +136,42 @@ void apply_remap(Subgraph& fcall, const ClosureRemap& m) {
     fcall._zerops = std::move(new_zerops);
 }
 
-void finalize_remap(Function& fbody, const ClosureRemap& m) {
+void finalize_remap(Function& fbody, Subgraph &fsg, const ClosureRemap& m) {
     LOG_DEBUG("Removing retired parameters...");
     LOG_BLOCK();
+
+    // Unfortunate truth - this function has to be aware of the
+    // Host Gather existence to properly update indices after
+    // Remap.
+    using PPtr = std::shared_ptr<ov::op::v0::Parameter>;
+    struct GatherParams {
+        PPtr pidx; // Parameter @ function body - input_ids
+        PPtr psrc; // Parameter @ function body - vocab tensor
+        PPtr pdst; // Parameter @ function body - gathered ids
+    };
+    GatherParams gather_params;
+    const auto &params = fbody._model->get_parameters();
+    if (fsg._host_gather.dst_idx != -1) {
+        gather_params = GatherParams {
+            params[fsg._host_gather.idx_idx],
+            params[fsg._host_gather.src_idx],
+            params[fsg._host_gather.dst_idx]
+        };
+    }
+
     for (auto&& p : m.params_to_remove) {
         LOG_DEBUG("Removing parameter " << p);
         LOG_BLOCK();
         fbody._model->remove_parameter(p);
     }
+
+    // Update indices for gather
+    if (fsg._host_gather.dst_idx != -1) {
+        fsg._host_gather.idx_idx = fbody._model->get_parameter_index(gather_params.pidx);
+        fsg._host_gather.src_idx = fbody._model->get_parameter_index(gather_params.psrc);
+        fsg._host_gather.dst_idx = fbody._model->get_parameter_index(gather_params.pdst);
+    }
+
     fbody._model->validate_nodes_and_infer_types();
     LOG_DEBUG("DONE");
 }
diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/dcoff.hpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/dcoff.hpp
index 9bb3c132fa9c5d..e06420bbdea73c 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/dcoff.hpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/dcoff.hpp
@@ -46,7 +46,7 @@ struct ClosureRemap {
 
 ClosureRemap build_remap(const Function& fbody, const DCOFFParams& p);
 void apply_remap(Subgraph& fcall, const ClosureRemap& m);
-void finalize_remap(Function& fbody, const ClosureRemap& m);
+void finalize_remap(Function& fbody, Subgraph &fsg, const ClosureRemap& m);
 
 // Various patterns here
 

From b4e60872fc2120106312efa29e38bc29ed30f7d0 Mon Sep 17 00:00:00 2001
From: Dmitry Matveev <dmitry.matveev@intel.com>
Date: Mon, 7 Oct 2024 14:38:10 +0000
Subject: [PATCH 3/3] NPUW FCFA fixes: clang-format

---
 .../plugin/npuw/partitioning/partitioning.hpp  |  4 ++--
 .../npuw/partitioning/patterns/dcoff.cpp       | 18 ++++++++----------
 .../npuw/partitioning/patterns/dcoff.hpp       |  2 +-
 3 files changed, 11 insertions(+), 13 deletions(-)

diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.hpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.hpp
index 23e2202154ea2f..dd1c3d9a27bcfe 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.hpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.hpp
@@ -35,8 +35,8 @@ struct Subgraph {
     //
     // FIXME: Replace with variant or some other proper way (maybe
     // even a class hierarchy)
-    std::string _repeated_id; // FIXME: What's the difference
-    std::string _funcall;     // ..between these two?
+    std::string _repeated_id;  // FIXME: What's the difference
+    std::string _funcall;      // ..between these two?
     std::vector<ov::Tensor> _closure;
     std::vector<ov::Tensor> _scales;  // Scale coeffs for manual unpacking
     std::vector<ov::Tensor> _zerops;  // Zero points for manual unpacking
diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/dcoff.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/dcoff.cpp
index c4b5c1ac7382e4..8536492aa862e0 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/dcoff.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/dcoff.cpp
@@ -136,7 +136,7 @@ void apply_remap(Subgraph& fcall, const ClosureRemap& m) {
     fcall._zerops = std::move(new_zerops);
 }
 
-void finalize_remap(Function& fbody, Subgraph &fsg, const ClosureRemap& m) {
+void finalize_remap(Function& fbody, Subgraph& fsg, const ClosureRemap& m) {
     LOG_DEBUG("Removing retired parameters...");
     LOG_BLOCK();
 
@@ -145,18 +145,16 @@ void finalize_remap(Function& fbody, Subgraph &fsg, const ClosureRemap& m) {
     // Remap.
     using PPtr = std::shared_ptr<ov::op::v0::Parameter>;
     struct GatherParams {
-        PPtr pidx; // Parameter @ function body - input_ids
-        PPtr psrc; // Parameter @ function body - vocab tensor
-        PPtr pdst; // Parameter @ function body - gathered ids
+        PPtr pidx;  // Parameter @ function body - input_ids
+        PPtr psrc;  // Parameter @ function body - vocab tensor
+        PPtr pdst;  // Parameter @ function body - gathered ids
     };
     GatherParams gather_params;
-    const auto &params = fbody._model->get_parameters();
+    const auto& params = fbody._model->get_parameters();
     if (fsg._host_gather.dst_idx != -1) {
-        gather_params = GatherParams {
-            params[fsg._host_gather.idx_idx],
-            params[fsg._host_gather.src_idx],
-            params[fsg._host_gather.dst_idx]
-        };
+        gather_params = GatherParams{params[fsg._host_gather.idx_idx],
+                                     params[fsg._host_gather.src_idx],
+                                     params[fsg._host_gather.dst_idx]};
     }
 
     for (auto&& p : m.params_to_remove) {
diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/dcoff.hpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/dcoff.hpp
index e06420bbdea73c..eae6d8aa1afbed 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/dcoff.hpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/dcoff.hpp
@@ -46,7 +46,7 @@ struct ClosureRemap {
 
 ClosureRemap build_remap(const Function& fbody, const DCOFFParams& p);
 void apply_remap(Subgraph& fcall, const ClosureRemap& m);
-void finalize_remap(Function& fbody, Subgraph &fsg, const ClosureRemap& m);
+void finalize_remap(Function& fbody, Subgraph& fsg, const ClosureRemap& m);
 
 // Various patterns here