[microNPU] Add a pass to reorder copy and compute nodes

apache · Apr 20, 2022 · 88c0ea3 · 88c0ea3
1 parent 970f868
commit 88c0ea3
Show file tree

Hide file tree

Showing 8 changed files with 513 additions and 112 deletions.
diff --git a/python/tvm/relay/backend/contrib/ethosu/tir/compiler.py b/python/tvm/relay/backend/contrib/ethosu/tir/compiler.py
@@ -89,6 +89,7 @@ def lower_ethosu(sch, args, const_dict, name="main"):
         mod = tvm.tir.transform.RemoveNoOp()(mod)
         mod, const_dict = ethosu_passes.EncodeConstants(const_dict)(mod)
         mod = ethosu_passes.HoistAllocates()(mod)
+        mod = ethosu_passes.CopyComputeReordering()(mod)
         disable_storage_rewrite = curr_cfg.get("tir.disable_storage_rewrite", False)
         if not disable_storage_rewrite:
             mod = tvm.tir.transform.StorageRewrite()(mod)

diff --git a/python/tvm/relay/backend/contrib/ethosu/tir/passes.py b/python/tvm/relay/backend/contrib/ethosu/tir/passes.py
@@ -815,3 +815,19 @@ def HoistAllocates() -> tvm.IRModule:
         The new module with hoisted allocate nodes.
     """
     return _ffi_api.HoistAllocates()
+
+
+def CopyComputeReordering() -> tvm.IRModule:
+    """
+    Reorders copy and compute nodes in such a way that independent DMA copies,
+    and computes happen in parallel.
+    Copies to buffers with local scope are not reordered, indeed they copy LUT
+    into the SHRAM which already happens in parallel with copying weights into
+    the weights encoder.
+
+    Returns
+    -------
+    tvm.IRModule
+        The new module with copy and compute nodes reordered.
+    """
+    return _ffi_api.CopyComputeReordering()
diff --git a/src/tir/contrib/ethosu/passes.cc b/src/tir/contrib/ethosu/passes.cc
@@ -110,6 +110,80 @@ tvm::transform::Pass HoistAllocates() {
 
 TVM_REGISTER_GLOBAL("tir.contrib.ethos-u.HoistAllocates").set_body_typed(HoistAllocates);
 
+/*!
+ * \brief Reorders copy and compute nodes in such a way that independent DMA copies,
+ * and computes happen in parallel.
+ * Copies to buffers with local scope are not reordered, indeed they copy LUT
+ * into the SHRAM which already happens in parallel with copying weights into
+ * the weights encoder.
+ */
+class CopyComputeReorderingMutator : public StmtExprMutator {
+ public:
+  CopyComputeReorderingMutator() {}
+
+  PrimFunc operator()(PrimFunc main_func) {
+    auto n{main_func.CopyOnWrite()};
+    n->body = this->VisitStmt(main_func->body);
+    return GetRef<PrimFunc>(n);
+  }
+
+ private:
+  Stmt VisitStmt_(const SeqStmtNode* op) override {
+    if (op->size() <= 1) {
+      return StmtExprMutator::VisitStmt_(op);
+    }
+
+    auto seq_stmt{GetRef<SeqStmt>(op)};
+    std::vector<Stmt> new_seq(seq_stmt->size());
+    std::copy(seq_stmt->seq.begin(), seq_stmt->seq.end(), new_seq.begin());
+    bool previous_stmt_is_copy{true};  // Do not move the first stmt if it is a copy
+
+    for (size_t i{}; i < seq_stmt->size(); ++i) {
+      Stmt stmt{seq_stmt[i]};
+      auto eval_node{stmt.as<EvaluateNode>()};
+      ICHECK(eval_node) << "Expected statement to be an evaluate node, but was "
+                        << stmt->GetTypeKey();
+      auto call_node{eval_node->value.as<CallNode>()};
+      ICHECK(call_node) << "Expected expression to be a call node, but was "
+                        << eval_node->value->GetTypeKey();
+      auto args{call_node->args};
+      bool stmt_is_copy{args[0].as<StringImmNode>()->value == "ethosu_copy"};
+      bool stmt_is_global_copy{stmt_is_copy &&
+                               args[3].as<BufferLoadNode>()->buffer.scope() == "global"};
+      if (!previous_stmt_is_copy && stmt_is_global_copy) {
+        std::swap(new_seq[i], new_seq[i - 1]);
+      } else {
+        previous_stmt_is_copy = stmt_is_copy;
+      }
+    }
+
+    auto n{CopyOnWrite(op)};
+    n->seq = std::move(new_seq);
+    return Stmt{n};
+  }
+};
+
+/*!
+ * \brief A pass to reorder copy and compute nodes in such a way that independent DMA copies,
+ * and computes happen in parallel.
+ *
+ * \return tvm::transform::Pass
+ */
+tvm::transform::Pass CopyComputeReordering() {
+  auto pass_func = [=](PrimFunc f, IRModule mod, tvm::transform::PassContext ctx) {
+    ICHECK(mod->GetGlobalVars().size() == 1 && mod->ContainGlobalVar("main"))
+        << "Expected a single primitive function called 'main'. Please run the "
+           "CopyComputeReordering "
+           "pass in conjunction with the LowerToTIR() pass.";
+    return CopyComputeReorderingMutator()(f);
+  };
+  return tvm::tir::transform::CreatePrimFuncPass(pass_func, 0,
+                                                 "tir.contrib.ethos-u.CopyComputeReordering", {});
+}
+
+TVM_REGISTER_GLOBAL("tir.contrib.ethos-u.CopyComputeReordering")
+    .set_body_typed(CopyComputeReordering);
+
 }  // namespace ethosu
 }  // namespace contrib
 }  // namespace tir

diff --git a/tests/python/contrib/test_ethosu/test_copy_compute_reordering.py b/tests/python/contrib/test_ethosu/test_copy_compute_reordering.py
diff --git a/tests/python/contrib/test_ethosu/test_encode_constants.py b/tests/python/contrib/test_ethosu/test_encode_constants.py
diff --git a/tests/python/contrib/test_ethosu/test_networks.py b/tests/python/contrib/test_ethosu/test_networks.py
@@ -43,13 +43,13 @@
 @pytest.mark.parametrize(
     "accel_type, model_url, workspace_size",
     [
-        ("ethos-u65-256", MOBILENET_V1_URL, 1423344),
-        ("ethos-u65-256", MOBILENET_V2_URL, 2185584),
-        ("ethos-u55-256", MOBILENET_V1_URL, 1423344),
-        ("ethos-u55-256", MOBILENET_V2_URL, 2185584),
-        ("ethos-u55-128", MOBILENET_V2_URL, 2185584),
-        ("ethos-u55-64", MOBILENET_V2_URL, 2185584),
-        ("ethos-u55-32", MOBILENET_V2_URL, 2185584),
+        ("ethos-u65-256", MOBILENET_V1_URL, 1892704),
+        ("ethos-u65-256", MOBILENET_V2_URL, 2257984),
+        ("ethos-u55-256", MOBILENET_V1_URL, 1892704),
+        ("ethos-u55-256", MOBILENET_V2_URL, 2257984),
+        ("ethos-u55-128", MOBILENET_V2_URL, 2257984),
+        ("ethos-u55-64", MOBILENET_V2_URL, 2257984),
+        ("ethos-u55-32", MOBILENET_V2_URL, 2258000),
     ],
 )
 def test_networks_without_usmp(accel_type, model_url, workspace_size):
@@ -71,8 +71,8 @@ def test_networks_without_usmp(accel_type, model_url, workspace_size):
 @pytest.mark.parametrize(
     "accel_type, model_url, workspace_size",
     [
-        ("ethos-u65-256", MOBILENET_V1_URL, 1205872),
-        ("ethos-u55-256", MOBILENET_V2_URL, 1507152),
+        ("ethos-u65-256", MOBILENET_V1_URL, 1206880),
+        ("ethos-u55-256", MOBILENET_V2_URL, 1509408),
     ],
 )
 def test_networks_with_usmp(accel_type, model_url, workspace_size):

diff --git a/tests/python/contrib/test_ethosu/test_replace_copy.py b/tests/python/contrib/test_ethosu/test_replace_copy.py
@@ -34,16 +34,16 @@ class ReferenceModule:
     def main(placeholder_3: T.Buffer[(8192,), "int8"], ethosu_write_1: T.Buffer[(2048,), "int8"]) -> None:
         # function attr dict
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
-        buffer = T.buffer_decl([80], "uint8")
-        buffer_1 = T.buffer_decl([304], "uint8")
+        buffer1 = T.buffer_decl([304], "uint8")
+        buffer2 = T.buffer_decl([80], "uint8")
         T.preflattened_buffer(placeholder_3, [1, 16, 16, 32], dtype="int8", data=placeholder_3.data)
         T.preflattened_buffer(ethosu_write_1, [1, 16, 16, 8], dtype="int8", data=ethosu_write_1.data)
         # body
-        placeholder_global = T.allocate([304], "uint8", "global", annotations={"disable_lower_builtin": True})
-        placeholder_d_global = T.allocate([80], "uint8", "global", annotations={"disable_lower_builtin": True})
-        T.evaluate(T.call_extern("ethosu_copy", buffer_1[0], 304, placeholder_global[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer[0], 80, placeholder_d_global[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder_3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 8, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 304, 12, placeholder_d_global[0], 80, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        p1 = T.allocate([304], "uint8", "global", annotations={"disable_lower_builtin": True})
+        p2 = T.allocate([80], "uint8", "global", annotations={"disable_lower_builtin": True})
+        T.evaluate(T.call_extern("ethosu_copy", buffer1[0], 304, p1[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 80, p2[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder_3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 8, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 304, 12, p2[0], 80, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     __tvm_meta__ = None
 # fmt: on
 
@@ -80,23 +80,23 @@ class WeightStream:
     def main(placeholder_5: T.Buffer[(8192,), "int8"], ethosu_write_1: T.Buffer[(4096,), "int8"]) -> None:
         # function attr dict
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
-        buffer = T.buffer_decl([416], "uint8")
-        buffer_1 = T.buffer_decl([112], "uint8")
-        buffer_2 = T.buffer_decl([272], "uint8")
-        buffer_3 = T.buffer_decl([64], "uint8")
+        buffer1 = T.buffer_decl([416], "uint8")
+        buffer2 = T.buffer_decl([112], "uint8")
+        buffer3 = T.buffer_decl([272], "uint8")
+        buffer4 = T.buffer_decl([64], "uint8")
         T.preflattened_buffer(placeholder_5, [1, 16, 16, 32], dtype="int8", data=placeholder_5.data)
         T.preflattened_buffer(ethosu_write_1, [1, 16, 16, 16], dtype="int8", data=ethosu_write_1.data)
         # body
-        placeholder_global_unrolled_iter_0 = T.allocate([416], "uint8", "global", annotations={"disable_lower_builtin": True})
-        placeholder_global_unrolled_iter_1 = T.buffer_decl([272], "uint8", data=placeholder_global_unrolled_iter_0.data)
-        placeholder_d_global_unrolled_iter_0 = T.allocate([112], "uint8", "global", annotations={"disable_lower_builtin": True})
-        placeholder_d_global_unrolled_iter_1 = T.buffer_decl([64], dtype="uint8", data=placeholder_d_global_unrolled_iter_0.data)
-        T.evaluate(T.call_extern("ethosu_copy", buffer[0], 416, placeholder_global_unrolled_iter_0[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer_1[0], 112, placeholder_d_global_unrolled_iter_0[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder_5[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 10, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, placeholder_global_unrolled_iter_0[0], 416, 12, placeholder_d_global_unrolled_iter_0[0], 112, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer_2[0], 272, placeholder_global_unrolled_iter_1[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer_3[0], 64, placeholder_d_global_unrolled_iter_1[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder_5[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 6, 16, 0, 16, ethosu_write_1[10], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, placeholder_global_unrolled_iter_1[0], 272, 12, placeholder_d_global_unrolled_iter_1[0], 64, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        p1 = T.allocate([416], "uint8", "global", annotations={"disable_lower_builtin": True})
+        p2 = T.allocate([112], "uint8", "global", annotations={"disable_lower_builtin": True})
+        p3 = T.allocate([272], "uint8", "global", annotations={"disable_lower_builtin": True})
+        p4 = T.allocate([64],  "uint8", "global", annotations={"disable_lower_builtin": True})
+        T.evaluate(T.call_extern("ethosu_copy", buffer1[0], 416, p1[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 112, p2[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 272, p3[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 64, p4[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder_5[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 10, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, p1[0], 416, 12, p2[0], 112, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder_5[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 6, 16, 0, 16, ethosu_write_1[10], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, p3[0], 272, 12, p4[0], 64, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     __tvm_meta__ = None
 # fmt: on
 

diff --git a/tests/python/contrib/test_ethosu/test_scheduler.py b/tests/python/contrib/test_ethosu/test_scheduler.py
@@ -180,29 +180,27 @@ def test_schedule_cache_reads():
 @tvm.script.ir_module
 class DiamondGraphTir:
     @T.prim_func
-    def main(input_buffer: T.Buffer[(301056,), "int8"], output_buffer: T.Buffer[(75264,), "int8"]) -> None:
+    def main(placeholder: T.Buffer[(301056,), "int8"], ethosu_write: T.Buffer[(75264,), "int8"]) -> None:
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
-        T.preflattened_buffer(input_buffer, [1, 56, 56, 96], dtype='int8', data=input_buffer.data)
-        T.preflattened_buffer(output_buffer, [1, 56, 56, 24], dtype='int8', data=output_buffer.data)
-
-        weight_buffer = T.buffer_decl([2608], "uint8")
-        bias_buffer = T.buffer_decl([240], "uint8")
-        weight_buffer2 = T.buffer_decl([736], "uint8")
-        bias_buffer2 = T.buffer_decl([240], "uint8")
-
-        weight_global = T.allocate([2608], "uint8", "global", annotations={"disable_lower_builtin":True})
-        weight_global2 = T.buffer_decl([736], "uint8", data=weight_global.data)
-        bias_global = T.allocate([240], "uint8", "global", annotations={"disable_lower_builtin":True})
-        featuremap_buffer = T.allocate([75264], "int8", "global", annotations={"disable_lower_builtin": True})
-        featuremap_buffer2 = T.allocate([75264], "int8", "global", annotations={"disable_lower_builtin": True})
-
-        T.evaluate(T.call_extern("ethosu_copy", weight_buffer[0], 2608, weight_global[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", bias_buffer[0], 240, bias_global[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 56, 56, 96, 56, 0, 56, input_buffer[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 5376, 96, 1, "int8", 56, 56, 24, 56, 0, 56, featuremap_buffer[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 1344, 24, 1, 1, 1, 1, 1, 1, 1, weight_global[0], 2608, 12, bias_global[0], 240, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", weight_buffer2[0], 736, weight_global2[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", bias_buffer2[0], 240, bias_global[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 56, 56, 24, 56, 0, 56, featuremap_buffer[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 1344, 24, 1, "int8", 56, 56, 24, 56, 0, 56, featuremap_buffer2[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 1344, 24, 1, 1, 1, 1, 1, 1, 1, weight_global2[0], 736, 12, bias_global[0], 240, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_binary_elementwise", "int8", 56, 56, 24, 56, 0, 56, featuremap_buffer[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1344, 24, 1, "int8", 56, 56, 24, 56, 0, 56, featuremap_buffer2[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1344, 24, 1, "int8", 56, 56, 24, 56, 0, 56, output_buffer[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1344, 24, 1, "ADD", 0, "NONE", 0, 0, "TFL", 0, 0, 0, dtype="handle"))
+        T.preflattened_buffer(placeholder, [1, 56, 56, 96], dtype='int8', data=placeholder.data)
+        T.preflattened_buffer(ethosu_write, [1, 56, 56, 24], dtype='int8', data=ethosu_write.data)
+        buffer1 = T.buffer_decl([2608], "uint8")
+        buffer2 = T.buffer_decl([240], "uint8")
+        buffer3 = T.buffer_decl([736], "uint8")
+        buffer4 = T.buffer_decl([240], "uint8")
+        p1 = T.allocate([2608], "uint8", "global", annotations={"disable_lower_builtin":True})
+        p2 = T.allocate([240], "uint8", "global", annotations={"disable_lower_builtin":True})
+        p3 = T.allocate([736], "uint8", "global", annotations={"disable_lower_builtin":True})
+        p4 = T.allocate([240], "uint8", "global", annotations={"disable_lower_builtin":True})
+        p5 = T.allocate([75264], "int8", "global", annotations={"disable_lower_builtin":True})
+        p6 = T.allocate([75264], "int8", "global", annotations={"disable_lower_builtin":True})
+        T.evaluate(T.call_extern("ethosu_copy", buffer1[0], 2608, p1[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 240, p2[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 736, p3[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 240, p4[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 56, 56, 96, 56, 0, 56, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 5376, 96, 1, "int8", 56, 56, 24, 56, 0, 56, p5[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 1344, 24, 1, 1, 1, 1, 1, 1, 1, p1[0], 2608, 12, p2[0], 240, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 56, 56, 24, 56, 0, 56, p5[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 1344, 24, 1, "int8", 56, 56, 24, 56, 0, 56, p6[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 1344, 24, 1, 1, 1, 1, 1, 1, 1, p3[0], 736, 12, p4[0], 240, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_binary_elementwise", "int8", 56, 56, 24, 56, 0, 56, p5[0], 0, 0, 0,T.float32(1), 0, "NHWC", 1344, 24, 1, "int8", 56, 56, 24, 56, 0, 56, p6[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1344, 24, 1, "int8", 56, 56, 24, 56, 0, 56, ethosu_write[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1344, 24, 1, "ADD", 0, "NONE", 0, 0, "TFL", 0, 0, 0, dtype="handle"))
     __tvm_meta__ = None
 # fmt: on