Rebase and respond to comments

apache · Aug 12, 2022 · dd29f9f · dd29f9f
1 parent 0d0c361
commit dd29f9f
Show file tree

Hide file tree

Showing 4 changed files with 136 additions and 227 deletions.
diff --git a/python/tvm/relay/backend/contrib/ethosu/tir/compiler.py b/python/tvm/relay/backend/contrib/ethosu/tir/compiler.py
@@ -90,6 +90,7 @@ def lower_ethosu(sch, args, const_dict, name="main"):
         mod = tvm.tir.transform.RemoveNoOp()(mod)
         mod, const_dict = ethosu_passes.EncodeConstants(const_dict)(mod)
         mod = ethosu_passes.HoistAllocates()(mod)
+        mod = tvm.tir.transform.RemoveNoOp()(mod)
         #  MergeConstant pass currently does not support striped schedules.
         #  It requires further investigation.
         if not util.is_striping_enabled():

diff --git a/python/tvm/relay/backend/contrib/ethosu/tir/passes.py b/python/tvm/relay/backend/contrib/ethosu/tir/passes.py
@@ -926,9 +926,22 @@ def CopyComputeReordering(
     into the SHRAM and that already happens in parallel with copying weights into
     the weights encoder.
 
-    If reorder_by_cycles is set, we use the cycle hint to decide the reordering. If it is not set,
-    we move the copies up by a fixed number of movements, either by max_copy_movements if it is
-    specified, or by default value of 1.
+    If reorder_by_cycles is set, we use the compute_cycles_hint to decide the reordering. If it is
+    not set, we move the copies up by a fixed number of movements, either by max_copy_movements if
+    it is specified, or by default value of 1.
+
+    If reordering based on the cycle count is enabled, we try to achieve further copy latency
+    hiding with a two step algorithm:
+    (1) Move all the global copies (i.e. copies that copy a constant into SRAM for conv2d or
+    depthwise_conv2d) above a preceding compute op. If in general the computes take longer than
+    copies, this should be enough to hide the copy latencies.
+    (2) If there are some global copies that take longer than the computes, we might be able to
+    hide them further by moving them further up in a graph since in general there are more compute
+    ops than copy ops in a graph (as only conv2d and depthwise_conv2d have constants associated
+    with them). The algortithm checks whether a copy is hidden and if it is not, it checks if a
+    preceding compute op has a preceding copy and if it doesn't it moves the copy that we try to
+    hide further up. It keeps moving the copy until it can't move it any further or until the
+    latency is hidden.
 
     Parameters
     ----------

diff --git a/src/tir/contrib/ethosu/passes.cc b/src/tir/contrib/ethosu/passes.cc
@@ -278,14 +278,14 @@ class CopyComputeReorderingMutator : public StmtExprMutator {
       for (size_t idx = ops.size() - 1; idx > 0; --idx) {
         if (ops[idx].global_copy.as<AttrStmtNode>()) {
           // Check whether the copy is hidden
-          int64_t copy_cycles{get_cycles(ops[idx].global_copy)};
-          int64_t compute_cycles{get_cycles(ops[idx].compute_op)};
+          int64_t copy_cycles{GetStmtCycles(ops[idx].global_copy)};
+          int64_t compute_cycles{GetStmtCycles(ops[idx].compute_op)};
           bool is_hidden = compute_cycles >= copy_cycles;
 
           // If the previous compute op is not already hiding another copy, move the copy back, so
           // that it would be hidden by multiple computes
           while (!is_hidden && !ops[idx - 1].global_copy.as<AttrStmtNode>() && (idx > 0)) {
-            int64_t new_compute_cycles{get_cycles(ops[idx - 1].compute_op)};
+            int64_t new_compute_cycles{GetStmtCycles(ops[idx - 1].compute_op)};
             ops[idx - 1].global_copy = ops[idx].global_copy;
             ops[idx].global_copy = {};
             compute_cycles += new_compute_cycles;
@@ -317,7 +317,8 @@ class CopyComputeReorderingMutator : public StmtExprMutator {
       for (size_t index = 0; index < new_seq.size(); ++index) {
         if (GetStmtType(new_seq[index]) == StmtType::global_copy) {
           int lower = std::max(0, static_cast<int>(index) - _max_copy_movements);
-          for (int i = index; i > lower && (GetStmtType(new_seq[i - 1]) == StmtType::compute); --i) {
+          for (int i = index; i > lower && (GetStmtType(new_seq[i - 1]) == StmtType::compute);
+               --i) {
             std::swap(new_seq[i - 1], new_seq[i]);
           }
         }
@@ -329,44 +330,11 @@ class CopyComputeReorderingMutator : public StmtExprMutator {
     return Stmt{seq_stmt_node};
   }
 
-  int64_t get_cycles(const Stmt& stmt) {
-    auto attr_node{stmt.as<AttrStmtNode>()};
-    ICHECK(attr_node) << "The cycle count attribute is missing";
-    return attr_node->value.as<IntImmNode>()->value;
-  }
-
-  tvm::runtime::Array<tvm::PrimExpr> get_stmt_args(const Stmt& stmt) {
-    Stmt eval_stmt = stmt;
-    if (const auto* attr_stmt = eval_stmt.as<AttrStmtNode>()) {
-      eval_stmt = attr_stmt->body;
-    }
-
-    auto eval_node{eval_stmt.as<EvaluateNode>()};
-    ICHECK(eval_node) << "Expected statement to be an evaluate node, but was "
-                      << eval_stmt->GetTypeKey();
-    auto call_node{eval_node->value.as<CallNode>()};
-    ICHECK(call_node) << "Expected expression to be a call node, but was "
-                      << eval_node->value->GetTypeKey();
-    return call_node->args;
-  }
+  bool stmt_is_global_copy(const Stmt& stmt) { return GetStmtType(stmt) == StmtType::global_copy; }
 
-  bool stmt_is_global_copy(const Stmt& stmt) {
-    auto args{get_stmt_args(stmt)};
-    return args[0].as<StringImmNode>()->value == "ethosu_copy" &&
-           args[3].as<BufferLoadNode>()->buffer.scope() == "global";
-  }
-
-  bool stmt_is_local_copy(const Stmt& stmt) {
-    auto args{get_stmt_args(stmt)};
-    return args[0].as<StringImmNode>()->value == "ethosu_copy" &&
-           args[3].as<BufferLoadNode>()->buffer.scope() == "local";
-  }
-
-  bool stmt_is_copy(const Stmt& stmt) {
-    return stmt_is_global_copy(stmt) || stmt_is_local_copy(stmt);
-  }
+  bool stmt_is_local_copy(const Stmt& stmt) { return GetStmtType(stmt) == StmtType::local_copy; }
 
-  bool stmt_is_compute_op(const Stmt& stmt) { return !stmt_is_copy(stmt); }
+  bool stmt_is_compute_op(const Stmt& stmt) { return GetStmtType(stmt) == StmtType::compute; }
 
   /*! The maximum number of movements allowed for a copy. */
   int _max_copy_movements;
@@ -401,7 +369,7 @@ tvm::transform::Pass CopyComputeReordering(Optional<Integer> max_copy_movements,
         ctx->GetConfig(kCopyComputeReorderingMaxCopyMovements, Integer(1)).value());
     auto reorder = reorder_by_cycles.value_or(
         ctx->GetConfig(kCopyComputeReorderingReorderByCycles, Bool(false)).value());
-    return CopyComputeReorderingMutator(copy_movements, reorder)(f);
+    return CopyComputeReorderingMutator(copy_movements.IntValue(), reorder)(f);
   };
   return tvm::tir::transform::CreatePrimFuncPass(pass_func, 0,
                                                  "tir.contrib.ethos-u.CopyComputeReordering", {});