Skip to content

Commit

Permalink
Rebase and respond to comments
Browse files Browse the repository at this point in the history
  • Loading branch information
ekalda committed Aug 12, 2022
1 parent 0d0c361 commit dd29f9f
Show file tree
Hide file tree
Showing 4 changed files with 136 additions and 227 deletions.
1 change: 1 addition & 0 deletions python/tvm/relay/backend/contrib/ethosu/tir/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ def lower_ethosu(sch, args, const_dict, name="main"):
mod = tvm.tir.transform.RemoveNoOp()(mod)
mod, const_dict = ethosu_passes.EncodeConstants(const_dict)(mod)
mod = ethosu_passes.HoistAllocates()(mod)
mod = tvm.tir.transform.RemoveNoOp()(mod)
# MergeConstant pass currently does not support striped schedules.
# It requires further investigation.
if not util.is_striping_enabled():
Expand Down
19 changes: 16 additions & 3 deletions python/tvm/relay/backend/contrib/ethosu/tir/passes.py
Original file line number Diff line number Diff line change
Expand Up @@ -926,9 +926,22 @@ def CopyComputeReordering(
into the SHRAM and that already happens in parallel with copying weights into
the weights encoder.
If reorder_by_cycles is set, we use the cycle hint to decide the reordering. If it is not set,
we move the copies up by a fixed number of movements, either by max_copy_movements if it is
specified, or by default value of 1.
If reorder_by_cycles is set, we use the compute_cycles_hint to decide the reordering. If it is
not set, we move the copies up by a fixed number of movements, either by max_copy_movements if
it is specified, or by default value of 1.
If reordering based on the cycle count is enabled, we try to achieve further copy latency
hiding with a two step algorithm:
(1) Move all the global copies (i.e. copies that copy a constant into SRAM for conv2d or
depthwise_conv2d) above a preceding compute op. If in general the computes take longer than
copies, this should be enough to hide the copy latencies.
(2) If there are some global copies that take longer than the computes, we might be able to
hide them further by moving them further up in a graph since in general there are more compute
ops than copy ops in a graph (as only conv2d and depthwise_conv2d have constants associated
with them). The algortithm checks whether a copy is hidden and if it is not, it checks if a
preceding compute op has a preceding copy and if it doesn't it moves the copy that we try to
hide further up. It keeps moving the copy until it can't move it any further or until the
latency is hidden.
Parameters
----------
Expand Down
50 changes: 9 additions & 41 deletions src/tir/contrib/ethosu/passes.cc
Original file line number Diff line number Diff line change
Expand Up @@ -278,14 +278,14 @@ class CopyComputeReorderingMutator : public StmtExprMutator {
for (size_t idx = ops.size() - 1; idx > 0; --idx) {
if (ops[idx].global_copy.as<AttrStmtNode>()) {
// Check whether the copy is hidden
int64_t copy_cycles{get_cycles(ops[idx].global_copy)};
int64_t compute_cycles{get_cycles(ops[idx].compute_op)};
int64_t copy_cycles{GetStmtCycles(ops[idx].global_copy)};
int64_t compute_cycles{GetStmtCycles(ops[idx].compute_op)};
bool is_hidden = compute_cycles >= copy_cycles;

// If the previous compute op is not already hiding another copy, move the copy back, so
// that it would be hidden by multiple computes
while (!is_hidden && !ops[idx - 1].global_copy.as<AttrStmtNode>() && (idx > 0)) {
int64_t new_compute_cycles{get_cycles(ops[idx - 1].compute_op)};
int64_t new_compute_cycles{GetStmtCycles(ops[idx - 1].compute_op)};
ops[idx - 1].global_copy = ops[idx].global_copy;
ops[idx].global_copy = {};
compute_cycles += new_compute_cycles;
Expand Down Expand Up @@ -317,7 +317,8 @@ class CopyComputeReorderingMutator : public StmtExprMutator {
for (size_t index = 0; index < new_seq.size(); ++index) {
if (GetStmtType(new_seq[index]) == StmtType::global_copy) {
int lower = std::max(0, static_cast<int>(index) - _max_copy_movements);
for (int i = index; i > lower && (GetStmtType(new_seq[i - 1]) == StmtType::compute); --i) {
for (int i = index; i > lower && (GetStmtType(new_seq[i - 1]) == StmtType::compute);
--i) {
std::swap(new_seq[i - 1], new_seq[i]);
}
}
Expand All @@ -329,44 +330,11 @@ class CopyComputeReorderingMutator : public StmtExprMutator {
return Stmt{seq_stmt_node};
}

int64_t get_cycles(const Stmt& stmt) {
auto attr_node{stmt.as<AttrStmtNode>()};
ICHECK(attr_node) << "The cycle count attribute is missing";
return attr_node->value.as<IntImmNode>()->value;
}

tvm::runtime::Array<tvm::PrimExpr> get_stmt_args(const Stmt& stmt) {
Stmt eval_stmt = stmt;
if (const auto* attr_stmt = eval_stmt.as<AttrStmtNode>()) {
eval_stmt = attr_stmt->body;
}

auto eval_node{eval_stmt.as<EvaluateNode>()};
ICHECK(eval_node) << "Expected statement to be an evaluate node, but was "
<< eval_stmt->GetTypeKey();
auto call_node{eval_node->value.as<CallNode>()};
ICHECK(call_node) << "Expected expression to be a call node, but was "
<< eval_node->value->GetTypeKey();
return call_node->args;
}
bool stmt_is_global_copy(const Stmt& stmt) { return GetStmtType(stmt) == StmtType::global_copy; }

bool stmt_is_global_copy(const Stmt& stmt) {
auto args{get_stmt_args(stmt)};
return args[0].as<StringImmNode>()->value == "ethosu_copy" &&
args[3].as<BufferLoadNode>()->buffer.scope() == "global";
}

bool stmt_is_local_copy(const Stmt& stmt) {
auto args{get_stmt_args(stmt)};
return args[0].as<StringImmNode>()->value == "ethosu_copy" &&
args[3].as<BufferLoadNode>()->buffer.scope() == "local";
}

bool stmt_is_copy(const Stmt& stmt) {
return stmt_is_global_copy(stmt) || stmt_is_local_copy(stmt);
}
bool stmt_is_local_copy(const Stmt& stmt) { return GetStmtType(stmt) == StmtType::local_copy; }

bool stmt_is_compute_op(const Stmt& stmt) { return !stmt_is_copy(stmt); }
bool stmt_is_compute_op(const Stmt& stmt) { return GetStmtType(stmt) == StmtType::compute; }

/*! The maximum number of movements allowed for a copy. */
int _max_copy_movements;
Expand Down Expand Up @@ -401,7 +369,7 @@ tvm::transform::Pass CopyComputeReordering(Optional<Integer> max_copy_movements,
ctx->GetConfig(kCopyComputeReorderingMaxCopyMovements, Integer(1)).value());
auto reorder = reorder_by_cycles.value_or(
ctx->GetConfig(kCopyComputeReorderingReorderByCycles, Bool(false)).value());
return CopyComputeReorderingMutator(copy_movements, reorder)(f);
return CopyComputeReorderingMutator(copy_movements.IntValue(), reorder)(f);
};
return tvm::tir::transform::CreatePrimFuncPass(pass_func, 0,
"tir.contrib.ethos-u.CopyComputeReordering", {});
Expand Down
Loading

0 comments on commit dd29f9f

Please sign in to comment.