Skip to content

Commit

Permalink
[microNPU] Add a pass to reorder copy and compute nodes
Browse files Browse the repository at this point in the history
  • Loading branch information
NicolaLancellotti committed Apr 20, 2022
1 parent 970f868 commit 88c0ea3
Show file tree
Hide file tree
Showing 8 changed files with 513 additions and 112 deletions.
1 change: 1 addition & 0 deletions python/tvm/relay/backend/contrib/ethosu/tir/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ def lower_ethosu(sch, args, const_dict, name="main"):
mod = tvm.tir.transform.RemoveNoOp()(mod)
mod, const_dict = ethosu_passes.EncodeConstants(const_dict)(mod)
mod = ethosu_passes.HoistAllocates()(mod)
mod = ethosu_passes.CopyComputeReordering()(mod)
disable_storage_rewrite = curr_cfg.get("tir.disable_storage_rewrite", False)
if not disable_storage_rewrite:
mod = tvm.tir.transform.StorageRewrite()(mod)
Expand Down
16 changes: 16 additions & 0 deletions python/tvm/relay/backend/contrib/ethosu/tir/passes.py
Original file line number Diff line number Diff line change
Expand Up @@ -815,3 +815,19 @@ def HoistAllocates() -> tvm.IRModule:
The new module with hoisted allocate nodes.
"""
return _ffi_api.HoistAllocates()


def CopyComputeReordering() -> tvm.IRModule:
"""
Reorders copy and compute nodes in such a way that independent DMA copies,
and computes happen in parallel.
Copies to buffers with local scope are not reordered, indeed they copy LUT
into the SHRAM which already happens in parallel with copying weights into
the weights encoder.
Returns
-------
tvm.IRModule
The new module with copy and compute nodes reordered.
"""
return _ffi_api.CopyComputeReordering()
74 changes: 74 additions & 0 deletions src/tir/contrib/ethosu/passes.cc
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,80 @@ tvm::transform::Pass HoistAllocates() {

TVM_REGISTER_GLOBAL("tir.contrib.ethos-u.HoistAllocates").set_body_typed(HoistAllocates);

/*!
* \brief Reorders copy and compute nodes in such a way that independent DMA copies,
* and computes happen in parallel.
* Copies to buffers with local scope are not reordered, indeed they copy LUT
* into the SHRAM which already happens in parallel with copying weights into
* the weights encoder.
*/
class CopyComputeReorderingMutator : public StmtExprMutator {
public:
CopyComputeReorderingMutator() {}

PrimFunc operator()(PrimFunc main_func) {
auto n{main_func.CopyOnWrite()};
n->body = this->VisitStmt(main_func->body);
return GetRef<PrimFunc>(n);
}

private:
Stmt VisitStmt_(const SeqStmtNode* op) override {
if (op->size() <= 1) {
return StmtExprMutator::VisitStmt_(op);
}

auto seq_stmt{GetRef<SeqStmt>(op)};
std::vector<Stmt> new_seq(seq_stmt->size());
std::copy(seq_stmt->seq.begin(), seq_stmt->seq.end(), new_seq.begin());
bool previous_stmt_is_copy{true}; // Do not move the first stmt if it is a copy

for (size_t i{}; i < seq_stmt->size(); ++i) {
Stmt stmt{seq_stmt[i]};
auto eval_node{stmt.as<EvaluateNode>()};
ICHECK(eval_node) << "Expected statement to be an evaluate node, but was "
<< stmt->GetTypeKey();
auto call_node{eval_node->value.as<CallNode>()};
ICHECK(call_node) << "Expected expression to be a call node, but was "
<< eval_node->value->GetTypeKey();
auto args{call_node->args};
bool stmt_is_copy{args[0].as<StringImmNode>()->value == "ethosu_copy"};
bool stmt_is_global_copy{stmt_is_copy &&
args[3].as<BufferLoadNode>()->buffer.scope() == "global"};
if (!previous_stmt_is_copy && stmt_is_global_copy) {
std::swap(new_seq[i], new_seq[i - 1]);
} else {
previous_stmt_is_copy = stmt_is_copy;
}
}

auto n{CopyOnWrite(op)};
n->seq = std::move(new_seq);
return Stmt{n};
}
};

/*!
* \brief A pass to reorder copy and compute nodes in such a way that independent DMA copies,
* and computes happen in parallel.
*
* \return tvm::transform::Pass
*/
tvm::transform::Pass CopyComputeReordering() {
auto pass_func = [=](PrimFunc f, IRModule mod, tvm::transform::PassContext ctx) {
ICHECK(mod->GetGlobalVars().size() == 1 && mod->ContainGlobalVar("main"))
<< "Expected a single primitive function called 'main'. Please run the "
"CopyComputeReordering "
"pass in conjunction with the LowerToTIR() pass.";
return CopyComputeReorderingMutator()(f);
};
return tvm::tir::transform::CreatePrimFuncPass(pass_func, 0,
"tir.contrib.ethos-u.CopyComputeReordering", {});
}

TVM_REGISTER_GLOBAL("tir.contrib.ethos-u.CopyComputeReordering")
.set_body_typed(CopyComputeReordering);

} // namespace ethosu
} // namespace contrib
} // namespace tir
Expand Down
307 changes: 307 additions & 0 deletions tests/python/contrib/test_ethosu/test_copy_compute_reordering.py

Large diffs are not rendered by default.

125 changes: 65 additions & 60 deletions tests/python/contrib/test_ethosu/test_encode_constants.py

Large diffs are not rendered by default.

18 changes: 9 additions & 9 deletions tests/python/contrib/test_ethosu/test_networks.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,13 +43,13 @@
@pytest.mark.parametrize(
"accel_type, model_url, workspace_size",
[
("ethos-u65-256", MOBILENET_V1_URL, 1423344),
("ethos-u65-256", MOBILENET_V2_URL, 2185584),
("ethos-u55-256", MOBILENET_V1_URL, 1423344),
("ethos-u55-256", MOBILENET_V2_URL, 2185584),
("ethos-u55-128", MOBILENET_V2_URL, 2185584),
("ethos-u55-64", MOBILENET_V2_URL, 2185584),
("ethos-u55-32", MOBILENET_V2_URL, 2185584),
("ethos-u65-256", MOBILENET_V1_URL, 1892704),
("ethos-u65-256", MOBILENET_V2_URL, 2257984),
("ethos-u55-256", MOBILENET_V1_URL, 1892704),
("ethos-u55-256", MOBILENET_V2_URL, 2257984),
("ethos-u55-128", MOBILENET_V2_URL, 2257984),
("ethos-u55-64", MOBILENET_V2_URL, 2257984),
("ethos-u55-32", MOBILENET_V2_URL, 2258000),
],
)
def test_networks_without_usmp(accel_type, model_url, workspace_size):
Expand All @@ -71,8 +71,8 @@ def test_networks_without_usmp(accel_type, model_url, workspace_size):
@pytest.mark.parametrize(
"accel_type, model_url, workspace_size",
[
("ethos-u65-256", MOBILENET_V1_URL, 1205872),
("ethos-u55-256", MOBILENET_V2_URL, 1507152),
("ethos-u65-256", MOBILENET_V1_URL, 1206880),
("ethos-u55-256", MOBILENET_V2_URL, 1509408),
],
)
def test_networks_with_usmp(accel_type, model_url, workspace_size):
Expand Down
42 changes: 21 additions & 21 deletions tests/python/contrib/test_ethosu/test_replace_copy.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,16 +34,16 @@ class ReferenceModule:
def main(placeholder_3: T.Buffer[(8192,), "int8"], ethosu_write_1: T.Buffer[(2048,), "int8"]) -> None:
# function attr dict
T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
buffer = T.buffer_decl([80], "uint8")
buffer_1 = T.buffer_decl([304], "uint8")
buffer1 = T.buffer_decl([304], "uint8")
buffer2 = T.buffer_decl([80], "uint8")
T.preflattened_buffer(placeholder_3, [1, 16, 16, 32], dtype="int8", data=placeholder_3.data)
T.preflattened_buffer(ethosu_write_1, [1, 16, 16, 8], dtype="int8", data=ethosu_write_1.data)
# body
placeholder_global = T.allocate([304], "uint8", "global", annotations={"disable_lower_builtin": True})
placeholder_d_global = T.allocate([80], "uint8", "global", annotations={"disable_lower_builtin": True})
T.evaluate(T.call_extern("ethosu_copy", buffer_1[0], 304, placeholder_global[0], dtype="handle"))
T.evaluate(T.call_extern("ethosu_copy", buffer[0], 80, placeholder_d_global[0], dtype="handle"))
T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder_3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 8, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 304, 12, placeholder_d_global[0], 80, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
p1 = T.allocate([304], "uint8", "global", annotations={"disable_lower_builtin": True})
p2 = T.allocate([80], "uint8", "global", annotations={"disable_lower_builtin": True})
T.evaluate(T.call_extern("ethosu_copy", buffer1[0], 304, p1[0], dtype="handle"))
T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 80, p2[0], dtype="handle"))
T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder_3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 8, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 304, 12, p2[0], 80, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
__tvm_meta__ = None
# fmt: on

Expand Down Expand Up @@ -80,23 +80,23 @@ class WeightStream:
def main(placeholder_5: T.Buffer[(8192,), "int8"], ethosu_write_1: T.Buffer[(4096,), "int8"]) -> None:
# function attr dict
T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
buffer = T.buffer_decl([416], "uint8")
buffer_1 = T.buffer_decl([112], "uint8")
buffer_2 = T.buffer_decl([272], "uint8")
buffer_3 = T.buffer_decl([64], "uint8")
buffer1 = T.buffer_decl([416], "uint8")
buffer2 = T.buffer_decl([112], "uint8")
buffer3 = T.buffer_decl([272], "uint8")
buffer4 = T.buffer_decl([64], "uint8")
T.preflattened_buffer(placeholder_5, [1, 16, 16, 32], dtype="int8", data=placeholder_5.data)
T.preflattened_buffer(ethosu_write_1, [1, 16, 16, 16], dtype="int8", data=ethosu_write_1.data)
# body
placeholder_global_unrolled_iter_0 = T.allocate([416], "uint8", "global", annotations={"disable_lower_builtin": True})
placeholder_global_unrolled_iter_1 = T.buffer_decl([272], "uint8", data=placeholder_global_unrolled_iter_0.data)
placeholder_d_global_unrolled_iter_0 = T.allocate([112], "uint8", "global", annotations={"disable_lower_builtin": True})
placeholder_d_global_unrolled_iter_1 = T.buffer_decl([64], dtype="uint8", data=placeholder_d_global_unrolled_iter_0.data)
T.evaluate(T.call_extern("ethosu_copy", buffer[0], 416, placeholder_global_unrolled_iter_0[0], dtype="handle"))
T.evaluate(T.call_extern("ethosu_copy", buffer_1[0], 112, placeholder_d_global_unrolled_iter_0[0], dtype="handle"))
T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder_5[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 10, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, placeholder_global_unrolled_iter_0[0], 416, 12, placeholder_d_global_unrolled_iter_0[0], 112, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
T.evaluate(T.call_extern("ethosu_copy", buffer_2[0], 272, placeholder_global_unrolled_iter_1[0], dtype="handle"))
T.evaluate(T.call_extern("ethosu_copy", buffer_3[0], 64, placeholder_d_global_unrolled_iter_1[0], dtype="handle"))
T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder_5[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 6, 16, 0, 16, ethosu_write_1[10], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, placeholder_global_unrolled_iter_1[0], 272, 12, placeholder_d_global_unrolled_iter_1[0], 64, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
p1 = T.allocate([416], "uint8", "global", annotations={"disable_lower_builtin": True})
p2 = T.allocate([112], "uint8", "global", annotations={"disable_lower_builtin": True})
p3 = T.allocate([272], "uint8", "global", annotations={"disable_lower_builtin": True})
p4 = T.allocate([64], "uint8", "global", annotations={"disable_lower_builtin": True})
T.evaluate(T.call_extern("ethosu_copy", buffer1[0], 416, p1[0], dtype="handle"))
T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 112, p2[0], dtype="handle"))
T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 272, p3[0], dtype="handle"))
T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 64, p4[0], dtype="handle"))
T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder_5[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 10, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, p1[0], 416, 12, p2[0], 112, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder_5[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 6, 16, 0, 16, ethosu_write_1[10], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, p3[0], 272, 12, p4[0], 64, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
__tvm_meta__ = None
# fmt: on

Expand Down
42 changes: 20 additions & 22 deletions tests/python/contrib/test_ethosu/test_scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,29 +180,27 @@ def test_schedule_cache_reads():
@tvm.script.ir_module
class DiamondGraphTir:
@T.prim_func
def main(input_buffer: T.Buffer[(301056,), "int8"], output_buffer: T.Buffer[(75264,), "int8"]) -> None:
def main(placeholder: T.Buffer[(301056,), "int8"], ethosu_write: T.Buffer[(75264,), "int8"]) -> None:
T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
T.preflattened_buffer(input_buffer, [1, 56, 56, 96], dtype='int8', data=input_buffer.data)
T.preflattened_buffer(output_buffer, [1, 56, 56, 24], dtype='int8', data=output_buffer.data)

weight_buffer = T.buffer_decl([2608], "uint8")
bias_buffer = T.buffer_decl([240], "uint8")
weight_buffer2 = T.buffer_decl([736], "uint8")
bias_buffer2 = T.buffer_decl([240], "uint8")

weight_global = T.allocate([2608], "uint8", "global", annotations={"disable_lower_builtin":True})
weight_global2 = T.buffer_decl([736], "uint8", data=weight_global.data)
bias_global = T.allocate([240], "uint8", "global", annotations={"disable_lower_builtin":True})
featuremap_buffer = T.allocate([75264], "int8", "global", annotations={"disable_lower_builtin": True})
featuremap_buffer2 = T.allocate([75264], "int8", "global", annotations={"disable_lower_builtin": True})

T.evaluate(T.call_extern("ethosu_copy", weight_buffer[0], 2608, weight_global[0], dtype="handle"))
T.evaluate(T.call_extern("ethosu_copy", bias_buffer[0], 240, bias_global[0], dtype="handle"))
T.evaluate(T.call_extern("ethosu_conv2d", "int8", 56, 56, 96, 56, 0, 56, input_buffer[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 5376, 96, 1, "int8", 56, 56, 24, 56, 0, 56, featuremap_buffer[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 1344, 24, 1, 1, 1, 1, 1, 1, 1, weight_global[0], 2608, 12, bias_global[0], 240, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
T.evaluate(T.call_extern("ethosu_copy", weight_buffer2[0], 736, weight_global2[0], dtype="handle"))
T.evaluate(T.call_extern("ethosu_copy", bias_buffer2[0], 240, bias_global[0], dtype="handle"))
T.evaluate(T.call_extern("ethosu_conv2d", "int8", 56, 56, 24, 56, 0, 56, featuremap_buffer[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 1344, 24, 1, "int8", 56, 56, 24, 56, 0, 56, featuremap_buffer2[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 1344, 24, 1, 1, 1, 1, 1, 1, 1, weight_global2[0], 736, 12, bias_global[0], 240, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
T.evaluate(T.call_extern("ethosu_binary_elementwise", "int8", 56, 56, 24, 56, 0, 56, featuremap_buffer[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1344, 24, 1, "int8", 56, 56, 24, 56, 0, 56, featuremap_buffer2[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1344, 24, 1, "int8", 56, 56, 24, 56, 0, 56, output_buffer[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1344, 24, 1, "ADD", 0, "NONE", 0, 0, "TFL", 0, 0, 0, dtype="handle"))
T.preflattened_buffer(placeholder, [1, 56, 56, 96], dtype='int8', data=placeholder.data)
T.preflattened_buffer(ethosu_write, [1, 56, 56, 24], dtype='int8', data=ethosu_write.data)
buffer1 = T.buffer_decl([2608], "uint8")
buffer2 = T.buffer_decl([240], "uint8")
buffer3 = T.buffer_decl([736], "uint8")
buffer4 = T.buffer_decl([240], "uint8")
p1 = T.allocate([2608], "uint8", "global", annotations={"disable_lower_builtin":True})
p2 = T.allocate([240], "uint8", "global", annotations={"disable_lower_builtin":True})
p3 = T.allocate([736], "uint8", "global", annotations={"disable_lower_builtin":True})
p4 = T.allocate([240], "uint8", "global", annotations={"disable_lower_builtin":True})
p5 = T.allocate([75264], "int8", "global", annotations={"disable_lower_builtin":True})
p6 = T.allocate([75264], "int8", "global", annotations={"disable_lower_builtin":True})
T.evaluate(T.call_extern("ethosu_copy", buffer1[0], 2608, p1[0], dtype="handle"))
T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 240, p2[0], dtype="handle"))
T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 736, p3[0], dtype="handle"))
T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 240, p4[0], dtype="handle"))
T.evaluate(T.call_extern("ethosu_conv2d", "int8", 56, 56, 96, 56, 0, 56, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 5376, 96, 1, "int8", 56, 56, 24, 56, 0, 56, p5[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 1344, 24, 1, 1, 1, 1, 1, 1, 1, p1[0], 2608, 12, p2[0], 240, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
T.evaluate(T.call_extern("ethosu_conv2d", "int8", 56, 56, 24, 56, 0, 56, p5[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 1344, 24, 1, "int8", 56, 56, 24, 56, 0, 56, p6[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 1344, 24, 1, 1, 1, 1, 1, 1, 1, p3[0], 736, 12, p4[0], 240, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
T.evaluate(T.call_extern("ethosu_binary_elementwise", "int8", 56, 56, 24, 56, 0, 56, p5[0], 0, 0, 0,T.float32(1), 0, "NHWC", 1344, 24, 1, "int8", 56, 56, 24, 56, 0, 56, p6[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1344, 24, 1, "int8", 56, 56, 24, 56, 0, 56, ethosu_write[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1344, 24, 1, "ADD", 0, "NONE", 0, 0, "TFL", 0, 0, 0, dtype="handle"))
__tvm_meta__ = None
# fmt: on

Expand Down

0 comments on commit 88c0ea3

Please sign in to comment.