diff --git a/python/tvm/relay/backend/contrib/ethosu/tir/compiler.py b/python/tvm/relay/backend/contrib/ethosu/tir/compiler.py index 707f6b6ccefb..4020e4b78ea7 100644 --- a/python/tvm/relay/backend/contrib/ethosu/tir/compiler.py +++ b/python/tvm/relay/backend/contrib/ethosu/tir/compiler.py @@ -89,6 +89,7 @@ def lower_ethosu(sch, args, const_dict, name="main"): mod = tvm.tir.transform.RemoveNoOp()(mod) mod, const_dict = ethosu_passes.EncodeConstants(const_dict)(mod) mod = ethosu_passes.HoistAllocates()(mod) + mod = ethosu_passes.CopyComputeReordering()(mod) disable_storage_rewrite = curr_cfg.get("tir.disable_storage_rewrite", False) if not disable_storage_rewrite: mod = tvm.tir.transform.StorageRewrite()(mod) diff --git a/python/tvm/relay/backend/contrib/ethosu/tir/passes.py b/python/tvm/relay/backend/contrib/ethosu/tir/passes.py index 5c143815ae1f..22f349d4eeab 100644 --- a/python/tvm/relay/backend/contrib/ethosu/tir/passes.py +++ b/python/tvm/relay/backend/contrib/ethosu/tir/passes.py @@ -815,3 +815,19 @@ def HoistAllocates() -> tvm.IRModule: The new module with hoisted allocate nodes. """ return _ffi_api.HoistAllocates() + + +def CopyComputeReordering() -> tvm.IRModule: + """ + Reorders copy and compute nodes in such a way that independent DMA copies, + and computes happen in parallel. + Copies to buffers with local scope are not reordered, indeed they copy LUT + into the SHRAM which already happens in parallel with copying weights into + the weights encoder. + + Returns + ------- + tvm.IRModule + The new module with copy and compute nodes reordered. + """ + return _ffi_api.CopyComputeReordering() diff --git a/src/tir/contrib/ethosu/passes.cc b/src/tir/contrib/ethosu/passes.cc index 45161499f5be..41c21cf4902e 100644 --- a/src/tir/contrib/ethosu/passes.cc +++ b/src/tir/contrib/ethosu/passes.cc @@ -110,6 +110,80 @@ tvm::transform::Pass HoistAllocates() { TVM_REGISTER_GLOBAL("tir.contrib.ethos-u.HoistAllocates").set_body_typed(HoistAllocates); +/*! + * \brief Reorders copy and compute nodes in such a way that independent DMA copies, + * and computes happen in parallel. + * Copies to buffers with local scope are not reordered, indeed they copy LUT + * into the SHRAM which already happens in parallel with copying weights into + * the weights encoder. + */ +class CopyComputeReorderingMutator : public StmtExprMutator { + public: + CopyComputeReorderingMutator() {} + + PrimFunc operator()(PrimFunc main_func) { + auto n{main_func.CopyOnWrite()}; + n->body = this->VisitStmt(main_func->body); + return GetRef(n); + } + + private: + Stmt VisitStmt_(const SeqStmtNode* op) override { + if (op->size() <= 1) { + return StmtExprMutator::VisitStmt_(op); + } + + auto seq_stmt{GetRef(op)}; + std::vector new_seq(seq_stmt->size()); + std::copy(seq_stmt->seq.begin(), seq_stmt->seq.end(), new_seq.begin()); + bool previous_stmt_is_copy{true}; // Do not move the first stmt if it is a copy + + for (size_t i{}; i < seq_stmt->size(); ++i) { + Stmt stmt{seq_stmt[i]}; + auto eval_node{stmt.as()}; + ICHECK(eval_node) << "Expected statement to be an evaluate node, but was " + << stmt->GetTypeKey(); + auto call_node{eval_node->value.as()}; + ICHECK(call_node) << "Expected expression to be a call node, but was " + << eval_node->value->GetTypeKey(); + auto args{call_node->args}; + bool stmt_is_copy{args[0].as()->value == "ethosu_copy"}; + bool stmt_is_global_copy{stmt_is_copy && + args[3].as()->buffer.scope() == "global"}; + if (!previous_stmt_is_copy && stmt_is_global_copy) { + std::swap(new_seq[i], new_seq[i - 1]); + } else { + previous_stmt_is_copy = stmt_is_copy; + } + } + + auto n{CopyOnWrite(op)}; + n->seq = std::move(new_seq); + return Stmt{n}; + } +}; + +/*! + * \brief A pass to reorder copy and compute nodes in such a way that independent DMA copies, + * and computes happen in parallel. + * + * \return tvm::transform::Pass + */ +tvm::transform::Pass CopyComputeReordering() { + auto pass_func = [=](PrimFunc f, IRModule mod, tvm::transform::PassContext ctx) { + ICHECK(mod->GetGlobalVars().size() == 1 && mod->ContainGlobalVar("main")) + << "Expected a single primitive function called 'main'. Please run the " + "CopyComputeReordering " + "pass in conjunction with the LowerToTIR() pass."; + return CopyComputeReorderingMutator()(f); + }; + return tvm::tir::transform::CreatePrimFuncPass(pass_func, 0, + "tir.contrib.ethos-u.CopyComputeReordering", {}); +} + +TVM_REGISTER_GLOBAL("tir.contrib.ethos-u.CopyComputeReordering") + .set_body_typed(CopyComputeReordering); + } // namespace ethosu } // namespace contrib } // namespace tir diff --git a/tests/python/contrib/test_ethosu/test_copy_compute_reordering.py b/tests/python/contrib/test_ethosu/test_copy_compute_reordering.py new file mode 100644 index 000000000000..4f241272a2f3 --- /dev/null +++ b/tests/python/contrib/test_ethosu/test_copy_compute_reordering.py @@ -0,0 +1,307 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +import pytest + +pytest.importorskip("ethosu.vela") + +import tvm +from tvm.script import tir as T +from tvm.relay.backend.contrib.ethosu.tir.passes import CopyComputeReordering + + +def test_all_operators_with_weights(): + # fmt: off + @tvm.script.ir_module + class InputModule: + @T.prim_func + def main() -> None: + # function attr dict + T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) + buffer1 = T.buffer_decl([8192], "int8") + buffer2 = T.buffer_decl([128], "uint8") + buffer3 = T.buffer_decl([32], "uint8") + buffer4 = T.buffer_decl([112], "uint8") + buffer5 = T.buffer_decl([32], "uint8") + buffer6 = T.buffer_decl([112], "uint8") + buffer7 = T.buffer_decl([32], "uint8") + buffer8 = T.buffer_decl([112], "uint8") + buffer9 = T.buffer_decl([32], "uint8") + buffer10 = T.buffer_decl([2048], "int8") + # body + p1 = T.allocate([128], "uint8", "global") + p2 = T.allocate([112], "uint8", "global") + p3 = T.allocate([112], "uint8", "global") + p4 = T.allocate([32], "uint8", "global") + p5 = T.allocate([32], "uint8", "global") + p6 = T.allocate([32], "uint8", "global") + p7 = T.allocate([112], "uint8", "global") + p8 = T.allocate([32], "uint8", "global") + T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 128, p1[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 32, p4[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 128, 12, p4[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 112, p2[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer5[0], 32, p5[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p2[0], 112, 12, p5[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer6[0], 112, p3[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer7[0], 32, p6[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p3[0], 112, 12, p6[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer8[0], 112, p7[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer9[0], 32, p8[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p7[0], 112, 12, p8[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + + @tvm.script.ir_module + class ReferenceModule: + @T.prim_func + def main() -> None: + # function attr dict + T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) + buffer1 = T.buffer_decl([8192], "int8") + buffer2 = T.buffer_decl([128], "uint8") + buffer3 = T.buffer_decl([32], "uint8") + buffer4 = T.buffer_decl([112], "uint8") + buffer5 = T.buffer_decl([32], "uint8") + buffer6 = T.buffer_decl([112], "uint8") + buffer7 = T.buffer_decl([32], "uint8") + buffer8 = T.buffer_decl([112], "uint8") + buffer9 = T.buffer_decl([32], "uint8") + buffer10 = T.buffer_decl([2048], "int8") + # body + p1 = T.allocate([128], "uint8", "global") + p2 = T.allocate([112], "uint8", "global") + p3 = T.allocate([112], "uint8", "global") + p4 = T.allocate([32], "uint8", "global") + p5 = T.allocate([32], "uint8", "global") + p6 = T.allocate([32], "uint8", "global") + p7 = T.allocate([112], "uint8", "global") + p8 = T.allocate([32], "uint8", "global") + T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 128, p1[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 32, p4[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 112, p2[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer5[0], 32, p5[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 128, 12, p4[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer6[0], 112, p3[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer7[0], 32, p6[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p2[0], 112, 12, p5[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer8[0], 112, p7[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer9[0], 32, p8[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p3[0], 112, 12, p6[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p7[0], 112, 12, p8[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + # fmt: on + + test_mod = CopyComputeReordering()(InputModule) + reference_mod = ReferenceModule + tvm.ir.assert_structural_equal(test_mod, reference_mod, True) + + +def test_all_operators_without_weights(): + # fmt: off + @tvm.script.ir_module + class InputModule: + @T.prim_func + def main() -> None: + T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) + buffer1 = T.buffer_decl([36], "int8") + buffer2 = T.buffer_decl([9], "int8") + # body + p1 = T.allocate([96], "int8", "global") + T.evaluate(T.call_extern("ethosu_pooling", "int8", 3, 4, 3, 3, 0, 4, buffer1[0], 0, 0, 0, T.float32(1), 0, "NHWC", 12, 3, 1, "int8", 3, 2, 3, 3, 0, 2, p1[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 32, 16, 1, "MAX", 2, 1, 2, 1, 1, 1, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_pooling", "int8", 3, 2, 3, 3, 0, 2, p1[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 32, 16, 1, "int8", 3, 1, 3, 3, 0, 1, buffer2[0], 0, 0, 0, T.float32(1), 0, "NHWC", 3, 1, 1, "MAX", 2, 1, 2, 1, 1, 1, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + + @tvm.script.ir_module + class ReferenceModule: + @T.prim_func + def main() -> None: + T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) + buffer1 = T.buffer_decl([36], "int8") + buffer2 = T.buffer_decl([9], "int8") + # body + p1 = T.allocate([96], "int8", "global") + T.evaluate(T.call_extern("ethosu_pooling", "int8", 3, 4, 3, 3, 0, 4, buffer1[0], 0, 0, 0, T.float32(1), 0, "NHWC", 12, 3, 1, "int8", 3, 2, 3, 3, 0, 2, p1[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 32, 16, 1, "MAX", 2, 1, 2, 1, 1, 1, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_pooling", "int8", 3, 2, 3, 3, 0, 2, p1[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 32, 16, 1, "int8", 3, 1, 3, 3, 0, 1, buffer2[0], 0, 0, 0, T.float32(1), 0, "NHWC", 3, 1, 1, "MAX", 2, 1, 2, 1, 1, 1, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + # fmt: on + + test_mod = CopyComputeReordering()(InputModule) + reference_mod = ReferenceModule + tvm.ir.assert_structural_equal(test_mod, reference_mod, True) + + +def test_operators_with_and_without_weights(): + # fmt: off + @tvm.script.ir_module + class InputModule: + @T.prim_func + def main() -> None: + T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) + buffer1 = T.buffer_decl([97156], "int8") + buffer2 = T.buffer_decl([80], "uint8") + buffer3 = T.buffer_decl([64], "uint8") + buffer4 = T.buffer_decl([96], "uint8") + buffer5 = T.buffer_decl([32], "uint8") + # body + p1 = T.allocate([390336], "int8", "global") + p2 = T.allocate([80], "uint8", "global") + p3 = T.allocate([64], "uint8", "global") + p4 = T.allocate([390336], "int8", "global") + p5 = T.allocate([96], "uint8", "global") + p6 = T.allocate([32], "uint8", "global") + T.evaluate(T.call_extern("ethosu_pooling", "int8", 214, 227, 2, 214, 0, 227, buffer1[0], 0, 0, 0, T.float32(1), 0, "NHWC", 454, 2, 1, "int8", 214, 114, 2, 214, 0, 114, p1[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 1824, 16, 1, "MAX", 2, 1, 2, 1, 1, 1, 0, 0, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 80, p2[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 64, p3[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 214, 114, 2, 214, 0, 114, p1[0], 0, 0, 0, T.float32(0.00392157), -128, "NHCWB16", 1824, 16, 1, "int8", 214, 114, 5, 214, 0, 114, p4[0], 0, 0, 0, T.float32(0.0174839), -128, "NHCWB16", 1824, 16, 1, 3, 1, 1, 1, 1, 2, p2[0], 80, 0, p3[0], 64, 0, 1, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 96, p5[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer5[0], 32, p6[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 214, 114, 5, 214, 0, 114, p4[0], 0, 0, 0, T.float32(0.0174839), -128, "NHCWB16", 1824, 16, 1, "int8", 214, 114, 3, 214, 0, 114, buffer3[0], 0, 0, 0, T.float32(0.104816), -128, "NHWC", 342, 3, 1, 3, 1, 1, 1, 1, 2, p5[0], 96, 0, p6[0], 32, 0, 1, 0, 1, "CLIP", -128, 127, "TFL", "NONE", 0, 0, 0, dtype="handle")) + + @tvm.script.ir_module + class ReferenceModule: + @T.prim_func + def main() -> None: + T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) + buffer1 = T.buffer_decl([97156], "int8") + buffer2 = T.buffer_decl([80], "uint8") + buffer3 = T.buffer_decl([64], "uint8") + buffer4 = T.buffer_decl([96], "uint8") + buffer5 = T.buffer_decl([32], "uint8") + # body + p1 = T.allocate([390336], "int8", "global") + p2 = T.allocate([80], "uint8", "global") + p3 = T.allocate([64], "uint8", "global") + p4 = T.allocate([390336], "int8", "global") + p5 = T.allocate([96], "uint8", "global") + p6 = T.allocate([32], "uint8", "global") + T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 80, p2[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 64, p3[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_pooling", "int8", 214, 227, 2, 214, 0, 227, buffer1[0], 0, 0, 0, T.float32(1), 0, "NHWC", 454, 2, 1, "int8", 214, 114, 2, 214, 0, 114, p1[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 1824, 16, 1, "MAX", 2, 1, 2, 1, 1, 1, 0, 0, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 96, p5[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer5[0], 32, p6[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 214, 114, 2, 214, 0, 114, p1[0], 0, 0, 0, T.float32(0.00392157), -128, "NHCWB16", 1824, 16, 1, "int8", 214, 114, 5, 214, 0, 114, p4[0], 0, 0, 0, T.float32(0.0174839), -128, "NHCWB16", 1824, 16, 1, 3, 1, 1, 1, 1, 2, p2[0], 80, 0, p3[0], 64, 0, 1, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 214, 114, 5, 214, 0, 114, p4[0], 0, 0, 0, T.float32(0.0174839), -128, "NHCWB16", 1824, 16, 1, "int8", 214, 114, 3, 214, 0, 114, buffer3[0], 0, 0, 0, T.float32(0.104816), -128, "NHWC", 342, 3, 1, 3, 1, 1, 1, 1, 2, p5[0], 96, 0, p6[0], 32, 0, 1, 0, 1, "CLIP", -128, 127, "TFL", "NONE", 0, 0, 0, dtype="handle")) + # fmt: on + + test_mod = CopyComputeReordering()(InputModule) + reference_mod = ReferenceModule + tvm.ir.assert_structural_equal(test_mod, reference_mod, True) + + +def test_copy_to_buffer_with_local_scope(): + # fmt: off + @tvm.script.ir_module + class InputModule: + @T.prim_func + def main() -> None: + T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) + buffer1 = T.buffer_decl([64], "uint8") + buffer2 = T.buffer_decl([48], "uint8") + buffer3 = T.buffer_decl([48], "uint8") + buffer4 = T.buffer_decl([256], "uint8") + buffer5 = T.buffer_decl([16], "uint8") + buffer6 = T.buffer_decl([48], "uint8") + buffer7 = T.buffer_decl([256], "uint8") + buffer8 = T.buffer_decl([64], "uint8") + # body + p1 = T.allocate([48], "uint8", "global") + p2 = T.allocate([48], "uint8", "global") + p3 = T.allocate([256], "int8", "local") + p4 = T.allocate([256], "int8", "global") + p5 = T.allocate([16], "uint8", "global") + p6 = T.allocate([48], "uint8", "global") + p7 = T.allocate([256], "int8", "local") + T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 48, p1[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 48, p2[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 256, p3[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 4, 4, 4, 4, 0, 4, buffer1[0], 0, 0, 0, T.float32(0.00392081), -128, "NHWC", 16, 4, 1, "int8", 4, 4, 4, 4, 0, 4, p4[0], 0, 0, 0, T.float32(0.00839574), -128, "NHCWB16", 64, 16, 1, 1, 1, 1, 1, 1, 1, p1[0], 48, 0, p2[0], 48, 0, 0, 0, 0, "TANH", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer5[0], 16, p5[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer6[0], 48, p6[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer7[0], 256, p7[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_depthwise_conv2d", "int8", 4, 4, 4, 4, 0, 4, p4[0], 0, 0, 0, T.float32(0.0078125), 0, "NHCWB16", 64, 16, 1, "int8", 4, 4, 4, 4, 0, 4, buffer8[0], 0, 0, 0, T.float32(0.00372155), -128, "NHWC", 16, 4, 1, 1, 1, 1, 1, 1, 1, p5[0], 16, 0, p6[0], 48, 0, 0, 0, 0, "TANH", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + + @tvm.script.ir_module + class ReferenceModule: + @T.prim_func + def main() -> None: + T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) + buffer1 = T.buffer_decl([64], "uint8") + buffer2 = T.buffer_decl([48], "uint8") + buffer3 = T.buffer_decl([48], "uint8") + buffer4 = T.buffer_decl([256], "uint8") + buffer5 = T.buffer_decl([16], "uint8") + buffer6 = T.buffer_decl([48], "uint8") + buffer7 = T.buffer_decl([256], "uint8") + buffer8 = T.buffer_decl([64], "uint8") + # body + p1 = T.allocate([48], "uint8", "global") + p2 = T.allocate([48], "uint8", "global") + p3 = T.allocate([256], "int8", "local") + p4 = T.allocate([256], "int8", "global") + p5 = T.allocate([16], "uint8", "global") + p6 = T.allocate([48], "uint8", "global") + p7 = T.allocate([256], "int8", "local") + T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 48, p1[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 48, p2[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 256, p3[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer5[0], 16, p5[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer6[0], 48, p6[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 4, 4, 4, 4, 0, 4, buffer1[0], 0, 0, 0, T.float32(0.00392081), -128, "NHWC", 16, 4, 1, "int8", 4, 4, 4, 4, 0, 4, p4[0], 0, 0, 0, T.float32(0.00839574), -128, "NHCWB16", 64, 16, 1, 1, 1, 1, 1, 1, 1, p1[0], 48, 0, p2[0], 48, 0, 0, 0, 0, "TANH", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer7[0], 256, p7[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_depthwise_conv2d", "int8", 4, 4, 4, 4, 0, 4, p4[0], 0, 0, 0, T.float32(0.0078125), 0, "NHCWB16", 64, 16, 1, "int8", 4, 4, 4, 4, 0, 4, buffer8[0], 0, 0, 0, T.float32(0.00372155), -128, "NHWC", 16, 4, 1, 1, 1, 1, 1, 1, 1, p5[0], 16, 0, p6[0], 48, 0, 0, 0, 0, "TANH", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + # fmt: on + + test_mod = CopyComputeReordering()(InputModule) + reference_mod = ReferenceModule + tvm.ir.assert_structural_equal(test_mod, reference_mod, True) + + +def test_multiple_prim_funcs(): + # fmt: off + @tvm.script.ir_module + class InputModule: + @T.prim_func + def main(): + T.evaluate(0) + + @T.prim_func + def abc(): + T.evaluate(0) + # fmt: on + + err_rgx = ( + r"Expected a single primitive function called 'main'. " + r"Please run the CopyComputeReordering pass in conjunction with the LowerToTIR\(\) pass." + ) + with pytest.raises(tvm.TVMError, match=err_rgx): + CopyComputeReordering()(InputModule) + + +def test_no_main_prim_func(): + # fmt: off + @tvm.script.ir_module + class InputModule: + @T.prim_func + def abs(): + T.evaluate(0) + # fmt: on + + err_rgx = ( + r"Expected a single primitive function called 'main'. " + r"Please run the CopyComputeReordering pass in conjunction with the LowerToTIR\(\) pass." + ) + with pytest.raises(tvm.TVMError, match=err_rgx): + CopyComputeReordering()(InputModule) + + +if __name__ == "__main__": + pytest.main([__file__]) diff --git a/tests/python/contrib/test_ethosu/test_encode_constants.py b/tests/python/contrib/test_ethosu/test_encode_constants.py index 277986eb7184..eb60094d8da8 100644 --- a/tests/python/contrib/test_ethosu/test_encode_constants.py +++ b/tests/python/contrib/test_ethosu/test_encode_constants.py @@ -37,33 +37,34 @@ class WeightStreamOnly: def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,), "int8"]) -> None: # function attr dict T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) - buffer = T.buffer_decl([128], "uint8") - buffer_1 = T.buffer_decl([32], "uint8") - buffer_2 = T.buffer_decl([112], "uint8") - buffer_3 = T.buffer_decl([32], "uint8") - buffer_4 = T.buffer_decl([112], "uint8") - buffer_5 = T.buffer_decl([32], "uint8") - buffer_6 = T.buffer_decl([112], "uint8") - buffer_7 = T.buffer_decl([32], "uint8") + buffer1 = T.buffer_decl([128], "uint8") + buffer2 = T.buffer_decl([32], "uint8") + buffer3 = T.buffer_decl([112], "uint8") + buffer4 = T.buffer_decl([32], "uint8") + buffer5 = T.buffer_decl([112], "uint8") + buffer6 = T.buffer_decl([32], "uint8") + buffer7 = T.buffer_decl([112], "uint8") + buffer8 = T.buffer_decl([32], "uint8") T.preflattened_buffer(placeholder, [1, 16, 16, 32], "int8", data=placeholder.data) T.preflattened_buffer(ethosu_write, [1, 16, 16, 8], "int8", data=ethosu_write.data) # body - p1_global = T.allocate([128], "uint8", "global", annotations={"disable_lower_builtin":True}) - p2_global = T.allocate([32], "uint8", "global", annotations={"disable_lower_builtin":True}) - p1_global_1 = T.buffer_decl([112], dtype="uint8", data=p1_global.data) - p2_global_1 = T.buffer_decl([32], dtype="uint8", data=p2_global.data) - T.evaluate(T.call_extern("ethosu_copy", buffer[0], 128, p1_global[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_1[0], 32, p2_global[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1_global[0], 128, 12, p2_global[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_2[0], 112, p1_global_1[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_3[0], 32, p2_global_1[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1_global_1[0], 112, 12, p2_global_1[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_4[0], 112, p1_global_1[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_5[0], 32, p2_global_1[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1_global_1[0], 112, 12, p2_global_1[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_6[0], 112, p1_global_1[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_7[0], 32, p2_global_1[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1_global_1[0], 112, 12, p2_global_1[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + p1 = T.allocate([128], "uint8", "global", annotations={"disable_lower_builtin":True}) + p2 = T.allocate([32], "uint8", "global", annotations={"disable_lower_builtin":True}) + p3 = T.allocate([112], "uint8", "global", annotations={"disable_lower_builtin":True}) + p4 = T.allocate([32], "uint8", "global", annotations={"disable_lower_builtin":True}) + buffer9 = T.buffer_decl([112], "uint8", data=p1.data) + T.evaluate(T.call_extern("ethosu_copy", buffer1[0], 128, p1[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 32, p2[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 112, p3[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 32, p4[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 128, 12, p2[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer5[0], 112, buffer9[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer6[0], 32, p2[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p3[0], 112, 12, p4[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer7[0], 112, p3[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer8[0], 32, p4[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, buffer9[0], 112, 12, p2[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p3[0], 112, 12, p4[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) __tvm_meta__ = None # fmt: on @@ -114,19 +115,21 @@ class RereadWeights: def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,), "int8"]) -> None: # function attr dict T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) - buffer = T.buffer_decl([304], "uint8") - buffer_1 = T.buffer_decl([80], "uint8") + buffer1 = T.buffer_decl([304], "uint8") + buffer2 = T.buffer_decl([80], "uint8") T.preflattened_buffer(placeholder, [1, 16, 16, 32], "int8", data=placeholder.data) T.preflattened_buffer(ethosu_write, [1, 16, 16, 8], "int8", data=ethosu_write.data) # body - placeholder_global = T.allocate([304], "uint8", "global", annotations={"disable_lower_builtin":True}) - placeholder_d_global = T.allocate([80], "uint8", "global", annotations={"disable_lower_builtin":True}) - T.evaluate(T.call_extern("ethosu_copy", buffer[0], 304, placeholder_global[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_1[0], 80, placeholder_d_global[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 8, 32, 16, 0, 8, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 8, 8, 16, 0, 8, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 1, 8, 1, 1, 1, 1, 1, 1, placeholder_global[0], 304, 12, placeholder_d_global[0], 80, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer[0], 304, placeholder_global[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_1[0], 80, placeholder_d_global[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 8, 32, 16, 0, 8, placeholder[256], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 8, 8, 16, 0, 8, ethosu_write[64], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 1, 8, 1, 1, 1, 1, 1, 1, placeholder_global[0], 304, 12, placeholder_d_global[0], 80, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + p1 = T.allocate([304], "uint8", "global", annotations={"disable_lower_builtin":True}) + p2 = T.allocate([80], "uint8", "global", annotations={"disable_lower_builtin":True}) + p3 = T.allocate([304], "uint8", "global", annotations={"disable_lower_builtin":True}) + p4 = T.allocate([80], "uint8", "global", annotations={"disable_lower_builtin":True}) + T.evaluate(T.call_extern("ethosu_copy", buffer1[0], 304, p1[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 80, p2[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer1[0], 304, p3[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 80, p4[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 8, 32, 16, 0, 8, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 8, 8, 16, 0, 8, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 1, 8, 1, 1, 1, 1, 1, 1, p1[0], 304, 12, p2[0], 80, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 8, 32, 16, 0, 8, placeholder[256], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 8, 8, 16, 0, 8, ethosu_write[64], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 1, 8, 1, 1, 1, 1, 1, 1, p3[0], 304, 12, p4[0], 80, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) __tvm_meta__ = None # fmt: on @@ -236,35 +239,37 @@ class MixedRead: def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,), "int8"]) -> None: # function attr dict T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) - buffer = T.buffer_decl([592], "uint8") - buffer_1 = T.buffer_decl([160], "uint8") - buffer_2 = T.buffer_decl([80], "uint8") - buffer_3 = T.buffer_decl([32], "uint8") - buffer_4 = T.buffer_decl([80], "uint8") - buffer_5 = T.buffer_decl([32], "uint8") - buffer_6 = T.buffer_decl([80], "uint8") - buffer_7 = T.buffer_decl([32], "uint8") - buffer_8 = T.buffer_decl([80], "uint8") - buffer_9 = T.buffer_decl([32], "uint8") + buffer1 = T.buffer_decl([80], "uint8") + buffer2 = T.buffer_decl([32], "uint8") + buffer3 = T.buffer_decl([80], "uint8") + buffer4 = T.buffer_decl([32], "uint8") + buffer5 = T.buffer_decl([80], "uint8") + buffer6 = T.buffer_decl([32], "uint8") + buffer7 = T.buffer_decl([80], "uint8") + buffer8 = T.buffer_decl([32], "uint8") + buffer9 = T.buffer_decl([592], "uint8") + buffer10 = T.buffer_decl([160], "uint8") T.preflattened_buffer(placeholder, [1, 16, 16, 32], "int8", data=placeholder.data) T.preflattened_buffer(ethosu_write, [1, 16, 16, 8], "int8", data=ethosu_write.data) # body - ethosu_write_1 = T.allocate([4096], "int8", "global", annotations={"disable_lower_builtin":True}) - placeholder_global = T.allocate([80], "uint8", "global", annotations={"disable_lower_builtin":True}) - placeholder_d_global = T.allocate([32], "uint8", "global", annotations={"disable_lower_builtin":True}) - T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 16, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, buffer[0], 592, 12, buffer_1[0], 160, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_2[0], 80, placeholder_global[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_3[0], 32, placeholder_d_global[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 80, 12, placeholder_d_global[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_4[0], 80, placeholder_global[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_5[0], 32, placeholder_d_global[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 80, 12, placeholder_d_global[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_6[0], 80, placeholder_global[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_7[0], 32, placeholder_d_global[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 80, 12, placeholder_d_global[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_8[0], 80, placeholder_global[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_9[0], 32, placeholder_d_global[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 80, 12, placeholder_d_global[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + p1 = T.allocate([80], "uint8", "global", annotations={"disable_lower_builtin":True}) + p2 = T.allocate([32], "uint8", "global", annotations={"disable_lower_builtin":True}) + p3 = T.allocate([4096], "int8", "global", annotations={"disable_lower_builtin":True}) + p4 = T.allocate([80], "uint8", "global", annotations={"disable_lower_builtin":True}) + p5 = T.allocate([32], "uint8", "global", annotations={"disable_lower_builtin":True}) + T.evaluate(T.call_extern("ethosu_copy", buffer1[0], 80, p1[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 32, p2[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 16, 16, 0, 16, p3[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, buffer9[0], 592, 12, buffer10[0], 160, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 80, p4[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 32, p5[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, p3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 80, 12, p2[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer5[0], 80, p1[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer6[0], 32, p2[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, p3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p4[0], 80, 12, p5[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer7[0], 80, p4[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer8[0], 32, p5[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, p3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 80, 12, p2[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, p3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p4[0], 80, 12, p5[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) __tvm_meta__ = None # fmt: on diff --git a/tests/python/contrib/test_ethosu/test_networks.py b/tests/python/contrib/test_ethosu/test_networks.py index e04cabe79d2f..f64263ca0623 100644 --- a/tests/python/contrib/test_ethosu/test_networks.py +++ b/tests/python/contrib/test_ethosu/test_networks.py @@ -43,13 +43,13 @@ @pytest.mark.parametrize( "accel_type, model_url, workspace_size", [ - ("ethos-u65-256", MOBILENET_V1_URL, 1423344), - ("ethos-u65-256", MOBILENET_V2_URL, 2185584), - ("ethos-u55-256", MOBILENET_V1_URL, 1423344), - ("ethos-u55-256", MOBILENET_V2_URL, 2185584), - ("ethos-u55-128", MOBILENET_V2_URL, 2185584), - ("ethos-u55-64", MOBILENET_V2_URL, 2185584), - ("ethos-u55-32", MOBILENET_V2_URL, 2185584), + ("ethos-u65-256", MOBILENET_V1_URL, 1892704), + ("ethos-u65-256", MOBILENET_V2_URL, 2257984), + ("ethos-u55-256", MOBILENET_V1_URL, 1892704), + ("ethos-u55-256", MOBILENET_V2_URL, 2257984), + ("ethos-u55-128", MOBILENET_V2_URL, 2257984), + ("ethos-u55-64", MOBILENET_V2_URL, 2257984), + ("ethos-u55-32", MOBILENET_V2_URL, 2258000), ], ) def test_networks_without_usmp(accel_type, model_url, workspace_size): @@ -71,8 +71,8 @@ def test_networks_without_usmp(accel_type, model_url, workspace_size): @pytest.mark.parametrize( "accel_type, model_url, workspace_size", [ - ("ethos-u65-256", MOBILENET_V1_URL, 1205872), - ("ethos-u55-256", MOBILENET_V2_URL, 1507152), + ("ethos-u65-256", MOBILENET_V1_URL, 1206880), + ("ethos-u55-256", MOBILENET_V2_URL, 1509408), ], ) def test_networks_with_usmp(accel_type, model_url, workspace_size): diff --git a/tests/python/contrib/test_ethosu/test_replace_copy.py b/tests/python/contrib/test_ethosu/test_replace_copy.py index 62bea662e7d8..bf4e28c2a069 100644 --- a/tests/python/contrib/test_ethosu/test_replace_copy.py +++ b/tests/python/contrib/test_ethosu/test_replace_copy.py @@ -34,16 +34,16 @@ class ReferenceModule: def main(placeholder_3: T.Buffer[(8192,), "int8"], ethosu_write_1: T.Buffer[(2048,), "int8"]) -> None: # function attr dict T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) - buffer = T.buffer_decl([80], "uint8") - buffer_1 = T.buffer_decl([304], "uint8") + buffer1 = T.buffer_decl([304], "uint8") + buffer2 = T.buffer_decl([80], "uint8") T.preflattened_buffer(placeholder_3, [1, 16, 16, 32], dtype="int8", data=placeholder_3.data) T.preflattened_buffer(ethosu_write_1, [1, 16, 16, 8], dtype="int8", data=ethosu_write_1.data) # body - placeholder_global = T.allocate([304], "uint8", "global", annotations={"disable_lower_builtin": True}) - placeholder_d_global = T.allocate([80], "uint8", "global", annotations={"disable_lower_builtin": True}) - T.evaluate(T.call_extern("ethosu_copy", buffer_1[0], 304, placeholder_global[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer[0], 80, placeholder_d_global[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder_3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 8, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 304, 12, placeholder_d_global[0], 80, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + p1 = T.allocate([304], "uint8", "global", annotations={"disable_lower_builtin": True}) + p2 = T.allocate([80], "uint8", "global", annotations={"disable_lower_builtin": True}) + T.evaluate(T.call_extern("ethosu_copy", buffer1[0], 304, p1[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 80, p2[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder_3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 8, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 304, 12, p2[0], 80, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) __tvm_meta__ = None # fmt: on @@ -80,23 +80,23 @@ class WeightStream: def main(placeholder_5: T.Buffer[(8192,), "int8"], ethosu_write_1: T.Buffer[(4096,), "int8"]) -> None: # function attr dict T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) - buffer = T.buffer_decl([416], "uint8") - buffer_1 = T.buffer_decl([112], "uint8") - buffer_2 = T.buffer_decl([272], "uint8") - buffer_3 = T.buffer_decl([64], "uint8") + buffer1 = T.buffer_decl([416], "uint8") + buffer2 = T.buffer_decl([112], "uint8") + buffer3 = T.buffer_decl([272], "uint8") + buffer4 = T.buffer_decl([64], "uint8") T.preflattened_buffer(placeholder_5, [1, 16, 16, 32], dtype="int8", data=placeholder_5.data) T.preflattened_buffer(ethosu_write_1, [1, 16, 16, 16], dtype="int8", data=ethosu_write_1.data) # body - placeholder_global_unrolled_iter_0 = T.allocate([416], "uint8", "global", annotations={"disable_lower_builtin": True}) - placeholder_global_unrolled_iter_1 = T.buffer_decl([272], "uint8", data=placeholder_global_unrolled_iter_0.data) - placeholder_d_global_unrolled_iter_0 = T.allocate([112], "uint8", "global", annotations={"disable_lower_builtin": True}) - placeholder_d_global_unrolled_iter_1 = T.buffer_decl([64], dtype="uint8", data=placeholder_d_global_unrolled_iter_0.data) - T.evaluate(T.call_extern("ethosu_copy", buffer[0], 416, placeholder_global_unrolled_iter_0[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_1[0], 112, placeholder_d_global_unrolled_iter_0[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder_5[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 10, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, placeholder_global_unrolled_iter_0[0], 416, 12, placeholder_d_global_unrolled_iter_0[0], 112, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_2[0], 272, placeholder_global_unrolled_iter_1[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_3[0], 64, placeholder_d_global_unrolled_iter_1[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder_5[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 6, 16, 0, 16, ethosu_write_1[10], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, placeholder_global_unrolled_iter_1[0], 272, 12, placeholder_d_global_unrolled_iter_1[0], 64, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + p1 = T.allocate([416], "uint8", "global", annotations={"disable_lower_builtin": True}) + p2 = T.allocate([112], "uint8", "global", annotations={"disable_lower_builtin": True}) + p3 = T.allocate([272], "uint8", "global", annotations={"disable_lower_builtin": True}) + p4 = T.allocate([64], "uint8", "global", annotations={"disable_lower_builtin": True}) + T.evaluate(T.call_extern("ethosu_copy", buffer1[0], 416, p1[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 112, p2[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 272, p3[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 64, p4[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder_5[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 10, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, p1[0], 416, 12, p2[0], 112, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder_5[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 6, 16, 0, 16, ethosu_write_1[10], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, p3[0], 272, 12, p4[0], 64, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) __tvm_meta__ = None # fmt: on diff --git a/tests/python/contrib/test_ethosu/test_scheduler.py b/tests/python/contrib/test_ethosu/test_scheduler.py index bc0232fc99c6..9b23e5eb3ca8 100644 --- a/tests/python/contrib/test_ethosu/test_scheduler.py +++ b/tests/python/contrib/test_ethosu/test_scheduler.py @@ -180,29 +180,27 @@ def test_schedule_cache_reads(): @tvm.script.ir_module class DiamondGraphTir: @T.prim_func - def main(input_buffer: T.Buffer[(301056,), "int8"], output_buffer: T.Buffer[(75264,), "int8"]) -> None: + def main(placeholder: T.Buffer[(301056,), "int8"], ethosu_write: T.Buffer[(75264,), "int8"]) -> None: T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) - T.preflattened_buffer(input_buffer, [1, 56, 56, 96], dtype='int8', data=input_buffer.data) - T.preflattened_buffer(output_buffer, [1, 56, 56, 24], dtype='int8', data=output_buffer.data) - - weight_buffer = T.buffer_decl([2608], "uint8") - bias_buffer = T.buffer_decl([240], "uint8") - weight_buffer2 = T.buffer_decl([736], "uint8") - bias_buffer2 = T.buffer_decl([240], "uint8") - - weight_global = T.allocate([2608], "uint8", "global", annotations={"disable_lower_builtin":True}) - weight_global2 = T.buffer_decl([736], "uint8", data=weight_global.data) - bias_global = T.allocate([240], "uint8", "global", annotations={"disable_lower_builtin":True}) - featuremap_buffer = T.allocate([75264], "int8", "global", annotations={"disable_lower_builtin": True}) - featuremap_buffer2 = T.allocate([75264], "int8", "global", annotations={"disable_lower_builtin": True}) - - T.evaluate(T.call_extern("ethosu_copy", weight_buffer[0], 2608, weight_global[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", bias_buffer[0], 240, bias_global[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_conv2d", "int8", 56, 56, 96, 56, 0, 56, input_buffer[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 5376, 96, 1, "int8", 56, 56, 24, 56, 0, 56, featuremap_buffer[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 1344, 24, 1, 1, 1, 1, 1, 1, 1, weight_global[0], 2608, 12, bias_global[0], 240, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", weight_buffer2[0], 736, weight_global2[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", bias_buffer2[0], 240, bias_global[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_conv2d", "int8", 56, 56, 24, 56, 0, 56, featuremap_buffer[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 1344, 24, 1, "int8", 56, 56, 24, 56, 0, 56, featuremap_buffer2[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 1344, 24, 1, 1, 1, 1, 1, 1, 1, weight_global2[0], 736, 12, bias_global[0], 240, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) - T.evaluate(T.call_extern("ethosu_binary_elementwise", "int8", 56, 56, 24, 56, 0, 56, featuremap_buffer[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1344, 24, 1, "int8", 56, 56, 24, 56, 0, 56, featuremap_buffer2[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1344, 24, 1, "int8", 56, 56, 24, 56, 0, 56, output_buffer[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1344, 24, 1, "ADD", 0, "NONE", 0, 0, "TFL", 0, 0, 0, dtype="handle")) + T.preflattened_buffer(placeholder, [1, 56, 56, 96], dtype='int8', data=placeholder.data) + T.preflattened_buffer(ethosu_write, [1, 56, 56, 24], dtype='int8', data=ethosu_write.data) + buffer1 = T.buffer_decl([2608], "uint8") + buffer2 = T.buffer_decl([240], "uint8") + buffer3 = T.buffer_decl([736], "uint8") + buffer4 = T.buffer_decl([240], "uint8") + p1 = T.allocate([2608], "uint8", "global", annotations={"disable_lower_builtin":True}) + p2 = T.allocate([240], "uint8", "global", annotations={"disable_lower_builtin":True}) + p3 = T.allocate([736], "uint8", "global", annotations={"disable_lower_builtin":True}) + p4 = T.allocate([240], "uint8", "global", annotations={"disable_lower_builtin":True}) + p5 = T.allocate([75264], "int8", "global", annotations={"disable_lower_builtin":True}) + p6 = T.allocate([75264], "int8", "global", annotations={"disable_lower_builtin":True}) + T.evaluate(T.call_extern("ethosu_copy", buffer1[0], 2608, p1[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 240, p2[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 736, p3[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 240, p4[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 56, 56, 96, 56, 0, 56, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 5376, 96, 1, "int8", 56, 56, 24, 56, 0, 56, p5[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 1344, 24, 1, 1, 1, 1, 1, 1, 1, p1[0], 2608, 12, p2[0], 240, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 56, 56, 24, 56, 0, 56, p5[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 1344, 24, 1, "int8", 56, 56, 24, 56, 0, 56, p6[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 1344, 24, 1, 1, 1, 1, 1, 1, 1, p3[0], 736, 12, p4[0], 240, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_binary_elementwise", "int8", 56, 56, 24, 56, 0, 56, p5[0], 0, 0, 0,T.float32(1), 0, "NHWC", 1344, 24, 1, "int8", 56, 56, 24, 56, 0, 56, p6[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1344, 24, 1, "int8", 56, 56, 24, 56, 0, 56, ethosu_write[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1344, 24, 1, "ADD", 0, "NONE", 0, 0, "TFL", 0, 0, 0, dtype="handle")) __tvm_meta__ = None # fmt: on