From 1125de41e108ee72f809ea41692ed30d79c77b24 Mon Sep 17 00:00:00 2001 From: Elen Kalda Date: Tue, 31 May 2022 13:53:59 +0100 Subject: [PATCH 1/3] [microNPU] Reorder copies and computes based on the cycle count If the cascader is enabled and the ops in TIR have the cycle count annotation, enabling the reorder_by_cycles option will reorder to copies and computes based on a cycle count. If reorder_by_cycles is enabled, max_copy_movements is ignored. This pass is currently not part of the TIR pipeline since it assumes that weights and bias of a compute op are merged into one constant (which is WIP). --- .../backend/contrib/ethosu/tir/passes.py | 22 +- src/tir/contrib/ethosu/passes.cc | 193 ++++++++++-- .../test_copy_compute_reordering.py | 283 ++++++++++++++++++ 3 files changed, 475 insertions(+), 23 deletions(-) diff --git a/python/tvm/relay/backend/contrib/ethosu/tir/passes.py b/python/tvm/relay/backend/contrib/ethosu/tir/passes.py index c0b017e703ce..77d2fff230df 100644 --- a/python/tvm/relay/backend/contrib/ethosu/tir/passes.py +++ b/python/tvm/relay/backend/contrib/ethosu/tir/passes.py @@ -916,14 +916,20 @@ def HoistAllocates() -> tvm.IRModule: return _ffi_api.HoistAllocates() -def CopyComputeReordering(max_copy_movements: Optional[int] = None) -> tvm.IRModule: +def CopyComputeReordering( + max_copy_movements: Optional[int] = None, reorder_by_cycles: Optional[bool] = None +) -> tvm.IRModule: """ - Reorders copy and compute nodes in such a way that independent DMA copies, + Reorders copy and compute nodes in such a way that independent DMA copies and computes happen in parallel. - Copies to buffers with local scope are not reordered, indeed they copy LUT - into the SHRAM which already happens in parallel with copying weights into + Copies to buffers with local scope are not reordered since they copy LUT + into the SHRAM and that already happens in parallel with copying weights into the weights encoder. + If reorder_by_cycles is set, we use the cycle hint to decide the reordering. If it is not set, + we move the copies up by a fixed number of movements, either by max_copy_movements if it is + specified, or by default value of 1. + Parameters ---------- max_copy_movements: Optional[int] @@ -932,12 +938,18 @@ def CopyComputeReordering(max_copy_movements: Optional[int] = None) -> tvm.IRMod tir.contrib.ethos-u.copy_compute_reordering_max_copy_movements is used if provided, otherwise the default value will be 1. + reorder_by_cycles: Optional[bool] + Whether to reorder the computes and copies based on the cycle hint. + If None, the pass context option + tir.contrib.ethos-u.copy_compute_reordering_reorder_by_cycles + is used if provided, otherwise the default value will be False. + Returns ------- tvm.IRModule The new module with copy and compute nodes reordered. """ - return _ffi_api.CopyComputeReordering(max_copy_movements) + return _ffi_api.CopyComputeReordering(max_copy_movements, reorder_by_cycles) def MergeConstants(const_dict): diff --git a/src/tir/contrib/ethosu/passes.cc b/src/tir/contrib/ethosu/passes.cc index b662e9dfd025..8a5d1ef18225 100644 --- a/src/tir/contrib/ethosu/passes.cc +++ b/src/tir/contrib/ethosu/passes.cc @@ -41,6 +41,13 @@ constexpr const char* kCopyComputeReorderingMaxCopyMovements = "tir.contrib.ethos-u.copy_compute_reordering_max_copy_movements"; TVM_REGISTER_PASS_CONFIG_OPTION(kCopyComputeReorderingMaxCopyMovements, Integer); +/*! + * \brief Whether to reorder copies and computes based on cycle count. + */ +constexpr const char* kCopyComputeReorderingReorderByCycles = + "tir.contrib.ethos-u.copy_compute_reordering_reorder_by_cycles"; +TVM_REGISTER_PASS_CONFIG_OPTION(kCopyComputeReorderingReorderByCycles, Bool); + namespace tir { namespace contrib { namespace ethosu { @@ -180,16 +187,16 @@ tvm::transform::Pass HoistAllocates() { TVM_REGISTER_GLOBAL("tir.contrib.ethos-u.HoistAllocates").set_body_typed(HoistAllocates); /*! - * \brief Reorders copy and compute nodes in such a way that independent DMA copies, + * \brief Reorders copy and compute nodes in such a way that independent DMA copies * and computes happen in parallel. - * Copies to buffers with local scope are not reordered, indeed they copy LUT - * into the SHRAM which already happens in parallel with copying weights into + * Copies to buffers with local scope are not reordered since they copy LUT + * into the SHRAM and that already happens in parallel with copying weights into * the weights encoder. */ class CopyComputeReorderingMutator : public StmtExprMutator { public: - explicit CopyComputeReorderingMutator(int max_copy_movements) - : _max_copy_movements{max_copy_movements} {} + explicit CopyComputeReorderingMutator(int max_copy_movements, bool reorder_by_cycles) + : _max_copy_movements{max_copy_movements}, _reorder_by_cycles{reorder_by_cycles} {} PrimFunc operator()(PrimFunc main_func) { if (_max_copy_movements > 0) { @@ -201,6 +208,13 @@ class CopyComputeReorderingMutator : public StmtExprMutator { } private: + // A structure to hold a compute op with the corresponding weights/bias copy and LUT copy + struct OpWithCopies { + Stmt compute_op{}; + Stmt global_copy{}; + Stmt local_copy{}; + }; + Stmt VisitStmt_(const SeqStmtNode* op) override { if (op->size() <= 1) { return StmtExprMutator::VisitStmt_(op); @@ -210,13 +224,105 @@ class CopyComputeReorderingMutator : public StmtExprMutator { std::vector new_seq(seq_stmt->size()); std::copy(seq_stmt->seq.begin(), seq_stmt->seq.end(), new_seq.begin()); - // Each copy statement to a buffer with global scope is moved up - // at most `_max_copy_movements` times. - for (size_t index = 0; index < new_seq.size(); ++index) { - if (GetStmtType(new_seq[index]) == StmtType::global_copy) { - int lower = std::max(0, static_cast(index) - _max_copy_movements); - for (int i = index; i > lower && (GetStmtType(new_seq[i - 1]) == StmtType::compute); --i) { - std::swap(new_seq[i - 1], new_seq[i]); + // Reorder the copies and computes based on the cycle count + if (_reorder_by_cycles) { + // We can't hide the first copy, so ignore it for the purpose of hiding copies + Stmt first_copy{}; + if (stmt_is_global_copy(new_seq[0]) || + (stmt_is_local_copy(new_seq[0]) && stmt_is_global_copy(new_seq[1]))) { + auto copy_position = stmt_is_global_copy(new_seq[0]) ? 0 : 1; + first_copy = new_seq[copy_position]; + new_seq.erase(new_seq.begin() + copy_position); + } + + // Build up a list of cells with the compute op and the copy ops that directly preceed it + std::vector ops{}; + for (size_t idx = 0; idx < new_seq.size(); ++idx) { + if (stmt_is_compute_op(new_seq[idx])) { + OpWithCopies new_op; + new_op.compute_op = new_seq[idx]; + if (idx > 0) { + auto prev_op = new_seq[idx - 1]; + if (!stmt_is_compute_op(prev_op)) { + if (stmt_is_local_copy(prev_op)) { + new_op.local_copy = prev_op; + } else { + new_op.global_copy = prev_op; + } + if (idx > 1) { + auto prev_prev_op = new_seq[idx - 2]; + if (!stmt_is_compute_op(prev_prev_op)) { + if (stmt_is_local_copy(prev_prev_op)) { + new_op.local_copy = prev_prev_op; + } else { + new_op.global_copy = prev_prev_op; + } + } + } + } + } + ops.push_back(new_op); + } + } + + // Move the global copies up by one. If in general the computes take longer than the copies, + // that should be good enough + for (size_t idx = 1; idx < ops.size(); ++idx) { + if (ops[idx].global_copy.as()) { + ops[idx - 1].global_copy = ops[idx].global_copy; + ops[idx].global_copy = {}; + } + } + + // If there are long copies, try to hide them further + for (size_t idx = ops.size() - 1; idx > 0; --idx) { + if (ops[idx].global_copy.as()) { + // Check whether the copy is hidden + int64_t copy_cycles{get_cycles(ops[idx].global_copy)}; + int64_t compute_cycles{get_cycles(ops[idx].compute_op)}; + bool is_hidden = compute_cycles >= copy_cycles; + + // If the previous compute op is not already hiding another copy, move the copy back, so + // that it would be hidden by multiple computes + while (!is_hidden && !ops[idx - 1].global_copy.as() && (idx > 0)) { + int64_t new_compute_cycles{get_cycles(ops[idx - 1].compute_op)}; + ops[idx - 1].global_copy = ops[idx].global_copy; + ops[idx].global_copy = {}; + compute_cycles += new_compute_cycles; + is_hidden = compute_cycles >= copy_cycles; + --idx; + } + } + } + + // Reconstruct the op sequence from the vector of OpWithCopies + new_seq.clear(); + if (first_copy.as()) { + new_seq.push_back(first_copy); + } + for (auto& op : ops) { + if (op.global_copy.as()) { + new_seq.push_back(op.global_copy); + } + if (op.local_copy.as()) { + new_seq.push_back(op.local_copy); + } + if (op.compute_op.as()) { + new_seq.push_back(op.compute_op); + } + } + + } + // Reorder the copies and computes by a fixed number of movements + else { + // Each copy statement to a buffer with global scope is moved up + // at most `_max_copy_movements` times. + for (size_t index = 0; index < new_seq.size(); ++index) { + if (GetStmtType(new_seq[index]) == StmtType::global_copy) { + int lower = std::max(0, static_cast(index) - _max_copy_movements); + for (int i = index; i > lower && (GetStmtType(new_seq[i - 1]) == StmtType::compute); --i) { + std::swap(new_seq[i - 1], new_seq[i]); + } } } } @@ -226,28 +332,79 @@ class CopyComputeReorderingMutator : public StmtExprMutator { return Stmt{seq_stmt_node}; } + int64_t get_cycles(const Stmt& stmt) { + auto attr_node{stmt.as()}; + ICHECK(attr_node) << "The cycle count attribute is missing"; + return attr_node->value.as()->value; + } + + tvm::runtime::Array get_stmt_args(const Stmt& stmt) { + Stmt eval_stmt = stmt; + if (const auto* attr_stmt = eval_stmt.as()) { + eval_stmt = attr_stmt->body; + } + + auto eval_node{eval_stmt.as()}; + ICHECK(eval_node) << "Expected statement to be an evaluate node, but was " + << eval_stmt->GetTypeKey(); + auto call_node{eval_node->value.as()}; + ICHECK(call_node) << "Expected expression to be a call node, but was " + << eval_node->value->GetTypeKey(); + return call_node->args; + } + + bool stmt_is_global_copy(const Stmt& stmt) { + auto args{get_stmt_args(stmt)}; + return args[0].as()->value == "ethosu_copy" && + args[3].as()->buffer.scope() == "global"; + } + + bool stmt_is_local_copy(const Stmt& stmt) { + auto args{get_stmt_args(stmt)}; + return args[0].as()->value == "ethosu_copy" && + args[3].as()->buffer.scope() == "local"; + } + + bool stmt_is_copy(const Stmt& stmt) { + return stmt_is_global_copy(stmt) || stmt_is_local_copy(stmt); + } + + bool stmt_is_compute_op(const Stmt& stmt) { return !stmt_is_copy(stmt); } + /*! The maximum number of movements allowed for a copy. */ int _max_copy_movements; + /*! Whether we use the cycle hint to determine the reordering. */ + bool _reorder_by_cycles; }; /*! - * \brief A pass to reorder copy and compute nodes in such a way that independent DMA copies, - * and computes happen in parallel. + * \brief A pass to reorder copy and compute nodes in such a way that independent DMA copies + * and computes happen in parallel. If reorder_by_cycles is set, we will ignore the + * max_copy_movements value. * - * \param max_copy_movements: The maximum number of movements allowed for a copy. + * \param max_copy_movements: The maximum number of movements allowed for a copy. * If None, the pass context option tir.contrib.ethos-u.copy_compute_reordering_max_copy_movements * is used if provided, otherwise the default value will be 1. + * + * \param reorder_by_cycles: Whether to reorder copies and computes by cycles. + * If None, the pass context option tir.contrib.ethos-u.copy_compute_reordering_reorder_by_cycles + * is used if provided, otherwise the default value will be False. If the value is True, + * max_copy_movements will be ignored. * \return tvm::transform::Pass */ -tvm::transform::Pass CopyComputeReordering(Optional max_copy_movements) { +tvm::transform::Pass CopyComputeReordering(Optional max_copy_movements, + Optional reorder_by_cycles) { auto pass_func = [=](PrimFunc f, IRModule mod, tvm::transform::PassContext ctx) { ICHECK(mod->GetGlobalVars().size() == 1 && mod->ContainGlobalVar("main")) << "Expected a single primitive function called 'main'. Please run the " "CopyComputeReordering " "pass in conjunction with the LowerToTIR() pass."; - auto value = max_copy_movements.value_or( + + auto copy_movements = max_copy_movements.value_or( ctx->GetConfig(kCopyComputeReorderingMaxCopyMovements, Integer(1)).value()); - return CopyComputeReorderingMutator(value.IntValue())(f); + auto reorder = reorder_by_cycles.value_or( + ctx->GetConfig(kCopyComputeReorderingReorderByCycles, Bool(false)).value()); + return CopyComputeReorderingMutator(copy_movements, reorder)(f); }; return tvm::tir::transform::CreatePrimFuncPass(pass_func, 0, "tir.contrib.ethos-u.CopyComputeReordering", {}); diff --git a/tests/python/contrib/test_ethosu/test_copy_compute_reordering.py b/tests/python/contrib/test_ethosu/test_copy_compute_reordering.py index eebaa3b816b4..d6f0c178e56e 100644 --- a/tests/python/contrib/test_ethosu/test_copy_compute_reordering.py +++ b/tests/python/contrib/test_ethosu/test_copy_compute_reordering.py @@ -468,5 +468,288 @@ def main() -> None: tvm.ir.assert_structural_equal(test_mod, reference_mod, True) +def test_reordering_based_on_cycles(): + # fmt: off + @tvm.script.ir_module + class ModuleBefore: + @T.prim_func + def main(placeholder: T.Buffer[(256,), "int8"], placeholder_encoded: T.Buffer[(288,), "uint8"], placeholder_encoded_2: T.Buffer[(128,), "uint8"], placeholder_encoded_4: T.Buffer[(288,), "uint8"], placeholder_encoded_6: T.Buffer[(128,), "uint8"], placeholder_encoded_8: T.Buffer[(144,), "uint8"], ethosu_write: T.Buffer[(572,), "int8"]) -> None: + # function attr dict + T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) + ax0_ax1_fused_ax2_fused_ax3_fused = T.var("int32") + ax0_ax1_fused_ax2_fused_ax3_fused_1 = T.var("int32") + ax0_ax1_fused_ax2_fused_ax3_fused_2 = T.var("int32") + ax0_ax1_fused_ax2_fused_ax3_fused_3 = T.var("int32") + ax0_ax1_fused_ax2_fused_ax3_fused_4 = T.var("int32") + nn = T.var("int32") + nn_1 = T.var("int32") + nn_2 = T.var("int32") + nn_3 = T.var("int32") + nn_4 = T.var("int32") + nn_5 = T.var("int32") + nn_6 = T.var("int32") + nn_7 = T.var("int32") + nn_8 = T.var("int32") + nn_9 = T.var("int32") + T.preflattened_buffer(placeholder, [1, 8, 8, 4], dtype="int8", data=placeholder.data) + T.preflattened_buffer(placeholder_encoded, [4, 3, 3, 4], dtype="int8") + T.preflattened_buffer(placeholder_encoded_2, [4, 3, 3, 1], dtype="int8") + T.preflattened_buffer(placeholder_encoded_4, [4, 3, 3, 4], dtype="int8") + T.preflattened_buffer(placeholder_encoded_6, [4, 3, 3, 1], dtype="int8") + T.preflattened_buffer(placeholder_encoded_8, [4, 1, 3, 4], dtype="int8") + T.preflattened_buffer(ethosu_write, [1, 13, 11, 4], dtype="int8", data=ethosu_write.data) + # body + placeholder_d_d_global = T.allocate([288], "uint8", "global") + ethosu_write_2 = T.allocate([256], "int8", "global") + placeholder_d_d_global_2 = T.allocate([128], "uint8", "global") + ethosu_write_3 = T.allocate([256], "int8", "global") + placeholder_d_d_global_4 = T.allocate([288], "uint8", "global") + ethosu_write_4 = T.allocate([256], "int8", "global") + ethosu_write_5 = T.allocate([256], "int8", "global") + ethosu_write_6 = T.allocate([324], "int8", "global") + placeholder_d_global = T.allocate([128], "uint8", "global") + ethosu_write_7 = T.allocate([324], "int8", "global") + ethosu_write_8 = T.allocate([484], "int8", "global") + ethosu_write_9 = T.allocate([484], "int8", "global") + ethosu_write_10 = T.allocate([484], "int8", "global") + placeholder_global = T.allocate([144], "uint8", "global") + with T.attr(T.iter_var(ax0_ax1_fused_ax2_fused_ax3_fused, None, "DataPar", ""), "pragma_compute_cycles_hint", 2304): + T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded[0], 288, placeholder_d_d_global[0], dtype="handle")) + with T.attr(T.iter_var(nn, None, "DataPar", ""), "pragma_compute_cycles_hint", 320): + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 8, 4, 8, 0, 8, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 32, 4, 1, "int8", 8, 8, 4, 8, 0, 8, ethosu_write_2[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 32, 4, 1, 3, 3, 1, 1, 1, 1, placeholder_d_d_global[0], 240, T.int8(-1), T.int8(-1), 12, placeholder_d_d_global[240], 48, T.int8(-1), T.int8(-1), 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 8, 8, 8, dtype="handle")) + with T.attr(T.iter_var(ax0_ax1_fused_ax2_fused_ax3_fused_1, None, "DataPar", ""), "pragma_compute_cycles_hint", 576): + T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded_2[0], 128, placeholder_d_d_global_2[0], dtype="handle")) + with T.attr(T.iter_var(nn_1, None, "DataPar", ""), "pragma_compute_cycles_hint", 320): + T.evaluate(T.call_extern("ethosu_depthwise_conv2d", "int8", 8, 8, 4, 8, 0, 8, ethosu_write_2[0], 0, 0, 0, T.float32(0.59999999999999998), 11, "NHWC", 32, 4, 1, "int8", 8, 8, 4, 8, 0, 8, ethosu_write_3[0], 0, 0, 0, T.float32(0.26000000000000001), 15, "NHWC", 32, 4, 1, 3, 3, 1, 1, 1, 1, placeholder_d_d_global_2[0], 80, 13, placeholder_d_d_global_2[80], 48, 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 8, 8, 8, dtype="handle")) + with T.attr(T.iter_var(ax0_ax1_fused_ax2_fused_ax3_fused_2, None, "DataPar", ""), "pragma_compute_cycles_hint", 2304): + T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded_4[0], 288, placeholder_d_d_global_4[0], dtype="handle")) + with T.attr(T.iter_var(nn_2, None, "DataPar", ""), "pragma_compute_cycles_hint", 320): + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 8, 4, 8, 0, 8, ethosu_write_3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 32, 4, 1, "int8", 8, 8, 4, 8, 0, 8, ethosu_write_4[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 32, 4, 1, 3, 3, 1, 1, 1, 1, placeholder_d_d_global_4[0], 240, T.int8(-1), T.int8(-1), 12, placeholder_d_d_global_4[240], 48, T.int8(-1), T.int8(-1), 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 8, 8, 8, dtype="handle")) + with T.attr(T.iter_var(nn_3, None, "DataPar", ""), "pragma_compute_cycles_hint", 192): + T.evaluate(T.call_extern("ethosu_pooling", "int8", 8, 8, 4, 8, 0, 8, ethosu_write_4[0], 0, 0, 0, T.float32(1), 0, "NHWC", 32, 4, 1, "int8", 8, 8, 4, 8, 0, 8, ethosu_write_5[0], 0, 0, 0, T.float32(1), 0, "NHWC", 32, 4, 1, "MAX", 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 8, 8, 8, dtype="handle")) + with T.attr(T.iter_var(nn_4, None, "DataPar", ""), "pragma_compute_cycles_hint", 300): + T.evaluate(T.call_extern("ethosu_pooling", "int8", 8, 8, 4, 8, 0, 8, ethosu_write_5[0], 0, 0, 0, T.float32(1), 0, "NHWC", 32, 4, 1, "int8", 9, 9, 4, 9, 0, 9, ethosu_write_6[0], 0, 0, 0, T.float32(1), 0, "NHWC", 36, 4, 1, "AVG", 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 10, 10, 8, dtype="handle")) + with T.attr(T.iter_var(ax0_ax1_fused_ax2_fused_ax3_fused_3, None, "DataPar", ""), "pragma_compute_cycles_hint", 576): + T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded_6[0], 128, placeholder_d_global[0], dtype="handle")) + with T.attr(T.iter_var(nn_5, None, "DataPar", ""), "pragma_compute_cycles_hint", 500): + T.evaluate(T.call_extern("ethosu_depthwise_conv2d", "int8", 9, 9, 4, 9, 0, 9, ethosu_write_6[0], 0, 0, 0, T.float32(0.59999999999999998), 11, "NHWC", 36, 4, 1, "int8", 9, 9, 4, 9, 0, 9, ethosu_write_7[0], 0, 0, 0, T.float32(0.26000000000000001), 15, "NHWC", 36, 4, 1, 3, 3, 1, 1, 1, 1, placeholder_d_global[0], 80, 13, placeholder_d_global[80], 48, 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 10, 10, 8, dtype="handle")) + with T.attr(T.iter_var(nn_6, None, "DataPar", ""), "pragma_compute_cycles_hint", 432): + T.evaluate(T.call_extern("ethosu_pooling", "int8", 9, 9, 4, 9, 0, 9, ethosu_write_7[0], 0, 0, 0, T.float32(1), 0, "NHWC", 36, 4, 1, "int8", 11, 11, 4, 11, 0, 11, ethosu_write_8[0], 0, 0, 0, T.float32(1), 0, "NHWC", 44, 4, 1, "MAX", 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 12, 12, 8, dtype="handle")) + with T.attr(T.iter_var(nn_7, None, "DataPar", ""), "pragma_compute_cycles_hint", 432): + T.evaluate(T.call_extern("ethosu_pooling", "int8", 11, 11, 4, 11, 0, 11, ethosu_write_8[0], 0, 0, 0, T.float32(1), 0, "NHWC", 44, 4, 1, "int8", 11, 11, 4, 11, 0, 11, ethosu_write_9[0], 0, 0, 0, T.float32(1), 0, "NHWC", 44, 4, 1, "AVG", 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 12, 12, 8, dtype="handle")) + with T.attr(T.iter_var(nn_8, None, "DataPar", ""), "pragma_compute_cycles_hint", 432): + T.evaluate(T.call_extern("ethosu_pooling", "int8", 11, 11, 4, 11, 0, 11, ethosu_write_9[0], 0, 0, 0, T.float32(1), 0, "NHWC", 44, 4, 1, "int8", 11, 11, 4, 11, 0, 11, ethosu_write_10[0], 0, 0, 0, T.float32(1), 0, "NHWC", 44, 4, 1, "AVG", 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 12, 12, 8, dtype="handle")) + with T.attr(T.iter_var(ax0_ax1_fused_ax2_fused_ax3_fused_4, None, "DataPar", ""), "pragma_compute_cycles_hint", 768): + T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded_8[0], 144, placeholder_global[0], dtype="handle")) + T.attr(T.iter_var(nn_9, None, "DataPar", ""), "pragma_compute_cycles_hint", 504) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 11, 11, 4, 11, 0, 11, ethosu_write_10[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 44, 4, 1, "int8", 13, 11, 4, 13, 0, 11, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 44, 4, 1, 3, 1, 1, 1, 1, 1, placeholder_global[0], 96, T.int8(-1), T.int8(-1), 12, placeholder_global[96], 48, T.int8(-1), T.int8(-1), 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 14, 12, 8, dtype="handle")) + + + + @tvm.script.ir_module + class ModuleAfter: + @T.prim_func + def main(placeholder: T.Buffer[(256,), "int8"], placeholder_encoded: T.Buffer[(288,), "uint8"], placeholder_encoded_2: T.Buffer[(128,), "uint8"], placeholder_encoded_4: T.Buffer[(288,), "uint8"], placeholder_encoded_6: T.Buffer[(128,), "uint8"], placeholder_encoded_8: T.Buffer[(144,), "uint8"], ethosu_write: T.Buffer[(572,), "int8"]) -> None: + # function attr dict + T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) + ax0_ax1_fused_ax2_fused_ax3_fused = T.var("int32") + ax0_ax1_fused_ax2_fused_ax3_fused_1 = T.var("int32") + ax0_ax1_fused_ax2_fused_ax3_fused_2 = T.var("int32") + ax0_ax1_fused_ax2_fused_ax3_fused_3 = T.var("int32") + ax0_ax1_fused_ax2_fused_ax3_fused_4 = T.var("int32") + nn = T.var("int32") + nn_1 = T.var("int32") + nn_2 = T.var("int32") + nn_3 = T.var("int32") + nn_4 = T.var("int32") + nn_5 = T.var("int32") + nn_6 = T.var("int32") + nn_7 = T.var("int32") + nn_8 = T.var("int32") + nn_9 = T.var("int32") + T.preflattened_buffer(placeholder, [1, 8, 8, 4], dtype="int8", data=placeholder.data) + T.preflattened_buffer(placeholder_encoded, [4, 3, 3, 4], dtype="int8", data=placeholder_encoded.data) + T.preflattened_buffer(placeholder_encoded_2, [4, 3, 3, 1], dtype="int8", data=placeholder_encoded_2.data) + T.preflattened_buffer(placeholder_encoded_4, [4, 3, 3, 4], dtype="int8", data=placeholder_encoded_4.data) + T.preflattened_buffer(placeholder_encoded_6, [4, 3, 3, 1], dtype="int8", data=placeholder_encoded_6.data) + T.preflattened_buffer(placeholder_encoded_8, [4, 1, 3, 4], dtype="int8", data=placeholder_encoded_8.data) + T.preflattened_buffer(ethosu_write, [1, 13, 11, 4], dtype="int8", data=ethosu_write.data) + # body + placeholder_d_d_global = T.allocate([288], "uint8", "global") + ethosu_write_2 = T.allocate([256], "int8", "global") + placeholder_d_d_global_2 = T.allocate([128], "uint8", "global") + ethosu_write_3 = T.allocate([256], "int8", "global") + placeholder_d_d_global_4 = T.allocate([288], "uint8", "global") + ethosu_write_4 = T.allocate([256], "int8", "global") + ethosu_write_5 = T.allocate([256], "int8", "global") + ethosu_write_6 = T.allocate([324], "int8", "global") + placeholder_d_global = T.allocate([128], "uint8", "global") + ethosu_write_7 = T.allocate([324], "int8", "global") + ethosu_write_8 = T.allocate([484], "int8", "global") + ethosu_write_9 = T.allocate([484], "int8", "global") + ethosu_write_10 = T.allocate([484], "int8", "global") + placeholder_global = T.allocate([144], "uint8", "global") + with T.attr(T.iter_var(ax0_ax1_fused_ax2_fused_ax3_fused, None, "DataPar", ""), "pragma_compute_cycles_hint", 2304): + T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded[0], 288, placeholder_d_d_global[0], dtype="handle")) + with T.attr(T.iter_var(ax0_ax1_fused_ax2_fused_ax3_fused_1, None, "DataPar", ""), "pragma_compute_cycles_hint", 576): + T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded_2[0], 128, placeholder_d_d_global_2[0], dtype="handle")) + with T.attr(T.iter_var(nn, None, "DataPar", ""), "pragma_compute_cycles_hint", 320): + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 8, 4, 8, 0, 8, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 32, 4, 1, "int8", 8, 8, 4, 8, 0, 8, ethosu_write_2[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 32, 4, 1, 3, 3, 1, 1, 1, 1, placeholder_d_d_global[0], 240, T.int8(-1), T.int8(-1), 12, placeholder_d_d_global[240], 48, T.int8(-1), T.int8(-1), 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 8, 8, 8, dtype="handle")) + with T.attr(T.iter_var(ax0_ax1_fused_ax2_fused_ax3_fused_2, None, "DataPar", ""), "pragma_compute_cycles_hint", 2304): + T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded_4[0], 288, placeholder_d_d_global_4[0], dtype="handle")) + with T.attr(T.iter_var(nn_1, None, "DataPar", ""), "pragma_compute_cycles_hint", 320): + T.evaluate(T.call_extern("ethosu_depthwise_conv2d", "int8", 8, 8, 4, 8, 0, 8, ethosu_write_2[0], 0, 0, 0, T.float32(0.59999999999999998), 11, "NHWC", 32, 4, 1, "int8", 8, 8, 4, 8, 0, 8, ethosu_write_3[0], 0, 0, 0, T.float32(0.26000000000000001), 15, "NHWC", 32, 4, 1, 3, 3, 1, 1, 1, 1, placeholder_d_d_global_2[0], 80, 13, placeholder_d_d_global_2[80], 48, 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 8, 8, 8, dtype="handle")) + with T.attr(T.iter_var(ax0_ax1_fused_ax2_fused_ax3_fused_3, None, "DataPar", ""), "pragma_compute_cycles_hint", 576): + T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded_6[0], 128, placeholder_d_global[0], dtype="handle")) + with T.attr(T.iter_var(nn_2, None, "DataPar", ""), "pragma_compute_cycles_hint", 320): + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 8, 4, 8, 0, 8, ethosu_write_3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 32, 4, 1, "int8", 8, 8, 4, 8, 0, 8, ethosu_write_4[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 32, 4, 1, 3, 3, 1, 1, 1, 1, placeholder_d_d_global_4[0], 240, T.int8(-1), T.int8(-1), 12, placeholder_d_d_global_4[240], 48, T.int8(-1), T.int8(-1), 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 8, 8, 8, dtype="handle")) + with T.attr(T.iter_var(nn_3, None, "DataPar", ""), "pragma_compute_cycles_hint", 192): + T.evaluate(T.call_extern("ethosu_pooling", "int8", 8, 8, 4, 8, 0, 8, ethosu_write_4[0], 0, 0, 0, T.float32(1), 0, "NHWC", 32, 4, 1, "int8", 8, 8, 4, 8, 0, 8, ethosu_write_5[0], 0, 0, 0, T.float32(1), 0, "NHWC", 32, 4, 1, "MAX", 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 8, 8, 8, dtype="handle")) + with T.attr(T.iter_var(nn_4, None, "DataPar", ""), "pragma_compute_cycles_hint", 300): + T.evaluate(T.call_extern("ethosu_pooling", "int8", 8, 8, 4, 8, 0, 8, ethosu_write_5[0], 0, 0, 0, T.float32(1), 0, "NHWC", 32, 4, 1, "int8", 9, 9, 4, 9, 0, 9, ethosu_write_6[0], 0, 0, 0, T.float32(1), 0, "NHWC", 36, 4, 1, "AVG", 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 10, 10, 8, dtype="handle")) + with T.attr(T.iter_var(nn_5, None, "DataPar", ""), "pragma_compute_cycles_hint", 500): + T.evaluate(T.call_extern("ethosu_depthwise_conv2d", "int8", 9, 9, 4, 9, 0, 9, ethosu_write_6[0], 0, 0, 0, T.float32(0.59999999999999998), 11, "NHWC", 36, 4, 1, "int8", 9, 9, 4, 9, 0, 9, ethosu_write_7[0], 0, 0, 0, T.float32(0.26000000000000001), 15, "NHWC", 36, 4, 1, 3, 3, 1, 1, 1, 1, placeholder_d_global[0], 80, 13, placeholder_d_global[80], 48, 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 10, 10, 8, dtype="handle")) + with T.attr(T.iter_var(nn_6, None, "DataPar", ""), "pragma_compute_cycles_hint", 432): + T.evaluate(T.call_extern("ethosu_pooling", "int8", 9, 9, 4, 9, 0, 9, ethosu_write_7[0], 0, 0, 0, T.float32(1), 0, "NHWC", 36, 4, 1, "int8", 11, 11, 4, 11, 0, 11, ethosu_write_8[0], 0, 0, 0, T.float32(1), 0, "NHWC", 44, 4, 1, "MAX", 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 12, 12, 8, dtype="handle")) + with T.attr(T.iter_var(ax0_ax1_fused_ax2_fused_ax3_fused_4, None, "DataPar", ""), "pragma_compute_cycles_hint", 768): + T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded_8[0], 144, placeholder_global[0], dtype="handle")) + with T.attr(T.iter_var(nn_7, None, "DataPar", ""), "pragma_compute_cycles_hint", 432): + T.evaluate(T.call_extern("ethosu_pooling", "int8", 11, 11, 4, 11, 0, 11, ethosu_write_8[0], 0, 0, 0, T.float32(1), 0, "NHWC", 44, 4, 1, "int8", 11, 11, 4, 11, 0, 11, ethosu_write_9[0], 0, 0, 0, T.float32(1), 0, "NHWC", 44, 4, 1, "AVG", 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 12, 12, 8, dtype="handle")) + with T.attr(T.iter_var(nn_8, None, "DataPar", ""), "pragma_compute_cycles_hint", 432): + T.evaluate(T.call_extern("ethosu_pooling", "int8", 11, 11, 4, 11, 0, 11, ethosu_write_9[0], 0, 0, 0, T.float32(1), 0, "NHWC", 44, 4, 1, "int8", 11, 11, 4, 11, 0, 11, ethosu_write_10[0], 0, 0, 0, T.float32(1), 0, "NHWC", 44, 4, 1, "AVG", 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 12, 12, 8, dtype="handle")) + T.attr(T.iter_var(nn_9, None, "DataPar", ""), "pragma_compute_cycles_hint", 504) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 11, 11, 4, 11, 0, 11, ethosu_write_10[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 44, 4, 1, "int8", 13, 11, 4, 13, 0, 11, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 44, 4, 1, 3, 1, 1, 1, 1, 1, placeholder_global[0], 96, T.int8(-1), T.int8(-1), 12, placeholder_global[96], 48, T.int8(-1), T.int8(-1), 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 14, 12, 8, dtype="handle")) + # fmt: on + + test_mod = CopyComputeReordering(reorder_by_cycles=True)(ModuleBefore) + reference_mod = ModuleAfter + tvm.ir.assert_structural_equal(test_mod, reference_mod, True) + + +def test_reordering_based_on_cycles_luts_present(): + # fmt: off + @tvm.script.ir_module + class ModuleBefore: + @T.prim_func + def main(placeholder: T.Buffer[9075, "int8"], placeholder_encoded: T.Buffer[256, "uint8"], placeholder_encoded_2: T.Buffer[112, "uint8"], placeholder_1: T.Buffer[256, "int8"], placeholder_encoded_4: T.Buffer[112, "uint8"], placeholder_2: T.Buffer[256, "int8"], placeholder_3: T.Buffer[256, "int8"], ethosu_write: T.Buffer[2496, "int8"]) -> None: + # function attr dict + T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) + ax0_ax1_fused_ax2_fused_ax3_fused = T.var("int32") + ax0_ax1_fused_ax2_fused_ax3_fused_1 = T.var("int32") + ax0_ax1_fused_ax2_fused_ax3_fused_2 = T.var("int32") + nn = T.var("int32") + nn_1 = T.var("int32") + nn_2 = T.var("int32") + nn_3 = T.var("int32") + nn_4 = T.var("int32") + nn_5 = T.var("int32") + T.preflattened_buffer(placeholder, [1, 55, 55, 3], dtype="int8", data=placeholder.data) + T.preflattened_buffer(placeholder_encoded, [4, 3, 3, 3], dtype="int8") + T.preflattened_buffer(placeholder_encoded_2, [4, 2, 3, 1], dtype="int8") + T.preflattened_buffer(placeholder_1, [256], dtype="int8", data=placeholder_1.data) + T.preflattened_buffer(placeholder_encoded_4, [4, 2, 3, 1], dtype="int8") + T.preflattened_buffer(placeholder_2, [256], dtype="int8", data=placeholder_2.data) + T.preflattened_buffer(placeholder_3, [256], dtype="int8", data=placeholder_3.data) + T.preflattened_buffer(ethosu_write, [1, 26, 24, 4], dtype="int8", data=ethosu_write.data) + # body + placeholder_d_d_global = T.allocate([256], "uint8", "global") + ethosu_write_2 = T.allocate([12544], "int8", "global") + placeholder_local = T.allocate([256], "int8", "local") + placeholder_d_global = T.allocate([112], "uint8", "global") + ethosu_write_3 = T.allocate([9984], "int8", "global") + ethosu_write_4 = T.allocate([9984], "int8", "global") + ethosu_write_5 = T.allocate([9984], "int8", "global") + placeholder_d_local = T.allocate([256], "int8", "local") + placeholder_global = T.allocate([112], "uint8", "global") + ethosu_write_6 = T.allocate([9984], "int8", "global") + placeholder_d_local_1 = T.allocate([256], "int8", "local") + with T.attr(T.iter_var(ax0_ax1_fused_ax2_fused_ax3_fused, None, "DataPar", ""), "pragma_compute_cycles_hint", 1728): + T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded[0], 256, placeholder_d_d_global[0], dtype="handle")) + with T.attr(T.iter_var(nn, None, "DataPar", ""), "pragma_compute_cycles_hint", 9920): + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 55, 55, 3, 55, 0, 55, placeholder[0], 0, 0, 0, T.float32(0.0027450970374047756), -128, "NHWC", 165, 3, 1, "int8", 28, 28, 4, 28, 0, 28, ethosu_write_2[0], 0, 0, 0, T.float32(0.0095788920298218727), -128, "NHCWB16", 448, 16, 1, 3, 3, 2, 2, 1, 1, placeholder_d_d_global[0], 208, T.int8(-1), T.int8(-1), 0, placeholder_d_d_global[208], 48, T.int8(-1), T.int8(-1), 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 4, 16, 16, dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", placeholder_1[0], 256, placeholder_local[0], dtype="handle")) + with T.attr(T.iter_var(ax0_ax1_fused_ax2_fused_ax3_fused_1, None, "DataPar", ""), "pragma_compute_cycles_hint", 384): + T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded_2[0], 112, placeholder_d_global[0], dtype="handle")) + with T.attr(T.iter_var(nn_1, None, "DataPar", ""), "pragma_compute_cycles_hint", 330): + T.evaluate(T.call_extern("ethosu_depthwise_conv2d", "int8", 28, 28, 4, 28, 0, 28, ethosu_write_2[0], 0, 0, 0, T.float32(0.0095788920298218727), -128, "NHCWB16", 448, 16, 1, "int8", 26, 24, 4, 26, 0, 24, ethosu_write_3[0], 0, 0, 0, T.float32(0.0078157493844628334), -128, "NHCWB16", 384, 16, 1, 3, 2, 1, 1, 2, 2, placeholder_d_global[0], 64, 0, placeholder_d_global[64], 48, 0, 0, 0, 0, "SIGMOID", 0, 0, "TFL", "NONE", 5, 12, 16, dtype="handle")) + with T.attr(T.iter_var(nn_2, None, "DataPar", ""), "pragma_compute_cycles_hint", 411): + T.evaluate(T.call_extern("ethosu_pooling", "int8", 26, 24, 4, 26, 0, 24, ethosu_write_3[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 384, 16, 1, "int8", 26, 24, 4, 26, 0, 24, ethosu_write_4[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 384, 16, 1, "MAX", 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 2, 24, 16, dtype="handle")) + with T.attr(T.iter_var(nn_3, None, "DataPar", ""), "pragma_compute_cycles_hint", 458): + T.evaluate(T.call_extern("ethosu_pooling", "int8", 26, 24, 4, 26, 0, 24, ethosu_write_4[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 384, 16, 1, "int8", 26, 24, 4, 26, 0, 24, ethosu_write_5[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 384, 16, 1, "AVG", 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 2, 24, 16, dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", placeholder_2[0], 256, placeholder_d_local[0], dtype="handle")) + with T.attr(T.iter_var(ax0_ax1_fused_ax2_fused_ax3_fused_2, None, "DataPar", ""), "pragma_compute_cycles_hint", 1500): + T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded_4[0], 112, placeholder_global[0], dtype="handle")) + with T.attr(T.iter_var(nn_4, None, "DataPar", ""), "pragma_compute_cycles_hint", 10464): + T.evaluate(T.call_extern("ethosu_depthwise_conv2d", "int8", 26, 24, 4, 26, 0, 24, ethosu_write_5[0], 0, 0, 0, T.float32(0.00390625), -128, "NHCWB16", 384, 16, 1, "int8", 26, 24, 4, 26, 0, 24, ethosu_write_6[0], 0, 0, 0, T.float32(0.002753810491412878), -128, "NHCWB16", 384, 16, 1, 3, 2, 1, 1, 2, 2, placeholder_global[0], 64, 0, placeholder_global[64], 48, 1, 2, 1, 2, "TANH", 0, 0, "TFL", "NONE", 5, 12, 16, dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", placeholder_3[0], 256, placeholder_d_local_1[0], dtype="handle")) + T.attr(T.iter_var(nn_5, None, "DataPar", ""), "pragma_compute_cycles_hint", 5232) + T.evaluate(T.call_extern("ethosu_pooling", "int8", 26, 24, 4, 26, 0, 24, ethosu_write_6[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 384, 16, 1, "int8", 26, 24, 4, 26, 0, 24, ethosu_write[0], 0, 0, 0, T.float32(1), 0, "NHWC", 96, 4, 1, "MAX", 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, "TANH", 0, 0, "TFL", "NONE", 5, 24, 8, dtype="handle")) + + + + + @tvm.script.ir_module + class ModuleAfter: + @T.prim_func + def main(placeholder: T.Buffer[9075, "int8"], placeholder_encoded: T.Buffer[256, "uint8"], placeholder_encoded_2: T.Buffer[112, "uint8"], placeholder_1: T.Buffer[256, "int8"], placeholder_encoded_4: T.Buffer[112, "uint8"], placeholder_2: T.Buffer[256, "int8"], placeholder_3: T.Buffer[256, "int8"], ethosu_write: T.Buffer[2496, "int8"]) -> None: + # function attr dict + T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) + ax0_ax1_fused_ax2_fused_ax3_fused = T.var("int32") + ax0_ax1_fused_ax2_fused_ax3_fused_1 = T.var("int32") + ax0_ax1_fused_ax2_fused_ax3_fused_2 = T.var("int32") + nn = T.var("int32") + nn_1 = T.var("int32") + nn_2 = T.var("int32") + nn_3 = T.var("int32") + nn_4 = T.var("int32") + nn_5 = T.var("int32") + T.preflattened_buffer(placeholder, [1, 55, 55, 3], dtype="int8", data=placeholder.data) + T.preflattened_buffer(placeholder_encoded, [4, 3, 3, 3], dtype="int8", data=placeholder_encoded.data) + T.preflattened_buffer(placeholder_encoded_2, [4, 2, 3, 1], dtype="int8", data=placeholder_encoded_2.data) + T.preflattened_buffer(placeholder_1, [256], dtype="int8", data=placeholder_1.data) + T.preflattened_buffer(placeholder_encoded_4, [4, 2, 3, 1], dtype="int8", data=placeholder_encoded_4.data) + T.preflattened_buffer(placeholder_2, [256], dtype="int8", data=placeholder_2.data) + T.preflattened_buffer(placeholder_3, [256], dtype="int8", data=placeholder_3.data) + T.preflattened_buffer(ethosu_write, [1, 26, 24, 4], dtype="int8", data=ethosu_write.data) + # body + placeholder_d_d_global = T.allocate([256], "uint8", "global") + ethosu_write_2 = T.allocate([12544], "int8", "global") + placeholder_local = T.allocate([256], "int8", "local") + placeholder_d_global = T.allocate([112], "uint8", "global") + ethosu_write_3 = T.allocate([9984], "int8", "global") + ethosu_write_4 = T.allocate([9984], "int8", "global") + ethosu_write_5 = T.allocate([9984], "int8", "global") + placeholder_d_local = T.allocate([256], "int8", "local") + placeholder_global = T.allocate([112], "uint8", "global") + ethosu_write_6 = T.allocate([9984], "int8", "global") + placeholder_d_local_1 = T.allocate([256], "int8", "local") + with T.attr(T.iter_var(ax0_ax1_fused_ax2_fused_ax3_fused, None, "DataPar", ""), "pragma_compute_cycles_hint", 1728): + T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded[0], 256, placeholder_d_d_global[0], dtype="handle")) + with T.attr(T.iter_var(ax0_ax1_fused_ax2_fused_ax3_fused_1, None, "DataPar", ""), "pragma_compute_cycles_hint", 384): + T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded_2[0], 112, placeholder_d_global[0], dtype="handle")) + with T.attr(T.iter_var(nn, None, "DataPar", ""), "pragma_compute_cycles_hint", 9920): + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 55, 55, 3, 55, 0, 55, placeholder[0], 0, 0, 0, T.float32(0.0027450970374047756), -128, "NHWC", 165, 3, 1, "int8", 28, 28, 4, 28, 0, 28, ethosu_write_2[0], 0, 0, 0, T.float32(0.0095788920298218727), -128, "NHCWB16", 448, 16, 1, 3, 3, 2, 2, 1, 1, placeholder_d_d_global[0], 208, T.int8(-1), T.int8(-1), 0, placeholder_d_d_global[208], 48, T.int8(-1), T.int8(-1), 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 4, 16, 16, dtype="handle")) + with T.attr(T.iter_var(ax0_ax1_fused_ax2_fused_ax3_fused_2, None, "DataPar", ""), "pragma_compute_cycles_hint", 1500): + T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded_4[0], 112, placeholder_global[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", placeholder_1[0], 256, placeholder_local[0], dtype="handle")) + with T.attr(T.iter_var(nn_1, None, "DataPar", ""), "pragma_compute_cycles_hint", 330): + T.evaluate(T.call_extern("ethosu_depthwise_conv2d", "int8", 28, 28, 4, 28, 0, 28, ethosu_write_2[0], 0, 0, 0, T.float32(0.0095788920298218727), -128, "NHCWB16", 448, 16, 1, "int8", 26, 24, 4, 26, 0, 24, ethosu_write_3[0], 0, 0, 0, T.float32(0.0078157493844628334), -128, "NHCWB16", 384, 16, 1, 3, 2, 1, 1, 2, 2, placeholder_d_global[0], 64, 0, placeholder_d_global[64], 48, 0, 0, 0, 0, "SIGMOID", 0, 0, "TFL", "NONE", 5, 12, 16, dtype="handle")) + with T.attr(T.iter_var(nn_2, None, "DataPar", ""), "pragma_compute_cycles_hint", 411): + T.evaluate(T.call_extern("ethosu_pooling", "int8", 26, 24, 4, 26, 0, 24, ethosu_write_3[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 384, 16, 1, "int8", 26, 24, 4, 26, 0, 24, ethosu_write_4[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 384, 16, 1, "MAX", 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 2, 24, 16, dtype="handle")) + with T.attr(T.iter_var(nn_3, None, "DataPar", ""), "pragma_compute_cycles_hint", 458): + T.evaluate(T.call_extern("ethosu_pooling", "int8", 26, 24, 4, 26, 0, 24, ethosu_write_4[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 384, 16, 1, "int8", 26, 24, 4, 26, 0, 24, ethosu_write_5[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 384, 16, 1, "AVG", 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 2, 24, 16, dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", placeholder_2[0], 256, placeholder_d_local[0], dtype="handle")) + with T.attr(T.iter_var(nn_4, None, "DataPar", ""), "pragma_compute_cycles_hint", 10464): + T.evaluate(T.call_extern("ethosu_depthwise_conv2d", "int8", 26, 24, 4, 26, 0, 24, ethosu_write_5[0], 0, 0, 0, T.float32(0.00390625), -128, "NHCWB16", 384, 16, 1, "int8", 26, 24, 4, 26, 0, 24, ethosu_write_6[0], 0, 0, 0, T.float32(0.002753810491412878), -128, "NHCWB16", 384, 16, 1, 3, 2, 1, 1, 2, 2, placeholder_global[0], 64, 0, placeholder_global[64], 48, 1, 2, 1, 2, "TANH", 0, 0, "TFL", "NONE", 5, 12, 16, dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", placeholder_3[0], 256, placeholder_d_local_1[0], dtype="handle")) + T.attr(T.iter_var(nn_5, None, "DataPar", ""), "pragma_compute_cycles_hint", 5232) + T.evaluate(T.call_extern("ethosu_pooling", "int8", 26, 24, 4, 26, 0, 24, ethosu_write_6[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 384, 16, 1, "int8", 26, 24, 4, 26, 0, 24, ethosu_write[0], 0, 0, 0, T.float32(1), 0, "NHWC", 96, 4, 1, "MAX", 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, "TANH", 0, 0, "TFL", "NONE", 5, 24, 8, dtype="handle")) + # fmt: on + + test_mod = CopyComputeReordering(reorder_by_cycles=True)(ModuleBefore) + reference_mod = ModuleAfter + tvm.ir.assert_structural_equal(test_mod, reference_mod, True) + + if __name__ == "__main__": pytest.main([__file__]) From 0d0c361659fd37ad30dc797d192a5cc2f23cd7be Mon Sep 17 00:00:00 2001 From: Elen Kalda Date: Mon, 6 Jun 2022 16:28:03 +0100 Subject: [PATCH 2/3] Linting... --- src/tir/contrib/ethosu/passes.cc | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/tir/contrib/ethosu/passes.cc b/src/tir/contrib/ethosu/passes.cc index 8a5d1ef18225..8ae6681ad0b5 100644 --- a/src/tir/contrib/ethosu/passes.cc +++ b/src/tir/contrib/ethosu/passes.cc @@ -311,10 +311,7 @@ class CopyComputeReorderingMutator : public StmtExprMutator { new_seq.push_back(op.compute_op); } } - - } - // Reorder the copies and computes by a fixed number of movements - else { + } else { // Each copy statement to a buffer with global scope is moved up // at most `_max_copy_movements` times. for (size_t index = 0; index < new_seq.size(); ++index) { From dd29f9f9ef0f05fa40eb7a570ac31feef0753a70 Mon Sep 17 00:00:00 2001 From: Elen Kalda Date: Thu, 14 Jul 2022 12:42:59 +0100 Subject: [PATCH 3/3] Rebase and respond to comments --- .../backend/contrib/ethosu/tir/compiler.py | 1 + .../backend/contrib/ethosu/tir/passes.py | 19 +- src/tir/contrib/ethosu/passes.cc | 50 +-- .../test_copy_compute_reordering.py | 293 +++++++----------- 4 files changed, 136 insertions(+), 227 deletions(-) diff --git a/python/tvm/relay/backend/contrib/ethosu/tir/compiler.py b/python/tvm/relay/backend/contrib/ethosu/tir/compiler.py index 85c6df4c7d0c..aaac59ad4a52 100644 --- a/python/tvm/relay/backend/contrib/ethosu/tir/compiler.py +++ b/python/tvm/relay/backend/contrib/ethosu/tir/compiler.py @@ -90,6 +90,7 @@ def lower_ethosu(sch, args, const_dict, name="main"): mod = tvm.tir.transform.RemoveNoOp()(mod) mod, const_dict = ethosu_passes.EncodeConstants(const_dict)(mod) mod = ethosu_passes.HoistAllocates()(mod) + mod = tvm.tir.transform.RemoveNoOp()(mod) # MergeConstant pass currently does not support striped schedules. # It requires further investigation. if not util.is_striping_enabled(): diff --git a/python/tvm/relay/backend/contrib/ethosu/tir/passes.py b/python/tvm/relay/backend/contrib/ethosu/tir/passes.py index 77d2fff230df..cc94c6e816cd 100644 --- a/python/tvm/relay/backend/contrib/ethosu/tir/passes.py +++ b/python/tvm/relay/backend/contrib/ethosu/tir/passes.py @@ -926,9 +926,22 @@ def CopyComputeReordering( into the SHRAM and that already happens in parallel with copying weights into the weights encoder. - If reorder_by_cycles is set, we use the cycle hint to decide the reordering. If it is not set, - we move the copies up by a fixed number of movements, either by max_copy_movements if it is - specified, or by default value of 1. + If reorder_by_cycles is set, we use the compute_cycles_hint to decide the reordering. If it is + not set, we move the copies up by a fixed number of movements, either by max_copy_movements if + it is specified, or by default value of 1. + + If reordering based on the cycle count is enabled, we try to achieve further copy latency + hiding with a two step algorithm: + (1) Move all the global copies (i.e. copies that copy a constant into SRAM for conv2d or + depthwise_conv2d) above a preceding compute op. If in general the computes take longer than + copies, this should be enough to hide the copy latencies. + (2) If there are some global copies that take longer than the computes, we might be able to + hide them further by moving them further up in a graph since in general there are more compute + ops than copy ops in a graph (as only conv2d and depthwise_conv2d have constants associated + with them). The algortithm checks whether a copy is hidden and if it is not, it checks if a + preceding compute op has a preceding copy and if it doesn't it moves the copy that we try to + hide further up. It keeps moving the copy until it can't move it any further or until the + latency is hidden. Parameters ---------- diff --git a/src/tir/contrib/ethosu/passes.cc b/src/tir/contrib/ethosu/passes.cc index 8ae6681ad0b5..2f6fa8f3ea33 100644 --- a/src/tir/contrib/ethosu/passes.cc +++ b/src/tir/contrib/ethosu/passes.cc @@ -278,14 +278,14 @@ class CopyComputeReorderingMutator : public StmtExprMutator { for (size_t idx = ops.size() - 1; idx > 0; --idx) { if (ops[idx].global_copy.as()) { // Check whether the copy is hidden - int64_t copy_cycles{get_cycles(ops[idx].global_copy)}; - int64_t compute_cycles{get_cycles(ops[idx].compute_op)}; + int64_t copy_cycles{GetStmtCycles(ops[idx].global_copy)}; + int64_t compute_cycles{GetStmtCycles(ops[idx].compute_op)}; bool is_hidden = compute_cycles >= copy_cycles; // If the previous compute op is not already hiding another copy, move the copy back, so // that it would be hidden by multiple computes while (!is_hidden && !ops[idx - 1].global_copy.as() && (idx > 0)) { - int64_t new_compute_cycles{get_cycles(ops[idx - 1].compute_op)}; + int64_t new_compute_cycles{GetStmtCycles(ops[idx - 1].compute_op)}; ops[idx - 1].global_copy = ops[idx].global_copy; ops[idx].global_copy = {}; compute_cycles += new_compute_cycles; @@ -317,7 +317,8 @@ class CopyComputeReorderingMutator : public StmtExprMutator { for (size_t index = 0; index < new_seq.size(); ++index) { if (GetStmtType(new_seq[index]) == StmtType::global_copy) { int lower = std::max(0, static_cast(index) - _max_copy_movements); - for (int i = index; i > lower && (GetStmtType(new_seq[i - 1]) == StmtType::compute); --i) { + for (int i = index; i > lower && (GetStmtType(new_seq[i - 1]) == StmtType::compute); + --i) { std::swap(new_seq[i - 1], new_seq[i]); } } @@ -329,44 +330,11 @@ class CopyComputeReorderingMutator : public StmtExprMutator { return Stmt{seq_stmt_node}; } - int64_t get_cycles(const Stmt& stmt) { - auto attr_node{stmt.as()}; - ICHECK(attr_node) << "The cycle count attribute is missing"; - return attr_node->value.as()->value; - } - - tvm::runtime::Array get_stmt_args(const Stmt& stmt) { - Stmt eval_stmt = stmt; - if (const auto* attr_stmt = eval_stmt.as()) { - eval_stmt = attr_stmt->body; - } - - auto eval_node{eval_stmt.as()}; - ICHECK(eval_node) << "Expected statement to be an evaluate node, but was " - << eval_stmt->GetTypeKey(); - auto call_node{eval_node->value.as()}; - ICHECK(call_node) << "Expected expression to be a call node, but was " - << eval_node->value->GetTypeKey(); - return call_node->args; - } + bool stmt_is_global_copy(const Stmt& stmt) { return GetStmtType(stmt) == StmtType::global_copy; } - bool stmt_is_global_copy(const Stmt& stmt) { - auto args{get_stmt_args(stmt)}; - return args[0].as()->value == "ethosu_copy" && - args[3].as()->buffer.scope() == "global"; - } - - bool stmt_is_local_copy(const Stmt& stmt) { - auto args{get_stmt_args(stmt)}; - return args[0].as()->value == "ethosu_copy" && - args[3].as()->buffer.scope() == "local"; - } - - bool stmt_is_copy(const Stmt& stmt) { - return stmt_is_global_copy(stmt) || stmt_is_local_copy(stmt); - } + bool stmt_is_local_copy(const Stmt& stmt) { return GetStmtType(stmt) == StmtType::local_copy; } - bool stmt_is_compute_op(const Stmt& stmt) { return !stmt_is_copy(stmt); } + bool stmt_is_compute_op(const Stmt& stmt) { return GetStmtType(stmt) == StmtType::compute; } /*! The maximum number of movements allowed for a copy. */ int _max_copy_movements; @@ -401,7 +369,7 @@ tvm::transform::Pass CopyComputeReordering(Optional max_copy_movements, ctx->GetConfig(kCopyComputeReorderingMaxCopyMovements, Integer(1)).value()); auto reorder = reorder_by_cycles.value_or( ctx->GetConfig(kCopyComputeReorderingReorderByCycles, Bool(false)).value()); - return CopyComputeReorderingMutator(copy_movements, reorder)(f); + return CopyComputeReorderingMutator(copy_movements.IntValue(), reorder)(f); }; return tvm::tir::transform::CreatePrimFuncPass(pass_func, 0, "tir.contrib.ethos-u.CopyComputeReordering", {}); diff --git a/tests/python/contrib/test_ethosu/test_copy_compute_reordering.py b/tests/python/contrib/test_ethosu/test_copy_compute_reordering.py index d6f0c178e56e..f348fd7f5a77 100644 --- a/tests/python/contrib/test_ethosu/test_copy_compute_reordering.py +++ b/tests/python/contrib/test_ethosu/test_copy_compute_reordering.py @@ -473,152 +473,97 @@ def test_reordering_based_on_cycles(): @tvm.script.ir_module class ModuleBefore: @T.prim_func - def main(placeholder: T.Buffer[(256,), "int8"], placeholder_encoded: T.Buffer[(288,), "uint8"], placeholder_encoded_2: T.Buffer[(128,), "uint8"], placeholder_encoded_4: T.Buffer[(288,), "uint8"], placeholder_encoded_6: T.Buffer[(128,), "uint8"], placeholder_encoded_8: T.Buffer[(144,), "uint8"], ethosu_write: T.Buffer[(572,), "int8"]) -> None: + def main(placeholder: T.Buffer[97156, "int8"], placeholder_encoded: T.Buffer[208, "uint8"], placeholder_encoded_1: T.Buffer[112, "uint8"], placeholder_encoded_2: T.Buffer[96, "uint8"], placeholder_encoded_3: T.Buffer[112, "uint8"], ethosu_write: T.Buffer[43672, "int8"]) -> None: # function attr dict - T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) + T.func_attr({"tir.noalias": True, "global_symbol": "main", "from_legacy_te_schedule": True}) ax0_ax1_fused_ax2_fused_ax3_fused = T.var("int32") ax0_ax1_fused_ax2_fused_ax3_fused_1 = T.var("int32") ax0_ax1_fused_ax2_fused_ax3_fused_2 = T.var("int32") ax0_ax1_fused_ax2_fused_ax3_fused_3 = T.var("int32") - ax0_ax1_fused_ax2_fused_ax3_fused_4 = T.var("int32") nn = T.var("int32") nn_1 = T.var("int32") nn_2 = T.var("int32") nn_3 = T.var("int32") nn_4 = T.var("int32") nn_5 = T.var("int32") - nn_6 = T.var("int32") - nn_7 = T.var("int32") - nn_8 = T.var("int32") - nn_9 = T.var("int32") - T.preflattened_buffer(placeholder, [1, 8, 8, 4], dtype="int8", data=placeholder.data) - T.preflattened_buffer(placeholder_encoded, [4, 3, 3, 4], dtype="int8") - T.preflattened_buffer(placeholder_encoded_2, [4, 3, 3, 1], dtype="int8") - T.preflattened_buffer(placeholder_encoded_4, [4, 3, 3, 4], dtype="int8") - T.preflattened_buffer(placeholder_encoded_6, [4, 3, 3, 1], dtype="int8") - T.preflattened_buffer(placeholder_encoded_8, [4, 1, 3, 4], dtype="int8") - T.preflattened_buffer(ethosu_write, [1, 13, 11, 4], dtype="int8", data=ethosu_write.data) # body - placeholder_d_d_global = T.allocate([288], "uint8", "global") - ethosu_write_2 = T.allocate([256], "int8", "global") - placeholder_d_d_global_2 = T.allocate([128], "uint8", "global") - ethosu_write_3 = T.allocate([256], "int8", "global") - placeholder_d_d_global_4 = T.allocate([288], "uint8", "global") - ethosu_write_4 = T.allocate([256], "int8", "global") - ethosu_write_5 = T.allocate([256], "int8", "global") - ethosu_write_6 = T.allocate([324], "int8", "global") - placeholder_d_global = T.allocate([128], "uint8", "global") - ethosu_write_7 = T.allocate([324], "int8", "global") - ethosu_write_8 = T.allocate([484], "int8", "global") - ethosu_write_9 = T.allocate([484], "int8", "global") - ethosu_write_10 = T.allocate([484], "int8", "global") - placeholder_global = T.allocate([144], "uint8", "global") - with T.attr(T.iter_var(ax0_ax1_fused_ax2_fused_ax3_fused, None, "DataPar", ""), "pragma_compute_cycles_hint", 2304): - T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded[0], 288, placeholder_d_d_global[0], dtype="handle")) - with T.attr(T.iter_var(nn, None, "DataPar", ""), "pragma_compute_cycles_hint", 320): - T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 8, 4, 8, 0, 8, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 32, 4, 1, "int8", 8, 8, 4, 8, 0, 8, ethosu_write_2[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 32, 4, 1, 3, 3, 1, 1, 1, 1, placeholder_d_d_global[0], 240, T.int8(-1), T.int8(-1), 12, placeholder_d_d_global[240], 48, T.int8(-1), T.int8(-1), 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 8, 8, 8, dtype="handle")) - with T.attr(T.iter_var(ax0_ax1_fused_ax2_fused_ax3_fused_1, None, "DataPar", ""), "pragma_compute_cycles_hint", 576): - T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded_2[0], 128, placeholder_d_d_global_2[0], dtype="handle")) - with T.attr(T.iter_var(nn_1, None, "DataPar", ""), "pragma_compute_cycles_hint", 320): - T.evaluate(T.call_extern("ethosu_depthwise_conv2d", "int8", 8, 8, 4, 8, 0, 8, ethosu_write_2[0], 0, 0, 0, T.float32(0.59999999999999998), 11, "NHWC", 32, 4, 1, "int8", 8, 8, 4, 8, 0, 8, ethosu_write_3[0], 0, 0, 0, T.float32(0.26000000000000001), 15, "NHWC", 32, 4, 1, 3, 3, 1, 1, 1, 1, placeholder_d_d_global_2[0], 80, 13, placeholder_d_d_global_2[80], 48, 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 8, 8, 8, dtype="handle")) - with T.attr(T.iter_var(ax0_ax1_fused_ax2_fused_ax3_fused_2, None, "DataPar", ""), "pragma_compute_cycles_hint", 2304): - T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded_4[0], 288, placeholder_d_d_global_4[0], dtype="handle")) - with T.attr(T.iter_var(nn_2, None, "DataPar", ""), "pragma_compute_cycles_hint", 320): - T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 8, 4, 8, 0, 8, ethosu_write_3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 32, 4, 1, "int8", 8, 8, 4, 8, 0, 8, ethosu_write_4[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 32, 4, 1, 3, 3, 1, 1, 1, 1, placeholder_d_d_global_4[0], 240, T.int8(-1), T.int8(-1), 12, placeholder_d_d_global_4[240], 48, T.int8(-1), T.int8(-1), 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 8, 8, 8, dtype="handle")) - with T.attr(T.iter_var(nn_3, None, "DataPar", ""), "pragma_compute_cycles_hint", 192): - T.evaluate(T.call_extern("ethosu_pooling", "int8", 8, 8, 4, 8, 0, 8, ethosu_write_4[0], 0, 0, 0, T.float32(1), 0, "NHWC", 32, 4, 1, "int8", 8, 8, 4, 8, 0, 8, ethosu_write_5[0], 0, 0, 0, T.float32(1), 0, "NHWC", 32, 4, 1, "MAX", 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 8, 8, 8, dtype="handle")) - with T.attr(T.iter_var(nn_4, None, "DataPar", ""), "pragma_compute_cycles_hint", 300): - T.evaluate(T.call_extern("ethosu_pooling", "int8", 8, 8, 4, 8, 0, 8, ethosu_write_5[0], 0, 0, 0, T.float32(1), 0, "NHWC", 32, 4, 1, "int8", 9, 9, 4, 9, 0, 9, ethosu_write_6[0], 0, 0, 0, T.float32(1), 0, "NHWC", 36, 4, 1, "AVG", 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 10, 10, 8, dtype="handle")) - with T.attr(T.iter_var(ax0_ax1_fused_ax2_fused_ax3_fused_3, None, "DataPar", ""), "pragma_compute_cycles_hint", 576): - T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded_6[0], 128, placeholder_d_global[0], dtype="handle")) - with T.attr(T.iter_var(nn_5, None, "DataPar", ""), "pragma_compute_cycles_hint", 500): - T.evaluate(T.call_extern("ethosu_depthwise_conv2d", "int8", 9, 9, 4, 9, 0, 9, ethosu_write_6[0], 0, 0, 0, T.float32(0.59999999999999998), 11, "NHWC", 36, 4, 1, "int8", 9, 9, 4, 9, 0, 9, ethosu_write_7[0], 0, 0, 0, T.float32(0.26000000000000001), 15, "NHWC", 36, 4, 1, 3, 3, 1, 1, 1, 1, placeholder_d_global[0], 80, 13, placeholder_d_global[80], 48, 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 10, 10, 8, dtype="handle")) - with T.attr(T.iter_var(nn_6, None, "DataPar", ""), "pragma_compute_cycles_hint", 432): - T.evaluate(T.call_extern("ethosu_pooling", "int8", 9, 9, 4, 9, 0, 9, ethosu_write_7[0], 0, 0, 0, T.float32(1), 0, "NHWC", 36, 4, 1, "int8", 11, 11, 4, 11, 0, 11, ethosu_write_8[0], 0, 0, 0, T.float32(1), 0, "NHWC", 44, 4, 1, "MAX", 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 12, 12, 8, dtype="handle")) - with T.attr(T.iter_var(nn_7, None, "DataPar", ""), "pragma_compute_cycles_hint", 432): - T.evaluate(T.call_extern("ethosu_pooling", "int8", 11, 11, 4, 11, 0, 11, ethosu_write_8[0], 0, 0, 0, T.float32(1), 0, "NHWC", 44, 4, 1, "int8", 11, 11, 4, 11, 0, 11, ethosu_write_9[0], 0, 0, 0, T.float32(1), 0, "NHWC", 44, 4, 1, "AVG", 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 12, 12, 8, dtype="handle")) - with T.attr(T.iter_var(nn_8, None, "DataPar", ""), "pragma_compute_cycles_hint", 432): - T.evaluate(T.call_extern("ethosu_pooling", "int8", 11, 11, 4, 11, 0, 11, ethosu_write_9[0], 0, 0, 0, T.float32(1), 0, "NHWC", 44, 4, 1, "int8", 11, 11, 4, 11, 0, 11, ethosu_write_10[0], 0, 0, 0, T.float32(1), 0, "NHWC", 44, 4, 1, "AVG", 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 12, 12, 8, dtype="handle")) - with T.attr(T.iter_var(ax0_ax1_fused_ax2_fused_ax3_fused_4, None, "DataPar", ""), "pragma_compute_cycles_hint", 768): - T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded_8[0], 144, placeholder_global[0], dtype="handle")) - T.attr(T.iter_var(nn_9, None, "DataPar", ""), "pragma_compute_cycles_hint", 504) - T.evaluate(T.call_extern("ethosu_conv2d", "int8", 11, 11, 4, 11, 0, 11, ethosu_write_10[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 44, 4, 1, "int8", 13, 11, 4, 13, 0, 11, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 44, 4, 1, 3, 1, 1, 1, 1, 1, placeholder_global[0], 96, T.int8(-1), T.int8(-1), 12, placeholder_global[96], 48, T.int8(-1), T.int8(-1), 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 14, 12, 8, dtype="handle")) - + placeholder_d_global = T.allocate([208], "uint8", "global") + placeholder_d_global_1 = T.allocate([112], "uint8", "global") + placeholder_d_global_2 = T.allocate([96], "uint8", "global") + placeholder_d_global_3 = T.allocate([112], "uint8", "global") + ethosu_write_1 = T.allocate([195168], "int8", "global") + ethosu_write_2 = T.allocate([184800], "int8", "global") + ethosu_write_3 = T.allocate([174688], "int8", "global") + ethosu_write_4 = T.allocate([174688], "int8", "global") + ethosu_write_5 = T.allocate([174688], "int8", "global") + with T.attr(T.iter_var(ax0_ax1_fused_ax2_fused_ax3_fused, None, "DataPar", ""), "pragma_compute_cycles_hint", 1792): + T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded[0], 208, placeholder_d_global[0], dtype="handle")) + with T.attr(T.iter_var(nn, None, "DataPar", ""), "pragma_compute_cycles_hint", 250): + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 214, 227, 2, 214, 0, 227, placeholder[0], 0, 0, 0, T.float32(0.0039215679280459881), -128, "NHWC", 454, 2, 1, "int8", 107, 114, 4, 107, 0, 114, ethosu_write_1[0], 0, 0, 0, T.float32(0.009109782986342907), -128, "NHCWB16", 1824, 16, 1, 3, 3, 2, 2, 1, 1, placeholder_d_global[0], 160, T.int8(-1), T.int8(-1), 0, placeholder_d_global[160], 48, T.int8(-1), T.int8(-1), 0, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 12, 10, 16, dtype="handle")) + with T.attr(T.iter_var(ax0_ax1_fused_ax2_fused_ax3_fused_1, None, "DataPar", ""), "pragma_compute_cycles_hint", 1024): + T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded_1[0], 112, placeholder_d_global_1[0], dtype="handle")) + with T.attr(T.iter_var(nn_1, None, "DataPar", ""), "pragma_compute_cycles_hint", 467): + T.evaluate(T.call_extern("ethosu_depthwise_conv2d", "int8", 107, 114, 4, 107, 0, 114, ethosu_write_1[0], 0, 0, 0, T.float32(0.009109782986342907), -128, "NHCWB16", 1824, 16, 1, "int8", 105, 110, 4, 105, 0, 110, ethosu_write_2[0], 0, 0, 0, T.float32(0.0066184266470372677), -128, "NHCWB16", 1760, 16, 1, 3, 2, 1, 1, 2, 2, placeholder_d_global_1[0], 64, 0, placeholder_d_global_1[64], 48, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 8, 16, 16, dtype="handle")) + with T.attr(T.iter_var(ax0_ax1_fused_ax2_fused_ax3_fused_2, None, "DataPar", ""), "pragma_compute_cycles_hint", 1024): + T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded_2[0], 96, placeholder_d_global_2[0], dtype="handle")) + with T.attr(T.iter_var(nn_2, None, "DataPar", ""), "pragma_compute_cycles_hint", 441): + T.evaluate(T.call_extern("ethosu_depthwise_conv2d", "int8", 105, 110, 4, 105, 0, 110, ethosu_write_2[0], 0, 0, 0, T.float32(0.0066184266470372677), -128, "NHCWB16", 1760, 16, 1, "int8", 103, 106, 4, 103, 0, 106, ethosu_write_3[0], 0, 0, 0, T.float32(0.0057637207210063934), -128, "NHCWB16", 1696, 16, 1, 3, 2, 1, 1, 2, 2, placeholder_d_global_2[0], 48, 0, placeholder_d_global_2[48], 48, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 8, 16, 16, dtype="handle")) + with T.attr(T.iter_var(nn_3, None, "DataPar", ""), "pragma_compute_cycles_hint", 439): + T.evaluate(T.call_extern("ethosu_pooling", "int8", 103, 106, 4, 103, 0, 106, ethosu_write_3[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 1696, 16, 1, "int8", 103, 106, 4, 103, 0, 106, ethosu_write_4[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 1696, 16, 1, "MAX", 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 2, 64, 16, dtype="handle")) + with T.attr(T.iter_var(nn_4, None, "DataPar", ""), "pragma_compute_cycles_hint", 439): + T.evaluate(T.call_extern("ethosu_pooling", "int8", 103, 106, 4, 103, 0, 106, ethosu_write_4[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 1696, 16, 1, "int8", 103, 106, 4, 103, 0, 106, ethosu_write_5[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 1696, 16, 1, "AVG", 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 2, 64, 16, dtype="handle")) + with T.attr(T.iter_var(ax0_ax1_fused_ax2_fused_ax3_fused_3, None, "DataPar", ""), "pragma_compute_cycles_hint", 1024): + T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded_3[0], 112, placeholder_d_global_3[0], dtype="handle")) + T.attr(T.iter_var(nn_5, None, "DataPar", ""), "pragma_compute_cycles_hint", 22340) + T.evaluate(T.call_extern("ethosu_depthwise_conv2d", "int8", 103, 106, 4, 103, 0, 106, ethosu_write_5[0], 0, 0, 0, T.float32(0.0057637207210063934), -128, "NHCWB16", 1696, 16, 1, "int8", 103, 106, 4, 103, 0, 106, ethosu_write[0], 0, 0, 0, T.float32(0.0057619437575340271), -128, "NHWC", 424, 4, 1, 3, 2, 1, 1, 2, 2, placeholder_d_global_3[0], 64, 0, placeholder_d_global_3[64], 48, 1, 2, 1, 2, "NONE", 0, 0, "TFL", "NONE", 14, 18, 8, dtype="handle")) @tvm.script.ir_module class ModuleAfter: @T.prim_func - def main(placeholder: T.Buffer[(256,), "int8"], placeholder_encoded: T.Buffer[(288,), "uint8"], placeholder_encoded_2: T.Buffer[(128,), "uint8"], placeholder_encoded_4: T.Buffer[(288,), "uint8"], placeholder_encoded_6: T.Buffer[(128,), "uint8"], placeholder_encoded_8: T.Buffer[(144,), "uint8"], ethosu_write: T.Buffer[(572,), "int8"]) -> None: + def main(placeholder: T.Buffer[97156, "int8"], placeholder_encoded: T.Buffer[208, "uint8"], placeholder_encoded_1: T.Buffer[112, "uint8"], placeholder_encoded_2: T.Buffer[96, "uint8"], placeholder_encoded_3: T.Buffer[112, "uint8"], ethosu_write: T.Buffer[43672, "int8"]) -> None: # function attr dict - T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) + T.func_attr({"tir.noalias": True, "global_symbol": "main", "from_legacy_te_schedule": True}) ax0_ax1_fused_ax2_fused_ax3_fused = T.var("int32") ax0_ax1_fused_ax2_fused_ax3_fused_1 = T.var("int32") ax0_ax1_fused_ax2_fused_ax3_fused_2 = T.var("int32") ax0_ax1_fused_ax2_fused_ax3_fused_3 = T.var("int32") - ax0_ax1_fused_ax2_fused_ax3_fused_4 = T.var("int32") nn = T.var("int32") nn_1 = T.var("int32") nn_2 = T.var("int32") nn_3 = T.var("int32") nn_4 = T.var("int32") nn_5 = T.var("int32") - nn_6 = T.var("int32") - nn_7 = T.var("int32") - nn_8 = T.var("int32") - nn_9 = T.var("int32") - T.preflattened_buffer(placeholder, [1, 8, 8, 4], dtype="int8", data=placeholder.data) - T.preflattened_buffer(placeholder_encoded, [4, 3, 3, 4], dtype="int8", data=placeholder_encoded.data) - T.preflattened_buffer(placeholder_encoded_2, [4, 3, 3, 1], dtype="int8", data=placeholder_encoded_2.data) - T.preflattened_buffer(placeholder_encoded_4, [4, 3, 3, 4], dtype="int8", data=placeholder_encoded_4.data) - T.preflattened_buffer(placeholder_encoded_6, [4, 3, 3, 1], dtype="int8", data=placeholder_encoded_6.data) - T.preflattened_buffer(placeholder_encoded_8, [4, 1, 3, 4], dtype="int8", data=placeholder_encoded_8.data) - T.preflattened_buffer(ethosu_write, [1, 13, 11, 4], dtype="int8", data=ethosu_write.data) # body - placeholder_d_d_global = T.allocate([288], "uint8", "global") - ethosu_write_2 = T.allocate([256], "int8", "global") - placeholder_d_d_global_2 = T.allocate([128], "uint8", "global") - ethosu_write_3 = T.allocate([256], "int8", "global") - placeholder_d_d_global_4 = T.allocate([288], "uint8", "global") - ethosu_write_4 = T.allocate([256], "int8", "global") - ethosu_write_5 = T.allocate([256], "int8", "global") - ethosu_write_6 = T.allocate([324], "int8", "global") - placeholder_d_global = T.allocate([128], "uint8", "global") - ethosu_write_7 = T.allocate([324], "int8", "global") - ethosu_write_8 = T.allocate([484], "int8", "global") - ethosu_write_9 = T.allocate([484], "int8", "global") - ethosu_write_10 = T.allocate([484], "int8", "global") - placeholder_global = T.allocate([144], "uint8", "global") - with T.attr(T.iter_var(ax0_ax1_fused_ax2_fused_ax3_fused, None, "DataPar", ""), "pragma_compute_cycles_hint", 2304): - T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded[0], 288, placeholder_d_d_global[0], dtype="handle")) - with T.attr(T.iter_var(ax0_ax1_fused_ax2_fused_ax3_fused_1, None, "DataPar", ""), "pragma_compute_cycles_hint", 576): - T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded_2[0], 128, placeholder_d_d_global_2[0], dtype="handle")) - with T.attr(T.iter_var(nn, None, "DataPar", ""), "pragma_compute_cycles_hint", 320): - T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 8, 4, 8, 0, 8, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 32, 4, 1, "int8", 8, 8, 4, 8, 0, 8, ethosu_write_2[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 32, 4, 1, 3, 3, 1, 1, 1, 1, placeholder_d_d_global[0], 240, T.int8(-1), T.int8(-1), 12, placeholder_d_d_global[240], 48, T.int8(-1), T.int8(-1), 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 8, 8, 8, dtype="handle")) - with T.attr(T.iter_var(ax0_ax1_fused_ax2_fused_ax3_fused_2, None, "DataPar", ""), "pragma_compute_cycles_hint", 2304): - T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded_4[0], 288, placeholder_d_d_global_4[0], dtype="handle")) - with T.attr(T.iter_var(nn_1, None, "DataPar", ""), "pragma_compute_cycles_hint", 320): - T.evaluate(T.call_extern("ethosu_depthwise_conv2d", "int8", 8, 8, 4, 8, 0, 8, ethosu_write_2[0], 0, 0, 0, T.float32(0.59999999999999998), 11, "NHWC", 32, 4, 1, "int8", 8, 8, 4, 8, 0, 8, ethosu_write_3[0], 0, 0, 0, T.float32(0.26000000000000001), 15, "NHWC", 32, 4, 1, 3, 3, 1, 1, 1, 1, placeholder_d_d_global_2[0], 80, 13, placeholder_d_d_global_2[80], 48, 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 8, 8, 8, dtype="handle")) - with T.attr(T.iter_var(ax0_ax1_fused_ax2_fused_ax3_fused_3, None, "DataPar", ""), "pragma_compute_cycles_hint", 576): - T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded_6[0], 128, placeholder_d_global[0], dtype="handle")) - with T.attr(T.iter_var(nn_2, None, "DataPar", ""), "pragma_compute_cycles_hint", 320): - T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 8, 4, 8, 0, 8, ethosu_write_3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 32, 4, 1, "int8", 8, 8, 4, 8, 0, 8, ethosu_write_4[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 32, 4, 1, 3, 3, 1, 1, 1, 1, placeholder_d_d_global_4[0], 240, T.int8(-1), T.int8(-1), 12, placeholder_d_d_global_4[240], 48, T.int8(-1), T.int8(-1), 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 8, 8, 8, dtype="handle")) - with T.attr(T.iter_var(nn_3, None, "DataPar", ""), "pragma_compute_cycles_hint", 192): - T.evaluate(T.call_extern("ethosu_pooling", "int8", 8, 8, 4, 8, 0, 8, ethosu_write_4[0], 0, 0, 0, T.float32(1), 0, "NHWC", 32, 4, 1, "int8", 8, 8, 4, 8, 0, 8, ethosu_write_5[0], 0, 0, 0, T.float32(1), 0, "NHWC", 32, 4, 1, "MAX", 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 8, 8, 8, dtype="handle")) - with T.attr(T.iter_var(nn_4, None, "DataPar", ""), "pragma_compute_cycles_hint", 300): - T.evaluate(T.call_extern("ethosu_pooling", "int8", 8, 8, 4, 8, 0, 8, ethosu_write_5[0], 0, 0, 0, T.float32(1), 0, "NHWC", 32, 4, 1, "int8", 9, 9, 4, 9, 0, 9, ethosu_write_6[0], 0, 0, 0, T.float32(1), 0, "NHWC", 36, 4, 1, "AVG", 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 10, 10, 8, dtype="handle")) - with T.attr(T.iter_var(nn_5, None, "DataPar", ""), "pragma_compute_cycles_hint", 500): - T.evaluate(T.call_extern("ethosu_depthwise_conv2d", "int8", 9, 9, 4, 9, 0, 9, ethosu_write_6[0], 0, 0, 0, T.float32(0.59999999999999998), 11, "NHWC", 36, 4, 1, "int8", 9, 9, 4, 9, 0, 9, ethosu_write_7[0], 0, 0, 0, T.float32(0.26000000000000001), 15, "NHWC", 36, 4, 1, 3, 3, 1, 1, 1, 1, placeholder_d_global[0], 80, 13, placeholder_d_global[80], 48, 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 10, 10, 8, dtype="handle")) - with T.attr(T.iter_var(nn_6, None, "DataPar", ""), "pragma_compute_cycles_hint", 432): - T.evaluate(T.call_extern("ethosu_pooling", "int8", 9, 9, 4, 9, 0, 9, ethosu_write_7[0], 0, 0, 0, T.float32(1), 0, "NHWC", 36, 4, 1, "int8", 11, 11, 4, 11, 0, 11, ethosu_write_8[0], 0, 0, 0, T.float32(1), 0, "NHWC", 44, 4, 1, "MAX", 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 12, 12, 8, dtype="handle")) - with T.attr(T.iter_var(ax0_ax1_fused_ax2_fused_ax3_fused_4, None, "DataPar", ""), "pragma_compute_cycles_hint", 768): - T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded_8[0], 144, placeholder_global[0], dtype="handle")) - with T.attr(T.iter_var(nn_7, None, "DataPar", ""), "pragma_compute_cycles_hint", 432): - T.evaluate(T.call_extern("ethosu_pooling", "int8", 11, 11, 4, 11, 0, 11, ethosu_write_8[0], 0, 0, 0, T.float32(1), 0, "NHWC", 44, 4, 1, "int8", 11, 11, 4, 11, 0, 11, ethosu_write_9[0], 0, 0, 0, T.float32(1), 0, "NHWC", 44, 4, 1, "AVG", 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 12, 12, 8, dtype="handle")) - with T.attr(T.iter_var(nn_8, None, "DataPar", ""), "pragma_compute_cycles_hint", 432): - T.evaluate(T.call_extern("ethosu_pooling", "int8", 11, 11, 4, 11, 0, 11, ethosu_write_9[0], 0, 0, 0, T.float32(1), 0, "NHWC", 44, 4, 1, "int8", 11, 11, 4, 11, 0, 11, ethosu_write_10[0], 0, 0, 0, T.float32(1), 0, "NHWC", 44, 4, 1, "AVG", 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 12, 12, 8, dtype="handle")) - T.attr(T.iter_var(nn_9, None, "DataPar", ""), "pragma_compute_cycles_hint", 504) - T.evaluate(T.call_extern("ethosu_conv2d", "int8", 11, 11, 4, 11, 0, 11, ethosu_write_10[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 44, 4, 1, "int8", 13, 11, 4, 13, 0, 11, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 44, 4, 1, 3, 1, 1, 1, 1, 1, placeholder_global[0], 96, T.int8(-1), T.int8(-1), 12, placeholder_global[96], 48, T.int8(-1), T.int8(-1), 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 14, 12, 8, dtype="handle")) + placeholder_d_global = T.allocate([208], "uint8", "global") + placeholder_d_global_1 = T.allocate([112], "uint8", "global") + placeholder_d_global_2 = T.allocate([96], "uint8", "global") + placeholder_d_global_3 = T.allocate([112], "uint8", "global") + ethosu_write_1 = T.allocate([195168], "int8", "global") + ethosu_write_2 = T.allocate([184800], "int8", "global") + ethosu_write_3 = T.allocate([174688], "int8", "global") + ethosu_write_4 = T.allocate([174688], "int8", "global") + ethosu_write_5 = T.allocate([174688], "int8", "global") + with T.attr(T.iter_var(ax0_ax1_fused_ax2_fused_ax3_fused, None, "DataPar", ""), "pragma_compute_cycles_hint", 1792): + T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded[0], 208, placeholder_d_global[0], dtype="handle")) + with T.attr(T.iter_var(ax0_ax1_fused_ax2_fused_ax3_fused_1, None, "DataPar", ""), "pragma_compute_cycles_hint", 1024): + T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded_1[0], 112, placeholder_d_global_1[0], dtype="handle")) + with T.attr(T.iter_var(nn, None, "DataPar", ""), "pragma_compute_cycles_hint", 250): + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 214, 227, 2, 214, 0, 227, placeholder[0], 0, 0, 0, T.float32(0.0039215679280459881), -128, "NHWC", 454, 2, 1, "int8", 107, 114, 4, 107, 0, 114, ethosu_write_1[0], 0, 0, 0, T.float32(0.009109782986342907), -128, "NHCWB16", 1824, 16, 1, 3, 3, 2, 2, 1, 1, placeholder_d_global[0], 160, T.int8(-1), T.int8(-1), 0, placeholder_d_global[160], 48, T.int8(-1), T.int8(-1), 0, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 12, 10, 16, dtype="handle")) + with T.attr(T.iter_var(ax0_ax1_fused_ax2_fused_ax3_fused_2, None, "DataPar", ""), "pragma_compute_cycles_hint", 1024): + T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded_2[0], 96, placeholder_d_global_2[0], dtype="handle")) + with T.attr(T.iter_var(nn_1, None, "DataPar", ""), "pragma_compute_cycles_hint", 467): + T.evaluate(T.call_extern("ethosu_depthwise_conv2d", "int8", 107, 114, 4, 107, 0, 114, ethosu_write_1[0], 0, 0, 0, T.float32(0.009109782986342907), -128, "NHCWB16", 1824, 16, 1, "int8", 105, 110, 4, 105, 0, 110, ethosu_write_2[0], 0, 0, 0, T.float32(0.0066184266470372677), -128, "NHCWB16", 1760, 16, 1, 3, 2, 1, 1, 2, 2, placeholder_d_global_1[0], 64, 0, placeholder_d_global_1[64], 48, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 8, 16, 16, dtype="handle")) + with T.attr(T.iter_var(ax0_ax1_fused_ax2_fused_ax3_fused_3, None, "DataPar", ""), "pragma_compute_cycles_hint", 1024): + T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded_3[0], 112, placeholder_d_global_3[0], dtype="handle")) + with T.attr(T.iter_var(nn_2, None, "DataPar", ""), "pragma_compute_cycles_hint", 441): + T.evaluate(T.call_extern("ethosu_depthwise_conv2d", "int8", 105, 110, 4, 105, 0, 110, ethosu_write_2[0], 0, 0, 0, T.float32(0.0066184266470372677), -128, "NHCWB16", 1760, 16, 1, "int8", 103, 106, 4, 103, 0, 106, ethosu_write_3[0], 0, 0, 0, T.float32(0.0057637207210063934), -128, "NHCWB16", 1696, 16, 1, 3, 2, 1, 1, 2, 2, placeholder_d_global_2[0], 48, 0, placeholder_d_global_2[48], 48, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 8, 16, 16, dtype="handle")) + with T.attr(T.iter_var(nn_3, None, "DataPar", ""), "pragma_compute_cycles_hint", 439): + T.evaluate(T.call_extern("ethosu_pooling", "int8", 103, 106, 4, 103, 0, 106, ethosu_write_3[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 1696, 16, 1, "int8", 103, 106, 4, 103, 0, 106, ethosu_write_4[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 1696, 16, 1, "MAX", 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 2, 64, 16, dtype="handle")) + with T.attr(T.iter_var(nn_4, None, "DataPar", ""), "pragma_compute_cycles_hint", 439): + T.evaluate(T.call_extern("ethosu_pooling", "int8", 103, 106, 4, 103, 0, 106, ethosu_write_4[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 1696, 16, 1, "int8", 103, 106, 4, 103, 0, 106, ethosu_write_5[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 1696, 16, 1, "AVG", 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 2, 64, 16, dtype="handle")) + T.attr(T.iter_var(nn_5, None, "DataPar", ""), "pragma_compute_cycles_hint", 22340) + T.evaluate(T.call_extern("ethosu_depthwise_conv2d", "int8", 103, 106, 4, 103, 0, 106, ethosu_write_5[0], 0, 0, 0, T.float32(0.0057637207210063934), -128, "NHCWB16", 1696, 16, 1, "int8", 103, 106, 4, 103, 0, 106, ethosu_write[0], 0, 0, 0, T.float32(0.0057619437575340271), -128, "NHWC", 424, 4, 1, 3, 2, 1, 1, 2, 2, placeholder_d_global_3[0], 64, 0, placeholder_d_global_3[64], 48, 1, 2, 1, 2, "NONE", 0, 0, "TFL", "NONE", 14, 18, 8, dtype="handle")) # fmt: on test_mod = CopyComputeReordering(reorder_by_cycles=True)(ModuleBefore) @@ -631,9 +576,9 @@ def test_reordering_based_on_cycles_luts_present(): @tvm.script.ir_module class ModuleBefore: @T.prim_func - def main(placeholder: T.Buffer[9075, "int8"], placeholder_encoded: T.Buffer[256, "uint8"], placeholder_encoded_2: T.Buffer[112, "uint8"], placeholder_1: T.Buffer[256, "int8"], placeholder_encoded_4: T.Buffer[112, "uint8"], placeholder_2: T.Buffer[256, "int8"], placeholder_3: T.Buffer[256, "int8"], ethosu_write: T.Buffer[2496, "int8"]) -> None: + def main(placeholder: T.Buffer[97156, "int8"], placeholder_encoded: T.Buffer[208, "uint8"], placeholder_encoded_1: T.Buffer[112, "uint8"], placeholder_1: T.Buffer[256, "int8"], placeholder_encoded_2: T.Buffer[96, "uint8"], placeholder_2: T.Buffer[256, "int8"], placeholder_3: T.Buffer[256, "int8"], ethosu_write: T.Buffer[46200, "int8"]) -> None: # function attr dict - T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) + T.func_attr({"tir.noalias": True, "global_symbol": "main", "from_legacy_te_schedule": True}) ax0_ax1_fused_ax2_fused_ax3_fused = T.var("int32") ax0_ax1_fused_ax2_fused_ax3_fused_1 = T.var("int32") ax0_ax1_fused_ax2_fused_ax3_fused_2 = T.var("int32") @@ -643,57 +588,47 @@ def main(placeholder: T.Buffer[9075, "int8"], placeholder_encoded: T.Buffer[256, nn_3 = T.var("int32") nn_4 = T.var("int32") nn_5 = T.var("int32") - T.preflattened_buffer(placeholder, [1, 55, 55, 3], dtype="int8", data=placeholder.data) - T.preflattened_buffer(placeholder_encoded, [4, 3, 3, 3], dtype="int8") - T.preflattened_buffer(placeholder_encoded_2, [4, 2, 3, 1], dtype="int8") - T.preflattened_buffer(placeholder_1, [256], dtype="int8", data=placeholder_1.data) - T.preflattened_buffer(placeholder_encoded_4, [4, 2, 3, 1], dtype="int8") - T.preflattened_buffer(placeholder_2, [256], dtype="int8", data=placeholder_2.data) - T.preflattened_buffer(placeholder_3, [256], dtype="int8", data=placeholder_3.data) - T.preflattened_buffer(ethosu_write, [1, 26, 24, 4], dtype="int8", data=ethosu_write.data) # body - placeholder_d_d_global = T.allocate([256], "uint8", "global") - ethosu_write_2 = T.allocate([12544], "int8", "global") + placeholder_d_d_global = T.allocate([208], "uint8", "global") + placeholder_d_d_global_1 = T.allocate([112], "uint8", "global") + placeholder_d_global = T.allocate([96], "uint8", "global") + ethosu_write_1 = T.allocate([195168], "int8", "global") placeholder_local = T.allocate([256], "int8", "local") - placeholder_d_global = T.allocate([112], "uint8", "global") - ethosu_write_3 = T.allocate([9984], "int8", "global") - ethosu_write_4 = T.allocate([9984], "int8", "global") - ethosu_write_5 = T.allocate([9984], "int8", "global") + ethosu_write_2 = T.allocate([184800], "int8", "global") + ethosu_write_3 = T.allocate([184800], "int8", "global") + ethosu_write_4 = T.allocate([184800], "int8", "global") placeholder_d_local = T.allocate([256], "int8", "local") - placeholder_global = T.allocate([112], "uint8", "global") - ethosu_write_6 = T.allocate([9984], "int8", "global") - placeholder_d_local_1 = T.allocate([256], "int8", "local") - with T.attr(T.iter_var(ax0_ax1_fused_ax2_fused_ax3_fused, None, "DataPar", ""), "pragma_compute_cycles_hint", 1728): - T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded[0], 256, placeholder_d_d_global[0], dtype="handle")) - with T.attr(T.iter_var(nn, None, "DataPar", ""), "pragma_compute_cycles_hint", 9920): - T.evaluate(T.call_extern("ethosu_conv2d", "int8", 55, 55, 3, 55, 0, 55, placeholder[0], 0, 0, 0, T.float32(0.0027450970374047756), -128, "NHWC", 165, 3, 1, "int8", 28, 28, 4, 28, 0, 28, ethosu_write_2[0], 0, 0, 0, T.float32(0.0095788920298218727), -128, "NHCWB16", 448, 16, 1, 3, 3, 2, 2, 1, 1, placeholder_d_d_global[0], 208, T.int8(-1), T.int8(-1), 0, placeholder_d_d_global[208], 48, T.int8(-1), T.int8(-1), 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 4, 16, 16, dtype="handle")) + ethosu_write_5 = T.allocate([184800], "int8", "global") + placeholder_d_d_local = T.allocate([256], "int8", "local") + with T.attr(T.iter_var(ax0_ax1_fused_ax2_fused_ax3_fused, None, "DataPar", ""), "pragma_compute_cycles_hint", 1792): + T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded[0], 208, placeholder_d_d_global[0], dtype="handle")) + with T.attr(T.iter_var(nn, None, "DataPar", ""), "pragma_compute_cycles_hint", 73668): + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 214, 227, 2, 214, 0, 227, placeholder[0], 0, 0, 0, T.float32(0.0039215679280459881), -128, "NHWC", 454, 2, 1, "int8", 107, 114, 4, 107, 0, 114, ethosu_write_1[0], 0, 0, 0, T.float32(0.009109782986342907), -128, "NHCWB16", 1824, 16, 1, 3, 3, 2, 2, 1, 1, placeholder_d_d_global[0], 160, T.int8(-1), T.int8(-1), 0, placeholder_d_d_global[160], 48, T.int8(-1), T.int8(-1), 0, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 12, 10, 16, dtype="handle")) T.evaluate(T.call_extern("ethosu_copy", placeholder_1[0], 256, placeholder_local[0], dtype="handle")) with T.attr(T.iter_var(ax0_ax1_fused_ax2_fused_ax3_fused_1, None, "DataPar", ""), "pragma_compute_cycles_hint", 384): - T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded_2[0], 112, placeholder_d_global[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded_1[0], 112, placeholder_d_d_global_1[0], dtype="handle")) with T.attr(T.iter_var(nn_1, None, "DataPar", ""), "pragma_compute_cycles_hint", 330): - T.evaluate(T.call_extern("ethosu_depthwise_conv2d", "int8", 28, 28, 4, 28, 0, 28, ethosu_write_2[0], 0, 0, 0, T.float32(0.0095788920298218727), -128, "NHCWB16", 448, 16, 1, "int8", 26, 24, 4, 26, 0, 24, ethosu_write_3[0], 0, 0, 0, T.float32(0.0078157493844628334), -128, "NHCWB16", 384, 16, 1, 3, 2, 1, 1, 2, 2, placeholder_d_global[0], 64, 0, placeholder_d_global[64], 48, 0, 0, 0, 0, "SIGMOID", 0, 0, "TFL", "NONE", 5, 12, 16, dtype="handle")) + T.evaluate(T.call_extern("ethosu_depthwise_conv2d", "int8", 107, 114, 4, 107, 0, 114, ethosu_write_1[0], 0, 0, 0, T.float32(0.009109782986342907), -128, "NHCWB16", 1824, 16, 1, "int8", 105, 110, 4, 105, 0, 110, ethosu_write_2[0], 0, 0, 0, T.float32(0.0066184266470372677), -128, "NHCWB16", 1760, 16, 1, 3, 2, 1, 1, 2, 2, placeholder_d_d_global_1[0], 64, 0, placeholder_d_d_global_1[64], 48, 0, 0, 0, 0, "SIGMOID", 0, 0, "TFL", "NONE", 8, 16, 16, dtype="handle")) with T.attr(T.iter_var(nn_2, None, "DataPar", ""), "pragma_compute_cycles_hint", 411): - T.evaluate(T.call_extern("ethosu_pooling", "int8", 26, 24, 4, 26, 0, 24, ethosu_write_3[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 384, 16, 1, "int8", 26, 24, 4, 26, 0, 24, ethosu_write_4[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 384, 16, 1, "MAX", 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 2, 24, 16, dtype="handle")) + T.evaluate(T.call_extern("ethosu_pooling", "int8", 105, 110, 4, 105, 0, 110, ethosu_write_2[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 1760, 16, 1, "int8", 105, 110, 4, 105, 0, 110, ethosu_write_3[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 1760, 16, 1, "MAX", 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 2, 64, 16, dtype="handle")) with T.attr(T.iter_var(nn_3, None, "DataPar", ""), "pragma_compute_cycles_hint", 458): - T.evaluate(T.call_extern("ethosu_pooling", "int8", 26, 24, 4, 26, 0, 24, ethosu_write_4[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 384, 16, 1, "int8", 26, 24, 4, 26, 0, 24, ethosu_write_5[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 384, 16, 1, "AVG", 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 2, 24, 16, dtype="handle")) + T.evaluate(T.call_extern("ethosu_pooling", "int8", 105, 110, 4, 105, 0, 110, ethosu_write_3[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 1760, 16, 1, "int8", 105, 110, 4, 105, 0, 110, ethosu_write_4[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 1760, 16, 1, "AVG", 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 2, 64, 16, dtype="handle")) T.evaluate(T.call_extern("ethosu_copy", placeholder_2[0], 256, placeholder_d_local[0], dtype="handle")) with T.attr(T.iter_var(ax0_ax1_fused_ax2_fused_ax3_fused_2, None, "DataPar", ""), "pragma_compute_cycles_hint", 1500): - T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded_4[0], 112, placeholder_global[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded_2[0], 96, placeholder_d_global[0], dtype="handle")) with T.attr(T.iter_var(nn_4, None, "DataPar", ""), "pragma_compute_cycles_hint", 10464): - T.evaluate(T.call_extern("ethosu_depthwise_conv2d", "int8", 26, 24, 4, 26, 0, 24, ethosu_write_5[0], 0, 0, 0, T.float32(0.00390625), -128, "NHCWB16", 384, 16, 1, "int8", 26, 24, 4, 26, 0, 24, ethosu_write_6[0], 0, 0, 0, T.float32(0.002753810491412878), -128, "NHCWB16", 384, 16, 1, 3, 2, 1, 1, 2, 2, placeholder_global[0], 64, 0, placeholder_global[64], 48, 1, 2, 1, 2, "TANH", 0, 0, "TFL", "NONE", 5, 12, 16, dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", placeholder_3[0], 256, placeholder_d_local_1[0], dtype="handle")) - T.attr(T.iter_var(nn_5, None, "DataPar", ""), "pragma_compute_cycles_hint", 5232) - T.evaluate(T.call_extern("ethosu_pooling", "int8", 26, 24, 4, 26, 0, 24, ethosu_write_6[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 384, 16, 1, "int8", 26, 24, 4, 26, 0, 24, ethosu_write[0], 0, 0, 0, T.float32(1), 0, "NHWC", 96, 4, 1, "MAX", 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, "TANH", 0, 0, "TFL", "NONE", 5, 24, 8, dtype="handle")) - - + T.evaluate(T.call_extern("ethosu_depthwise_conv2d", "int8", 105, 110, 4, 105, 0, 110, ethosu_write_4[0], 0, 0, 0, T.float32(0.00390625), -128, "NHCWB16", 1760, 16, 1, "int8", 105, 110, 4, 105, 0, 110, ethosu_write_5[0], 0, 0, 0, T.float32(0.00381289585493505), -128, "NHCWB16", 1760, 16, 1, 3, 2, 1, 1, 2, 2, placeholder_d_global[0], 48, 0, placeholder_d_global[48], 48, 1, 2, 1, 2, "TANH", 0, 0, "TFL", "NONE", 8, 16, 16, dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", placeholder_3[0], 256, placeholder_d_d_local[0], dtype="handle")) + T.attr(T.iter_var(nn_5, None, "DataPar", ""), "pragma_compute_cycles_hint", 5253) + T.evaluate(T.call_extern("ethosu_pooling", "int8", 105, 110, 4, 105, 0, 110, ethosu_write_5[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 1760, 16, 1, "int8", 105, 110, 4, 105, 0, 110, ethosu_write[0], 0, 0, 0, T.float32(1), 0, "NHWC", 440, 4, 1, "MAX", 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, "TANH", 0, 0, "TFL", "NONE", 4, 64, 8, dtype="handle")) @tvm.script.ir_module class ModuleAfter: @T.prim_func - def main(placeholder: T.Buffer[9075, "int8"], placeholder_encoded: T.Buffer[256, "uint8"], placeholder_encoded_2: T.Buffer[112, "uint8"], placeholder_1: T.Buffer[256, "int8"], placeholder_encoded_4: T.Buffer[112, "uint8"], placeholder_2: T.Buffer[256, "int8"], placeholder_3: T.Buffer[256, "int8"], ethosu_write: T.Buffer[2496, "int8"]) -> None: + def main(placeholder: T.Buffer[97156, "int8"], placeholder_encoded: T.Buffer[208, "uint8"], placeholder_encoded_1: T.Buffer[112, "uint8"], placeholder_1: T.Buffer[256, "int8"], placeholder_encoded_2: T.Buffer[96, "uint8"], placeholder_2: T.Buffer[256, "int8"], placeholder_3: T.Buffer[256, "int8"], ethosu_write: T.Buffer[46200, "int8"]) -> None: # function attr dict - T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) + T.func_attr({"tir.noalias": True, "global_symbol": "main", "from_legacy_te_schedule": True}) ax0_ax1_fused_ax2_fused_ax3_fused = T.var("int32") ax0_ax1_fused_ax2_fused_ax3_fused_1 = T.var("int32") ax0_ax1_fused_ax2_fused_ax3_fused_2 = T.var("int32") @@ -703,47 +638,39 @@ def main(placeholder: T.Buffer[9075, "int8"], placeholder_encoded: T.Buffer[256, nn_3 = T.var("int32") nn_4 = T.var("int32") nn_5 = T.var("int32") - T.preflattened_buffer(placeholder, [1, 55, 55, 3], dtype="int8", data=placeholder.data) - T.preflattened_buffer(placeholder_encoded, [4, 3, 3, 3], dtype="int8", data=placeholder_encoded.data) - T.preflattened_buffer(placeholder_encoded_2, [4, 2, 3, 1], dtype="int8", data=placeholder_encoded_2.data) - T.preflattened_buffer(placeholder_1, [256], dtype="int8", data=placeholder_1.data) - T.preflattened_buffer(placeholder_encoded_4, [4, 2, 3, 1], dtype="int8", data=placeholder_encoded_4.data) - T.preflattened_buffer(placeholder_2, [256], dtype="int8", data=placeholder_2.data) - T.preflattened_buffer(placeholder_3, [256], dtype="int8", data=placeholder_3.data) - T.preflattened_buffer(ethosu_write, [1, 26, 24, 4], dtype="int8", data=ethosu_write.data) # body - placeholder_d_d_global = T.allocate([256], "uint8", "global") - ethosu_write_2 = T.allocate([12544], "int8", "global") + placeholder_d_d_global = T.allocate([208], "uint8", "global") + placeholder_d_d_global_1 = T.allocate([112], "uint8", "global") + placeholder_d_global = T.allocate([96], "uint8", "global") + ethosu_write_1 = T.allocate([195168], "int8", "global") placeholder_local = T.allocate([256], "int8", "local") - placeholder_d_global = T.allocate([112], "uint8", "global") - ethosu_write_3 = T.allocate([9984], "int8", "global") - ethosu_write_4 = T.allocate([9984], "int8", "global") - ethosu_write_5 = T.allocate([9984], "int8", "global") + ethosu_write_2 = T.allocate([184800], "int8", "global") + ethosu_write_3 = T.allocate([184800], "int8", "global") + ethosu_write_4 = T.allocate([184800], "int8", "global") placeholder_d_local = T.allocate([256], "int8", "local") - placeholder_global = T.allocate([112], "uint8", "global") - ethosu_write_6 = T.allocate([9984], "int8", "global") - placeholder_d_local_1 = T.allocate([256], "int8", "local") - with T.attr(T.iter_var(ax0_ax1_fused_ax2_fused_ax3_fused, None, "DataPar", ""), "pragma_compute_cycles_hint", 1728): - T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded[0], 256, placeholder_d_d_global[0], dtype="handle")) + ethosu_write_5 = T.allocate([184800], "int8", "global") + placeholder_d_d_local = T.allocate([256], "int8", "local") + with T.attr(T.iter_var(ax0_ax1_fused_ax2_fused_ax3_fused, None, "DataPar", ""), "pragma_compute_cycles_hint", 1792): + T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded[0], 208, placeholder_d_d_global[0], dtype="handle")) with T.attr(T.iter_var(ax0_ax1_fused_ax2_fused_ax3_fused_1, None, "DataPar", ""), "pragma_compute_cycles_hint", 384): - T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded_2[0], 112, placeholder_d_global[0], dtype="handle")) - with T.attr(T.iter_var(nn, None, "DataPar", ""), "pragma_compute_cycles_hint", 9920): - T.evaluate(T.call_extern("ethosu_conv2d", "int8", 55, 55, 3, 55, 0, 55, placeholder[0], 0, 0, 0, T.float32(0.0027450970374047756), -128, "NHWC", 165, 3, 1, "int8", 28, 28, 4, 28, 0, 28, ethosu_write_2[0], 0, 0, 0, T.float32(0.0095788920298218727), -128, "NHCWB16", 448, 16, 1, 3, 3, 2, 2, 1, 1, placeholder_d_d_global[0], 208, T.int8(-1), T.int8(-1), 0, placeholder_d_d_global[208], 48, T.int8(-1), T.int8(-1), 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 4, 16, 16, dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded_1[0], 112, placeholder_d_d_global_1[0], dtype="handle")) + with T.attr(T.iter_var(nn, None, "DataPar", ""), "pragma_compute_cycles_hint", 73668): + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 214, 227, 2, 214, 0, 227, placeholder[0], 0, 0, 0, T.float32(0.0039215679280459881), -128, "NHWC", 454, 2, 1, "int8", 107, 114, 4, 107, 0, 114, ethosu_write_1[0], 0, 0, 0, T.float32(0.009109782986342907), -128, "NHCWB16", 1824, 16, 1, 3, 3, 2, 2, 1, 1, placeholder_d_d_global[0], 160, T.int8(-1), T.int8(-1), 0, placeholder_d_d_global[160], 48, T.int8(-1), T.int8(-1), 0, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 12, 10, 16, dtype="handle")) with T.attr(T.iter_var(ax0_ax1_fused_ax2_fused_ax3_fused_2, None, "DataPar", ""), "pragma_compute_cycles_hint", 1500): - T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded_4[0], 112, placeholder_global[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded_2[0], 96, placeholder_d_global[0], dtype="handle")) T.evaluate(T.call_extern("ethosu_copy", placeholder_1[0], 256, placeholder_local[0], dtype="handle")) with T.attr(T.iter_var(nn_1, None, "DataPar", ""), "pragma_compute_cycles_hint", 330): - T.evaluate(T.call_extern("ethosu_depthwise_conv2d", "int8", 28, 28, 4, 28, 0, 28, ethosu_write_2[0], 0, 0, 0, T.float32(0.0095788920298218727), -128, "NHCWB16", 448, 16, 1, "int8", 26, 24, 4, 26, 0, 24, ethosu_write_3[0], 0, 0, 0, T.float32(0.0078157493844628334), -128, "NHCWB16", 384, 16, 1, 3, 2, 1, 1, 2, 2, placeholder_d_global[0], 64, 0, placeholder_d_global[64], 48, 0, 0, 0, 0, "SIGMOID", 0, 0, "TFL", "NONE", 5, 12, 16, dtype="handle")) + T.evaluate(T.call_extern("ethosu_depthwise_conv2d", "int8", 107, 114, 4, 107, 0, 114, ethosu_write_1[0], 0, 0, 0, T.float32(0.009109782986342907), -128, "NHCWB16", 1824, 16, 1, "int8", 105, 110, 4, 105, 0, 110, ethosu_write_2[0], 0, 0, 0, T.float32(0.0066184266470372677), -128, "NHCWB16", 1760, 16, 1, 3, 2, 1, 1, 2, 2, placeholder_d_d_global_1[0], 64, 0, placeholder_d_d_global_1[64], 48, 0, 0, 0, 0, "SIGMOID", 0, 0, "TFL", "NONE", 8, 16, 16, dtype="handle")) with T.attr(T.iter_var(nn_2, None, "DataPar", ""), "pragma_compute_cycles_hint", 411): - T.evaluate(T.call_extern("ethosu_pooling", "int8", 26, 24, 4, 26, 0, 24, ethosu_write_3[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 384, 16, 1, "int8", 26, 24, 4, 26, 0, 24, ethosu_write_4[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 384, 16, 1, "MAX", 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 2, 24, 16, dtype="handle")) + T.evaluate(T.call_extern("ethosu_pooling", "int8", 105, 110, 4, 105, 0, 110, ethosu_write_2[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 1760, 16, 1, "int8", 105, 110, 4, 105, 0, 110, ethosu_write_3[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 1760, 16, 1, "MAX", 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 2, 64, 16, dtype="handle")) with T.attr(T.iter_var(nn_3, None, "DataPar", ""), "pragma_compute_cycles_hint", 458): - T.evaluate(T.call_extern("ethosu_pooling", "int8", 26, 24, 4, 26, 0, 24, ethosu_write_4[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 384, 16, 1, "int8", 26, 24, 4, 26, 0, 24, ethosu_write_5[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 384, 16, 1, "AVG", 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 2, 24, 16, dtype="handle")) + T.evaluate(T.call_extern("ethosu_pooling", "int8", 105, 110, 4, 105, 0, 110, ethosu_write_3[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 1760, 16, 1, "int8", 105, 110, 4, 105, 0, 110, ethosu_write_4[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 1760, 16, 1, "AVG", 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 2, 64, 16, dtype="handle")) T.evaluate(T.call_extern("ethosu_copy", placeholder_2[0], 256, placeholder_d_local[0], dtype="handle")) with T.attr(T.iter_var(nn_4, None, "DataPar", ""), "pragma_compute_cycles_hint", 10464): - T.evaluate(T.call_extern("ethosu_depthwise_conv2d", "int8", 26, 24, 4, 26, 0, 24, ethosu_write_5[0], 0, 0, 0, T.float32(0.00390625), -128, "NHCWB16", 384, 16, 1, "int8", 26, 24, 4, 26, 0, 24, ethosu_write_6[0], 0, 0, 0, T.float32(0.002753810491412878), -128, "NHCWB16", 384, 16, 1, 3, 2, 1, 1, 2, 2, placeholder_global[0], 64, 0, placeholder_global[64], 48, 1, 2, 1, 2, "TANH", 0, 0, "TFL", "NONE", 5, 12, 16, dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", placeholder_3[0], 256, placeholder_d_local_1[0], dtype="handle")) - T.attr(T.iter_var(nn_5, None, "DataPar", ""), "pragma_compute_cycles_hint", 5232) - T.evaluate(T.call_extern("ethosu_pooling", "int8", 26, 24, 4, 26, 0, 24, ethosu_write_6[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 384, 16, 1, "int8", 26, 24, 4, 26, 0, 24, ethosu_write[0], 0, 0, 0, T.float32(1), 0, "NHWC", 96, 4, 1, "MAX", 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, "TANH", 0, 0, "TFL", "NONE", 5, 24, 8, dtype="handle")) + T.evaluate(T.call_extern("ethosu_depthwise_conv2d", "int8", 105, 110, 4, 105, 0, 110, ethosu_write_4[0], 0, 0, 0, T.float32(0.00390625), -128, "NHCWB16", 1760, 16, 1, "int8", 105, 110, 4, 105, 0, 110, ethosu_write_5[0], 0, 0, 0, T.float32(0.00381289585493505), -128, "NHCWB16", 1760, 16, 1, 3, 2, 1, 1, 2, 2, placeholder_d_global[0], 48, 0, placeholder_d_global[48], 48, 1, 2, 1, 2, "TANH", 0, 0, "TFL", "NONE", 8, 16, 16, dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", placeholder_3[0], 256, placeholder_d_d_local[0], dtype="handle")) + T.attr(T.iter_var(nn_5, None, "DataPar", ""), "pragma_compute_cycles_hint", 5253) + T.evaluate(T.call_extern("ethosu_pooling", "int8", 105, 110, 4, 105, 0, 110, ethosu_write_5[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 1760, 16, 1, "int8", 105, 110, 4, 105, 0, 110, ethosu_write[0], 0, 0, 0, T.float32(1), 0, "NHWC", 440, 4, 1, "MAX", 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, "TANH", 0, 0, "TFL", "NONE", 4, 64, 8, dtype="handle")) # fmt: on test_mod = CopyComputeReordering(reorder_by_cycles=True)(ModuleBefore)