From 90bec8c33fd36302a567bf07171b29fcafdc74ce Mon Sep 17 00:00:00 2001 From: tqchen Date: Fri, 27 Sep 2019 11:20:22 -0700 Subject: [PATCH] [ARITH] cleanup the indexmod/div on python side --- python/tvm/autotvm/task/task.py | 4 ++- python/tvm/expr.py | 20 +++++------ src/pass/rewrite_unsafe_select.cc | 6 ++-- tests/python/relay/test_op_level3.py | 14 ++++---- tests/python/relay/test_op_level5.py | 3 +- .../unittest/test_autotvm_flop_calculator.py | 6 ++-- tests/python/unittest/test_codegen_cuda.py | 5 +-- tests/python/unittest/test_ir_builder.py | 5 +-- tests/python/unittest/test_lang_buffer.py | 28 +++++++-------- .../test_pass_rewrite_unsafe_select.py | 2 +- .../unittest/test_schedule_tensorize.py | 9 ++--- topi/python/topi/arm_cpu/bitserial_conv2d.py | 14 ++++++-- topi/python/topi/arm_cpu/conv2d.py | 35 ++++++++++++------- topi/python/topi/cuda/nms.py | 15 +++++--- topi/python/topi/cuda/rcnn/proposal.py | 17 +++++---- topi/python/topi/cuda/sort.py | 21 ++++++----- topi/python/topi/nn/bitserial_conv2d.py | 18 +++++----- topi/python/topi/nn/sparse.py | 5 ++- topi/python/topi/util.py | 9 +++-- topi/python/topi/vision/ssd/multibox.py | 4 +-- topi/python/topi/x86/conv2d_avx_1x1.py | 15 ++++++-- vta/python/vta/ir_pass.py | 33 +++++++++-------- 22 files changed, 177 insertions(+), 111 deletions(-) diff --git a/python/tvm/autotvm/task/task.py b/python/tvm/autotvm/task/task.py index 901183f46948..e0db27574898 100644 --- a/python/tvm/autotvm/task/task.py +++ b/python/tvm/autotvm/task/task.py @@ -350,7 +350,9 @@ def _count_flop(exp): return _count_flop(exp.value) if isinstance(exp, expr.Var): return 0 - if isinstance(exp, (expr.Add, expr.Sub, expr.Mul, expr.Div, expr.Mod, + if isinstance(exp, (expr.Add, expr.Sub, expr.Mul, + expr.Div, expr.Mod, + expr.FloorDiv, expr.FloorMod, expr.Max, expr.Min, expr.EQ, expr.NE, expr.LT, expr.LE, expr.GT, expr.GE, expr.And, expr.Or, expr.Not)): diff --git a/python/tvm/expr.py b/python/tvm/expr.py index a8bd651d6469..5b7c60d819bd 100644 --- a/python/tvm/expr.py +++ b/python/tvm/expr.py @@ -72,23 +72,23 @@ def __rmul__(self, other): return _generic.multiply(other, self) def __div__(self, other): - # if _dtype_is_int(self) and _dtype_is_int(other): - # raise div_ambiguity_error() + if _dtype_is_int(self) and _dtype_is_int(other): + raise div_ambiguity_error() return _generic.divide(self, other) def __rdiv__(self, other): - # if _dtype_is_int(self) and _dtype_is_int(other): - # raise div_ambiguity_error() + if _dtype_is_int(self) and _dtype_is_int(other): + raise div_ambiguity_error() return _generic.divide(other, self) def __truediv__(self, other): - # if _dtype_is_int(self) and _dtype_is_int(other): - # raise div_ambiguity_error() + if _dtype_is_int(self) and _dtype_is_int(other): + raise div_ambiguity_error() return _generic.divide(self, other) def __rtruediv__(self, other): - # if _dtype_is_int(self) and _dtype_is_int(other): - # raise div_ambiguity_error() + if _dtype_is_int(self) and _dtype_is_int(other): + raise div_ambiguity_error() return _generic.divide(other, self) def __floordiv__(self, other): @@ -100,8 +100,8 @@ def __rfloordiv__(self, other): return _generic.divide(other, self) def __mod__(self, other): - # raise div_ambiguity_error() - return _make._OpMod(self, other) + raise div_ambiguity_error() + # return _make._OpMod(self, other) def __neg__(self): neg_one = _api_internal._const(-1, self.dtype) diff --git a/src/pass/rewrite_unsafe_select.cc b/src/pass/rewrite_unsafe_select.cc index 871efcae615d..62db0b414be1 100644 --- a/src/pass/rewrite_unsafe_select.cc +++ b/src/pass/rewrite_unsafe_select.cc @@ -6,9 +6,9 @@ * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -64,6 +64,8 @@ class UnsafeExprDetector : public ExprFunctor { bool VisitExpr_(const Mul* op) final { return BinaryOp(op); } bool VisitExpr_(const Div* op) final { return BinaryOp(op); } bool VisitExpr_(const Mod* op) final { return BinaryOp(op); } + bool VisitExpr_(const FloorDiv* op) final { return BinaryOp(op); } + bool VisitExpr_(const FloorMod* op) final { return BinaryOp(op); } bool VisitExpr_(const Min* op) final { return BinaryOp(op); } bool VisitExpr_(const Max* op) final { return BinaryOp(op); } bool VisitExpr_(const EQ* op) final { return BinaryOp(op); } diff --git a/tests/python/relay/test_op_level3.py b/tests/python/relay/test_op_level3.py index 424462fbe0c4..2d92489328af 100644 --- a/tests/python/relay/test_op_level3.py +++ b/tests/python/relay/test_op_level3.py @@ -373,6 +373,8 @@ def verify_split(dshape, indices_or_sections, ret_type, axis=None): yy = run_infer_type(y.astuple()) assert yy.checked_type == ret_type + idxd = tvm.indexdiv + d1, d2, d3, d4 = tvm.var("d1"), tvm.var("d2"), tvm.var("d3"), tvm.var("d4") axis = tvm.var("axis") verify_split((5, 5, 2, 2), 5, @@ -393,15 +395,15 @@ def verify_split(dshape, indices_or_sections, ret_type, axis=None): axis=0) verify_split((d1, d2, d3, d4), 4, relay.ty.TupleType(tvm.convert([ - relay.ty.TensorType((d1, d2, d3/4, d4), "float32"), - relay.ty.TensorType((d1, d2, d3/4, d4), "float32"), - relay.ty.TensorType((d1, d2, d3/4, d4), "float32"), - relay.ty.TensorType((d1, d2, d3/4, d4), "float32")])), + relay.ty.TensorType((d1, d2, idxd(d3, 4), d4), "float32"), + relay.ty.TensorType((d1, d2, idxd(d3, 4), d4), "float32"), + relay.ty.TensorType((d1, d2, idxd(d3, 4), d4), "float32"), + relay.ty.TensorType((d1, d2, idxd(d3, 4), d4), "float32")])), axis=2) verify_split((d1, d2, d3, d4), 2, relay.ty.TupleType(tvm.convert([ - relay.ty.TensorType((d1/2, d2, d3, d4), "float32"), - relay.ty.TensorType((d1/2, d2, d3, d4), "float32")])), + relay.ty.TensorType((idxd(d1, 2), d2, d3, d4), "float32"), + relay.ty.TensorType((idxd(d1, 2), d2, d3, d4), "float32")])), axis=0) verify_split((d1, d2, d3, d4), (2, 4, 7), relay.ty.TupleType(tvm.convert([ diff --git a/tests/python/relay/test_op_level5.py b/tests/python/relay/test_op_level5.py index f4ac673cf378..8c107351c81a 100644 --- a/tests/python/relay/test_op_level5.py +++ b/tests/python/relay/test_op_level5.py @@ -487,8 +487,9 @@ def verify_yolo_reorg(shape, stride, out_shape): assert zz.checked_type == relay.ty.TensorType(out_shape, "float32") n, c, h, w = tvm.var("n"), tvm.var("c"), tvm.var("h"), tvm.var("w") + idxd = tvm.indexdiv verify_yolo_reorg((n, c, 20, 20), 10, (n, c*10*10, 2, 2)) - verify_yolo_reorg((n, c, h, w), 2, (n, c*2*2, h/2, w/2)) + verify_yolo_reorg((n, c, h, w), 2, (n, c*2*2, idxd(h, 2), idxd(w, 2))) def test_yolo_reorg(): def verify_yolo_reorg(shape, stride): diff --git a/tests/python/unittest/test_autotvm_flop_calculator.py b/tests/python/unittest/test_autotvm_flop_calculator.py index 54ade9a05267..5cafd02c45bf 100644 --- a/tests/python/unittest/test_autotvm_flop_calculator.py +++ b/tests/python/unittest/test_autotvm_flop_calculator.py @@ -60,14 +60,14 @@ def test_pack_gemm(): k = tvm.reduce_axis((0, L)) bn = 4 - fld = tvm.floordiv - flm = tvm.floormod + idxd = tvm.indexdiv + idxm = tvm.indexmod A_pack = tvm.compute((N // bn, L, bn), lambda i, j, k: A[i * bn + k][j]) B_pack = tvm.compute((M // bn, L, bn), lambda i, j, k: B[i * bn + k][j]) C_pack = tvm.compute((N // bn, M // bn, bn, bn), lambda i, j, ii, jj: tvm.sum(A_pack[i, k, ii].astype(acc_dtype) * B_pack[j, k, jj].astype(acc_dtype), axis=[k])) - C = tvm.compute((N, M), lambda i, j: C_pack[fld(i, bn)][fld(j, bn)][flm(i, bn)][flm(j, bn)]) + C = tvm.compute((N, M), lambda i, j: C_pack[idxd(i, bn)][idxd(j, bn)][idxm(i, bn)][idxm(j, bn)]) s = tvm.create_schedule([C.op]) assert compute_flop(s) == 2 * N * L * M diff --git a/tests/python/unittest/test_codegen_cuda.py b/tests/python/unittest/test_codegen_cuda.py index 63aaf2146ca8..aa3a5374ce48 100644 --- a/tests/python/unittest/test_codegen_cuda.py +++ b/tests/python/unittest/test_codegen_cuda.py @@ -37,7 +37,7 @@ def check_cuda(dtype, n, lanes): print("skip because gpu does not support int8") return A = tvm.placeholder((n,), name='A', dtype="%sx%d" % (dtype, lanes)) - B = tvm.compute((n,), lambda i: A[i]+tvm.const(1, A.dtype), name='B') + B = tvm.compute((n,), lambda i: A[i] + tvm.const(1, A.dtype), name='B') s = tvm.create_schedule(B.op) xo, xi = s[B].split(B.op.axis[0], factor=num_thread) s[B].bind(xo, bx) @@ -165,9 +165,10 @@ def test_cuda_shuffle(): print("skip because cuda is not enabled..") return + idxm = tvm.indexmod a = tvm.placeholder((64, ), 'int32') b = tvm.placeholder((64, ), 'int32') - c = tvm.compute((64, ), lambda x: a[x] + b[x - (x % 4) + (3 - x % 4)]) + c = tvm.compute((64, ), lambda x: a[x] + b[x - idxm(x, 4) + (3 - idxm(x, 4))]) sch = tvm.create_schedule(c.op) x = c.op.axis[0] xo, xi = sch[c].split(x, 4) diff --git a/tests/python/unittest/test_ir_builder.py b/tests/python/unittest/test_ir_builder.py index ef58174d4474..c910c62424f0 100644 --- a/tests/python/unittest/test_ir_builder.py +++ b/tests/python/unittest/test_ir_builder.py @@ -109,14 +109,15 @@ def test_gpu(): dtype = "float32" A = tvm.placeholder((n,), name='A') B = tvm.placeholder((n,), name='B') - fld = tvm.floordiv + idxd = tvm.indexdiv + def test_device_ir(A, B, C): n = A.shape[0] max_threads = 32 ib = tvm.ir_builder.create() bx = tvm.thread_axis("blockIdx.x") tx = tvm.thread_axis("threadIdx.x") - ib.scope_attr(bx, "thread_extent", fld(n+max_threads-1, max_threads)) + ib.scope_attr(bx, "thread_extent", idxd(n+max_threads-1, max_threads)) ib.scope_attr(tx, "thread_extent", max_threads) idx = bx.var * max_threads + tx.var Aptr = ib.buffer_ptr(A) diff --git a/tests/python/unittest/test_lang_buffer.py b/tests/python/unittest/test_lang_buffer.py index 9ad8b62821cf..32c17452269e 100644 --- a/tests/python/unittest/test_lang_buffer.py +++ b/tests/python/unittest/test_lang_buffer.py @@ -94,31 +94,31 @@ def test_buffer_index_merge_mult_mod(): def assert_simplified_equal(index_simplified, index_direct): assert tvm.ir_pass.Equal(index_simplified, index_direct),\ "index_simplified=%s, index_direct=%s" %(index_simplified, index_direct) - idxdiv = tvm.indexdiv - idxmod = tvm.indexmod + idxd = tvm.indexdiv + idxm = tvm.indexmod # Test Case1 index_simplified = A_stride.vload( - (idxdiv(idxmod(k0, k1), s), idxmod(idxmod(k0, k1), s) + idxdiv(k0, k1) * k1)) + (idxd(idxm(k0, k1), s), idxm(idxm(k0, k1), s) + idxd(k0, k1) * k1)) index_direct = A_stride.vload((0, k0)) assert_simplified_equal(index_simplified, index_direct) # Test Case2 - index_simplified = A.vload((idxdiv(idxmod(k0, idxdiv(k1, s)), n), - idxmod(idxmod(k0, idxdiv(k1, s)), n) + idxmod(k0, k1))) - index_direct = A.vload((0, idxmod(k0, k1) + idxmod(k0, idxdiv(k1, s)))) + index_simplified = A.vload((idxd(idxm(k0, idxd(k1, s)), n), + idxm(idxm(k0, idxd(k1, s)), n) + idxm(k0, k1))) + index_direct = A.vload((0, idxm(k0, k1) + idxm(k0, idxd(k1, s)))) assert_simplified_equal(index_simplified, index_direct) # Test Case3 - index_simplified = A.vload((idxdiv((idxdiv(k0, idxdiv(k1, s)) * idxdiv(k1, s)), n) + - idxdiv(idxmod(k0, idxdiv(k1, s)), n), - idxmod((idxdiv(k0, idxdiv(k1, s)) * idxdiv(k1, s)), n) + - idxmod(idxmod(k0, idxdiv(k1, s)), n))) + index_simplified = A.vload((idxd((idxd(k0, idxd(k1, s)) * idxd(k1, s)), n) + + idxd(idxm(k0, idxd(k1, s)), n), + idxm((idxd(k0, idxd(k1, s)) * idxd(k1, s)), n) + + idxm(idxm(k0, idxd(k1, s)), n))) index_direct = A.vload((0, k0)) assert_simplified_equal(index_simplified, index_direct) # Test Case4 (not able to simplify) - index_simplified = A.vload((idxdiv(idxmod(k0, idxdiv(k1, s)), n), - idxmod(idxmod(k0, idxdiv(k1, n)), n) + idxmod(k0, k1))) - index_direct = A.vload((0, idxdiv(idxmod(k0, idxdiv(k1, s)), n) * n + - (idxmod(idxmod(k0, idxdiv(k1, n)), n) + idxmod(k0, k1)))) + index_simplified = A.vload((idxd(idxm(k0, idxd(k1, s)), n), + idxm(idxm(k0, idxd(k1, n)), n) + idxm(k0, k1))) + index_direct = A.vload((0, idxd(idxm(k0, idxd(k1, s)), n) * n + + (idxm(idxm(k0, idxd(k1, n)), n) + idxm(k0, k1)))) assert_simplified_equal(index_simplified, index_direct) diff --git a/tests/python/unittest/test_pass_rewrite_unsafe_select.py b/tests/python/unittest/test_pass_rewrite_unsafe_select.py index b2d73ec00ce8..4c42899be62a 100644 --- a/tests/python/unittest/test_pass_rewrite_unsafe_select.py +++ b/tests/python/unittest/test_pass_rewrite_unsafe_select.py @@ -28,7 +28,7 @@ def test_rewrite_Select(): tvm.expr.Select(i > 1, A[i-1], 1.0) > 0.0, A[i], 0.1) zz = tvm.ir_pass.RewriteUnsafeSelect(tvm.make.Evaluate(z)).value - a = tvm.expr.Select(i>10, y, z) + a = tvm.expr.Select(tvm.floordiv(i, 4) > 10, y, z) aa = tvm.ir_pass.RewriteUnsafeSelect(tvm.make.Evaluate(a)).value assert yy.name == "tvm_if_then_else" assert zz.name == "tvm_if_then_else" diff --git a/tests/python/unittest/test_schedule_tensorize.py b/tests/python/unittest/test_schedule_tensorize.py index 4bad959c2453..59adf0cc7e99 100644 --- a/tests/python/unittest/test_schedule_tensorize.py +++ b/tests/python/unittest/test_schedule_tensorize.py @@ -221,14 +221,15 @@ def check_rfactor_no_reset_multi_reduction(factor, rfactor): # This tests whether algorithm and intrinsics expressions are simplified # as much as possible first and then checked for equality. See Issue #696 def test_tensorize_op(): - tdiv = tvm.truncdiv - tmod = tvm.truncmod + idxd = tvm.indexdiv + idxm = tvm.indexmod + def op_intrin(): bh = 9 bw = 9 x = tvm.placeholder((5, 5), name='A') y = tvm.compute((bh, bw), - lambda i, j: x[tdiv(j,3) + tmod(i,3), tmod(j,3)+ tdiv(i,3)]) + lambda i, j: x[idxd(j,3) + idxm(i,3), idxm(j,3)+ idxd(i,3)]) def intrin_func(ins, outs): xx, = ins @@ -239,7 +240,7 @@ def intrin_func(ins, outs): return tvm.decl_tensor_intrin(y.op, intrin_func) A = tvm.placeholder((5, 5), name='A') - B = tvm.compute((9,9), lambda i, j: A[tdiv(j,3) + tmod(i,3), tmod(j,3) + tdiv(i,3)]) + B = tvm.compute((9,9), lambda i, j: A[idxd(j,3) + idxm(i,3), idxm(j,3) + idxd(i,3)]) bt = op_intrin() s = tvm.create_schedule(B.op) diff --git a/topi/python/topi/arm_cpu/bitserial_conv2d.py b/topi/python/topi/arm_cpu/bitserial_conv2d.py index 072c187ee294..9b8360dd1427 100644 --- a/topi/python/topi/arm_cpu/bitserial_conv2d.py +++ b/topi/python/topi/arm_cpu/bitserial_conv2d.py @@ -70,6 +70,9 @@ def spatial_pack_nhwc(cfg, data, kernel, stride, padding, activation_bits, weigh OW = (PAD_W - KW) // WSTR + 1 oshape = (1, OH, OW, CO) + idxd = tvm.indexdiv + idxm = tvm.indexmod + # Pad input channels of weights and data when it is not a multiple of 8 if CI_packed % 8 != 0: CI_PAD = CI_packed % 8 @@ -106,7 +109,8 @@ def spatial_pack_nhwc(cfg, data, kernel, stride, padding, activation_bits, weigh data_q = bitpack(data, activation_bits, pack_axis=3, bit_axis=3, pack_type='uint8') kernel_vec = _kernel_vec_spatial_pack_nhwc(kernel, weight_bits, VC, len(kernel.shape) == 4) - if kernel_vec.shape[-1] % 8 != 0 and CI_PAD != 0: + idxm = tvm.indexmod + if idxm(kernel_vec.shape[-1], 8) != 0 and CI_PAD != 0: kernel_vec = pad(kernel_vec, [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, CI_PAD]) N, H, W, IB, CI = data_q.shape @@ -147,8 +151,12 @@ def _unipolar_conv(n, h, w, co, vh, vw, vc): else: conv_vec = tvm.compute(ovshape, _bipolar_conv, name='conv_vec', tag='bipolar') - conv = tvm.compute(oshape, lambda n, h, w, co: - conv_vec[n][h//VH][w//VW][co//VC][h%VH][w%VW][co%VC].astype(out_dtype), + + conv = tvm.compute(oshape, + lambda n, h, w, co: + conv_vec[n, + idxd(h, VH), idxd(w, VW), idxd(co, VC), + idxm(h, VH), idxm(w, VW), idxm(co, VC)].astype(out_dtype), name='conv', tag='spatial_bitserial_conv_nhwc') return conv diff --git a/topi/python/topi/arm_cpu/conv2d.py b/topi/python/topi/arm_cpu/conv2d.py index 73a97d2bb33c..f5cbbf0f7bad 100644 --- a/topi/python/topi/arm_cpu/conv2d.py +++ b/topi/python/topi/arm_cpu/conv2d.py @@ -171,6 +171,9 @@ def _decl_winograd(cfg, data, kernel, strides, padding, dilation, layout, out_dt assert KH == 3 and KW == 3 and HSTR == 1 and WSTR == 1 data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad") + idxd = tvm.indexdiv + idxm = tvm.indexmod + r = KW m = tile_size alpha = m + r - 1 @@ -190,10 +193,11 @@ def _decl_winograd(cfg, data, kernel, strides, padding, dilation, layout, out_dt VK = cfg['tile_k'].size[-1] # pack input tile - input_tile = tvm.compute((C, P // VP, alpha, alpha, VP), + input_tile = tvm.compute((C, idxd(P, VP), alpha, alpha, VP), lambda c, b, eps, nu, bb: - data_pad[(b*VP+bb) // (nH*nW)][c][(b*VP+bb) // nW % nH * m + eps] - [(b*VP+bb) % nW * m + nu], + data_pad[idxd(b*VP + bb, nH*nW), c, + idxm(idxd(b*VP + bb, nW), nH) * m + eps, + idxm(b*VP + bb, nW) * m + nu], name='d') # transform kernel @@ -202,22 +206,22 @@ def _decl_winograd(cfg, data, kernel, strides, padding, dilation, layout, out_dt else: r_kh = tvm.reduce_axis((0, KH), 'r_kh') r_kw = tvm.reduce_axis((0, KW), 'r_kw') - U = tvm.compute((alpha, alpha, K // VK, C, VK), lambda eps, nu, k, c, kk: + U = tvm.compute((alpha, alpha, idxd(K, VK), C, VK), lambda eps, nu, k, c, kk: tvm.sum(kernel[k * VK + kk][c][r_kh][r_kw].astype(out_dtype) * G[eps][r_kh] * G[nu][r_kw], axis=[r_kh, r_kw]), name='U') # transform image r_eps = tvm.reduce_axis((0, alpha), 'r_eps') r_nu = tvm.reduce_axis((0, alpha), 'r_nu') - V = tvm.compute((alpha, alpha, P // VP, C, VP), lambda eps, nu, b, c, bb: + V = tvm.compute((alpha, alpha, idxd(P, VP), C, VP), lambda eps, nu, b, c, bb: tvm.sum(input_tile[c][b][r_eps][r_nu][bb].astype(out_dtype) * B[r_eps][eps] * B[r_nu][nu], axis=[r_eps, r_nu]), name='V') # batch gemm c = tvm.reduce_axis((0, C), name='c') M = tvm.compute((alpha, alpha, K, P), lambda eps, nu, k, b: - tvm.sum(U[eps][nu][k // VK][c][k % VK] * - V[eps][nu][b // VP][c][b % VP], axis=c), name='M') + tvm.sum(U[eps][nu][idxd(k, VK)][c][idxm(k, VK)] * + V[eps][nu][idxd(b, VP)][c][idxm(b, VP)], axis=c), name='M') # inverse transform r_eps = tvm.reduce_axis((0, alpha), 'r_eps') @@ -228,7 +232,8 @@ def _decl_winograd(cfg, data, kernel, strides, padding, dilation, layout, out_dt # unpack output output = tvm.compute((N, K, H, W), lambda n, k, h, w: - Y[k][n * nH * nW + (h//m) * nW + w//m][h % m][w % m], + Y[k][n * nH * nW + idxd(h, m) * nW + idxd(w, m), + idxm(h, m), idxm(w, m)], name='output', tag='winograd_conv2d_output') # we have to manually assign effective GFLOP for winograd @@ -517,6 +522,8 @@ def _alter_conv2d_layout_arm(attrs, inputs, tinfos, F): N, CI, H, W = get_const_tuple(data.shape) CO, _, KH, KW = get_const_tuple(kernel.shape) + idxd = tvm.indexdiv + if groups == 1: # query config of this workload workload = autotvm.task.args_to_workload( @@ -535,7 +542,7 @@ def _alter_conv2d_layout_arm(attrs, inputs, tinfos, F): # Store the same config for the altered operator (workload) new_data = data - new_kernel = tvm.placeholder((CO // VC, CI, KH, KW, VC), dtype=kernel.dtype) + new_kernel = tvm.placeholder((idxd(CO, VC), CI, KH, KW, VC), dtype=kernel.dtype) new_workload = autotvm.task.args_to_workload( [new_data, new_kernel, strides, padding, dilation, 'NCHW', out_dtype], conv2d) dispatch_ctx.update(target, new_workload, cfg) @@ -553,7 +560,9 @@ def _alter_conv2d_layout_arm(attrs, inputs, tinfos, F): weight = F.nn.contrib_conv2d_winograd_weight_transform(copy_inputs[1], tile_size=tile_size) weight = F.reshape(weight, - newshape=(KH + tile_size - 1, KW + tile_size - 1, CO // VC, VC, CI)) + newshape=(KH + tile_size - 1, + KW + tile_size - 1, + idxd(CO, VC), VC, CI)) weight = F.transpose(weight, axes=[0, 1, 2, 4, 3]) copy_inputs[1] = weight @@ -561,7 +570,9 @@ def _alter_conv2d_layout_arm(attrs, inputs, tinfos, F): # Store the same config for the altered operator (workload) new_data = data - new_weight = tvm.placeholder((KH + tile_size - 1, KH + tile_size -1, CO // VC, CI, VC), + new_weight = tvm.placeholder((KH + tile_size - 1, + KH + tile_size -1, + idxd(CO, VC), CI, VC), kernel.dtype) new_workload = autotvm.task.args_to_workload( [new_data, new_weight, strides, padding, dilation, @@ -612,7 +623,7 @@ def _alter_conv2d_layout_arm(attrs, inputs, tinfos, F): # Store the same config for the altered operator (workload) new_data = data CO, M, KH, KW = get_const_tuple(kernel.shape) - new_kernel = tvm.placeholder((CO // VC, M, KH, KW, VC), dtype=kernel.dtype) + new_kernel = tvm.placeholder((idxd(CO, VC), M, KH, KW, VC), dtype=kernel.dtype) new_workload = autotvm.task.args_to_workload( [new_data, new_kernel, strides, padding, dilation, out_dtype], depthwise_conv2d_nchw) diff --git a/topi/python/topi/cuda/nms.py b/topi/python/topi/cuda/nms.py index 6ff8a79d3630..33fc7249802b 100644 --- a/topi/python/topi/cuda/nms.py +++ b/topi/python/topi/cuda/nms.py @@ -243,14 +243,16 @@ def get_valid_counts_downsweep(data, idx_in, partial, idx): ib.scope_attr(bx, "thread_extent", nthread_bx) tid = bx * max_threads + tx new_range = num_anchors // elem_per_thread + 1 + idxd = tvm.indexdiv + idxm = tvm.indexmod # Scan: Downsweep: with ib. if_scope(tid < batch_size * num_anchors): - i = tid // num_anchors # number of batches - j = tid % num_anchors # number of anchors + i = idxd(tid, num_anchors) # number of batches + j = idxm(tid, num_anchors) # number of anchors with ib.if_scope(j < elem_per_thread): idx[tid] = idx_in[tid] with ib.else_scope(): - idx[tid] = idx_in[tid] + partial[i * new_range + j // elem_per_thread - 1] + idx[tid] = idx_in[tid] + partial[i * new_range + idxd(j, elem_per_thread) - 1] return ib.get() @@ -303,9 +305,12 @@ def get_valid_counts_ir(data, flag, idx, valid_count, out): ib.scope_attr(bx, "thread_extent", nthread_bx) tid = bx * max_threads + tx + idxd = tvm.indexdiv + idxm = tvm.indexmod + with ib.if_scope(tid < batch_size * num_anchors): - i = tid // num_anchors - j = tid % num_anchors + i = idxd(tid, num_anchors) + j = idxm(tid, num_anchors) base_idx = i * num_anchors * elem_length with ib.if_scope(flag[tid] > 0): with ib.for_range(0, elem_length) as k: diff --git a/topi/python/topi/cuda/rcnn/proposal.py b/topi/python/topi/cuda/rcnn/proposal.py index 06226d1b40b9..11c16f7270c2 100644 --- a/topi/python/topi/cuda/rcnn/proposal.py +++ b/topi/python/topi/cuda/rcnn/proposal.py @@ -79,15 +79,18 @@ def predict_bbox_ir(cls_prob_buf, bbox_pred_buf, im_info_buf, out_buf, scales, r p_im_info = ib.buffer_ptr(im_info_buf) p_out = ib.buffer_ptr(out_buf) + idxm = tvm.indexmod + idxd = tvm.indexdiv + with ib.if_scope(tid < batch * height * width): - w = tid % width - h = (tid // width) % height - b = tid // width // height + w = idxm(tid, width) + h = idxm(idxd(tid, width), height) + b = idxd(idxd(tid, width), height) for k in range(num_anchors): out_index = tid * num_anchors + k - ratio = ratios[k // len(scales)] - scale = scales[k % len(scales)] + ratio = ratios[idxd(k, len(scales))] + scale = scales[idxm(k, len(scales))] anchor = generate_anchor(ratio, scale, feature_stride) im_height = p_im_info[b * 3] im_width = p_im_info[b * 3 + 1] @@ -163,6 +166,8 @@ def argsort_ir(data_buf, out_index_buf): temp_data = ib.allocate("float32", (1,), name="temp_data", scope="local") temp_index = ib.allocate("int32", (1,), name="temp_index", scope="local") + idxm = tvm.indexmod + with ib.for_range(0, batch, for_type="unroll") as b: start = b * num_bbox for i in range(2): @@ -170,7 +175,7 @@ def argsort_ir(data_buf, out_index_buf): with ib.if_scope(bbox_id < num_bbox): index_out[start + bbox_id] = bbox_id with ib.for_range(0, num_bbox) as k: - offset = start + 2 * tid + (k % 2) + offset = start + 2 * tid + idxm(k, 2) with ib.if_scope( tvm.all(offset + 1 < num_bbox, p_data[offset] < p_data[offset + 1])): temp_data[0] = p_data[offset] diff --git a/topi/python/topi/cuda/sort.py b/topi/python/topi/cuda/sort.py index c45465e31624..b02c14b47e60 100644 --- a/topi/python/topi/cuda/sort.py +++ b/topi/python/topi/cuda/sort.py @@ -115,6 +115,8 @@ def sort_ir(data, values_out, axis, is_ascend, indices_out=None): ib.emit(tvm.make.Call(None, 'tvm_storage_sync', tvm.convert(['shared']), tvm.expr.Call.Intrinsic, None, 0)) + idxd = tvm.indexdiv + idxm = tvm.indexmod with ib.for_range(0, axis_mul_before) as i: with ib.for_range(0, axis_mul_after) as j: @@ -122,13 +124,13 @@ def sort_ir(data, values_out, axis, is_ascend, indices_out=None): base_idx = i * shape[axis] * axis_mul_after + j # OddEvenTransposeSort with ib.for_range(0, current_sort_num) as k: - with ib.if_scope(tid < (current_sort_num + 1) // 2): - offset = base_idx + (2 * tid + (k % 2)) * axis_mul_after + with ib.if_scope(tid < idxd(current_sort_num + 1, 2)): + offset = base_idx + (2 * tid + idxm(k, 2)) * axis_mul_after if is_ascend: - cond = tvm.all(2 * tid + (k % 2) + 1 < current_sort_num, + cond = tvm.all(2 * tid + idxm(k, 2) + 1 < current_sort_num, values_out[offset] > values_out[offset + axis_mul_after]) else: - cond = tvm.all(2 * tid + (k % 2) + 1 < current_sort_num, + cond = tvm.all(2 * tid + idxm(k, 2) + 1 < current_sort_num, values_out[offset] < values_out[offset + axis_mul_after]) with ib.if_scope(cond): temp_data[0] = values_out[offset] @@ -199,6 +201,9 @@ def sort_nms_ir(data, valid_count, output, axis, is_ascend): temp_index = ib.allocate("int32", (1,), name="temp_index", scope="local") is_ascend = tvm.make.node("IntImm", dtype="int32", value=is_ascend) + idxd = tvm.indexdiv + idxm = tvm.indexmod + with ib.for_range(0, axis_mul_before) as i: with ib.for_range(0, axis_mul_after) as j: current_sort_num = valid_count[i * axis_mul_after + j] @@ -207,10 +212,10 @@ def sort_nms_ir(data, valid_count, output, axis, is_ascend): output[base_idx + tid * axis_mul_after] = tid # OddEvenTransposeSort with ib.for_range(0, current_sort_num) as k: - with ib.if_scope(tid < (current_sort_num + 1) // 2): - offset = base_idx + (2 * tid + (k % 2)) * axis_mul_after + with ib.if_scope(tid < idxd(current_sort_num + 1, 2)): + offset = base_idx + (2 * tid + idxm(k, 2)) * axis_mul_after with ib.if_scope(tvm.all(is_ascend == 1, \ - 2 * tid + (k % 2) + 1 < current_sort_num, \ + 2 * tid + idxm(k, 2) + 1 < current_sort_num, \ data[offset] > data[offset + axis_mul_after])): temp_data[0] = data[offset] data[offset] = data[offset + axis_mul_after] @@ -219,7 +224,7 @@ def sort_nms_ir(data, valid_count, output, axis, is_ascend): output[offset] = output[offset + axis_mul_after] output[offset + axis_mul_after] = temp_index[0] with ib.if_scope(tvm.all(is_ascend == 0, \ - 2 * tid + (k % 2) + 1 < current_sort_num, \ + 2 * tid + idxm(k, 2) + 1 < current_sort_num, \ data[offset] < data[offset + axis_mul_after])): temp_data[0] = data[offset] data[offset] = data[offset + axis_mul_after] diff --git a/topi/python/topi/nn/bitserial_conv2d.py b/topi/python/topi/nn/bitserial_conv2d.py index 2faabf2bbf89..932c141450ac 100644 --- a/topi/python/topi/nn/bitserial_conv2d.py +++ b/topi/python/topi/nn/bitserial_conv2d.py @@ -313,13 +313,14 @@ def _conv(n, co, h, w, vh, vw, vc): axis=[ci, dh, dw, b1, b2]) conv = tvm.compute(ovshape, _conv, name='conv_out') - idxdiv = tvm.indexdiv - idxmod = tvm.indexmod + idxd = tvm.indexdiv + idxm = tvm.indexmod return tvm.compute( oshape, lambda n, co, h, w: - conv[n][idxdiv(co, VC)][idxdiv(h, VH)][idxdiv( - w, VW)][idxmod(h, VH)][idxmod(w, VW)][idxmod(co, VC)], + conv[n, + idxd(co, VC), idxd(h, VH), idxd(w, VW), + idxm(h, VH), idxm(w, VW), idxm(co, VC)], name='conv_vec', tag='spatial_bitserial_conv_nchw') @autotvm.register_topi_compute(bitserial_conv2d_nhwc, 'cpu', 'direct') @@ -419,12 +420,13 @@ def _conv(n, h, w, co, vh, vw, vc): conv = tvm.compute(ovshape, _conv, name='conv') - idxdiv = tvm.indexdiv - idxmod = tvm.indexmod + idxd = tvm.indexdiv + idxm = tvm.indexmod return tvm.compute( oshape, lambda n, h, w, co: - conv[n][idxdiv(h, VH)][idxdiv(w, VW)][idxdiv( - co, VC)][idxmod(h, VH)][idxmod(w, VW)][idxmod(co, VC)], + conv[n, + idxd(h, VH), idxd(w, VW), idxd(co, VC), + idxm(h, VH), idxm(w, VW), idxm(co, VC)], name='output_unpack', tag='spatial_bitserial_conv_nhwc') @tvm.target.generic_func diff --git a/topi/python/topi/nn/sparse.py b/topi/python/topi/nn/sparse.py index 11116b2e6d2c..584126ea2015 100644 --- a/topi/python/topi/nn/sparse.py +++ b/topi/python/topi/nn/sparse.py @@ -94,12 +94,15 @@ def _compute_block(i, nb_j, j): x_val = data[i, bs_c * block_j + c] return tvm.sum(block_ij_val * x_val, axis=[elem_idx, c]) + idxd = tvm.indexdiv + idxm = tvm.indexmod + bsrmm_block = tvm.compute( (m, num_blocks, bs_r), _compute_block, tag="sparse_dense_bsrmm_block") return tvm.compute( (m, num_blocks * bs_r), - lambda m, n: bsrmm_block[m, n // bs_r, n % bs_r], + lambda m, n: bsrmm_block[m, idxd(n, bs_r), idxm(n, bs_r)], tag="sparse_dense_bsrmm") @tvm.target.generic_func diff --git a/topi/python/topi/util.py b/topi/python/topi/util.py index 6de916c8a106..1bf3a102a88f 100644 --- a/topi/python/topi/util.py +++ b/topi/python/topi/util.py @@ -232,10 +232,12 @@ def unravel_index(idx, shape): indices : tuple of int or tvm.expr.IntImm Corresponding coordinate of the 1D index """ + idxd = tvm.indexdiv + idxm = tvm.indexmod indices = [] for i in range(len(shape) - 1, -1, -1): - indices.append(idx % shape[i]) - idx = idx // shape[i] + indices.append(idxm(idx, shape[i])) + idx = idxd(idx, shape[i]) indices = indices[::-1] return indices @@ -257,12 +259,13 @@ def const_matrix(matrix, name="const_matrix"): """ row, col = matrix.shape dtype = str(matrix.dtype) + idxm = tvm.indexmod def select_array(i, j): now = tvm.const(0.0, dtype) for ii in range(row): for jj in range(col): - now = tvm.expr.Select(tvm.all(i % row == ii, j % col == jj), + now = tvm.expr.Select(tvm.all(idxm(i, row) == ii, idxm(j, col) == jj), tvm.const(matrix[ii][jj], dtype), now) return now diff --git a/topi/python/topi/vision/ssd/multibox.py b/topi/python/topi/vision/ssd/multibox.py index ca1b4a9eb268..135315b3f086 100644 --- a/topi/python/topi/vision/ssd/multibox.py +++ b/topi/python/topi/vision/ssd/multibox.py @@ -73,10 +73,10 @@ def hybrid_multibox_prior(data, sizes, ratios, steps, offsets): center_w = (j + offset_w) * steps_w for k in const_range(num_sizes + num_ratios - 1): if k < num_sizes: - w = sizes[k] * in_height / in_width / 2.0 + w = float32(sizes[k] * in_height) / in_width / 2.0 h = sizes[k] / 2.0 else: - w = sizes[0] * in_height / in_width \ + w = float32(sizes[0] * in_height) / in_width \ * sqrt(ratios[k - num_sizes + 1] * 1.0) / 2.0 h = sizes[0] / sqrt(ratios[k - num_sizes + 1] * 1.0) / 2.0 count = i * in_width * (num_sizes + num_ratios - 1) \ diff --git a/topi/python/topi/x86/conv2d_avx_1x1.py b/topi/python/topi/x86/conv2d_avx_1x1.py index 3d0978cc94a5..6e36e93b9806 100644 --- a/topi/python/topi/x86/conv2d_avx_1x1.py +++ b/topi/python/topi/x86/conv2d_avx_1x1.py @@ -309,8 +309,15 @@ def _declaration_conv_nhwc_pack(cfg, Input, Filter, stride, padding, dilation, o # packing the Filter to let memory access be consecutive for AVX512 intrinsic # Done in pre-compute stage - packw_shape = (kernel_h, kernel_w, num_filter/16, 16*(channel/4), 4) - PackW = tvm.compute(packw_shape, lambda a, b, c, d, e: Filter[a][b][c*16+d%16][d/16*4+e], + idxd = tvm.indexdiv + idxm = tvm.indexmod + + packw_shape = (kernel_h, kernel_w, idxd(num_filter, 16), 16 * idxd(channel, 4), 4) + PackW = tvm.compute(packw_shape, + lambda a, b, c, d, e: + Filter[a, b, + c*16 + idxm(d, 16), + idxd(d, 16) * 4 + e], name="packed_filter") rc = tvm.reduce_axis((0, in_channel), name='rc') @@ -321,7 +328,9 @@ def _declaration_conv_nhwc_pack(cfg, Input, Filter, stride, padding, dilation, o lambda nn, yy, xx, ff: tvm.sum( PaddedInput[nn, yy * stride_h + ry * dilation_h, xx * stride_w + rx * dilation_w, rc].astype(out_dtype) * - PackW[ry, rx, ff/16, (rc/4)*16+ff%16, rc%4].astype(out_dtype), axis=[ry, rx, rc]), + PackW[ry, rx, idxd(ff, 16), + idxd(rc, 4) * 16 + idxm(ff, 16), + idxm(rc, 4)].astype(out_dtype), axis=[ry, rx, rc]), name="Conv2d_1x1_Output_int8", tag="conv2d_nhwc_pack_int8") return Output diff --git a/vta/python/vta/ir_pass.py b/vta/python/vta/ir_pass.py index 06a1975b0008..12ef7daac731 100644 --- a/vta/python/vta/ir_pass.py +++ b/vta/python/vta/ir_pass.py @@ -335,6 +335,9 @@ def inject_dma_intrin(stmt_in): Transformed statement """ env = get_env() + idxd = tvm.indexdiv + idxm = tvm.indexmod + def _check_compact(buf): ndim = len(buf.shape) size = tvm.const(1, buf.shape[0].dtype) @@ -369,7 +372,7 @@ def _fold_buffer_dim(buf, scope, elem_block): x_size = 1 x_stride = buf.strides[ndim - base] next_base = base - if not util.equal_const_int(x_stride % elem_block, 0): + if not util.equal_const_int(idxm(x_stride, elem_block), 0): raise RuntimeError( "scope %s need to have block=%d, shape=%s, strides=%s" % ( scope, elem_block, buf.shape, buf.strides)) @@ -394,7 +397,7 @@ def _get_2d_pattern(buf, elem_width, elem_bytes, dtype, scope, allow_fold): raise RuntimeError("Expect buffer type to be %s instead of %s" % (dtype, buf.dtype)) shape, strides = buf.shape, buf.strides - if not util.equal_const_int(buf.elem_offset % elem_block, 0): + if not util.equal_const_int(idxm(buf.elem_offset, elem_block), 0): raise RuntimeError("scope %s need to have block=%d" % (scope, elem_block)) if allow_fold: shape, strides = _fold_buffer_dim(buf, scope, elem_block) @@ -421,7 +424,7 @@ def raise_error(): x_size = 1 x_stride = 1 y_size = 1 - return x_size, y_size, x_stride, buf.elem_offset / elem_block + return x_size, y_size, x_stride, idxd(buf.elem_offset, elem_block) if not util.equal_const_int(strides[-2] - elem_block, 0): raise_error() @@ -429,15 +432,15 @@ def raise_error(): x_size = shape[-2] x_stride = shape[-2] y_size = 1 - return x_size, y_size, x_stride, buf.elem_offset / elem_block - if not util.equal_const_int(strides[-3] % elem_block, 0): + return x_size, y_size, x_stride, idxd(buf.elem_offset, elem_block) + if not util.equal_const_int(idxm(strides[-3], elem_block), 0): raise_error() if ndim == 3: x_size = shape[-2] - x_stride = strides[-3] / elem_block + x_stride = idxd(strides[-3], elem_block) y_size = shape[-3] - return x_size, y_size, x_stride, buf.elem_offset / elem_block + return x_size, y_size, x_stride, idxd(buf.elem_offset, elem_block) else: if not util.equal_const_int(strides[-1], 1): @@ -451,7 +454,7 @@ def raise_error(): x_size = 1 x_stride = 1 y_size = 1 - return x_size, y_size, x_stride, buf.elem_offset / elem_block + return x_size, y_size, x_stride, idxd(buf.elem_offset, elem_block) if not util.equal_const_int(strides[-3], elem_block): raise_error() @@ -459,15 +462,15 @@ def raise_error(): x_size = shape[-3] x_stride = shape[-3] y_size = 1 - return x_size, y_size, x_stride, buf.elem_offset / elem_block - if not util.equal_const_int(strides[-4] % elem_block, 0): + return x_size, y_size, x_stride, idxd(buf.elem_offset, elem_block) + if not util.equal_const_int(idxm(strides[-4], elem_block), 0): raise_error() if ndim == 4: x_size = shape[-3] - x_stride = strides[-4] / elem_block + x_stride = idxd(strides[-4], elem_block) y_size = shape[-4] - return x_size, y_size, x_stride, buf.elem_offset / elem_block + return x_size, y_size, x_stride, idxd(buf.elem_offset, elem_block) raise_error() @@ -765,6 +768,8 @@ def inject_alu_intrin(stmt_in): Transformed statement """ env = get_env() + idxm = tvm.indexmod + def _do_fold(stmt): def _equal(x, y): return tvm.ir_pass.Equal(tvm.ir_pass.Simplify(x - y), 0) @@ -910,10 +915,10 @@ def _flatten_loop(src_coeff, dst_coeff, extents): assert len(extents) != 0 assert tvm.ir_pass.Equal( tvm.ir_pass.Simplify( - src_coeff[-1] % (env.BATCH * env.BLOCK_OUT)), 0) + idxm(src_coeff[-1], env.BATCH * env.BLOCK_OUT)), 0) assert tvm.ir_pass.Equal( tvm.ir_pass.Simplify( - dst_coeff[-1] % (env.BATCH * env.BLOCK_OUT)), 0) + idxm(dst_coeff[-1], env.BATCH * env.BLOCK_OUT)), 0) assert tvm.ir_pass.Equal(src_coeff[-2], 1) assert tvm.ir_pass.Equal(dst_coeff[-2], 1) if env.BATCH > 1: