diff --git a/python/tvm/build_module.py b/python/tvm/build_module.py index 7b81c55f1111..80959c61af41 100755 --- a/python/tvm/build_module.py +++ b/python/tvm/build_module.py @@ -386,6 +386,7 @@ def lower(sch, stmt = ir_pass.RemoveNoOp(stmt) if not cfg.disable_select_rewriting: stmt = ir_pass.RewriteUnsafeSelect(stmt) + stmt = ir_pass.CanonicalSimplify(stmt) for f in lower_phase3: stmt = f(stmt) # Instrument BoundCheckers diff --git a/python/tvm/contrib/util.py b/python/tvm/contrib/util.py index d3a727f9389f..8a84846146aa 100644 --- a/python/tvm/contrib/util.py +++ b/python/tvm/contrib/util.py @@ -174,4 +174,4 @@ def find_all(op): for out in outputs: find_all(out) - return lower(s, inputs, simple_mode=True) + return lower(s, inputs + [x.output(0) for x in outputs], simple_mode=True) diff --git a/vta/python/vta/build_module.py b/vta/python/vta/build_module.py index 299a914b15f5..a1d2299ba7aa 100644 --- a/vta/python/vta/build_module.py +++ b/vta/python/vta/build_module.py @@ -53,7 +53,8 @@ def add_debug(stmt): debug_flag) return tvm.make.stmt_seq(debug, stmt) - pass_list = [(1, ptr_alias.lower_ptr_alias), + pass_list = [(0, ir_pass.inject_conv2d_transpose_skip), + (1, ptr_alias.lower_ptr_alias), (1, ir_pass.inject_dma_intrin), (1, ir_pass.inject_skip_copy), (1, ir_pass.annotate_alu_coproc_scope), diff --git a/vta/python/vta/ir_pass.py b/vta/python/vta/ir_pass.py index 9800cc6472b3..a530c34b536e 100644 --- a/vta/python/vta/ir_pass.py +++ b/vta/python/vta/ir_pass.py @@ -272,6 +272,135 @@ def _do_fold(stmt): stmt_in, _do_fold, None, ["AttrStmt"]) +def show_dir(x): + print(type(x), x) + for key in dir(x): + print(key, getattr(x, key)) + + +def _get_gemm_intrin_buffer(): + env = get_env() + wgt_lanes = env.WGT_ELEM_BITS // env.WGT_WIDTH + assert wgt_lanes == env.BLOCK_OUT * env.BLOCK_IN + wgt_shape = (env.BLOCK_OUT, env.BLOCK_IN) + assert wgt_shape[0] * wgt_shape[1] == wgt_lanes + inp_lanes = env.INP_ELEM_BITS // env.INP_WIDTH + assert inp_lanes == env.BATCH * env.BLOCK_IN + inp_shape = (env.BATCH, env.BLOCK_IN) + assert inp_shape[0] * inp_shape[1] == inp_lanes + out_lanes = env.ACC_ELEM_BITS // env.ACC_WIDTH + assert out_lanes == env.BATCH * env.BLOCK_OUT + out_shape = (env.BATCH, env.BLOCK_OUT) + assert out_shape[0] * out_shape[1] == out_lanes + wgt = tvm.placeholder((wgt_shape[0], wgt_shape[1]), + dtype="int%d" % env.WGT_WIDTH, + name=env.wgt_scope) + inp = tvm.placeholder((inp_shape[0], inp_shape[1]), + dtype="int%d" % env.INP_WIDTH, + name=env.inp_scope) + k = tvm.reduce_axis((0, wgt_shape[1]), name="k") + out_dtype = "int%d" % env.ACC_WIDTH + out = tvm.compute((out_shape[0], out_shape[1]), + lambda i, j: tvm.sum(inp[i, k].astype(out_dtype) * + wgt[j, k].astype(out_dtype), + axis=[k]), + name="out") + wgt_layout = tvm.decl_buffer( + wgt.shape, wgt.dtype, env.wgt_scope, + scope=env.wgt_scope, offset_factor=wgt_lanes, data_alignment=wgt_lanes) + inp_layout = tvm.decl_buffer( + inp.shape, inp.dtype, env.inp_scope, + scope=env.inp_scope, offset_factor=inp_lanes, data_alignment=inp_lanes) + out_layout = tvm.decl_buffer( + out.shape, out.dtype, env.acc_scope, + scope=env.acc_scope, offset_factor=out_lanes, data_alignment=out_lanes) + + return wgt_layout, inp_layout, out_layout + + +def inject_conv2d_transpose_skip(stmt_in): + env = get_env() + dwgt, dinp, dout = _get_gemm_intrin_buffer() + + calls = [] + selects = [] + + def _find_basics(op): + if isinstance(op, tvm.expr.Call): + calls.append(op) + elif isinstance(op, tvm.expr.Select): + selects.append(op) + + def _do_fold(op): + if _match_pragma(op, "conv2d_transpose_gemm"): + is_init = ".init" in str(op) + tvm.ir_pass.PostOrderVisit(op, _find_basics) + + if is_init: + # create inner most block + irb = tvm.ir_builder.create() + dev = env.dev + irb.scope_attr(dev.vta_axis, "coproc_scope", dev.get_task_qid(dev.QID_COMPUTE)) + irb.scope_attr(dev.vta_axis, "coproc_uop_scope", dev.vta_push_uop) + irb.emit(tvm.call_extern("int32", "VTAUopPush", + 0, 1, + dout.access_ptr("rw", "int32"), + 0, 0, + 0, 0, 0)) + inner = irb.get() + args = op.body.body.args + res_tensor = op.body.body.func.output(0) + tpl = (args[0], 1, args[1], 1, args[2], 1, args[3], 1, 0, 1, 0, 16) + inner = tvm.make.AttrStmt( + [dout, res_tensor], 'buffer_bind_scope', + tvm.call_intrin('handle', 'tvm_tuple', *tpl), inner) + return inner + else: + conv_call, data_call, kernel_call = calls[-3:] + pad_data_tensor, kernel_tensor, res_tensor = (data_call.func.output(0), + kernel_call.func.output(0), conv_call.func.output(0)) + + if selects: + condition = selects[0].condition + else: + condition = tvm.const(1, 'int') + + # create inner most block + irb = tvm.ir_builder.create() + with irb.if_scope(condition): + dev = env.dev + irb.scope_attr(dev.vta_axis, "coproc_scope", dev.get_task_qid(dev.QID_COMPUTE)) + irb.scope_attr(dev.vta_axis, "coproc_uop_scope", dev.vta_push_uop) + irb.emit(tvm.call_extern("int32", "VTAUopPush", + 0, 0, + dout.access_ptr("rw", "int32"), + dinp.access_ptr("r", "int32"), + dwgt.access_ptr("r", "int32"), + 0, 0, 0)) + inner = irb.get() + + args = conv_call.args + tpl = (args[0], 1, args[1], 1, args[2], 1, args[3], 1, 0, 1, 0, 16) + inner = tvm.make.AttrStmt( + [dout, res_tensor], 'buffer_bind_scope', + tvm.call_intrin('handle', 'tvm_tuple', *tpl), inner) + args = kernel_call.args + tpl = (args[0], 1, args[1], 1, args[2], 1, args[3], 1, 0, 16, 0, 16) + inner = tvm.make.AttrStmt( + [dwgt, kernel_tensor], 'buffer_bind_scope', + tvm.call_intrin('handle', 'tvm_tuple', *tpl), inner) + args = data_call.args + tpl = (args[0], 1, args[1], 1, args[2], 1, args[3], 1, 0, 1, 0, 16) + inner = tvm.make.AttrStmt( + [dinp, pad_data_tensor], 'buffer_bind_scope', + tvm.call_intrin('handle', 'tvm_tuple', *tpl), inner) + return inner + return None + ret = tvm.ir_pass.IRTransform( + stmt_in, _do_fold, None, ["AttrStmt"]) + return ret + + def inject_coproc_sync(stmt_in): """Pass to inject skip copy stmt, used in debug. diff --git a/vta/python/vta/top/__init__.py b/vta/python/vta/top/__init__.py index 6c07c64f27d7..fbe4d04dca7e 100644 --- a/vta/python/vta/top/__init__.py +++ b/vta/python/vta/top/__init__.py @@ -4,5 +4,7 @@ from . import arm_conv2d from .bitpack import bitpack +from .vta_dense import packed_dense, schedule_packed_dense from .vta_conv2d import packed_conv2d, schedule_packed_conv2d from .vta_group_conv2d import packed_group_conv2d, schedule_packed_group_conv2d +from .vta_conv2d_transpose import packed_conv2d_transpose, schedule_packed_conv2d_transpose diff --git a/vta/python/vta/top/arm_conv2d.py b/vta/python/vta/top/arm_conv2d.py index 012c16b098ed..634348a87cfe 100644 --- a/vta/python/vta/top/arm_conv2d.py +++ b/vta/python/vta/top/arm_conv2d.py @@ -5,7 +5,6 @@ from topi.nn import conv2d, conv2d_alter_layout from topi import generic - @conv2d.register(["vtacpu", "vta"]) def compute(*args, **kwargs): with tvm.target.arm_cpu("vtacpu"): diff --git a/vta/python/vta/top/vta_conv2d.py b/vta/python/vta/top/vta_conv2d.py index d8ea163c794e..91bb8d2ef19f 100644 --- a/vta/python/vta/top/vta_conv2d.py +++ b/vta/python/vta/top/vta_conv2d.py @@ -13,7 +13,7 @@ from ..environment import get_env from ..ptr_alias import reinterpret from .vta_group_conv2d import packed_group_conv2d, schedule_packed_group_conv2d - +from .vta_conv2d_transpose import packed_conv2d_transpose, schedule_packed_conv2d_transpose Workload = namedtuple("Conv2DWorkload", ['batch', 'height', 'width', 'in_filter', 'out_filter', @@ -156,6 +156,7 @@ def _get_data_movement_byte(schedule, layer): return [fil_sched[xfer_size.index(min(xfer_size))]] return fil_sched + def packed_conv2d(data, kernel, padding, @@ -308,6 +309,42 @@ def schedule_conv2d(attrs, outs, target): return _nn.schedule_conv2d(attrs, outs, target) +@reg.register_compute("conv2d_transpose", level=15) +def compute_conv2d_transpose(attrs, inputs, out): + """ 2D convolution algorithm. + """ + padding = attrs.get_int_tuple("padding") + strides = attrs.get_int_tuple("strides") + dilation = attrs.get_int_tuple("dilation") + layout = attrs["layout"] + out_dtype = attrs['out_dtype'] + + print(inputs) + + assert dilation == (1, 1), "not support dilate now" + if is_packed_layout(layout): + return packed_conv2d_transpose(inputs[0], inputs[1], + padding, strides, + out_dtype=out_dtype) + return _nn.compute_conv2d_transpose(attrs, inputs, out) + + +@reg.register_schedule("conv2d_transpose", level=15) +def schedule_conv2d_transpose(attrs, outs, target): + """ 2D convolution schedule. + """ + layout = attrs["layout"] + + if is_packed_layout(layout): + target = tvm.target.create(target) + if target.device_name == "vta": + return schedule_packed_conv2d_transpose(outs) + elif str(target).startswith("llvm"): + return tvm.create_schedule([x.op for x in outs]) + else: + raise RuntimeError("not support target %s" % target) + return _nn.schedule_conv2d_transpose(attrs, outs, target) + def _get_workload(data, pad_data, kernel, output): """ Get the workload structure. """ diff --git a/vta/python/vta/top/vta_conv2d_transpose.py b/vta/python/vta/top/vta_conv2d_transpose.py new file mode 100644 index 000000000000..d53d69d02f2c --- /dev/null +++ b/vta/python/vta/top/vta_conv2d_transpose.py @@ -0,0 +1,217 @@ +import logging +from collections import namedtuple + +import tvm +import topi +from topi.nn.util import get_pad_tuple +from topi.util import get_const_int, get_const_tuple +from tvm.contrib.util import get_lower_ir + +from ..environment import get_env + + +Workload = namedtuple("Conv2DTransposeWorkload", + ('batch', 'height', 'width', 'in_filter', 'out_filter', + 'hkernel', 'wkernel', 'hpad', 'wpad', 'hstride', 'wstride')) + +Schedule = namedtuple("Conv2DTransposeSchedule", + ('b_factor', 'oc_factor', 'ic_factor', 'h_factor', 'w_factor', + 'oc_nthread', 'h_nthread', 'debug_sync')) + + +def find_schedules(layer, vt_only=False, best_only=False): + return [Schedule(1, 1, 1, 2, 4, 1, 1, False)] + + +def packed_conv2d_transpose(data, + kernel, + padding, + strides, + out_dtype="int32"): + batch, in_c, in_h, in_w, B_BATCH, B_CI = get_const_tuple(data.shape) + out_c, _, filter_h, filter_w, B_CO, B_CI = get_const_tuple(kernel.shape) + stride_h, stride_w = strides + + # padding stage + fpad_top, fpad_left, fpad_bottom, fpad_right = get_pad_tuple(padding, (filter_h, filter_w)) + bpad_top = filter_h - 1 - fpad_top + bpad_bottom = filter_h - 1 - fpad_bottom + bpad_left = filter_w - 1 - fpad_left + bpad_right = filter_w - 1 - fpad_right + + # padding stage + FirstPad = topi.nn.pad(data, + [0, 0, (bpad_top + stride_h - 1) // stride_h, + (bpad_left + stride_w - 1) // stride_w, 0, 0], + [0, 0, (bpad_bottom + stride_h - 1) // stride_h, + (bpad_right + stride_w - 1) // stride_w, 0, 0], + name='pad_data') + border_h = (stride_h - bpad_top % stride_h) % stride_h # remove extra padding introduced by dilatation + border_w = (stride_w - bpad_left % stride_w) % stride_w + + # dilation stage + data = FirstPad + strides = [1, 1, stride_h, stride_w, 1, 1] + n = len(data.shape) + + def _dilate(*indices): + not_zero = [] + index_tuple = [] + for i in range(n): + if not topi.util.equal_const_int(strides[i], 1): + index_tuple.append(indices[i] // strides[i]) + not_zero.append((indices[i] % strides[i]).equal(0)) + else: + index_tuple.append(indices[i]) + if not_zero: + not_zero = tvm.all(*not_zero) + return tvm.select(not_zero, data(*index_tuple), tvm.const(0.0, data.dtype)) + return data(*index_tuple) + + # convolution stage + out_h = (in_h - 1) * stride_h - fpad_top - fpad_bottom + filter_h + out_w = (in_w - 1) * stride_w - fpad_left - fpad_right + filter_w + dc = tvm.reduce_axis((0, in_c), name='dc') + dh = tvm.reduce_axis((0, filter_h), name='dh') + dw = tvm.reduce_axis((0, filter_w), name='dw') + dci = tvm.reduce_axis((0, B_CI), name='dci') + + Output = tvm.compute( + (batch, out_c, out_h, out_w, B_BATCH, B_CO), + lambda b, c, h, w, b_n, b_co: tvm.sum( + _dilate(b, dc, h + dh + border_h, w + dw + border_w, b_n, dci).astype(out_dtype) * + kernel[c, dc, dh, dw, b_co, dci].astype(out_dtype), + axis=[dc, dh, dw, dci]), + tag="packed_conv2d_transpose", + name='res', + attrs={"workload": (n, in_h, in_w, in_c, out_c, filter_h, filter_w, + padding[0], padding[1], stride_h, stride_w)}) + + return Output + +global_plan = None + +def set_global_plan(plan): + global global_plan + global_plan = plan + +def schedule_packed_conv2d_transpose(outs): + assert len(outs) == 1 + output = outs[0] + ewise_inputs = [] + ewise_ops = [] + conv2d_res = [] + assert output.dtype == "int8" + assert output.op.input_tensors[0].dtype == "int32" + # + #return tvm.create_schedule(output.op) + + def _traverse(op): + if topi.tag.is_broadcast(op.tag): + if not op.same_as(output.op): + ewise_ops.append(op) + for tensor in op.input_tensors: + if isinstance(tensor.op, tvm.tensor.PlaceholderOp): + ewise_inputs.append((op, tensor)) + else: + _traverse(tensor.op) + else: + assert op.tag == "packed_conv2d_transpose" + conv2d_res.append(op) + + _traverse(output.op) + assert len(conv2d_res) == 1 + conv2d_stage = conv2d_res[0].output(0) + + data, kernel = conv2d_stage.op.input_tensors + if isinstance(data.op, tvm.tensor.ComputeOp) and "pad" in data.op.tag: + temp = data.op.input_tensors[0] + pad_data = data + data = temp + else: + pad_data = None + + wrkld = Workload(*conv2d_stage.op.attrs['workload']) + plan = find_schedules(wrkld, vt_only=True, best_only=True)[0] + logging.info("Trying to find plan for %s", wrkld) + env = get_env() + + load_inp = load_wgt = load_out = store_out = env.dma_copy + alu = env.alu + gemm = env.gemm + + # schedule1 + s = tvm.create_schedule(output.op) + + # setup pad + if pad_data is not None: + cdata = pad_data + s[pad_data].set_scope(env.inp_scope) + else: + cdata = s.cache_read(data, env.inp_scope, [conv2d_stage]) + ckernel = s.cache_read(kernel, env.wgt_scope, [conv2d_stage]) + s[conv2d_stage].set_scope(env.acc_scope) + # cache read input + cache_read_ewise = [] + + for consumer, tensor in ewise_inputs: + cache_read_ewise.append( + s.cache_read(tensor, env.acc_scope, [consumer])) + # set ewise scope + for op in ewise_ops: + s[op].set_scope(env.acc_scope) + s[op].pragma(s[op].op.axis[0], alu) + + # tile + oc_factor = (plan.oc_factor if plan.oc_factor else 1) + h_factor = (plan.h_factor if plan.h_factor else 1) + w_factor = (plan.w_factor if plan.w_factor else 1) + + x_bo, x_co, x_i, x_j, x_bi, x_ci = s[output].op.axis + x_co0, x_co1 = s[output].split(x_co, factor=oc_factor) + x_i0, x_i1 = s[output].split(x_i, factor=h_factor) + x_j0, x_j1 = s[output].split(x_j, factor=w_factor) + s[output].reorder(x_bo, x_i0, x_co0, x_j0, x_co1, x_i1, x_j1, x_bi, x_ci) + store_pt = x_j0 + + # set all compute scopes + s[conv2d_stage].compute_at(s[output], store_pt) + for op in ewise_ops: + s[op].compute_at(s[output], store_pt) + + for tensor in cache_read_ewise: + s[tensor].compute_at(s[output], store_pt) + s[tensor].pragma(s[tensor].op.axis[0], load_out) + + # virtual threading along output channel axes + if plan.oc_nthread > 1: + _, v_t = s[output].split(x_co0, factor=plan.oc_nthread) + s[output].reorder(v_t, x_bo) + s[output].bind(v_t, tvm.thread_axis("cthread")) + + # virtual threading along spatial rows + if plan.h_nthread > 1: + _, v_t = s[output].split(x_i0, factor=plan.h_nthread) + s[output].reorder(v_t, x_bo) + s[output].bind(v_t, tvm.thread_axis("cthread")) + + x_bo, x_co, x_i, x_j, x_bi, x_ci = s[conv2d_stage].op.axis + k_o, d_i, d_j, k_i = s[conv2d_stage].op.reduce_axis + s[conv2d_stage].reorder(x_bo, k_o, d_j, d_i, x_co, x_i, x_j, x_bi, x_ci, k_i) + + for axis in [d_j, d_i, x_i, x_j]: + s[conv2d_stage].unroll(axis) + + ic_factor = plan.ic_factor or 1 + if ic_factor: + k_o, _ = s[conv2d_stage].split(k_o, factor=ic_factor) + s[cdata].compute_at(s[conv2d_stage], k_o) + s[ckernel].compute_at(s[conv2d_stage], k_o) + + # Use VTA instructions + s[cdata].pragma(s[cdata].op.axis[0], load_inp) + s[ckernel].pragma(s[ckernel].op.axis[0], load_wgt) + s[conv2d_stage].pragma(x_bi, "conv2d_transpose_gemm") + s[output].pragma(x_co1, store_out) + + return s diff --git a/vta/python/vta/top/vta_dense.py b/vta/python/vta/top/vta_dense.py new file mode 100644 index 000000000000..a190e4e979ac --- /dev/null +++ b/vta/python/vta/top/vta_dense.py @@ -0,0 +1,155 @@ +import logging +from collections import namedtuple + +import tvm +import topi +from topi.util import get_const_int, get_const_tuple + +from ..environment import get_env + +Workload = namedtuple("DenseWorkload", + ('batch', 'in_dim', 'out_dim')) + +Schedule = namedtuple("GroupConv2DSchedule", ('factor', )) + + +def find_schedules(layer, vt_only=False, best_only=False): + return [Schedule(0, 0, 1, 0, 0, 0, 0, False)] + + +def packed_dense(data, + weight, + out_dtype="int32"): + """ Packed conv2d function.""" + env = get_env() + + N, IN, B_BATCH, B_CI = get_const_tuple(data.shape) + OUT, IN, B_OUT, B_IN = get_const_tuple(weight.shape) + + oshape = (N, OUT, B_BATCH, B_OUT) + + ko = tvm.reduce_axis((0, IN), name='ko') + ki = tvm.reduce_axis((0, env.BLOCK_IN), name='ki') + + out = tvm.compute( + oshape, + lambda n, o, b_n, b_out: tvm.sum(data[n, ko, b_n, ki].astype(out_dtype) * + weight[o, ko, b_out, ki].astype(out_dtype), + axis=[ko, ki]), + name="res", tag="packed_dense", + attrs={'workload': (N, IN * B_CI, OUT * B_OUT)}) + return out + + +def schedule_packed_dense(outs): + """ Schedule the packed conv2d. + """ + assert len(outs) == 1 + output = outs[0] + return tvm.create_schedule(output.op) + + def _traverse(op): + if topi.tag.is_broadcast(op.tag): + if not op.same_as(output.op): + ewise_ops.append(op) + for tensor in op.input_tensors: + if isinstance(tensor.op, tvm.tensor.PlaceholderOp): + ewise_inputs.append((op, tensor)) + else: + _traverse(tensor.op) + else: + assert op.tag == "packed_group_conv2d" + conv2d_res.append(op) + + _traverse(output.op) + assert len(conv2d_res) == 1 + conv2d_stage = conv2d_res[0].output(0) + + data, kernel = conv2d_stage.op.input_tensors + if isinstance(data.op, tvm.tensor.ComputeOp) and "pad" in data.op.tag: + temp = data.op.input_tensors[0] + pad_data = data + data = temp + else: + pad_data = None + wrkld = _get_workload(data, pad_data, kernel, output) + plan = find_schedules(wrkld, vt_only=True, best_only=True)[0] + logging.info("Trying to find plan for %s", wrkld) + env = get_env() + + load_inp = load_wgt = load_out = store_out = env.dma_copy + alu = env.alu + gemm = env.gemm + + # schedule1 + oshape = topi.util.get_const_tuple(output.shape) + s = tvm.create_schedule(output.op) + + # setup pad + if pad_data is not None: + cdata = pad_data + s[pad_data].set_scope(env.inp_scope) + else: + cdata = s.cache_read(data, env.inp_scope, [conv2d_stage]) + ckernel = s.cache_read(kernel, env.wgt_scope, [conv2d_stage]) + s[conv2d_stage].set_scope(env.acc_scope) + # cache read input + cache_read_ewise = [] + + for consumer, tensor in ewise_inputs: + cache_read_ewise.append( + s.cache_read(tensor, env.acc_scope, [consumer])) + # set ewise scope + for op in ewise_ops: + s[op].set_scope(env.acc_scope) + s[op].pragma(s[op].op.axis[0], alu) + + # tile + oc_factor = (plan.oc_factor if plan.oc_factor else 1) + h_factor = (plan.h_factor if plan.h_factor else 1) + w_factor = (plan.w_factor if plan.w_factor else 1) + + x_bo, x_co, x_i, x_j, x_bi, x_ci = s[output].op.axis + x_co0, x_co1 = s[output].split(x_co, factor=oc_factor) + x_i0, x_i1 = s[output].split(x_i, factor=h_factor) + x_j0, x_j1 = s[output].split(x_j, factor=w_factor) + s[output].reorder(x_bo, x_i0, x_co0, x_j0, x_co1, x_i1, x_j1, x_bi, x_ci) + store_pt = x_j0 + + # set all compute scopes + s[conv2d_stage].compute_at(s[output], store_pt) + for op in ewise_ops: + s[op].compute_at(s[output], store_pt) + + for tensor in cache_read_ewise: + s[tensor].compute_at(s[output], store_pt) + s[tensor].pragma(s[tensor].op.axis[0], load_out) + + # virtual threading along output channel axes + if plan.oc_nthread > 1: + _, v_t = s[output].split(x_co0, factor=plan.oc_nthread) + s[output].reorder(v_t, x_bo) + s[output].bind(v_t, tvm.thread_axis("cthread")) + + # virtual threading along spatial rows + if plan.h_nthread > 1: + _, v_t = s[output].split(x_i0, factor=plan.h_nthread) + s[output].reorder(v_t, x_bo) + s[output].bind(v_t, tvm.thread_axis("cthread")) + + x_bo, x_co, x_i, x_j, x_bi, x_ci = s[conv2d_stage].op.axis + k_o, d_i, d_j, k_i = s[conv2d_stage].op.reduce_axis + s[conv2d_stage].reorder(x_bo, k_o, x_j, d_j, d_i, x_co, x_i, x_bi, x_ci, k_i) + + if plan.ic_factor: + k_o, _ = s[conv2d_stage].split(k_o, factor=plan.ic_factor) + s[cdata].compute_at(s[conv2d_stage], k_o) + s[ckernel].compute_at(s[conv2d_stage], k_o) + + # Use VTA instructions + s[cdata].pragma(s[cdata].op.axis[0], load_inp) + s[ckernel].pragma(s[ckernel].op.axis[0], load_wgt) + s[conv2d_stage].tensorize(x_bi, gemm) + s[output].pragma(x_co1, store_out) + + return s diff --git a/vta/python/vta/top/vta_group_conv2d.py b/vta/python/vta/top/vta_group_conv2d.py index e6891233a18d..c883b154f1c8 100644 --- a/vta/python/vta/top/vta_group_conv2d.py +++ b/vta/python/vta/top/vta_group_conv2d.py @@ -3,8 +3,6 @@ import tvm import topi - - from topi.util import get_const_int, get_const_tuple from tvm.contrib.util import get_lower_ir diff --git a/vta/tests/python/integration/test_benchmark_topi_conv2d_transpose.py b/vta/tests/python/integration/test_benchmark_topi_conv2d_transpose.py new file mode 100644 index 000000000000..e338dc8fc720 --- /dev/null +++ b/vta/tests/python/integration/test_benchmark_topi_conv2d_transpose.py @@ -0,0 +1,151 @@ +"""Testing if we can generate code in topi style""" + +import tvm +from tvm import autotvm +from tvm.contrib import util +from tvm.contrib.pickle_memoize import memoize +import topi +import topi.testing +import vta +import vta.testing +import numpy as np + +Workload = vta.top.vta_conv2d_transpose.Workload +Schedule = vta.top.vta_conv2d_transpose.Schedule + +@tvm.tag_scope(tag=topi.tag.ELEMWISE) +def my_clip(x, a_min, a_max): + """Unlike topi's current clip, put min and max into two stages.""" + const_min = tvm.const(a_min, x.dtype) + const_max = tvm.const(a_max, x.dtype) + x = tvm.compute(x.shape, lambda *i: tvm.min(x(*i), const_max), name="clipA") + x = tvm.compute(x.shape, lambda *i: tvm.max(x(*i), const_min), name="clipB") + return x + + +def test_vta_conv2d_transpose(): + def run_vta_conv2d_transpose(env, remote, name, wl, profile=True): + assert wl.batch % env.BATCH == 0 + assert wl.in_filter % env.BLOCK_IN == 0 + assert wl.out_filter % env.BLOCK_OUT == 0 + + data_shape = (wl.batch//env.BATCH, wl.in_filter//env.BLOCK_IN, + wl.height, wl.width, env.BATCH, env.BLOCK_IN) + kernel_shape = (wl.out_filter//env.BLOCK_OUT, wl.in_filter // env.BLOCK_IN, + wl.hkernel, wl.wkernel, env.BLOCK_OUT, env.BLOCK_IN) + bias_shape = (wl.batch//env.BATCH, wl.out_filter//env.BLOCK_OUT, + 1, 1, env.BATCH, env.BLOCK_OUT) + + fout_height = (wl.height - 1) * wl.hstride - 2 * wl.hpad + wl.hkernel + fout_width = (wl.width - 1) * wl.wstride - 2 * wl.wpad + wl.wkernel + + data = tvm.placeholder(data_shape, name="data", dtype=env.inp_dtype) + kernel = tvm.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype) + bias = tvm.placeholder(bias_shape, name="bias", dtype=env.acc_dtype) + + res_conv = vta.top.packed_conv2d_transpose( + data, kernel, (wl.hpad, wl.wpad), (wl.hstride, wl.wstride)) + res = topi.right_shift(res_conv, 8) + res = topi.add(res, bias) + res = my_clip(res, 0, 127) + res = topi.cast(res, "int8") + + # To compute number of ops, use a x2 factor for FMA + num_ops = 2 * wl.batch * fout_height * fout_width * wl.hkernel * wl.wkernel * \ + wl.out_filter * wl.in_filter / (wl.hstride * wl.wstride) + + a_shape = (wl.batch, wl.in_filter, wl.height, wl.width) + w_shape = (wl.in_filter, wl.out_filter, wl.hkernel, wl.wkernel) + data_dtype = data.dtype + kernel_dtype = kernel.dtype + acc_dtype = env.acc_dtype + stride = (wl.hstride, wl.wstride) + padding = (wl.hpad, wl.wpad) + + @memoize("vta.tests.test_conv2d_transpose") + def get_ref_data(): + a_np = (np.random.uniform(size=a_shape) * 4).astype(data_dtype) + w_np = (np.random.uniform(size=w_shape) * 4).astype(kernel_dtype) + a_np = np.abs(a_np) + w_np = np.abs(w_np) + b_np = topi.testing.conv2d_transpose_nchw_python( + a_np.astype(acc_dtype), w_np.astype(acc_dtype), stride, padding).astype(acc_dtype) + return a_np, w_np, b_np + + def verify(s, check_correctness): + mod = vta.build(s, [data, kernel, bias, res], "ext_dev", + env.target_host, name="conv2d_transpose") + temp = util.tempdir() + + mod.save(temp.relpath("conv2d_transpose.o")) + remote.upload(temp.relpath("conv2d_transpose.o")) + f = remote.load_module("conv2d_transpose.o") + # verify + ctx = remote.ext_dev(0) + # Data in original format + data_orig, kernel_orig, res_ref = get_ref_data() + bias_orig = (np.random.uniform(size=(wl.out_filter,)) * 4).astype("int32") + bias_orig = np.abs(bias_orig) + + data_packed = data_orig.reshape( + wl.batch//env.BATCH, env.BATCH, + wl.in_filter//env.BLOCK_IN, env.BLOCK_IN, + wl.height, wl.width).transpose((0, 2, 4, 5, 1, 3)) + kernel_packed = kernel_orig.reshape( + wl.in_filter//env.BLOCK_IN, env.BLOCK_IN, + wl.out_filter//env.BLOCK_OUT, env.BLOCK_OUT, + wl.hkernel, wl.wkernel).transpose((2, 0, 4, 5, 3, 1)) + kernel_flipped = np.flip(kernel_packed, [2, 3]) + + bias_packed = bias_orig.reshape( + 1, wl.out_filter // env.BLOCK_OUT, 1, 1, env.BATCH, env.BLOCK_OUT) + res_shape = topi.util.get_const_tuple(res.shape) + + res_np = np.zeros(res_shape).astype(res.dtype) + data_arr = tvm.nd.array(data_packed, ctx) + kernel_arr = tvm.nd.array(kernel_flipped, ctx) + bias_arr = tvm.nd.array(bias_packed, ctx) + res_arr = tvm.nd.array(res_np, ctx) + time_f = f.time_evaluator("conv2d_transpose", ctx, number=5) + cost = time_f(data_arr, kernel_arr, bias_arr, res_arr) + res_unpack = res_arr.asnumpy().transpose( + (0, 4, 1, 5, 2, 3)).reshape(wl.batch, wl.out_filter, fout_height, fout_width) + if check_correctness: + assert wl.hpad == wl.wpad + stride = (wl.hstride, wl.wstride) + padding = (wl.hpad, wl.wpad) + res_ref = res_ref >> 8 + res_ref += bias_orig.reshape(wl.out_filter, 1, 1) + res_ref = np.clip(res_ref, 0, 127).astype("int8") + np.testing.assert_allclose(res_unpack, res_ref) + return cost + + def conv2d_transpose_normal(print_ir): + print("----- Conv2d Transpose End-to-End Test-------") + with vta.build_config(): + s = vta.top.schedule_packed_conv2d_transpose([res]) + if print_ir: + print(vta.lower(s, [data, kernel, bias, res], simple_mode=True)) + cost = verify(s, True) + gops = (num_ops / cost.mean) / float(10 ** 9) + print("\tTime cost = %g sec/op, %g GOPS" % (cost.mean, gops)) + + conv2d_transpose_normal(False) + + def _run(env, remote): + tasks = [ + # mobilenet + ('DCGAN.CT1', Workload(1, 4, 4, 1024, 512, 4, 4, 1, 1, 2, 2)), + ('DCGAN.CT2', Workload(1, 8, 8, 512, 256, 4, 4, 1, 1, 2, 2)), + ('DCGAN.CT3', Workload(1, 16, 16, 256, 128, 4, 4, 1, 1, 2, 2)), + ('DCGAN.CT4', Workload(1, 32, 32, 128, env.BLOCK_IN, 4, 4, 1, 1, 2, 2)), + ] + + for tsk in tasks: + print(tsk) + name, wkl = tsk + run_vta_conv2d_transpose(env, remote, name, wkl) + vta.testing.run(_run) + +if __name__ == "__main__": + test_vta_conv2d_transpose() diff --git a/vta/tests/python/integration/test_benchmark_topi_dense.py b/vta/tests/python/integration/test_benchmark_topi_dense.py new file mode 100644 index 000000000000..5f4d8e8e4765 --- /dev/null +++ b/vta/tests/python/integration/test_benchmark_topi_dense.py @@ -0,0 +1,129 @@ +"""Testing if we can generate code in topi style""" + +import tvm +from tvm import autotvm +from tvm.contrib import util +from tvm.contrib.pickle_memoize import memoize +import topi +import topi.testing +import vta +import vta.testing +import numpy as np + +Workload = vta.top.vta_dense.Workload + + +@tvm.tag_scope(tag=topi.tag.ELEMWISE) +def my_clip(x, a_min, a_max): + """Unlike topi's current clip, put min and max into two stages.""" + const_min = tvm.const(a_min, x.dtype) + const_max = tvm.const(a_max, x.dtype) + x = tvm.compute(x.shape, lambda *i: tvm.min(x(*i), const_max), name="clipA") + x = tvm.compute(x.shape, lambda *i: tvm.max(x(*i), const_min), name="clipB") + return x + + +def test_vta_dense(): + def run_vta_dense(env, remote, name, wl, profile=True): + data_shape = (wl.batch//env.BATCH, wl.in_dim//env.BLOCK_IN, + env.BATCH, env.BLOCK_IN) + weight_shape = (wl.out_dim//env.BLOCK_OUT, wl.in_dim//env.BLOCK_IN, + env.BLOCK_OUT, env.BLOCK_IN) + bias_shape = (wl.batch//env.BATCH, wl.out_dim//env.BLOCK_OUT, + env.BATCH, env.BLOCK_OUT) + + data = tvm.placeholder(data_shape, name="data", dtype=env.inp_dtype) + weight = tvm.placeholder(weight_shape, name="kernel", dtype=env.wgt_dtype) + bias = tvm.placeholder(bias_shape, name="bias", dtype=env.acc_dtype) + data_dtype = data.dtype + weight_dtype = weight.dtype + + res = vta.top.packed_dense(data, weight) + res = topi.right_shift(res, 8) + res = topi.add(res, bias) + res = my_clip(res, 0, 127) + res = topi.cast(res, "int8") + + # To compute number of ops, use a x2 factor for FMA + num_ops = 2 * wl.batch * wl.in_dim * wl.out_dim + a_shape = (wl.batch, wl.in_dim) + w_shape = (wl.out_dim, wl.in_dim) + acc_dtype = env.acc_dtype + + @memoize("vta.tests.test_dense") + def get_ref_data(): + a_np = (np.random.uniform(size=a_shape) * 4).astype(data_dtype) + w_np = (np.random.uniform(size=w_shape) * 4).astype(weight_dtype) + a_np = np.abs(a_np) + w_np = np.abs(w_np) + b_np = np.dot(a_np.astype(acc_dtype), w_np.astype(acc_dtype).T).astype(acc_dtype) + return a_np, w_np, b_np + + def verify(s, check_correctness): + mod = vta.build(s, [data, weight, bias, res], "ext_dev", + env.target_host, name="dense") + temp = util.tempdir() + + mod.save(temp.relpath("dense.o")) + remote.upload(temp.relpath("dense.o")) + f = remote.load_module("dense.o") + # verify + ctx = remote.ext_dev(0) + # Data in original format + data_orig, id_card_opriginal, res_ref = get_ref_data() + bias_orig = (np.random.uniform(size=(wl.out_dim,)) * 4).astype("int32") + bias_orig = np.ones_like(bias_orig) + + data_packed = data_orig.reshape( + wl.batch//env.BATCH, env.BATCH, + wl.in_dim//env.BLOCK_IN, env.BLOCK_IN).transpose((0, 2, 1, 3)) + weight_packed = id_card_opriginal.reshape( + wl.out_dim//env.BLOCK_OUT, env.BLOCK_OUT, + wl.in_dim//env.BLOCK_IN, env.BLOCK_IN).transpose((0, 2, 1, 3)) + bias_packed = bias_orig.reshape( + 1, wl.out_dim // env.BLOCK_OUT, 1, env.BLOCK_OUT) + res_shape = topi.util.get_const_tuple(res.shape) + + res_np = np.zeros(res_shape).astype(res.dtype) + data_arr = tvm.nd.array(data_packed, ctx) + weight_arr = tvm.nd.array(weight_packed, ctx) + bias_arr = tvm.nd.array(bias_packed, ctx) + res_arr = tvm.nd.array(res_np, ctx) + + time_f = f.time_evaluator("dense", ctx, number=5) + cost = time_f(data_arr, weight_arr, bias_arr, res_arr) + res_unpack = res_arr.asnumpy().transpose( + (0, 2, 1, 3)).reshape(wl.batch, wl.out_dim) + if check_correctness: + res_ref = res_ref >> 8 + res_ref += bias_orig.reshape(1, wl.out_dim) + res_ref = np.clip(res_ref, 0, 127).astype("int8") + np.testing.assert_allclose(res_unpack, res_ref) + return cost + + def dense_normal(print_ir): + print("----- dense End-to-End Test-------") + with vta.build_config(): + s = vta.top.schedule_packed_dense([res]) + if print_ir: + print(vta.lower(s, [data, weight, bias, res], simple_mode=True)) + cost = verify(s, True) + gops = (num_ops / cost.mean) / float(10 ** 9) + print("\tTime cost = %g sec/op, %g GOPS" % (cost.mean, gops)) + + dense_normal(False) + + def _run(env, remote): + tasks = [ + ('dense.DEN1', Workload(1, 1024, 1024)), + ('dense.DEN2', Workload(1, 512, 512)), + ] + + for tsk in tasks: + name, wkl = tsk + run_vta_dense(env, remote, name, wkl) + + vta.testing.run(_run) + +if __name__ == "__main__": + test_vta_dense() diff --git a/vta/tests/python/integration/test_benchmark_topi_group_conv.py b/vta/tests/python/integration/test_benchmark_topi_group_conv2d.py similarity index 97% rename from vta/tests/python/integration/test_benchmark_topi_group_conv.py rename to vta/tests/python/integration/test_benchmark_topi_group_conv2d.py index 0b16c41350c0..59c6e262f0af 100644 --- a/vta/tests/python/integration/test_benchmark_topi_group_conv.py +++ b/vta/tests/python/integration/test_benchmark_topi_group_conv2d.py @@ -68,7 +68,7 @@ def run_vta_group_conv2d(env, remote, name, wl, profile=True): padding = (wl.hpad, wl.wpad) groups = wl.groups - @memoize("vta.tests.test_benchmark_topi.conv2d.verify_nhwc") + @memoize("vta.tests.test_group_conv2d") def get_ref_data(): a_np = (np.random.uniform(size=a_shape) * 4).astype(data_dtype) w_np = (np.random.uniform(size=w_shape) * 4).astype(kernel_dtype) @@ -115,9 +115,6 @@ def verify(s, check_correctness): res_unpack = res_arr.asnumpy().transpose( (0, 4, 1, 5, 2, 3)).reshape(batch_size, wl.out_filter, fout_height, fout_width) if check_correctness: - assert wl.hpad == wl.wpad - stride = (wl.hstride, wl.wstride) - padding = (wl.hpad, wl.wpad) res_ref = res_ref >> 8 res_ref += bias_orig.reshape(wl.out_filter, 1, 1) res_ref = np.clip(res_ref, 0, 127).astype("int8")