diff --git a/python/tvm/contrib/util.py b/python/tvm/contrib/util.py index 0d94a8da5058b..d3a727f9389ff 100644 --- a/python/tvm/contrib/util.py +++ b/python/tvm/contrib/util.py @@ -143,6 +143,7 @@ def which(exec_name): return full_path return None + def get_lower_ir(s): """Get lower ir code of a schedule. This is useful for debug, since you don't have to find all inputs/outputs diff --git a/topi/python/topi/testing/__init__.py b/topi/python/topi/testing/__init__.py index c496e08c1835a..a29ad0fb05a97 100644 --- a/topi/python/topi/testing/__init__.py +++ b/topi/python/topi/testing/__init__.py @@ -8,6 +8,7 @@ from .conv2d_nchw_python import conv2d_nchw_python from .conv2d_nhwc_python import conv2d_nhwc_python from .conv2d_transpose_nchw_python import conv2d_transpose_nchw_python +from .group_conv2d import group_conv2d_nchw_python from .depthwise_conv2d_python import depthwise_conv2d_python_nchw, depthwise_conv2d_python_nhwc from .dilate_python import dilate_python from .softmax_python import softmax_python, log_softmax_python diff --git a/topi/python/topi/testing/group_conv2d.py b/topi/python/topi/testing/group_conv2d.py new file mode 100644 index 0000000000000..c9332ffa3ce6c --- /dev/null +++ b/topi/python/topi/testing/group_conv2d.py @@ -0,0 +1,74 @@ +# pylint: disable=invalid-name, line-too-long, unused-variable, too-many-locals, too-many-branches +"""Convolution in python""" +import numpy as np +import scipy.signal + + +def group_conv2d_nchw_python(a_np, w_np, stride, padding, groups): + """Convolution operator in HWCN layout. + + Parameters + ---------- + a_np : numpy.ndarray + 4-D with shape [batch, in_channel, in_height, in_width] + + w_np : numpy.ndarray + 4-D with shape [num_filter, in_channel, filter_height, filter_width] + + stride : int or a list/tuple of two ints + Stride size, or [stride_height, stride_width] + + padding : int or str or a list/tuple of two ints + Padding size, or ['VALID', 'SAME'], or [pad_height, pad_width] + + groups: int + + Returns + ------- + b_np : np.ndarray + 4-D with shape [batch, out_channel, out_height, out_width] + """ + batch, in_channel, in_height, in_width = a_np.shape + num_filter, ci_g, kernel_h, kernel_w = w_np.shape + if isinstance(stride, int): + stride_h = stride_w = stride + else: + stride_h, stride_w = stride + if isinstance(padding, int): + pad_h = pad_w = padding * 2 + elif isinstance(padding, (list, tuple)): + pad_h, pad_w = padding[0] * 2, padding[1] * 2 + else: + pad_h = 0 if padding == 'VALID' else kernel_h - 1 + pad_w = 0 if padding == 'VALID' else kernel_w - 1 + pad_top = int(np.ceil(float(pad_h) / 2)) + pad_bottom = pad_h - pad_top + pad_left = int(np.ceil(float(pad_w) / 2)) + pad_right = pad_w - pad_left + # compute the output shape + out_channel = num_filter + out_height = (in_height - kernel_h + pad_h) // stride_h + 1 + out_width = (in_width - kernel_w + pad_w) // stride_w + 1 + b_np = np.zeros((batch, out_channel, out_height, out_width)) + + assert ci_g * groups == in_channel + + # group computation + for n in range(batch): + for f in range(out_channel): + for c in range(ci_g): + base = f // (out_channel // groups) * ci_g + if pad_h > 0 or pad_w > 0: + apad = np.zeros((in_height + pad_h, in_width + pad_w)) + if pad_h == 0: + apad[:, pad_left:-pad_right] = a_np[n, base + c] + elif pad_w == 0: + apad[pad_top:-pad_bottom, :] = a_np[n, base + c] + else: + apad[pad_top:-pad_bottom, pad_left:-pad_right] = a_np[n, base + c] + else: + apad = a_np[n, base + c] + out = scipy.signal.convolve2d( + apad, np.rot90(np.rot90(w_np[f, c])), mode='valid') + b_np[n, f] += out[::stride_h, ::stride_w] + return b_np diff --git a/vta/config/vta_config.json b/vta/config/vta_config.json index 27a2289b2b8b7..85f21b8270f4f 100644 --- a/vta/config/vta_config.json +++ b/vta/config/vta_config.json @@ -8,14 +8,14 @@ "GEMM_II" : 1, "TALU_II" : 2, "LOG_INP_WIDTH" : 3, - "LOG_WGT_WIDTH" : 1, + "LOG_WGT_WIDTH" : 3, "LOG_ACC_WIDTH" : 5, "LOG_OUT_WIDTH" : 3, "LOG_BATCH" : 0, - "LOG_BLOCK_IN" : 5, - "LOG_BLOCK_OUT" : 5, + "LOG_BLOCK_IN" : 4, + "LOG_BLOCK_OUT" : 4, "LOG_UOP_BUFF_SIZE" : 15, - "LOG_INP_BUFF_SIZE" : 16, + "LOG_INP_BUFF_SIZE" : 15, "LOG_WGT_BUFF_SIZE" : 18, "LOG_ACC_BUFF_SIZE" : 17 } diff --git a/vta/python/vta/top/__init__.py b/vta/python/vta/top/__init__.py index 46454ebf789f8..6c07c64f27d74 100644 --- a/vta/python/vta/top/__init__.py +++ b/vta/python/vta/top/__init__.py @@ -1,6 +1,8 @@ """TVM TOPI connector, eventually most of these should go to TVM repo""" -from .vta_conv2d import packed_conv2d, schedule_packed_conv2d from . import vta_conv2d from . import arm_conv2d + from .bitpack import bitpack +from .vta_conv2d import packed_conv2d, schedule_packed_conv2d +from .vta_group_conv2d import packed_group_conv2d, schedule_packed_group_conv2d diff --git a/vta/python/vta/top/arm_conv2d.py b/vta/python/vta/top/arm_conv2d.py index 634348a87cfe8..e3acb7a202df5 100644 --- a/vta/python/vta/top/arm_conv2d.py +++ b/vta/python/vta/top/arm_conv2d.py @@ -5,6 +5,88 @@ from topi.nn import conv2d, conv2d_alter_layout from topi import generic +_WORKLOADS = [ + # resnet 18 + Workload('float32', 'float32', 224, 224, 3, 64, 7, 7, 3, 3, 2, 2), + Workload('int8', 'int32', 224, 224, 3, 64, 7, 7, 3, 3, 2, 2), + Workload('int8', 'int32', 56, 56, 64, 64, 3, 3, 1, 1, 1, 1), + Workload('int8', 'int32', 56, 56, 64, 64, 1, 1, 0, 0, 1, 1), + Workload('int8', 'int32', 56, 56, 64, 128, 3, 3, 1, 1, 2, 2), + Workload('int8', 'int32', 56, 56, 64, 128, 1, 1, 0, 0, 2, 2), + Workload('int8', 'int32', 28, 28, 128, 128, 3, 3, 1, 1, 1, 1), + Workload('int8', 'int32', 28, 28, 128, 256, 3, 3, 1, 1, 2, 2), + Workload('int8', 'int32', 28, 28, 128, 256, 1, 1, 0, 0, 2, 2), + Workload('int8', 'int32', 14, 14, 256, 256, 3, 3, 1, 1, 1, 1), + Workload('int8', 'int32', 14, 14, 256, 512, 3, 3, 1, 1, 2, 2), + Workload('int8', 'int32', 14, 14, 256, 512, 1, 1, 0, 0, 2, 2), + Workload('int8', 'int32', 7, 7, 512, 512, 3, 3, 1, 1, 1, 1), + + # mobilenet float32 + Workload('float32', 'float32', 224, 224, 3, 32, 3, 3, 1, 1, 2, 2), + Workload('float32', 'float32', 112, 112, 32, 64, 1, 1, 0, 0, 1, 1), + Workload('float32', 'float32', 56, 56, 64, 128, 1, 1, 0, 0, 1, 1), + Workload('float32', 'float32', 56, 56, 128, 128, 1, 1, 0, 0, 1, 1), + Workload('float32', 'float32', 28, 28, 128, 256, 1, 1, 0, 0, 1, 1), + Workload('float32', 'float32', 28, 28, 256, 256, 1, 1, 0, 0, 1, 1), + Workload('float32', 'float32', 14, 14, 256, 512, 1, 1, 0, 0, 1, 1), + Workload('float32', 'float32', 14, 14, 512, 512, 1, 1, 0, 0, 1, 1), + Workload('float32', 'float32', 7, 7, 512, 1024, 1, 1, 0, 0, 1, 1), + Workload('float32', 'float32', 7, 7, 1024, 1024, 1, 1, 0, 0, 1, 1), + + # mobilenet int8 + Workload('float32', 'float32', 224, 224, 3, 32, 3, 3, 1, 1, 2, 2), + Workload('int8', 'int32', 112, 112, 32, 64, 1, 1, 0, 0, 1, 1), + Workload('int8', 'int32', 56, 56, 64, 128, 1, 1, 0, 0, 1, 1), + Workload('int8', 'int32', 56, 56, 128, 128, 1, 1, 0, 0, 1, 1), + Workload('int8', 'int32', 28, 28, 128, 256, 1, 1, 0, 0, 1, 1), + Workload('int8', 'int32', 28, 28, 256, 256, 1, 1, 0, 0, 1, 1), + Workload('int8', 'int32', 14, 14, 256, 512, 1, 1, 0, 0, 1, 1), + Workload('int8', 'int32', 14, 14, 512, 512, 1, 1, 0, 0, 1, 1), + Workload('int8', 'int32', 7, 7, 512, 1024, 1, 1, 0, 0, 1, 1), + Workload('int8', 'int32', 7, 7, 1024, 1024, 1, 1, 0, 0, 1, 1), +] + +_SCHEDULES = [ + # float32 imagenet + SpatialPack(1, 8, 4, 1, 4, True), + SpatialPack(1, 8, 4, 1, 4, True), + SpatialPack(1, 7, 4, 2, 4, True), + SpatialPack(1, 4, 8, 4, 1, True), + SpatialPack(1, 4, 4, 1, 16, False), + SpatialPack(1, 4, 8, 4, 8, False), + SpatialPack(1, 7, 4, 3, 8, True), + SpatialPack(1, 2, 8, 1, 8, True), + SpatialPack(2, 1, 16, 1, 4, True), + SpatialPack(1, 7, 4, 1, 1, True), + Im2ColPack(7, 4, 1, 16, True), + Im2ColPack(7, 4, 1, 8, False), + Im2ColPack(7, 4, 1, 16, False), + + # float32 mobilenet + SpatialPack(2, 2, 4, 28, 1, True), + SpatialPack(1, 4, 8, 14, 1, False), + SpatialPack(1, 2, 16, 8, 1, True), + SpatialPack(1, 4, 8, 8, 8, True), + SpatialPack(2, 2, 8, 1, 1, False), + SpatialPack(1, 4, 8, 4, 8, False), + SpatialPack(2, 2, 8, 1, 4, False), + SpatialPack(2, 2, 8, 1, 8, False), + Im2ColPack(7, 4, 1, 16, False), + Im2ColPack(7, 4, 1, 4, True), + + # int8 mobilenet + SpatialPack(2, 2, 4, 28, 1, True), + SpatialPack(1, 4, 8, 14, 1, False), + SpatialPack(1, 2, 16, 8, 1, True), + SpatialPack(1, 4, 8, 8, 8, True), + SpatialPack(2, 2, 8, 1, 1, False), + SpatialPack(1, 4, 8, 4, 8, False), + SpatialPack(2, 2, 8, 1, 4, False), + SpatialPack(2, 2, 8, 1, 8, False), + Im2ColPack(7, 4, 1, 16, False), + Im2ColPack(7, 4, 1, 4, True), +] + @conv2d.register(["vtacpu", "vta"]) def compute(*args, **kwargs): with tvm.target.arm_cpu("vtacpu"): diff --git a/vta/python/vta/top/vta_conv2d.py b/vta/python/vta/top/vta_conv2d.py index 7a73b58278052..b0029565f5066 100644 --- a/vta/python/vta/top/vta_conv2d.py +++ b/vta/python/vta/top/vta_conv2d.py @@ -11,6 +11,8 @@ from nnvm.top import nn as _nn from ..environment import get_env from ..ptr_alias import reinterpret +from .vta_group_conv2d import packed_group_conv2d, schedule_packed_group_conv2d + Workload = namedtuple("Conv2DWorkload", ['batch', 'height', 'width', 'in_filter', 'out_filter', @@ -262,22 +264,26 @@ def compute_conv2d(attrs, inputs, out): assert dilation == (1, 1), "not support dilate now" if is_packed_layout(layout): - assert groups == 1 - env = get_env() - assert env.LOG_INP_WIDTH == 3, "only support 8bit inp for now" - assert env.LOG_OUT_WIDTH == 3, "only support 8bit inp for now" - inputs = list(inputs) - w_pack_factor = 1 << (3 - env.LOG_WGT_WIDTH) - assert inputs[1].dtype == "int8" - - # Apply bit packing if necessary - if w_pack_factor != 1: - kshape = list(topi.util.get_const_tuple(inputs[1].shape)) - kshape[-1] *= w_pack_factor - inputs[1] = reinterpret(inputs[1], kshape, dtype=env.wgt_dtype) - - return packed_conv2d(inputs[0], inputs[1], - padding, strides, out_dtype=out_dtype) + if groups == 1: + assert groups == 1 + env = get_env() + assert env.LOG_INP_WIDTH == 3, "only support 8bit inp for now" + assert env.LOG_OUT_WIDTH == 3, "only support 8bit inp for now" + inputs = list(inputs) + w_pack_factor = 1 << (3 - env.LOG_WGT_WIDTH) + assert inputs[1].dtype == "int8" + + # Apply bit packing if necessary + if w_pack_factor != 1: + kshape = list(topi.util.get_const_tuple(inputs[1].shape)) + kshape[-1] *= w_pack_factor + inputs[1] = reinterpret(inputs[1], kshape, dtype=env.wgt_dtype) + + return packed_conv2d(inputs[0], inputs[1], + padding, strides, out_dtype=out_dtype) + else: + return packed_group_conv2d(inputs[0], inputs[1], + padding, strides, groups, out_dtype=out_dtype) return _nn.compute_conv2d(attrs, inputs, out) @@ -286,11 +292,15 @@ def schedule_conv2d(attrs, outs, target): """ 2D convolution schedule. """ layout = attrs["layout"] + groups = attrs.get_int('groups') if is_packed_layout(layout): target = tvm.target.create(target) if target.device_name == "vta": - return schedule_packed_conv2d(outs) + if groups == 1: + return schedule_packed_conv2d(outs) + else: + return schedule_packed_group_conv2d(outs) elif str(target).startswith("llvm"): return tvm.create_schedule([x.op for x in outs]) else: diff --git a/vta/python/vta/top/vta_group_conv2d.py b/vta/python/vta/top/vta_group_conv2d.py new file mode 100644 index 0000000000000..e6891233a18d4 --- /dev/null +++ b/vta/python/vta/top/vta_group_conv2d.py @@ -0,0 +1,224 @@ +import logging +from collections import namedtuple + +import tvm +import topi + + +from topi.util import get_const_int, get_const_tuple +from tvm.contrib.util import get_lower_ir + +from ..environment import get_env + +Workload = namedtuple("GroupConv2DWorkload", + ('batch', 'height', 'width', 'in_filter', 'out_filter', 'groups', + 'hkernel', 'wkernel', 'hpad', 'wpad', 'hstride', 'wstride')) + +Schedule = namedtuple("GroupConv2DSchedule", + ('b_factor', 'oc_factor', 'ic_factor', 'h_factor', 'w_factor', + 'oc_nthread', 'h_nthread', 'debug_sync')) + + +def find_schedules(layer, vt_only=False, best_only=False): + return [Schedule(0, 0, 1, 0, 0, 0, 0, False)] + + +def _get_workload(data, pad_data, kernel, output): + """ Get the workload structure. + """ + o_shape = get_const_tuple(output.shape) + d_shape = get_const_tuple(data.shape) + k_shape = get_const_tuple(kernel.shape) + o_b, o_c, o_h, o_w, ob_blk, o_blk = o_shape + i_b, i_c, i_h, i_w, ib_blk, i_blk = d_shape + k_o, k_i, k_h, k_w, ko_blk, ki_blk = k_shape + # For now we need to assume that input channel blocking is the same + # as the output channel blocking + assert o_blk == i_blk + assert ob_blk == ib_blk + # Make sure that dimensions match + assert o_b == i_b + assert o_blk == ko_blk + assert i_blk == ki_blk + assert k_o == o_c + groups = i_c // k_i + assert i_c % groups == 0 + assert o_c % groups == 0 + + # Scale the channel size + i_c *= i_blk + o_c *= o_blk + if pad_data is not None: + p_shape = topi.util.get_const_tuple(pad_data.shape) + h_pad = (p_shape[2] - d_shape[2]) // 2 + w_pad = (p_shape[3] - d_shape[3]) // 2 + else: + h_pad, w_pad = 0, 0 + h_str = (i_h + h_pad*2 - k_h) // (o_h - 1) + w_str = (i_w + w_pad*2 - k_w) // (o_w - 1) + return Workload(i_b, i_h, i_w, i_c, o_c, groups, k_h, k_w, h_pad, w_pad, h_str, w_str) + + +def packed_group_conv2d(data, + kernel, + padding, + strides, + group, + out_dtype="int32"): + """ Packed conv2d function.""" + + if padding[0]: + pad_data = topi.nn.pad(data, [0, 0, padding[0], padding[1], 0, 0], name="pad_data") + else: + pad_data = data + + assert len(data.shape) == 6 + assert len(kernel.shape) == 6 + assert data.dtype == "int8", data.dtype + assert kernel.dtype == "int8", kernel.dtype + + N, CI, IH, IW, B_BATCH, B_CI = get_const_tuple(data.shape) + CO, CI_G, KH, KW, B_CO, B_CI = get_const_tuple(kernel.shape) + PAD_H, PAD_W = padding + STR_H, STR_W = strides + + OH = (IH + 2 * PAD_H - KH) // strides[0] + 1 + OW = (IW + 2 * PAD_W - KW) // strides[1] + 1 + + assert group * CI_G == CI + assert CO % group == 0 + + oshape = (N, CO, OH, OW, B_BATCH, B_CO) + + kh = tvm.reduce_axis((0, KH), name='d_i') + kw = tvm.reduce_axis((0, KW), name='d_j') + ci_o = tvm.reduce_axis((0, CI_G), name='k_o') + ci_i = tvm.reduce_axis((0, B_CI), name='k_ten') + + out = tvm.compute( + oshape, + lambda n, co, h, w, b_n, b_co: tvm.sum( + pad_data[n, co // (CO // group) * CI_G + ci_o, h * STR_H + kh, + w * STR_W + kw, b_n, ci_i].astype(out_dtype) * + kernel[co, ci_o, kh, kw, b_co, ci_i].astype(out_dtype), + axis=[ci_o, kh, kw, ci_i]), + name="res", tag="packed_group_conv2d") + return out + + +def schedule_packed_group_conv2d(outs): + """ Schedule the packed conv2d. + """ + assert len(outs) == 1 + output = outs[0] + ewise_inputs = [] + ewise_ops = [] + conv2d_res = [] + assert output.dtype == "int8" + assert output.op.input_tensors[0].dtype == "int32" + + def _traverse(op): + if topi.tag.is_broadcast(op.tag): + if not op.same_as(output.op): + ewise_ops.append(op) + for tensor in op.input_tensors: + if isinstance(tensor.op, tvm.tensor.PlaceholderOp): + ewise_inputs.append((op, tensor)) + else: + _traverse(tensor.op) + else: + assert op.tag == "packed_group_conv2d" + conv2d_res.append(op) + + _traverse(output.op) + assert len(conv2d_res) == 1 + conv2d_stage = conv2d_res[0].output(0) + + data, kernel = conv2d_stage.op.input_tensors + if isinstance(data.op, tvm.tensor.ComputeOp) and "pad" in data.op.tag: + temp = data.op.input_tensors[0] + pad_data = data + data = temp + else: + pad_data = None + wrkld = _get_workload(data, pad_data, kernel, output) + plan = find_schedules(wrkld, vt_only=True, best_only=True)[0] + logging.info("Trying to find plan for %s", wrkld) + env = get_env() + + load_inp = load_wgt = load_out = store_out = env.dma_copy + alu = env.alu + gemm = env.gemm + + # schedule1 + oshape = topi.util.get_const_tuple(output.shape) + s = tvm.create_schedule(output.op) + + # setup pad + if pad_data is not None: + cdata = pad_data + s[pad_data].set_scope(env.inp_scope) + else: + cdata = s.cache_read(data, env.inp_scope, [conv2d_stage]) + ckernel = s.cache_read(kernel, env.wgt_scope, [conv2d_stage]) + s[conv2d_stage].set_scope(env.acc_scope) + # cache read input + cache_read_ewise = [] + + for consumer, tensor in ewise_inputs: + cache_read_ewise.append( + s.cache_read(tensor, env.acc_scope, [consumer])) + # set ewise scope + for op in ewise_ops: + s[op].set_scope(env.acc_scope) + s[op].pragma(s[op].op.axis[0], alu) + + # tile + oc_factor = (plan.oc_factor if plan.oc_factor else 1) + h_factor = (plan.h_factor if plan.h_factor else 1) + w_factor = (plan.w_factor if plan.w_factor else 1) + + x_bo, x_co, x_i, x_j, x_bi, x_ci = s[output].op.axis + x_co0, x_co1 = s[output].split(x_co, factor=oc_factor) + x_i0, x_i1 = s[output].split(x_i, factor=h_factor) + x_j0, x_j1 = s[output].split(x_j, factor=w_factor) + s[output].reorder(x_bo, x_i0, x_co0, x_j0, x_co1, x_i1, x_j1, x_bi, x_ci) + store_pt = x_j0 + + # set all compute scopes + s[conv2d_stage].compute_at(s[output], store_pt) + for op in ewise_ops: + s[op].compute_at(s[output], store_pt) + + for tensor in cache_read_ewise: + s[tensor].compute_at(s[output], store_pt) + s[tensor].pragma(s[tensor].op.axis[0], load_out) + + # virtual threading along output channel axes + if plan.oc_nthread > 1: + _, v_t = s[output].split(x_co0, factor=plan.oc_nthread) + s[output].reorder(v_t, x_bo) + s[output].bind(v_t, tvm.thread_axis("cthread")) + + # virtual threading along spatial rows + if plan.h_nthread > 1: + _, v_t = s[output].split(x_i0, factor=plan.h_nthread) + s[output].reorder(v_t, x_bo) + s[output].bind(v_t, tvm.thread_axis("cthread")) + + x_bo, x_co, x_i, x_j, x_bi, x_ci = s[conv2d_stage].op.axis + k_o, d_i, d_j, k_i = s[conv2d_stage].op.reduce_axis + s[conv2d_stage].reorder(x_bo, k_o, x_j, d_j, d_i, x_co, x_i, x_bi, x_ci, k_i) + + if plan.ic_factor: + k_o, _ = s[conv2d_stage].split(k_o, factor=plan.ic_factor) + s[cdata].compute_at(s[conv2d_stage], k_o) + s[ckernel].compute_at(s[conv2d_stage], k_o) + + # Use VTA instructions + s[cdata].pragma(s[cdata].op.axis[0], load_inp) + s[ckernel].pragma(s[ckernel].op.axis[0], load_wgt) + s[conv2d_stage].tensorize(x_bi, gemm) + s[output].pragma(x_co1, store_out) + + return s diff --git a/vta/tests/python/integration/test_benchmark_topi_group_conv.py b/vta/tests/python/integration/test_benchmark_topi_group_conv.py new file mode 100644 index 0000000000000..0b16c41350c07 --- /dev/null +++ b/vta/tests/python/integration/test_benchmark_topi_group_conv.py @@ -0,0 +1,161 @@ +"""Testing if we can generate code in topi style""" + +import tvm +from tvm import autotvm +from tvm.contrib import util +from tvm.contrib.pickle_memoize import memoize +import topi +import topi.testing +import vta +import vta.testing +import numpy as np + +Workload = vta.top.vta_group_conv2d.Workload + + +@tvm.tag_scope(tag=topi.tag.ELEMWISE) +def my_clip(x, a_min, a_max): + """Unlike topi's current clip, put min and max into two stages.""" + const_min = tvm.const(a_min, x.dtype) + const_max = tvm.const(a_max, x.dtype) + x = tvm.compute(x.shape, lambda *i: tvm.min(x(*i), const_max), name="clipA") + x = tvm.compute(x.shape, lambda *i: tvm.max(x(*i), const_min), name="clipB") + return x + + +def test_vta_group_conv2d(): + def run_vta_group_conv2d(env, remote, name, wl, profile=True): + assert wl.in_filter % wl.groups == 0 + assert wl.out_filter % wl.groups == 0 + assert wl.in_filter % (wl.groups * env.BLOCK_IN) == 0 + assert wl.batch % env.BATCH == 0 + assert wl.in_filter % env.BLOCK_IN == 0 + assert wl.out_filter % env.BLOCK_OUT == 0 + + batch_size = wl.batch + CI_G = wl.in_filter // wl.groups + + data_shape = (batch_size//env.BATCH, wl.in_filter//env.BLOCK_IN, + wl.height, wl.width, env.BATCH, env.BLOCK_IN) + kernel_shape = (wl.out_filter//env.BLOCK_OUT, CI_G//env.BLOCK_IN, + wl.hkernel, wl.wkernel, env.BLOCK_OUT, env.BLOCK_IN) + bias_shape = (batch_size//env.BATCH, wl.out_filter//env.BLOCK_OUT, + 1, 1, env.BATCH, env.BLOCK_OUT) + + fout_height = (wl.height + 2 * wl.hpad - wl.hkernel) // wl.hstride + 1 + fout_width = (wl.width + 2 * wl.wpad - wl.wkernel) // wl.wstride + 1 + data = tvm.placeholder(data_shape, name="data", dtype=env.inp_dtype) + kernel = tvm.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype) + bias = tvm.placeholder(bias_shape, name="bias", dtype=env.acc_dtype) + + res_conv = vta.top.packed_group_conv2d( + data, kernel, (wl.hpad, wl.wpad), (wl.hstride, wl.wstride), wl.groups) + res = topi.right_shift(res_conv, 8) + res = topi.add(res, bias) + res = my_clip(res, 0, 127) + res = topi.cast(res, "int8") + + # To compute number of ops, use a x2 factor for FMA + num_ops = 2 * batch_size * fout_height * fout_width * wl.hkernel * wl.wkernel * \ + wl.out_filter * wl.in_filter // wl.groups + + a_shape = (batch_size, wl.in_filter, wl.height, wl.width) + w_shape = (wl.out_filter, CI_G, wl.hkernel, wl.wkernel) + data_dtype = data.dtype + kernel_dtype = kernel.dtype + acc_dtype = env.acc_dtype + stride = (wl.hstride, wl.wstride) + padding = (wl.hpad, wl.wpad) + groups = wl.groups + + @memoize("vta.tests.test_benchmark_topi.conv2d.verify_nhwc") + def get_ref_data(): + a_np = (np.random.uniform(size=a_shape) * 4).astype(data_dtype) + w_np = (np.random.uniform(size=w_shape) * 4).astype(kernel_dtype) + a_np = np.abs(a_np) + w_np = np.abs(w_np) + b_np = topi.testing.group_conv2d_nchw_python( + a_np.astype(acc_dtype), w_np.astype(acc_dtype), stride, padding, groups).astype(acc_dtype) + return a_np, w_np, b_np + + def verify(s, check_correctness): + mod = vta.build(s, [data, kernel, bias, res], "ext_dev", + env.target_host, name="group_conv2d") + temp = util.tempdir() + + mod.save(temp.relpath("group_conv2d.o")) + remote.upload(temp.relpath("group_conv2d.o")) + f = remote.load_module("group_conv2d.o") + # verify + ctx = remote.ext_dev(0) + # Data in original format + data_orig, kernel_orig, res_ref = get_ref_data() + bias_orig = (np.random.uniform(size=(wl.out_filter,)) * 4).astype("int32") + bias_orig = np.abs(bias_orig) + + data_packed = data_orig.reshape( + batch_size//env.BATCH, env.BATCH, + wl.in_filter//env.BLOCK_IN, env.BLOCK_IN, + wl.height, wl.width).transpose((0, 2, 4, 5, 1, 3)) + kernel_packed = kernel_orig.reshape( + wl.out_filter//env.BLOCK_OUT, env.BLOCK_OUT, + wl.in_filter//wl.groups//env.BLOCK_IN, env.BLOCK_IN, + wl.hkernel, wl.wkernel).transpose((0, 2, 4, 5, 1, 3)) + bias_packed = bias_orig.reshape( + 1, wl.out_filter // env.BLOCK_OUT, 1, 1, env.BATCH, env.BLOCK_OUT) + res_shape = topi.util.get_const_tuple(res.shape) + + res_np = np.zeros(res_shape).astype(res.dtype) + data_arr = tvm.nd.array(data_packed, ctx) + kernel_arr = tvm.nd.array(kernel_packed, ctx) + bias_arr = tvm.nd.array(bias_packed, ctx) + res_arr = tvm.nd.array(res_np, ctx) + time_f = f.time_evaluator("group_conv2d", ctx, number=5) + cost = time_f(data_arr, kernel_arr, bias_arr, res_arr) + res_unpack = res_arr.asnumpy().transpose( + (0, 4, 1, 5, 2, 3)).reshape(batch_size, wl.out_filter, fout_height, fout_width) + if check_correctness: + assert wl.hpad == wl.wpad + stride = (wl.hstride, wl.wstride) + padding = (wl.hpad, wl.wpad) + res_ref = res_ref >> 8 + res_ref += bias_orig.reshape(wl.out_filter, 1, 1) + res_ref = np.clip(res_ref, 0, 127).astype("int8") + np.testing.assert_allclose(res_unpack, res_ref) + return cost + + def group_conv_normal(print_ir): + print("----- Group conv2d End-to-End Test-------") + with vta.build_config(): + s = vta.top.schedule_packed_group_conv2d([res]) + if print_ir: + print(vta.lower(s, [data, kernel, bias, res], simple_mode=True)) + cost = verify(s, True) + gops = (num_ops / cost.mean) / float(10 ** 9) + print("\tTime cost = %g sec/op, %g GOPS" % (cost.mean, gops)) + + group_conv_normal(False) + + def _run(env, remote): + tasks = [ + # mobilenet + ('mobilenet.D1', Workload(1, 112, 112, 32, 32, 2, 3, 3, 1, 1, 1, 1)), + ('mobilenet.D2', Workload(1, 112, 112, 64, 64, 4, 3, 3, 1, 1, 2, 2)), + ('mobilenet.D3', Workload(1, 56, 56, 64, 64, 4, 3, 3, 1, 1, 1, 1)), + ('mobilenet.D4', Workload(1, 56, 56, 128, 128, 8, 3, 3, 1, 1, 2, 2)), + ('mobilenet.D5', Workload(1, 28, 28, 256, 256, 8, 3, 3, 1, 1, 1, 1)), + ('mobilenet.D6', Workload(1, 28, 28, 256, 256, 16, 3, 3, 1, 1, 2, 2)), + ('mobilenet.D7', Workload(1, 14, 14, 256, 256, 16, 3, 3, 1, 1, 1, 1)), + ('mobilenet.D8', Workload(1, 14, 14, 256, 256, 16, 3, 3, 1, 1, 2, 2)), + ('mobilenet.D9', Workload(1, 7, 7, 1024, 1024, 64, 3, 3, 1, 1, 1, 1)), + ] + + for tsk in tasks: + print(tsk) + name, wkl = tsk + run_vta_group_conv2d(env, remote, name, wkl) + + vta.testing.run(_run) + +if __name__ == "__main__": + test_vta_group_conv2d()