From 6050527a078a4c4b35b82120c2b2b1deaf848b23 Mon Sep 17 00:00:00 2001 From: Leyuan Wang Date: Fri, 6 Sep 2019 17:01:29 -0700 Subject: [PATCH] [TOPI] Intel graphics conv2d autotvm template added (#3839) * update lint * lint fixed * lint updated * lint fixed * lint fixed * lint fixed * updates * add intel graphics as a package * remove print info * depthwise conv2d schedule added for intel graphics * asdf * fix lint * fix lint * fix ci * add channels --- python/tvm/autotvm/tophub.py | 15 +- .../nnvm_to_relay/test_alter_conv2d.py | 2 + topi/python/topi/intel_graphics/conv2d.py | 481 ++++++++++++------ .../topi/intel_graphics/depthwise_conv2d.py | 342 +++++++++++++ 4 files changed, 677 insertions(+), 163 deletions(-) create mode 100644 topi/python/topi/intel_graphics/depthwise_conv2d.py diff --git a/python/tvm/autotvm/tophub.py b/python/tvm/autotvm/tophub.py index 92d1be2c8f20e..aa49beaecf53a 100644 --- a/python/tvm/autotvm/tophub.py +++ b/python/tvm/autotvm/tophub.py @@ -36,15 +36,16 @@ # the version of each package PACKAGE_VERSION = { - 'arm_cpu': "v0.04", - 'llvm': "v0.03", + 'arm_cpu': "v0.04", + 'llvm': "v0.03", - 'cuda': "v0.05", - 'rocm': "v0.03", - 'opencl': "v0.03", - 'mali': "v0.05", + 'cuda': "v0.05", + 'rocm': "v0.03", + 'opencl': "v0.03", + 'mali': "v0.05", + 'intel_graphics': "v0.01", - 'vta': "v0.06", + 'vta': "v0.06", } logger = logging.getLogger('autotvm') diff --git a/tests/python/frontend/nnvm_to_relay/test_alter_conv2d.py b/tests/python/frontend/nnvm_to_relay/test_alter_conv2d.py index b7d21912e44ea..ed8b9cd9ed977 100644 --- a/tests/python/frontend/nnvm_to_relay/test_alter_conv2d.py +++ b/tests/python/frontend/nnvm_to_relay/test_alter_conv2d.py @@ -69,6 +69,7 @@ def convnet(): targets=['cuda', 'opencl -device=mali', 'opencl -device=intel_graphics', + 'llvm -device=arm_cpu', 'llvm -device=core-avx-ii'] @@ -83,5 +84,6 @@ def convnet(): assert not relay.analysis.alpha_equal(N, O) if __name__ == "__main__": + import numpy as np np.random.seed(42) test_alter_layout_conv2d() diff --git a/topi/python/topi/intel_graphics/conv2d.py b/topi/python/topi/intel_graphics/conv2d.py index 8fcde2513705e..63cae4c7da8db 100644 --- a/topi/python/topi/intel_graphics/conv2d.py +++ b/topi/python/topi/intel_graphics/conv2d.py @@ -21,15 +21,116 @@ import tvm +from tvm import autotvm +from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity +from ..nn.conv2d import conv2d, conv2d_NCHWc, conv2d_alter_layout, conv2d_infer_layout +from ..nn.util import get_pad_tuple +from ..nn.depthwise_conv2d import depthwise_conv2d_nchw +from ..nn import pad +from .. import tag from .. import generic from .. import util -from .. import tag -from ..nn import pad -from ..nn.conv2d import conv2d, conv2d_NCHWc, conv2d_alter_layout, _get_workload -from ..nn.util import get_pad_tuple -from ..util import simplify +from ..util import simplify, get_const_tuple +def _get_default_config(cfg, data, kernel, strides, padding, out_dtype, is_depthwise=False): + if is_depthwise: + raise RuntimeError("Depthwise not supported for intel graphics.") + else: + batch_size, in_channel, height, width = get_const_tuple(data.shape) + out_channel, _, hkernel, _ = get_const_tuple(kernel.shape) + HSTR, _ = strides + + ic_bn = 1 + oc_bn, oc_bn_upper = 16, 16 + for i in range(oc_bn_upper, 0, -1): + if out_channel % i == 0: + oc_bn = i + break + + if HSTR == 2: + if out_channel + hkernel == 515: + block_oh = 4 + block_ow = 4 + else: + block_oh = 4 + block_ow = 5 + elif hkernel == 3: + if out_channel == 512: + block_oh = 2 + block_ow = 7 + else: + block_oh = 2 + block_ow = 14 + else: + block_oh = 1 + block_ow = 16 + cfg["tile_ic"] = SplitEntity([in_channel // ic_bn, ic_bn]) + cfg["tile_oc"] = SplitEntity([out_channel // oc_bn, oc_bn]) + cfg["block_oh"] = OtherOptionEntity(block_oh) + cfg["block_ow"] = OtherOptionEntity(block_ow) + + +def _create_schedule_template(cfg, data, kernel, strides, padding, dilation, layout): + """Create schedule configuration from input arguments""" + dshape = get_const_tuple(data.shape) + kshape = get_const_tuple(kernel.shape) + if layout == 'NCHW': + n, ic, h, w = dshape + oc, _, kh, kw = kshape + else: + raise ValueError("Not support this layout {} with " + "schedule template.".format(layout)) + ph, pw = padding if isinstance(padding, (tuple, list)) else (padding, padding) + sh, sw = strides if isinstance(strides, (tuple, list)) else (strides, strides) + oh = (h - kh + 2 * ph) // sh + 1 + ow = (w - kw + 2 * pw) // sw + 1 + ic_bn_upper = 32 + oc_bn_upper = 64 + oc_bn_lower = min(oc, 8) + ic_bn_candidates, oc_bn_candidates = [], [] + for i in range(1, ic + 1): + if ic % i == 0 and i <= ic_bn_upper: + ic_bn_candidates.append(i) + if not ic_bn_candidates: + ic_bn_candidates.append(1) + ic_bn_candidates.append(ic) + + for i in range(1, oc + 1): + if oc % i == 0 and oc_bn_lower <= i <= oc_bn_upper: + oc_bn_candidates.append(i) + if not oc_bn_candidates: + oc_bn_candidates.append(1) + oc_bn_candidates.append(oc) + + blk_candidates_low_limits = 5 + blk_oh_list, blk_ow_list = [], [] + for i, j in zip(range(oh, 0, -1), range(ow, 0, -1)): + if i <= 16 and oh % i == 0: + blk_oh_list.append(i) + if j <= 16 and ow % j == 0: + blk_ow_list.append(j) + + if len(blk_oh_list) < blk_candidates_low_limits: + for i in range(2, oh): + if i not in blk_oh_list: + blk_oh_list.append(i) + if len(blk_oh_list) >= 5: + break + + if len(blk_ow_list) < blk_candidates_low_limits: + for i in range(min(ow - 1, 16), 1, -1): + if i not in blk_ow_list: + blk_ow_list.append(i) + if len(blk_ow_list) >= 5: + break + + # Create schedule config + cfg.define_knob("tile_ic", ic_bn_candidates) + cfg.define_knob("tile_oc", oc_bn_candidates) + cfg.define_knob("block_oh", blk_oh_list) + cfg.define_knob("block_ow", blk_ow_list) + ##### SCHEDULE UTILITIES ##### def tile_and_bind3d(s, tensor, z, y, x, z_factor=2, y_factor=None, x_factor=None): @@ -53,40 +154,82 @@ def tile_and_bind3d(s, tensor, z, y, x, z_factor=2, y_factor=None, x_factor=None return xi, thread_z, thread_y, thread_x @conv2d_alter_layout.register(["intel_graphics"]) -def _alter_conv2d_layout(attrs, inputs, tinfos, F): +def _alter_conv2d_layout(attrs, inputs, tinfo, F): + import nnvm.symbol as sym copy_inputs = [s for s in inputs] - - data = tinfos[0] - kernel = tinfos[1] - - import ast - padding = ast.literal_eval(str(attrs['padding'])) - stride = ast.literal_eval(str(attrs['strides'])) - - wkl = _get_workload(data, kernel, stride, padding, data.dtype) - oc_bn = 1 - kernel_shape = util.get_const_tuple(kernel.shape) - for oc_bn in range(16, 1, -1): - if kernel_shape[0] % oc_bn == 0: - break - - new_attrs = {k: attrs[k] for k in attrs.keys()} - new_attrs["kernel_layout"] = 'OIHW%do' % (oc_bn) + new_attrs = {k : attrs[k] for k in attrs.keys()} if F.__name__ == 'tvm.relay.op': # Derive channels for frontends (e.g ONNX) that miss "channel" field. new_attrs["channels"] = inputs[1].checked_type.shape[attrs['kernel_layout'].index('O')] - if F.__name__ == 'nnvm.symbol': - out = F.contrib.conv2d_NCHWc(*copy_inputs, **new_attrs) - else: - out = F.nn.contrib_conv2d_nchwc(*copy_inputs, **new_attrs) - - return out - -@conv2d_NCHWc.register(["intel_graphics"]) -def _decl_conv2d(data, kernel, stride, padding, dilation, layout, out_layout, out_dtype='float32'): + data, kernel = tinfo[0], tinfo[1] + batch_size, in_channel, height, width = get_const_tuple(data.shape) + + groups = attrs.get_int("groups") + out_channel = attrs.get_int("channels") + padding = attrs.get_int_tuple("padding") + strides = attrs.get_int_tuple("strides") + dilation = attrs.get_int_tuple("dilation") + out_dtype = attrs["out_dtype"] + + layout_name = 'layout' if F == sym else 'data_layout' + layout = attrs[layout_name] + kh, kw = attrs.get_int_tuple("kernel_size") + + dtype = data.dtype + out_dtype = dtype if out_dtype in ("same", "") else out_dtype + is_depthwise = groups == in_channel and groups == out_channel + + # only optimize for NCHW + if layout != 'NCHW': + return None + if groups != 1 and not is_depthwise: + return None + + dispatch_ctx = autotvm.task.DispatchContext.current + target = tvm.target.current_target() + + # query schedule and fallback if necessary + workload = autotvm.task.args_to_workload( + [data, kernel, strides, padding, dilation, out_dtype], depthwise_conv2d_nchw) \ + if is_depthwise else \ + autotvm.task.args_to_workload( + [data, kernel, strides, padding, dilation, layout, out_dtype], conv2d) + if is_depthwise: + return None + cfg = dispatch_ctx.query(target, workload) + if cfg.is_fallback: + _get_default_config(cfg, data, kernel, strides, padding, out_dtype, is_depthwise) + + ic_bn, oc_bn = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1] + + new_attrs[layout_name] = 'NCHW%dc' % ic_bn + new_attrs['out_layout'] = 'NCHW%dc' % oc_bn + + new_data = tvm.placeholder((batch_size, in_channel//ic_bn, height, width, ic_bn), + dtype=data.dtype) + + out_channel, _, kh, kw = get_const_tuple(kernel.shape) + # (oc, ic, h, w) -> (OC, IC, h, w, ic, oc) + new_attrs['kernel_layout'] = 'OIHW%di%do' % (ic_bn, oc_bn) + + # Store altered operator's config + new_kernel = tvm.placeholder((out_channel//oc_bn, in_channel//ic_bn, kh, kw, ic_bn, oc_bn), + dtype=kernel.dtype) + new_workload = autotvm.task.args_to_workload( + [new_data, new_kernel, strides, padding, dilation, new_attrs[layout_name], + new_attrs['out_layout'], out_dtype], conv2d_NCHWc) + + dispatch_ctx.update(target, new_workload, cfg) + if F == sym: + return F.contrib.conv2d_NCHWc(*copy_inputs, **new_attrs) + return F.nn.contrib_conv2d_nchwc(*copy_inputs, **new_attrs) + +@autotvm.register_topi_compute(conv2d_NCHWc, 'intel_graphics', 'direct') +def _decl_conv2d(cfg, data, kernel, strides, padding, dilation, + layout, out_layout, out_dtype='float32'): """Conv2D operator for Intel Graphics backend. Parameters @@ -111,21 +254,39 @@ def _decl_conv2d(data, kernel, stride, padding, dilation, layout, out_layout, ou output : tvm.Tensor 4-D with shape [batch, out_channel, out_height, out_width] """ - assert data.shape[0].value == 1, "only support batch size=1 convolution on intel gpu" - assert data.dtype == kernel.dtype, "Do not support inputs with different data types now." - - out_dtype = data.dtype - HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel) - kernel_shape = util.get_const_tuple(kernel.shape) - if isinstance(stride, (tuple, list)): - HSTR, WSTR = stride - else: - HSTR, WSTR = stride, stride - - return _decl_cl_spatialpack_NCHWc(data, kernel, stride, padding, out_dtype) - -@generic.schedule_conv2d_NCHWc.register(["intel_graphics"]) -def schedule_conv2d_NCHWc(outs): + dh, dw = dilation if isinstance(dilation, (tuple, list)) else (dilation, dilation) + assert (dh, dw) == (1, 1), "Does not support dilation" + + n, ic_chunk, ih, iw, ic_bn = get_const_tuple(data.shape) + oc_chunk, _, kernel_height, kernel_width, _, oc_bn = get_const_tuple(kernel.shape) + in_channel = ic_chunk * ic_bn + num_filter = oc_chunk * oc_bn + if cfg.is_fallback: + _get_default_config(cfg, tvm.placeholder((n, in_channel, ih, iw), dtype=data.dtype), + tvm.placeholder((num_filter, in_channel, kernel_height, kernel_width), + dtype=kernel.dtype), + strides, padding, out_dtype) + + return _decl_cl_spatialpack_NCHWc(cfg, data, kernel, strides, padding, dilation, out_dtype) + + +@conv2d_infer_layout.register("intel_graphics") +def _conv2d_infer_layout(workload, cfg): + _, data, kernel, strides, padding, dilation, layout, dtype = workload + batch_size, in_channel, in_height, in_width = data[:-1] + out_channel, _, k_height, k_width = kernel[:-1] + out_height = (in_height + 2 * padding[0] - k_height) // strides[0] + 1 + out_width = (in_width + 2 * padding[1] - k_width) // strides[1] + 1 + tile_ic, tile_oc = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1] + in_shape = (batch_size, in_channel // tile_ic, in_height, in_width, tile_ic) + in_layout = "NCHW%dc" % tile_ic + out_shape = (batch_size, out_channel // tile_oc, out_height, out_width, tile_oc) + out_layout = "NCHW%dc" % tile_oc + return ((in_shape, in_layout),), ((out_shape, out_layout),) + + +@autotvm.register_topi_schedule(generic.schedule_conv2d_NCHWc, 'intel_graphics', ['direct']) +def _schedule_conv2d_NCHWc(cfg, outs): """Schedule for conv2d_nchw for Intel Graphics Parameters @@ -145,14 +306,14 @@ def schedule_conv2d_NCHWc(outs): def traverse(op): """inline all one-to-one-mapping operators except the last stage (output)""" - if tag.is_broadcast(op.tag): + if tag.is_injective(op.tag): if op not in s.outputs: s[op].compute_inline() for tensor in op.input_tensors: - if isinstance(tensor.op, tvm.tensor.ComputeOp) and tensor.op not in scheduled_ops: + if tensor.op.input_tensors and tensor.op not in scheduled_ops: traverse(tensor.op) - if 'conv2d' in op.tag: - _schedule_cl_spatialpack_NCHWc(s, op) + if "conv" in op.tag: + _schedule_cl_spatialpack_NCHWc(cfg, s, op) scheduled_ops.append(op) @@ -160,97 +321,101 @@ def traverse(op): return s -def _decl_cl_spatialpack_NCHWc(data, kernel, stride, padding, out_dtype='float16'): - batch, in_channel, in_height, in_width = [util.get_const_int(x) for x in data.shape] - num_filter, channel, kernel_h, kernel_w, nv = [util.get_const_int(x) for x in kernel.shape] - num_filter = num_filter * nv +def _decl_cl_spatialpack_NCHWc(cfg, data, kernel, strides, padding, dilation, out_dtype='float16'): + batch, in_channel, in_height, in_width, vc = [util.get_const_int(x) for x in data.shape] + in_channel *= vc + num_filter, channel, kernel_h, kernel_w, ci, co = [util.get_const_int(x) for x in kernel.shape] + num_filter *= co pad_top, pad_left, pad_down, pad_right = get_pad_tuple(padding, kernel) - if isinstance(stride, (tuple, list)): - stride_h, stride_w = stride + ic_bn = vc + assert vc == ci + + if isinstance(strides, (tuple, list)): + stride_h, stride_w = strides else: - stride_h, stride_w = stride, stride + stride_h, stride_w = strides, strides out_channel = num_filter out_height = simplify((in_height - kernel_h + pad_top + pad_down) // stride_h + 1) out_width = simplify((in_width - kernel_w + pad_left + pad_right) // stride_w + 1) - oshape = (batch, out_channel, out_height, out_width) + oshape = (batch, out_channel // co, out_height, out_width, co) rc = tvm.reduce_axis((0, in_channel), name='rc') ry = tvm.reduce_axis((0, kernel_h), name='ry') rx = tvm.reduce_axis((0, kernel_w), name='rx') - block_w = 1 - block_h = 1 - if stride_h == 2: - if num_filter + kernel_h == 515: - block_h = 4 - block_w = 4 - else: - block_h = 4 - block_w = 5 - elif kernel_h == 3: - if num_filter == 512: - block_h = 2 - block_w = 7 - else: - block_h = 2 - block_w = 14 - elif kernel_h == 7 and padding == 3 and stride == 1: - block_h = 3 - block_w = 4 - else: - block_h = 1 - block_w = 16 + block_h = cfg["block_oh"].val + block_w = cfg["block_ow"].val - attrs = {'block_h': block_h, 'block_w' : block_w} c_h = out_height c_w = out_width - if not out_height % block_h == 0: + if out_height % block_h != 0: c_h = (out_height // block_h + 1) * block_h - if not out_width % block_w == 0: + if out_width % block_w != 0: c_w = (out_width // block_w + 1) * block_w - pad_before = [0, 0, pad_top, pad_left] - pad_after = [0, 0, pad_down + c_h - block_h, pad_right + c_w - block_w] - temp = pad(data, pad_before, pad_after, name="pad_temp") + cshape = (batch, out_channel // co, c_h, c_w, co) - cshape = (batch, out_channel // nv, c_h, c_w, nv) + pad_before = [0, 0, pad_top, pad_left, 0] + pad_after = [0, 0, pad_down + c_h - out_height, pad_right + \ + c_w - out_width, 0] + DOPAD = (pad_top != 0 or pad_left != 0 or pad_down + c_h - out_height != 0 \ + or pad_right + c_w - out_width != 0) + DOUNPACK = (c_h - out_height != 0 or c_w - out_width != 0) + if DOPAD: + temp = pad(data, pad_before, pad_after, name="pad_temp") + else: + temp = data conv = tvm.compute( cshape, - lambda nn, ff, yy, xx, vc:\ - tvm.sum( - temp[nn, rc, yy * stride_h + ry, xx * stride_w + rx].astype(out_dtype) * - kernel[ff, rc, ry, rx, vc].astype(out_dtype), - axis=[rc, ry, rx]), name='conv', attrs=attrs) + lambda nn, ff, yy, xx, ff_v: \ + tvm.sum( + temp[nn, rc//ic_bn, yy * stride_h + ry, xx * stride_w + rx, rc%ic_bn]. \ + astype(out_dtype) * + kernel[ff, rc//ic_bn, ry, rx, rc%ic_bn, ff_v].astype(out_dtype), + axis=[rc, ry, rx]), tag="conv", name='conv') + + if DOUNPACK: + output = tvm.compute( + oshape, + lambda nn, ff, yy, xx, ff_v: + conv[nn][ff][yy][xx][ff_v], + name='output_unpack', tag="conv_unpack") + else: + output = conv - output = tvm.compute( - oshape, - lambda nn, ff, yy, xx: - conv[nn][ff//nv][yy][xx][ff%nv], - name='output_unpack', tag='conv2d') return output -def _schedule_cl_spatialpack_NCHWc(s, op): - output = op.output(0) - _, _, out_height, out_width = [util.get_const_int(x) for x in output.shape] +def _schedule_cl_spatialpack_NCHWc(cfg, s, op): + output = op.output(0) conv = op.input_tensors[0] - temp = s[conv].op.input_tensors[0] - kernel = s[conv].op.input_tensors[1] - temp_W = s.cache_read(temp, "warp", [conv]) - conv_L = s.cache_write(conv, "local") - + if conv.op.name == "conv": + temp = s[conv].op.input_tensors[0] + kernel = s[conv].op.input_tensors[1] + temp_W = s.cache_read(temp, "warp", [conv]) + conv_L = s.cache_write(conv, "local") + SCHEDULE_OUTPUT = True + else: + temp = op.input_tensors[0] + kernel = op.input_tensors[1] + temp_W = s.cache_read(temp, "warp", [output]) + conv_L = s.cache_write(output, "local") + if output.op in s.outputs: + conv = output + else: + s[output].compute_inline() + conv = s.outputs[0] + SCHEDULE_OUTPUT = False kernel_L = s.cache_read(kernel, "local", [conv_L]) - _, in_channel, temp_h, temp_w = [util.get_const_int(x) for x in temp.shape] - attrs = s[conv].op.attrs - OUTPUT_BLOCK_HEIGHT = attrs['block_h'] - OUTPUT_BLOCK_WIDTH = attrs['block_w'] + OUTPUT_BLOCK_HEIGHT = cfg["block_oh"].val + OUTPUT_BLOCK_WIDTH = cfg["block_ow"].val # schedule conv z_factor = 1 @@ -286,12 +451,13 @@ def _schedule_cl_spatialpack_NCHWc(s, op): s[conv_L].unroll(rx) # schedule temp - _, ci, h, w = s[temp].op.axis - tile_and_bind3d(s, temp, ci, h, w, 1, 16, 16) + if temp.op.name == "pad_temp": + _, ci, h, w, vci = s[temp].op.axis + tile_and_bind3d(s, temp, ci, h, w, 1, 16, 16) # schedule temp_W - _, ci, h, w = s[temp_W].op.axis - zo, zi = s[temp_W].split(ci, 1) + _, ci, h, w, vci = s[temp_W].op.axis + zo, zi = s[temp_W].split(vci, 1) yo, yi = s[temp_W].split(h, 1) xo, xi = s[temp_W].split(w, 16) s[temp_W].reorder(zo, yo, xo, zi, yi, xi) @@ -300,27 +466,37 @@ def _schedule_cl_spatialpack_NCHWc(s, op): s[temp_W].bind(xi, thread_x) s[temp_W].storage_align(s[temp_W].op.axis[2], 16, 0) - #schedule kernel - # schedule kernel_L - if "2_14" in s[conv].op.tag: + if OUTPUT_BLOCK_HEIGHT == 2 and OUTPUT_BLOCK_WIDTH == 14: s[kernel_L].compute_at(s[conv_L], ry) else: s[kernel_L].compute_at(s[conv_L], rx) # schedule output - if output.op in s.outputs: - out = output - else: - s[output].compute_inline() - out = s.outputs[0] - - _, co, h, w = s[out].op.axis - tile_and_bind3d(s, out, w, h, co, 4, 8, 8) - - -@conv2d.register(["intel_graphics"]) -def decl_conv2d(data, kernel, stride, padding, dilation, layout='NCHW', out_dtype='float32'): + if SCHEDULE_OUTPUT: + if output.op in s.outputs: + out = output + else: + s[output].compute_inline() + out = s.outputs[0] + + _, co, h, w, vc = s[out].op.axis + tile_and_bind3d(s, out, w, h, vc, 4, 8, 8) + + +def conv_arg_to_workload(data, kernel, strides, padding, layout, out_dtype): + """convert argument to workload""" + if len(kernel.shape) == 4: + raw_kernel = kernel + else: # the input kernel is transformed by alter_op_layout + shape = get_const_tuple(kernel.shape) + raw_kernel = tvm.placeholder((shape[0] * shape[4], shape[1], shape[2], shape[3]), + dtype=kernel.dtype) + return ('conv2d', ) + autotvm.task.args_to_workload( + [data, raw_kernel, strides, padding, layout, out_dtype]) + +@autotvm.register_topi_compute(conv2d, 'intel_graphics', 'direct') +def decl_conv2d(cfg, data, kernel, stride, padding, dilation, layout='NCHW', out_dtype='float32'): """Conv2D operator for Intel Graphics backend. Parameters @@ -344,18 +520,10 @@ def decl_conv2d(data, kernel, stride, padding, dilation, layout='NCHW', out_dtyp assert data.shape[0].value == 1, "only support batch size=1 convolution on intel gpu" assert data.dtype == kernel.dtype, "Do not support inputs with different data types now." - out_dtype = data.dtype - HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel) - kernel_shape = util.get_const_tuple(kernel.shape) - if isinstance(stride, (tuple, list)): - HSTR, WSTR = stride - else: - HSTR, WSTR = stride, stride - - return _decl_cl_spatialpack(data, kernel, stride, padding, layout, out_dtype) + return _decl_cl_spatialpack(cfg, data, kernel, stride, padding, layout, out_dtype) -@generic.schedule_conv2d_nchw.register(["intel_graphics"]) -def schedule_conv2d_nchw(outs): +@autotvm.task.register_topi_schedule(generic.schedule_conv2d_nchw, 'intel_graphics', ['direct']) +def schedule_conv2d_nchw(cfg, outs): """Schedule for conv2d_nchw for Intel Graphics Parameters @@ -378,17 +546,17 @@ def traverse(op): if op not in s.outputs: s[op].compute_inline() for tensor in op.input_tensors: - if isinstance(tensor.op, tvm.tensor.ComputeOp) and tensor.op not in scheduled_ops: + if tensor.op.input_tensors and tensor.op not in scheduled_ops: traverse(tensor.op) if 'conv2d' in op.tag: - _schedule_cl_spatialpack(s, op) + _schedule_cl_spatialpack(cfg, s, op) scheduled_ops.append(op) traverse(outs[0].op) return s -def _decl_cl_spatialpack(data, kernel, stride, padding, layout, out_dtype='float16'): +def _decl_cl_spatialpack(cfg, data, kernel, stride, padding, layout, out_dtype='float16'): batch, in_channel, in_height, in_width = [util.get_const_int(x) for x in data.shape] num_filter, channel, kernel_h, kernel_w = [util.get_const_int(x) for x in kernel.shape] pad_top, pad_left, pad_down, pad_right = get_pad_tuple(padding, kernel) @@ -429,23 +597,22 @@ def _decl_cl_spatialpack(data, kernel, stride, padding, layout, out_dtype='float else: block_h = 1 block_w = 16 - attrs = {'block_h': block_h, 'block_w' : block_w} c_h = out_height c_w = out_width - if not out_width % block_w == 0: - c_w = (out_width // block_w + 1) * block_w - - if not out_height % block_h == 0: + if out_height % block_h != 0: c_h = (out_height // block_h + 1) * block_h + if out_width % block_w != 0: + c_w = (out_width // block_w + 1) * block_w + pad_before = [0, 0, pad_top, pad_left] pad_after = [0, 0, pad_down + c_h - block_h, pad_right + c_w - block_w] temp = pad(data, pad_before, pad_after, name="pad_temp") nv = 16 - if not num_filter % nv == 0: + if num_filter % nv != 0: num_filter = (num_filter // nv + 1) * nv out_channel = num_filter @@ -459,21 +626,23 @@ def _decl_cl_spatialpack(data, kernel, stride, padding, layout, out_dtype='float conv = tvm.compute( cshape, - lambda nn, ff, yy, xx, vc:\ - tvm.sum( - temp[nn, rc, yy * stride_h + ry, xx * stride_w + rx].astype(out_dtype) * - kernel_vec[ff, rc, ry, rx, vc].astype(out_dtype), - axis=[rc, ry, rx]), name='conv', attrs=attrs) + lambda nn, ff, yy, xx, vc: \ + tvm.sum( + temp[nn, rc, yy * stride_h + ry, xx * stride_w + rx].astype(out_dtype) * + kernel_vec[ff, rc, ry, rx, vc].astype(out_dtype), + axis=[rc, ry, rx]), name='conv', attrs=attrs) output = tvm.compute( oshape, lambda nn, ff, yy, xx: conv[nn][ff//nv][yy][xx][ff%nv], - name='output_unpack', tag='conv2d') + name='output_unpack', tag='conv2d', + attrs={'workload': conv_arg_to_workload(data, kernel, stride, padding, + layout, out_dtype)}) return output -def _schedule_cl_spatialpack(s, op): +def _schedule_cl_spatialpack(cfg, s, op): output = op.output(0) _, _, out_height, out_width = [util.get_const_int(x) for x in output.shape] diff --git a/topi/python/topi/intel_graphics/depthwise_conv2d.py b/topi/python/topi/intel_graphics/depthwise_conv2d.py new file mode 100644 index 0000000000000..424cb3c5f925c --- /dev/null +++ b/topi/python/topi/intel_graphics/depthwise_conv2d.py @@ -0,0 +1,342 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=invalid-name +"""Schedule for depthwise_conv2d with auto fusion""" +import tvm +from tvm import autotvm +from ..util import traverse_inline +from .. import tag +from .. import generic, nn +from ..nn.depthwise_conv2d import depthwise_conv2d_infer_layout + +# register original implementation of depthwise_conv2d_nchw since we don't need to change this part +autotvm.register_topi_compute(nn.depthwise_conv2d_nchw, ['intel_graphics'], 'direct', + nn.depthwise_conv2d_nchw.fdefault) + +@autotvm.register_topi_schedule(generic.schedule_depthwise_conv2d_nchw, \ + ['intel_graphics'], 'direct') +def schedule_depthwise_conv2d_nchw_intel_graphics(cfg, outs): + """Schedule for depthwise_conv2d nchw forward. + + Parameters + ---------- + outs: Array of Tensor + The computation graph description of depthwise_conv2d + in the format of an array of tensors. + + Returns + ------- + s: Schedule + The computation schedule for depthwise_conv2d nchw. + """ + outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs + s = tvm.create_schedule([x.op for x in outs]) + + def _callback(op): + if op.tag == 'depthwise_conv2d_nchw': + pad_data = op.input_tensors[0] + kernel = op.input_tensors[1] + conv = op.output(0) + + ##### space definition begin ##### + n, f, y, x = s[conv].op.axis + cfg.define_split("tile_f", f, num_outputs=4) + cfg.define_split("tile_y", y, num_outputs=4) + cfg.define_split("tile_x", x, num_outputs=4) + cfg.define_knob("auto_unroll_max_step", [0, 256, 1500]) + + target = tvm.target.current_target() + if target.target_name in ['nvptx', 'rocm']: + cfg.define_knob("unroll_explicit", [1]) + else: + cfg.define_knob("unroll_explicit", [0, 1]) + + # fallback support + if cfg.is_fallback: + ref_log = autotvm.tophub.load_reference_log( + target.target_name, target.model, 'depthwise_conv2d_nchw', 'direct') + cfg.fallback_with_reference_log(ref_log) + cfg['unroll_explicit'].val = 0 + ##### space definition end ##### + + s[pad_data].compute_inline() + if isinstance(kernel.op, tvm.tensor.ComputeOp) and 'dilate' in kernel.op.tag: + s[kernel].compute_inline() + + if conv.op in s.outputs: + output = conv + OL = s.cache_write(conv, 'local') + else: + output = s.outputs[0].output(0) + s[conv].set_scope('local') + OL = conv + + # create cache stage + AA = s.cache_read(pad_data, 'shared', [OL]) + WW = s.cache_read(kernel, 'shared', [OL]) + AL = s.cache_read(AA, 'local', [OL]) + WL = s.cache_read(WW, 'local', [OL]) + + # tile and bind spatial axes + n, f, y, x = s[output].op.axis + bf, vf, tf, fi = cfg["tile_f"].apply(s, output, f) + by, vy, ty, yi = cfg["tile_y"].apply(s, output, y) + bx, vx, tx, xi = cfg["tile_x"].apply(s, output, x) + + kernel_scope, n = s[output].split(n, nparts=1) + bf = s[output].fuse(n, bf) + s[output].bind(bf, tvm.thread_axis("blockIdx.z")) + s[output].bind(by, tvm.thread_axis("blockIdx.y")) + s[output].bind(bx, tvm.thread_axis("blockIdx.x")) + s[output].bind(vf, tvm.thread_axis("vthread")) + s[output].bind(vy, tvm.thread_axis("vthread")) + s[output].bind(vx, tvm.thread_axis("vthread")) + s[output].bind(tf, tvm.thread_axis("threadIdx.z")) + s[output].bind(ty, tvm.thread_axis("threadIdx.y")) + s[output].bind(tx, tvm.thread_axis("threadIdx.x")) + s[output].reorder(bf, by, bx, vf, vy, vx, tf, ty, tx, fi, yi, xi) + s[OL].compute_at(s[output], tx) + + # cooperative fetching + s[AA].compute_at(s[output], bx) + s[WW].compute_at(s[output], bx) + s[AL].compute_at(s[output], tx) + s[WL].compute_at(s[output], tx) + + for load in [AA, WW]: + fused = s[load].fuse(*list(s[load].op.axis)) + fused, tx = s[load].split(fused, cfg["tile_x"].size[2]) + fused, ty = s[load].split(fused, cfg["tile_y"].size[2]) + fused, tz = s[load].split(fused, cfg["tile_f"].size[2]) + s[load].bind(tz, tvm.thread_axis("threadIdx.z")) + s[load].bind(ty, tvm.thread_axis("threadIdx.y")) + s[load].bind(tx, tvm.thread_axis("threadIdx.x")) + + s[output].pragma(kernel_scope, 'auto_unroll_max_step', cfg['auto_unroll_max_step'].val) + s[output].pragma(kernel_scope, 'unroll_explicit', cfg['unroll_explicit'].val) + + traverse_inline(s, outs[0].op, _callback) + return s + +@generic.schedule_depthwise_conv2d_nhwc.register(["intel_graphics"]) +def schedule_depthwise_conv2d_nhwc(outs): + """Schedule for depthwise_conv2d nhwc forward. + + Parameters + ---------- + outs: Array of Tensor + The computation graph description of depthwise_conv2d + in the format of an array of tensors. + + Returns + ------- + s: Schedule + The computation schedule for depthwise_conv2d nhwc. + """ + outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs + s = tvm.create_schedule([x.op for x in outs]) + + def _schedule(temp, Filter, DepthwiseConv2d): + s[temp].compute_inline() + FS = s.cache_read(Filter, "shared", [DepthwiseConv2d]) + if DepthwiseConv2d.op in s.outputs: + Output = DepthwiseConv2d + CL = s.cache_write(DepthwiseConv2d, "local") + else: + Output = outs[0].op.output(0) + s[DepthwiseConv2d].set_scope("local") + + block_x = tvm.thread_axis("blockIdx.x") + thread_x = tvm.thread_axis("threadIdx.x") + + b, h, w, c = s[Output].op.axis + + # num_thread here could be 728, it is larger than cuda.max_num_threads + num_thread = tvm.ir_pass.Simplify(temp.shape[3]).value + target = tvm.target.current_target() + if target and (target.target_name not in ["cuda", "nvptx"]): + num_thread = target.max_num_threads + xoc, xic = s[Output].split(c, factor=num_thread) + s[Output].reorder(xoc, b, h, w, xic) + xo, yo, _, _ = s[Output].tile(h, w, x_factor=2, y_factor=2) + fused = s[Output].fuse(yo, xo) + fused = s[Output].fuse(fused, b) + fused = s[Output].fuse(fused, xoc) + + s[Output].bind(fused, block_x) + s[Output].bind(xic, thread_x) + + if DepthwiseConv2d.op in s.outputs: + s[CL].compute_at(s[Output], xic) + else: + s[DepthwiseConv2d].compute_at(s[Output], xic) + + _, _, ci, fi = s[FS].op.axis + s[FS].compute_at(s[Output], fused) + fused = s[FS].fuse(fi, ci) + s[FS].bind(fused, thread_x) + + scheduled_ops = [] + + def traverse(OP): + """Internal travserse function""" + # inline all one-to-one-mapping operators except the last stage (output) + if tag.is_broadcast(OP.tag): + if OP not in s.outputs: + s[OP].compute_inline() + for tensor in OP.input_tensors: + if tensor.op.input_tensors and tensor.op not in scheduled_ops: + traverse(tensor.op) + # schedule depthwise_conv2d + if OP.tag == 'depthwise_conv2d_nhwc': + PaddedInput = OP.input_tensors[0] + Filter = OP.input_tensors[1] + if isinstance(Filter.op, tvm.tensor.ComputeOp) and 'dilate' in Filter.op.tag: + s[Filter].compute_inline() + DepthwiseConv2d = OP.output(0) + _schedule(PaddedInput, Filter, DepthwiseConv2d) + + scheduled_ops.append(OP) + + traverse(outs[0].op) + return s + + +def schedule_depthwise_conv2d_backward_input_nhwc(outs): + """Schedule for depthwise_conv2d nhwc backward wrt input. + + Parameters + ---------- + outs: Array of Tensor + The computation graph description of depthwise_conv2d + backward wrt input in the format of an array of tensors. + + Returns + ------- + s: Schedule + The computation schedule for depthwise_conv2d backward + wrt input with layout nhwc. + """ + outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs + s = tvm.create_schedule([x.op for x in outs]) + + def _schedule(Padded_out_grad, In_grad): + s[Padded_out_grad].compute_inline() + + block_x = tvm.thread_axis("blockIdx.x") + thread_x = tvm.thread_axis("threadIdx.x") + _, h, w, c = In_grad.op.axis + + fused_hwc = s[In_grad].fuse(h, w, c) + xoc, xic = s[In_grad].split(fused_hwc, factor=128) + + s[In_grad].bind(xoc, block_x) + s[In_grad].bind(xic, thread_x) + + def traverse(OP): + # inline all one-to-one-mapping operators except the last stage (output) + if OP.tag == 'depthwise_conv2d_backward_input_nhwc': + Padded_out_grad = OP.input_tensors[0] + Dilated_out_grad = Padded_out_grad.op.input_tensors[0] + s[Dilated_out_grad].compute_inline() + In_grad = OP.output(0) + _schedule(Padded_out_grad, In_grad) + else: + raise ValueError("Depthwise conv backward wrt input for non-NHWC is not supported.") + + traverse(outs[0].op) + return s + +def schedule_depthwise_conv2d_backward_weight_nhwc(outs): + """Schedule for depthwise_conv2d nhwc backward wrt weight. + + Parameters + ---------- + outs: Array of Tensor + The computation graph description of depthwise_conv2d + backward wrt weight in the format of an array of tensors. + + Returns + ------- + s: Schedule + The computation schedule for depthwise_conv2d backward + wrt weight with layout nhwc. + """ + outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs + s = tvm.create_schedule([x.op for x in outs]) + + def _schedule(Weight_grad): + block_x = tvm.thread_axis("blockIdx.x") + thread_y = tvm.thread_axis("threadIdx.y") + thread_x = tvm.thread_axis("threadIdx.x") + + db, dh, dw = Weight_grad.op.reduce_axis + + fused_dbdhdw = s[Weight_grad].fuse(db, dh, dw) + _, ki = s[Weight_grad].split(fused_dbdhdw, factor=8) + BF = s.rfactor(Weight_grad, ki) + + fused_fwcm = s[Weight_grad].fuse(*s[Weight_grad].op.axis) + + xo, xi = s[Weight_grad].split(fused_fwcm, factor=32) + + s[Weight_grad].bind(xi, thread_x) + s[Weight_grad].bind(xo, block_x) + + s[Weight_grad].bind(s[Weight_grad].op.reduce_axis[0], thread_y) + s[BF].compute_at(s[Weight_grad], s[Weight_grad].op.reduce_axis[0]) + + def traverse(OP): + # inline all one-to-one-mapping operators except the last stage (output) + if OP.tag == 'depthwise_conv2d_backward_weight_nhwc': + Padded_in = OP.input_tensors[1] + s[Padded_in].compute_inline() + Weight_grad = OP.output(0) + _schedule(Weight_grad) + else: + raise ValueError("Depthwise conv backward wrt weight for non-NHWC is not supported.") + + traverse(outs[0].op) + return s + +@depthwise_conv2d_infer_layout.register("intel_graphics") +def _depthwise_conv2d_infer_layout(workload, _): + """Infer input/output shapes and layouts from a workload and cfg. + + Parameters + ---------- + workload : tuple + conv2d workload + + cfg : tuple + tvm.autotvm config + + Returns + ------- + Output : [tuple of tuple and str, tuple of tuple and str] + Input shapes and layouts, and output shapes and layouts + """ + _, data, kernel, strides, padding, _, _ = workload + batch_size, in_channel, in_height, in_width = data[:-1] + filter_channel, channel_multiplier, k_height, k_width = kernel[:-1] + out_channel = filter_channel * channel_multiplier + out_height = (in_height + 2 * padding[0] - k_height) // strides[0] + 1 + out_width = (in_width + 2 * padding[1] - k_width) // strides[1] + 1 + in_shape = (batch_size, in_channel, in_height, in_width) + out_shape = (batch_size, out_channel, out_height, out_width) + in_layout = out_layout = "NCHW" + return ((in_shape, in_layout),), ((out_shape, out_layout),)