diff --git a/python/tvm/relay/op/strategy/arm_cpu.py b/python/tvm/relay/op/strategy/arm_cpu.py index a803bb6bfdcd8..83e7e23503a7f 100644 --- a/python/tvm/relay/op/strategy/arm_cpu.py +++ b/python/tvm/relay/op/strategy/arm_cpu.py @@ -54,10 +54,15 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target): if groups == 1: if layout == "NCHW": if kernel_layout == "OIHW": + # Commenting the spatial pack as x86 NCHWc schedules perform better. + # strategy.add_implementation( + # wrap_compute_conv2d(topi.arm_cpu.conv2d_nchw_spatial_pack), + # wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nchw_spatial_pack), + # name="conv2d_nchw_spatial_pack.arm_cpu") strategy.add_implementation( - wrap_compute_conv2d(topi.arm_cpu.conv2d_nchw_spatial_pack), - wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nchw_spatial_pack), - name="conv2d_nchw_spatial_pack.arm_cpu") + wrap_compute_conv2d(topi.x86.conv2d_nchw), + wrap_topi_schedule(topi.x86.schedule_conv2d_nchw), + name="conv2d_nchw.x86") # check if winograd algorithm is applicable _, _, kh, kw = get_const_tuple(kernel.shape) pt, pl, pb, pr = topi.nn.get_pad_tuple(padding, (kh, kw)) @@ -100,11 +105,14 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target): elif is_depthwise_conv2d(data.shape, layout, kernel.shape, kernel_layout, groups): if layout == "NCHW": assert kernel_layout == "OIHW" or re.match(r"OIHW\d*o", kernel_layout) - if kernel_layout == "OIHW": - strategy.add_implementation( - wrap_compute_conv2d(topi.arm_cpu.depthwise_conv2d_nchw), - wrap_topi_schedule(topi.arm_cpu.schedule_depthwise_conv2d_nchw), - name="depthwise_conv2d_nchw.arm_cpu") + + # Commenting the spatial pack as x86 NCHWc schedules perform better. + # if kernel_layout == "OIHW": + # strategy.add_implementation( + # wrap_compute_conv2d(topi.arm_cpu.depthwise_conv2d_nchw), + # wrap_topi_schedule(topi.arm_cpu.schedule_depthwise_conv2d_nchw), + # name="depthwise_conv2d_nchw.arm_cpu") + # TODO: # This schedule has incorrect result on some hardware platforms (like NV Jetson TX2) # Let us comment it out but not remove. @@ -115,6 +123,13 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target): # wrap_topi_schedule(topi.arm_cpu.schedule_depthwise_conv2d_nchw_spatial_pack), # name="depthwise_conv2d_nchw_spatial_pack.arm_cpu", # plevel=15) + + channel_multiplier = get_const_tuple(inputs[1].shape)[1] + if channel_multiplier == 1 and dilation_h == 1 and dilation_w == 1: + strategy.add_implementation( + wrap_compute_conv2d(topi.x86.depthwise_conv2d_nchw), + wrap_topi_schedule(topi.x86.schedule_depthwise_conv2d_nchw), + name="depthwise_conv2d_nchw.x86") elif layout == "NHWC": assert kernel_layout == "HWOI" logger.warning("depthwise_conv2d with layout NHWC is not optimized for arm cpu.") @@ -138,6 +153,29 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target): format(layout)) return strategy +@conv2d_NCHWc_strategy.register("arm_cpu") +def conv2d_NCHWc_strategy_arm_cpu(attrs, inputs, out_type, target): + """conv2d_NCHWc x86 strategy""" + strategy = _op.OpStrategy() + data, kernel = inputs + logger.warning("Trying x86 Conv NCHWc.") + strategy.add_implementation( + wrap_compute_conv2d(topi.x86.conv2d_NCHWc, True, True), + wrap_topi_schedule(topi.x86.schedule_conv2d_NCHWc), + name="conv2d_NCHWc.x86") + return strategy + +@depthwise_conv2d_NCHWc_strategy.register("cpu") +def depthwise_conv2d_NCHWc_strategy_arm_cpu(attrs, inputs, out_type, target): + """depthwise_conv2d x86 strategy""" + strategy = _op.OpStrategy() + logger.warning("Trying x86 DWC NCHWc.") + strategy.add_implementation( + wrap_compute_conv2d(topi.x86.depthwise_conv2d_NCHWc, True, True), + wrap_topi_schedule(topi.x86.schedule_depthwise_conv2d_NCHWc), + name="depthwise_conv2d_NCHWc.x86") + return strategy + def wrap_compute_conv2d_winograd_nnpack(topi_compute): """wrap topi compute for conv2d_winograd NNPack""" def _compute_conv2d_nnpack(attrs, inputs, out_type): diff --git a/topi/python/topi/arm_cpu/conv2d_alter_op.py b/topi/python/topi/arm_cpu/conv2d_alter_op.py index 553239b6c4268..934739f01797b 100644 --- a/topi/python/topi/arm_cpu/conv2d_alter_op.py +++ b/topi/python/topi/arm_cpu/conv2d_alter_op.py @@ -59,6 +59,10 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type): data, kernel = tinfos out_dtype = out_type.dtype + data_tensor, kernel_tensor = tinfos + data_dtype = data_tensor.dtype + kernel_dtype = kernel_tensor.dtype + idxd = tvm.tir.indexdiv if topi_tmpl == "conv2d_nchw_spatial_pack.arm_cpu": @@ -169,4 +173,104 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type): return relay.nn.conv2d(*inputs, **new_attrs) + if topi_tmpl == "conv2d_NCHWc.x86": + # we only convert conv2d_NCHW to conv2d_NCHWc for x86 + assert data_layout == "NCHW" and kernel_layout == "OIHW" + if cfg.is_fallback: + _get_default_config(cfg, data_tensor, kernel_tensor, strides, padding, + out_dtype, False, data_layout) + batch_size, in_channel, height, width = get_const_tuple(data_tensor.shape) + out_channel, _, kh, kw = get_const_tuple(kernel_tensor.shape) + ic_bn, oc_bn = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1] + + # update new attrs + new_attrs['channels'] = out_channel + new_attrs['data_layout'] = 'NCHW%dc' % ic_bn + # (oc, ic, h, w) -> (OC, IC, h, w, ic, oc) + new_attrs['kernel_layout'] = 'OIHW%di%do' % (ic_bn, oc_bn) + new_attrs['out_layout'] = 'NCHW%dc' % oc_bn + + # Store altered operator's config + new_data = te.placeholder((batch_size, in_channel//ic_bn, height, width, ic_bn), + dtype=data_dtype) + new_kernel = te.placeholder((out_channel//oc_bn, in_channel//ic_bn, + kh, kw, ic_bn, oc_bn), dtype=kernel_tensor.dtype) + new_workload = autotvm.task.args_to_workload( + [new_data, new_kernel, strides, padding, dilation, new_attrs["data_layout"], + new_attrs["out_layout"], out_dtype], topi_tmpl) + dispatch_ctx.update(target, new_workload, cfg) + return relay.nn.contrib_conv2d_nchwc(*inputs, **new_attrs) + + if topi_tmpl == "conv2d_NCHWc_int8.x86": + # TODO(@icemelon9, @anijain2305): Need to support data layout NHWC with kernel layout HWIO + assert data_layout == "NCHW" and kernel_layout == "OIHW" + if cfg.is_fallback: + _get_default_config_int8(cfg, data_tensor, kernel_tensor, strides, padding, + out_dtype, False, data_layout) + + batch_size, in_channel, height, width = get_const_tuple(data_tensor.shape) + out_channel, channel_multiplier, kh, kw = get_const_tuple(kernel_tensor.shape) + ic_bn, oc_bn = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1] + n_elems = 4 + + # convert kernel data layout from 4D to 7D + data_expr, kernel_expr = inputs + kernel_IHWO = relay.transpose(kernel_expr, axes=(1, 2, 3, 0)) + kernel_IHWOo = relay.reshape(kernel_IHWO, (in_channel, kh, kw, out_channel//oc_bn, oc_bn)) + kernel_OHWoI = relay.transpose(kernel_IHWOo, axes=(3, 1, 2, 4, 0)) + kernel_OHWoIi = relay.reshape(kernel_OHWoI, (out_channel//oc_bn, kh, kw, oc_bn, + in_channel//ic_bn, ic_bn)) + kernel_OHWoIie = relay.reshape(kernel_OHWoIi, (out_channel//oc_bn, kh, kw, oc_bn, + in_channel//ic_bn, ic_bn//n_elems, n_elems)) + kernel_OIHWioe = relay.transpose(kernel_OHWoIie, axes=(0, 4, 1, 2, 5, 3, 6)) + + # update new attrs + new_attrs['channels'] = out_channel + new_attrs['data_layout'] = 'NCHW%dc' % ic_bn + new_attrs['out_layout'] = 'NCHW%dc' % oc_bn + + # Store altered operator's config. + new_data = te.placeholder((batch_size, in_channel//ic_bn, height, width, ic_bn), + dtype=data_dtype) + new_kernel = te.placeholder((out_channel // oc_bn, + in_channel // ic_bn, + kh, + kw, + ic_bn // n_elems, + oc_bn, + n_elems), dtype=kernel_dtype) + new_workload = autotvm.task.args_to_workload( + [new_data, new_kernel, strides, padding, dilation, new_attrs['data_layout'], + new_attrs['out_layout'], out_dtype], topi_tmpl) + dispatch_ctx.update(target, new_workload, cfg) + + return relay.nn.contrib_conv2d_nchwc(data_expr, kernel_OIHWioe, **new_attrs) + + if topi_tmpl == "depthwise_conv2d_NCHWc.x86": + assert data_layout == "NCHW" and kernel_layout == "OIHW" + if cfg.is_fallback: + _get_default_config(cfg, data_tensor, kernel_tensor, strides, padding, + out_dtype, True, data_layout) + + batch_size, in_channel, height, width = get_const_tuple(data_tensor.shape) + out_channel, channel_multiplier, kh, kw = get_const_tuple(kernel_tensor.shape) + ic_bn, oc_bn = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1] + assert channel_multiplier == 1 + + # update new attrs + new_attrs['channels'] = out_channel + new_attrs['data_layout'] = 'NCHW%dc' % ic_bn + new_attrs['kernel_layout'] = 'OIHW1i%do' % oc_bn + new_attrs['out_layout'] = 'NCHW%dc' % oc_bn + + # Store altered operator's config. + new_data = te.placeholder((batch_size, in_channel//ic_bn, height, width, ic_bn), + dtype=data_dtype) + new_kernel = te.placeholder((out_channel//oc_bn, 1, kh, kw, 1, oc_bn), dtype=kernel_dtype) + new_workload = autotvm.task.args_to_workload( + [new_data, new_kernel, strides, padding, dilation, new_attrs['data_layout'], + new_attrs['out_layout'], out_dtype], topi_tmpl) + dispatch_ctx.update(target, new_workload, cfg) + return relay.nn.contrib_depthwise_conv2d_nchwc(*inputs, **new_attrs) + return None diff --git a/topi/python/topi/x86/conv2d.py b/topi/python/topi/x86/conv2d.py index d875f8d6bd4b6..b4521357b4f03 100644 --- a/topi/python/topi/x86/conv2d.py +++ b/topi/python/topi/x86/conv2d.py @@ -169,7 +169,7 @@ def conv2d_NCHWc(cfg, data, kernel, strides, padding, dilation, layout, out_layo cfg.define_split("tile_ic", in_channel, num_outputs=2) cfg.define_split("tile_oc", num_filter, num_outputs=2) - cfg.define_split("tile_ow", ow, num_outputs=2, filter=lambda y: y.size[-1] <= 64) + cfg.define_split("tile_ow", ow, num_outputs=2, filter=lambda y: y.size[-1] <= 64, policy="verbose") if is_kernel_1x1: cfg.define_knob("tile_oh", [1, 2] if oh > 1 else [1]) else: