From 3a765c792a94b01e26a01b6902d9dfc51a59e123 Mon Sep 17 00:00:00 2001 From: Wang Yao Date: Tue, 31 Mar 2020 14:36:36 -0700 Subject: [PATCH 1/5] Fix x86 conv2d and depthwise conv2d auto tuning --- topi/python/topi/x86/conv2d.py | 14 ++++++++++- topi/python/topi/x86/conv2d_avx_1x1.py | 9 ++----- topi/python/topi/x86/conv2d_avx_common.py | 9 ++----- topi/python/topi/x86/depthwise_conv2d.py | 29 +++++++++++++---------- 4 files changed, 33 insertions(+), 28 deletions(-) diff --git a/topi/python/topi/x86/conv2d.py b/topi/python/topi/x86/conv2d.py index 81d848a4762f..d875f8d6bd4b 100644 --- a/topi/python/topi/x86/conv2d.py +++ b/topi/python/topi/x86/conv2d.py @@ -185,7 +185,19 @@ def conv2d_NCHWc(cfg, data, kernel, strides, padding, dilation, layout, out_layo # Pack data if raw 4-D data is provided. # This can only happen when autotuning. if len(data.shape) == 4: - data, kernel = _pack_data(cfg, data, kernel) + if autotvm.GLOBAL_SCOPE.in_tuning: + # Directly use modified data layout placeholder. + dshape = (n, in_channel // cfg["tile_ic"].size[-1], + ih, iw, cfg["tile_ic"].size[-1]) + data = tvm.te.placeholder(dshape, data.dtype, name="data") + kshape = (num_filter // cfg["tile_oc"].size[-1], + in_channel // cfg["tile_ic"].size[-1], + kernel_height, kernel_width, + cfg["tile_ic"].size[-1], + cfg["tile_oc"].size[-1]) + kernel = tvm.te.placeholder(kshape, kernel.dtype, name="kernel") + else: + data, kernel = _pack_data(cfg, data, kernel) return nn.conv2d_NCHWc(data, kernel, diff --git a/topi/python/topi/x86/conv2d_avx_1x1.py b/topi/python/topi/x86/conv2d_avx_1x1.py index 432f8b287513..3700261bb61f 100644 --- a/topi/python/topi/x86/conv2d_avx_1x1.py +++ b/topi/python/topi/x86/conv2d_avx_1x1.py @@ -69,17 +69,12 @@ def _schedule_conv_NCHWc(s, cfg, data_vec, kernel_vec, conv_out, last): if isinstance(s[data_vec].op, tvm.te.ComputeOp) \ and "pad" in data_vec.op.tag: batch, ic_chunk, ih, iw, ic_block = s[data_vec].op.axis + s[data_vec].vectorize(ic_block) parallel_axis = s[data_vec].fuse(batch, ic_chunk, ih) s[data_vec].parallel(parallel_axis) data_vec = data_vec.op.input_tensors[0] - if autotvm.GLOBAL_SCOPE.in_tuning: - # only in autotuning, input data of conv2d_NCHWc will be 4-D. - # skip this part during tuning to make records accurate. - # this part will be folded during Relay fold_constant pass. - s[data_vec].pragma(s[data_vec].op.axis[0], "debug_skip_region") - s[kernel_vec].pragma(s[kernel_vec].op.axis[0], "debug_skip_region") - elif isinstance(kernel_vec.op, tvm.te.ComputeOp) and \ + if isinstance(kernel_vec.op, tvm.te.ComputeOp) and \ kernel_vec.name == 'kernel_vec': # data and kernel are not pre-computed, schedule layout transform here. # this should only be used by x86 conv2d_nchw, which is for diff --git a/topi/python/topi/x86/conv2d_avx_common.py b/topi/python/topi/x86/conv2d_avx_common.py index ebed14cb924a..0c38f3f79377 100644 --- a/topi/python/topi/x86/conv2d_avx_common.py +++ b/topi/python/topi/x86/conv2d_avx_common.py @@ -91,17 +91,12 @@ def _schedule_conv_NCHWc(s, cfg, data_vec, kernel_vec, conv_out, last): if isinstance(s[data_vec].op, tvm.te.ComputeOp) \ and "pad" in data_vec.op.tag: batch, ic_chunk, ih, iw, ic_block = s[data_vec].op.axis + s[data_vec].vectorize(ic_block) parallel_axis = s[data_vec].fuse(batch, ic_chunk, ih) s[data_vec].parallel(parallel_axis) data_vec = data_vec.op.input_tensors[0] - if autotvm.GLOBAL_SCOPE.in_tuning: - # only in autotuning, input data of conv2d_NCHWc will be 4-D. - # skip this part during tuning to make records accurate. - # this part will be folded during Relay fold_constant pass. - s[data_vec].pragma(s[data_vec].op.axis[0], "debug_skip_region") - s[kernel_vec].pragma(s[kernel_vec].op.axis[0], "debug_skip_region") - elif isinstance(kernel_vec.op, tvm.te.ComputeOp) and \ + if isinstance(kernel_vec.op, tvm.te.ComputeOp) and \ kernel_vec.name == 'kernel_vec': # data and kernel are not pre-computed, schedule layout transform here. # this should only be used by x86 conv2d_nchw, which is for diff --git a/topi/python/topi/x86/depthwise_conv2d.py b/topi/python/topi/x86/depthwise_conv2d.py index 5b43cededcc8..c95e47ee9066 100644 --- a/topi/python/topi/x86/depthwise_conv2d.py +++ b/topi/python/topi/x86/depthwise_conv2d.py @@ -43,7 +43,6 @@ def _fallback_schedule(cfg, wkl): HPAD, WPAD = wkl.hpad, wkl.wpad HSTR, WSTR = wkl.hstride, wkl.wstride - out_height = (wkl.height + 2 * HPAD - wkl.hkernel) // HSTR + 1 out_width = (wkl.width + 2 * WPAD - wkl.wkernel) // WSTR + 1 oc_bn = 1 @@ -146,10 +145,21 @@ def depthwise_conv2d_NCHWc(cfg, data, kernel, strides, padding, dilation, # Pack data if raw 4-D data is provided. # This can only happen when autotuning. if len(data.shape) == 4: - data, kernel = _pack_data(cfg, data, kernel) - _, _, _, _, in_channel_block = get_const_tuple(data.shape) - out_channel_chunk, _, _, _, _, out_channel_block \ - = get_const_tuple(kernel.shape) + if autotvm.GLOBAL_SCOPE.in_tuning: + # Directly use modified data layout placeholder. + in_channel_block = cfg["tile_ic"].size[-1] + in_channel_chunk = in_channel // in_channel_block + out_channel_block = cfg["tile_oc"].size[-1] + out_channel_chunk = out_channel // out_channel_block + dshape = (batch, in_channel_chunk,in_height, in_width, in_channel_block) + data = tvm.te.placeholder(dshape, data.dtype, name="data") + kshape = (out_channel_chunk, 1, filter_height, filter_width, 1, out_channel_block) + kernel = tvm.te.placeholder(kshape, kernel.dtype, name="kernel") + else: + data, kernel = _pack_data(cfg, data, kernel) + _, _, _, _, in_channel_block = get_const_tuple(data.shape) + out_channel_chunk, _, _, _, _, out_channel_block \ + = get_const_tuple(kernel.shape) # padding stage DOPAD = (pad_top != 0 or pad_left != 0 or pad_down != 0 or pad_right != 0) @@ -203,16 +213,9 @@ def _schedule_depthwise_conv2d_NCHWc_impl(s, cfg, data_vec, kernel_vec, conv_out if isinstance(s[data_vec].op, tvm.te.ComputeOp) \ and "pad" in data_vec.op.tag: batch, ic_chunk, ih, iw, ic_block = s[data_vec].op.axis + s[data_vec].vectorize(ic_block) parallel_axis = s[data_vec].fuse(batch, ic_chunk, ih) s[data_vec].parallel(parallel_axis) - data_vec = data_vec.op.input_tensors[0] - - if autotvm.GLOBAL_SCOPE.in_tuning: - # only in autotuning, input data of conv2d_NCHWc will be 4-D. - # skip this part during tuning to make recrods accurate. - # this part will be folded during Relay fold_constant pass. - s[data_vec].pragma(s[data_vec].op.axis[0], "debug_skip_region") - s[kernel_vec].pragma(s[kernel_vec].op.axis[0], "debug_skip_region") C, O = conv_out, output CC = s.cache_write(C, 'global') From d9841515a6209d71a0a11cc6342d629ba3c21996 Mon Sep 17 00:00:00 2001 From: Yao Wang Date: Wed, 1 Apr 2020 05:26:32 +0000 Subject: [PATCH 2/5] Fix depthwise conv2d infer layout --- topi/python/topi/x86/depthwise_conv2d.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/topi/python/topi/x86/depthwise_conv2d.py b/topi/python/topi/x86/depthwise_conv2d.py index c95e47ee9066..35bb5ca8ba69 100644 --- a/topi/python/topi/x86/depthwise_conv2d.py +++ b/topi/python/topi/x86/depthwise_conv2d.py @@ -261,12 +261,12 @@ def _schedule_depthwise_conv2d_NCHWc_impl(s, cfg, data_vec, kernel_vec, conv_out @depthwise_conv2d_infer_layout.register("cpu") def _depthwise_conv2d_infer_layout(workload, cfg): - _, data, kernel, strides, padding, dilation, dtype = workload + _, data, kernel, strides, padding, dilation, _, _, dtype = workload batch_size, in_channel, in_height, in_width = data[1] filter_channel, channel_multiplier, k_height, k_width = kernel[1] out_channel = filter_channel * channel_multiplier - out_height = (in_height + 2 * padding[0] - k_height) // strides[0] + 1 - out_width = (in_width + 2 * padding[1] - k_width) // strides[1] + 1 + out_height = (in_height + padding[0] + padding[2] - k_height) // strides[0] + 1 + out_width = (in_width + padding[1] + padding[3] - k_width) // strides[1] + 1 tile_ic, tile_oc = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1] in_shape = (batch_size, in_channel // tile_ic, in_height, in_width, tile_ic) in_layout = "NCHW%dc" % tile_ic From 13cdf610e817633fd0c6383c41d915d16e9ce7f5 Mon Sep 17 00:00:00 2001 From: Yao Wang Date: Wed, 1 Apr 2020 17:41:11 +0000 Subject: [PATCH 3/5] Use random data instead of empty data for autotvm --- python/tvm/autotvm/measure/measure_methods.py | 5 +++-- topi/python/topi/x86/conv2d_avx_1x1.py | 1 - topi/python/topi/x86/conv2d_avx_common.py | 1 - topi/python/topi/x86/depthwise_conv2d.py | 2 +- 4 files changed, 4 insertions(+), 5 deletions(-) diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py index 698ddbc68dd7..0cb436ea681e 100644 --- a/python/tvm/autotvm/measure/measure_methods.py +++ b/python/tvm/autotvm/measure/measure_methods.py @@ -479,9 +479,10 @@ def run_through_rpc(measure_input, build_result, if ref_input: args = [nd.array(x, ctx=ctx) for x in ref_input] else: - # create empty arrays on the remote device and copy them once. + # create random arrays on the remote device and copy them once. # This can avoid some memory issues that make the measurement results unreliable. - args = [nd.empty(x[0], dtype=x[1], ctx=ctx) for x in build_result.arg_info] + args = [nd.array(np.random.uniform(0.0, 255.0, size=x[0]).astype(dtype=x[1]), ctx=ctx) + for x in build_result.arg_info] args = [nd.array(x, ctx=ctx) for x in args] ctx.sync() diff --git a/topi/python/topi/x86/conv2d_avx_1x1.py b/topi/python/topi/x86/conv2d_avx_1x1.py index 3700261bb61f..978c4b9adb30 100644 --- a/topi/python/topi/x86/conv2d_avx_1x1.py +++ b/topi/python/topi/x86/conv2d_avx_1x1.py @@ -19,7 +19,6 @@ from __future__ import absolute_import as _abs import tvm from tvm import te -from tvm import autotvm from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity from ..nn.pad import pad diff --git a/topi/python/topi/x86/conv2d_avx_common.py b/topi/python/topi/x86/conv2d_avx_common.py index 0c38f3f79377..a88d168194fc 100644 --- a/topi/python/topi/x86/conv2d_avx_common.py +++ b/topi/python/topi/x86/conv2d_avx_common.py @@ -17,7 +17,6 @@ # pylint: disable=invalid-name,unused-variable,unused-argument,invalid-name """Conv2D schedule on for Intel CPU""" import tvm -from tvm import autotvm from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity from ..generic import conv2d as conv2d_generic diff --git a/topi/python/topi/x86/depthwise_conv2d.py b/topi/python/topi/x86/depthwise_conv2d.py index 35bb5ca8ba69..366b47fc150c 100644 --- a/topi/python/topi/x86/depthwise_conv2d.py +++ b/topi/python/topi/x86/depthwise_conv2d.py @@ -151,7 +151,7 @@ def depthwise_conv2d_NCHWc(cfg, data, kernel, strides, padding, dilation, in_channel_chunk = in_channel // in_channel_block out_channel_block = cfg["tile_oc"].size[-1] out_channel_chunk = out_channel // out_channel_block - dshape = (batch, in_channel_chunk,in_height, in_width, in_channel_block) + dshape = (batch, in_channel_chunk, in_height, in_width, in_channel_block) data = tvm.te.placeholder(dshape, data.dtype, name="data") kshape = (out_channel_chunk, 1, filter_height, filter_width, 1, out_channel_block) kernel = tvm.te.placeholder(kshape, kernel.dtype, name="kernel") From 42dc0c98e5cbe32a908d138524694b9ac925ea35 Mon Sep 17 00:00:00 2001 From: Yao Wang Date: Wed, 1 Apr 2020 17:44:51 +0000 Subject: [PATCH 4/5] Fix pylint --- python/tvm/autotvm/measure/measure_methods.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py index 0cb436ea681e..e1fec122ea94 100644 --- a/python/tvm/autotvm/measure/measure_methods.py +++ b/python/tvm/autotvm/measure/measure_methods.py @@ -481,7 +481,7 @@ def run_through_rpc(measure_input, build_result, else: # create random arrays on the remote device and copy them once. # This can avoid some memory issues that make the measurement results unreliable. - args = [nd.array(np.random.uniform(0.0, 255.0, size=x[0]).astype(dtype=x[1]), ctx=ctx) + args = [nd.array(np.random.uniform(0.0, 255.0, size=x[0]).astype(dtype=x[1]), ctx=ctx) for x in build_result.arg_info] args = [nd.array(x, ctx=ctx) for x in args] ctx.sync() From a0e73c9c3a573a119cf05b550d7795f6105fbb4f Mon Sep 17 00:00:00 2001 From: Yao Wang Date: Thu, 2 Apr 2020 17:19:20 +0000 Subject: [PATCH 5/5] Keep empty array for now for autotvm --- python/tvm/autotvm/measure/measure_methods.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py index e1fec122ea94..698ddbc68dd7 100644 --- a/python/tvm/autotvm/measure/measure_methods.py +++ b/python/tvm/autotvm/measure/measure_methods.py @@ -479,10 +479,9 @@ def run_through_rpc(measure_input, build_result, if ref_input: args = [nd.array(x, ctx=ctx) for x in ref_input] else: - # create random arrays on the remote device and copy them once. + # create empty arrays on the remote device and copy them once. # This can avoid some memory issues that make the measurement results unreliable. - args = [nd.array(np.random.uniform(0.0, 255.0, size=x[0]).astype(dtype=x[1]), ctx=ctx) - for x in build_result.arg_info] + args = [nd.empty(x[0], dtype=x[1], ctx=ctx) for x in build_result.arg_info] args = [nd.array(x, ctx=ctx) for x in args] ctx.sync()