diff --git a/tests/python/relay/test_op_level2.py b/tests/python/relay/test_op_level2.py index 487cb650cb8d..eee5e4ffa52b 100644 --- a/tests/python/relay/test_op_level2.py +++ b/tests/python/relay/test_op_level2.py @@ -18,6 +18,7 @@ """ import numpy as np import tvm +from tvm import autotvm from tvm import relay from tvm.relay import transform from tvm.relay.testing import ctx_list @@ -174,6 +175,76 @@ def run_test_conv2d(dtype, out_dtype, scale, dshape, kshape, run_test_conv2d("float32", "float32", 1, dshape, kshape, padding=(1, 1), channels=10, kernel_size=(3 ,3), dilation=(3, 3)) +def test_conv2d_winograd(): + class WinogradFallback(autotvm.FallbackContext): + def _query_inside(self, target, workload): + key = (target, workload) + if key in self.memory: + return self.memory[key] + cfg = autotvm.task.space.FallbackConfigEntity() + cfg.template_key = 'winograd' + cfg.is_fallback = False + cfg['tile_b'] = autotvm.task.space.SplitEntity([-1, 1, 1, 1]) + cfg['tile_y'] = autotvm.task.space.SplitEntity([-1, 1, 1, 1]) + cfg['tile_x'] = autotvm.task.space.SplitEntity([-1, 1, 1, 1]) + cfg['tile_rc'] = autotvm.task.space.SplitEntity([-1, 1]) + cfg['auto_unroll_max_setp'] = autotvm.task.space.OtherOptionEntity(1500) + cfg['unroll_explicit'] = autotvm.task.space.OtherOptionEntity(1) + self.memory[key] = cfg + return cfg + + def run_test_conv2d_cuda(dtype, out_dtype, scale, dshape, kshape, + padding=(1, 1), + groups=1, + dilation=(1, 1), + **attrs): + + x = relay.var("x", shape=dshape, dtype=dtype) + w = relay.var("w", shape=kshape, dtype=dtype) + y = relay.nn.conv2d(x, w, + padding=padding, + dilation=dilation, + groups=groups, + **attrs) + func = relay.Function([x, w], y) + mod = relay.Module() + mod['main'] = func + mod = relay.transform.InferType()(mod) + + data = np.random.uniform(-scale, scale, size=dshape).astype(dtype) + kernel = np.random.uniform(-scale, scale, size=kshape).astype(dtype) + ref_res = topi.testing.conv2d_nchw_python( + data.astype(out_dtype), kernel.astype(out_dtype), 1, padding, + groups=groups) + + with WinogradFallback(), relay.build_config(opt_level=3): + for target, ctx in ctx_list(): + if target != 'cuda': + continue + params = {'w': tvm.nd.array(kernel)} + graph, lib, params = relay.build_module.build(mod, target=target, params=params) + module = tvm.contrib.graph_runtime.create(graph, lib, ctx) + module.set_input('x', tvm.nd.array(data)) + module.set_input(**params) + module.run() + op_res1 = module.get_output(0) + tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-3, atol=1e-3) + + # normal winograd: stride 1, padding 1, kernel 3x3 + dshape = (1, 80, 73, 73) + kshape = (192, 80, 3, 3) + run_test_conv2d_cuda("float32", "float32", 1, dshape, kshape, + padding=(1, 1), channels=192, kernel_size=(3, 3)) + # extended winograd: stride 1, padding N, kernel 3x3 + run_test_conv2d_cuda("float32", "float32", 1, dshape, kshape, + padding=(0, 0), channels=192, kernel_size=(3, 3)) + run_test_conv2d_cuda("float32", "float32", 1, dshape, kshape, + padding=(2, 2), channels=192, kernel_size=(3, 3)) + # extended winograd: stride 1, padding N, kernel NxN + kshape = (192, 80, 7, 7) + run_test_conv2d_cuda("float32", "float32", 1, dshape, kshape, + padding=(2, 2), channels=192, kernel_size=(7, 7)) + def test_conv2d_transpose_infer_type(): # symbolic in batch dimension @@ -702,6 +773,7 @@ def test_bitpack_infer_type(): test_conv2d_transpose_infer_type() test_conv2d_transpose_run() test_conv2d_run() + test_conv2d_winograd() test_bitserial_conv2d_infer_type() test_batch_flatten() test_upsampling() diff --git a/topi/python/topi/cuda/conv2d_winograd.py b/topi/python/topi/cuda/conv2d_winograd.py index eb004d79fd9b..f6f8640b495a 100644 --- a/topi/python/topi/cuda/conv2d_winograd.py +++ b/topi/python/topi/cuda/conv2d_winograd.py @@ -55,12 +55,13 @@ def winograd_cuda(cfg, data, kernel, strides, padding, dilation, layout, out_dty if dilation_h != 1 or dilation_w != 1: kernel = dilation(kernel, (1, 1, dilation_h, dilation_w)) CO, CI, KH, KW = get_const_tuple(kernel.shape) + alpha = KW + tile_size - 1 assert HSTR == 1 and WSTR == 1 and KH == KW else: # kernel tensor is pre-transfomred. this op is created by alter op layout. # dilation is not supported - _, _, CI, CO = get_const_tuple(kernel.shape) - KH = KW = 3 + alpha, _, CI, CO = get_const_tuple(kernel.shape) + KH = KW = alpha + 1 - tile_size assert HSTR == 1 and WSTR == 1 and dilation_h == 1 and dilation_w == 1 HPAD, WPAD, _, _ = nn.get_pad_tuple(padding, kernel) @@ -68,7 +69,6 @@ def winograd_cuda(cfg, data, kernel, strides, padding, dilation, layout, out_dty r = KW m = tile_size - alpha = m + r - 1 A, B, G = winograd_transform_matrices(m, r, out_dtype) H = (H + 2 * HPAD - KH) // HSTR + 1