From 3e1187d1c092201352acc097a20347f2a38054d0 Mon Sep 17 00:00:00 2001 From: Anastasia Stulova <38433336+AnastasiaStulova@users.noreply.github.com> Date: Thu, 26 Aug 2021 10:03:06 +0100 Subject: [PATCH] [Relay][TOPI] Support of depthwise conv2d NHWC for Mali/Bifrost. (#8584) * [Relay][TOPI] Support of depthwise conv2d NHWC for Mali/Bifrost. Added initial tunable autotvm templates for depthwise conv2d with NHWC layout for Mali and Bifrost. * [Relay][TOPI] Misc fixes for depthwise conv2d Mali/Bifrost. - Fix assert for Bifrost. - Set reasonable default axis splits to avoid using tophub for NHWC. - Fixed typo: arm cpu -> Mali. * [Relay][TOPI] Fixed formatting in depthwise conv2d Mali/Bifrost. --- python/tvm/relay/op/strategy/bifrost.py | 8 + python/tvm/relay/op/strategy/mali.py | 17 +- python/tvm/topi/mali/depthwise_conv2d.py | 200 ++++++++++++------ .../topi/python/test_topi_depthwise_conv2d.py | 2 + 4 files changed, 156 insertions(+), 71 deletions(-) diff --git a/python/tvm/relay/op/strategy/bifrost.py b/python/tvm/relay/op/strategy/bifrost.py index 8008391fe86c..ec3edab2c8b1 100644 --- a/python/tvm/relay/op/strategy/bifrost.py +++ b/python/tvm/relay/op/strategy/bifrost.py @@ -83,6 +83,14 @@ def conv2d_strategy_bifrost(attrs, inputs, out_type, target): wrap_topi_schedule(topi.bifrost.schedule_depthwise_conv2d_nchw), name="depthwise_conv2d_nchw.bifrost", ) + elif layout == "NHWC": + assert kernel_layout == "HWOI" + # For now just reuse general Mali strategy. + strategy.add_implementation( + wrap_compute_conv2d(topi.mali.depthwise_conv2d_nhwc), + wrap_topi_schedule(topi.mali.schedule_depthwise_conv2d_nhwc), + name="depthwise_conv2d_nchw.bifrost", + ) else: raise RuntimeError( "Unsupported depthwise_conv2d layout {} for Mali(Bifrost)".format(layout) diff --git a/python/tvm/relay/op/strategy/mali.py b/python/tvm/relay/op/strategy/mali.py index d38fe0d82758..e5f4b4e58562 100644 --- a/python/tvm/relay/op/strategy/mali.py +++ b/python/tvm/relay/op/strategy/mali.py @@ -120,14 +120,17 @@ def conv2d_strategy_mali(attrs, inputs, out_type, target): elif layout == "NHWC": assert kernel_layout == "HWOI" if not is_auto_scheduler_enabled(): - raise RuntimeError( - "depthwise_conv2d NHWC layout is not enabled for mali without auto_scheduler." + strategy.add_implementation( + wrap_compute_conv2d(topi.mali.depthwise_conv2d_nhwc), + wrap_topi_schedule(topi.mali.schedule_depthwise_conv2d_nhwc), + name="depthwise_conv2d_nhwc.mali", + ) + else: + strategy.add_implementation( + wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc), + naive_schedule, + name="depthwise_conv2d_nhwc.mali", ) - strategy.add_implementation( - wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc), - naive_schedule, - name="depthwise_conv2d_nhwc.mali", - ) else: raise RuntimeError("Unsupported depthwise_conv2d layout {} for mali".format(layout)) else: # group_conv2d diff --git a/python/tvm/topi/mali/depthwise_conv2d.py b/python/tvm/topi/mali/depthwise_conv2d.py index b292f694b995..98109ab4535f 100644 --- a/python/tvm/topi/mali/depthwise_conv2d.py +++ b/python/tvm/topi/mali/depthwise_conv2d.py @@ -30,7 +30,7 @@ def depthwise_conv2d_nchw(cfg, data, kernel, strides, padding, dilation, out_dty return nn.depthwise_conv2d_nchw(data, kernel, strides, padding, dilation, out_dtype) -# register customized schedule for arm cpu. +# register customized schedule for Mali. @autotvm.register_topi_schedule("depthwise_conv2d_nchw.mali") def schedule_depthwise_conv2d_nchw(cfg, outs): """Schedule depthwise conv2d @@ -51,86 +51,158 @@ def schedule_depthwise_conv2d_nchw(cfg, outs): outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs s = te.create_schedule([x.op for x in outs]) - def _schedule(pad_data, kernel, conv): - """schedule depthwise_conv2d""" - max_unroll = 16 - vec_size = [1, 2, 4, 8, 16] + def _callback(op): + """traverse to find op to schedule""" + # schedule depthwise_conv2d + if op.tag == "depthwise_conv2d_nchw": + pad_data = op.input_tensors[0] + kernel = op.input_tensors[1] + conv = op.output(0) + _schedule(cfg, s, pad_data, kernel, conv, "NCHW") - ##### space definition begin ##### - n, c, y, x = s[conv].op.axis - bc, tc, ci = cfg.define_split("tile_c", c, num_outputs=3) - by, ty, yi = cfg.define_split("tile_y", y, num_outputs=3) - bx, tx, xi = cfg.define_split("tile_x", x, num_outputs=3) - cfg.define_annotate("ann_spatial", [ci, yi, xi], policy="try_unroll_vec") + traverse_inline(s, outs[0].op, _callback) + return s - # fallback support - if cfg.is_fallback: - ref_log = autotvm.tophub.load_reference_log( - "mali", "rk3399", "depthwise_conv2d_nchw.mali" - ) - cfg.fallback_with_reference_log(ref_log) - ###### space definition end ###### - # schedule padding - n, c, y, x = s[pad_data].op.axis - tile_and_bind3d(s, pad_data, c, y, x, cfg["tile_c"].size[1], 1, 1) +# register original implementation of depthwise_conv2d_nhwc since we don't need to change this part +@autotvm.register_topi_compute("depthwise_conv2d_nhwc.mali") +def depthwise_conv2d_nhwc(cfg, data, kernel, strides, padding, dilation, out_dtype): + return nn.depthwise_conv2d_nhwc(data, kernel, strides, padding, dilation, out_dtype) - # schedule dilation - if isinstance(kernel.op, tvm.te.ComputeOp) and "dilate" in kernel.op.tag: - s[kernel].compute_inline() - # schedule conv - if conv.op not in s.outputs: - s[conv].set_scope("local") - OL = conv - output = s.outputs[0].output(0) - else: - OL = s.cache_write(conv, "local") - output = conv - - n, c, y, x = s[output].op.axis - bc, tc, ci = cfg["tile_c"].apply(s, output, c) - by, ty, yi = cfg["tile_y"].apply(s, output, y) - bx, tx, xi = cfg["tile_x"].apply(s, output, x) - - bc = s[output].fuse(n, bc) - s[output].bind(bc, te.thread_axis("blockIdx.z")) - s[output].bind(tc, te.thread_axis("threadIdx.z")) - s[output].bind(by, te.thread_axis("blockIdx.y")) - s[output].bind(ty, te.thread_axis("threadIdx.y")) - s[output].bind(bx, te.thread_axis("blockIdx.x")) - s[output].bind(tx, te.thread_axis("threadIdx.x")) - - di, dj = s[OL].op.reduce_axis - s[OL].unroll(di) - s[OL].unroll(dj) - - s[OL].compute_at(s[output], tx) - n, ci, yi, xi = s[OL].op.axis - - cfg["ann_spatial"].apply( - s, - OL, - [ci, yi, xi], - axis_lens=[cfg["tile_c"].size[2], cfg["tile_y"].size[2], cfg["tile_x"].size[2]], - max_unroll=max_unroll, - vec_size=vec_size, - cfg=cfg, - ) +# register customized schedule for Mali. +@autotvm.register_topi_schedule("depthwise_conv2d_nhwc.mali") +def schedule_depthwise_conv2d_nhwc(cfg, outs): + """Schedule depthwise conv2d + + Parameters + ---------- + cfg: ConfigEntity + The configuration of this template + outs: Array of Tensor + The computation graph description of depthwise convolution2d + in the format of an array of tensors. + + Returns + ------- + s: Schedule + The computation schedule for depthwise_conv2d nchw. + """ + outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs + s = te.create_schedule([x.op for x in outs]) def _callback(op): """traverse to find op to schedule""" # schedule depthwise_conv2d - if op.tag == "depthwise_conv2d_nchw": + if op.tag == "depthwise_conv2d_nhwc": pad_data = op.input_tensors[0] kernel = op.input_tensors[1] conv = op.output(0) - _schedule(pad_data, kernel, conv) + _schedule(cfg, s, pad_data, kernel, conv, "NHWC") traverse_inline(s, outs[0].op, _callback) return s +def _schedule(cfg, s, pad_data, kernel, conv, layout): + """schedule depthwise_conv2d""" + assert layout in ("NCHW", "NHWC") + + max_unroll = 16 + vec_size = [1, 2, 4, 8, 16] + + ##### space definition begin ##### + if layout == "NCHW": + n, c, h, w = s[conv].op.axis + else: + n, h, w, c = s[conv].op.axis + + bc, tc, ci = cfg.define_split("tile_c", c, num_outputs=3) + bh, th, hi = cfg.define_split("tile_y", h, num_outputs=3) + bw, tw, wi = cfg.define_split("tile_x", w, num_outputs=3) + cfg.define_annotate("ann_spatial", [ci, hi, wi], policy="try_unroll_vec") + + # fallback support + if cfg.is_fallback: + if layout == "NCHW": + ref_log = autotvm.tophub.load_reference_log( + "mali", "rk3399", "depthwise_conv2d_nchw.mali" + ) + cfg.fallback_with_reference_log(ref_log) + else: + cfg.fallback_split("tile_c", [-1, 4, 2]) + cfg.fallback_split("tile_y", [-1, 4, 2]) + cfg.fallback_split("tile_x", [-1, 4, 2]) + ###### space definition end ###### + + # schedule padding + if layout == "NCHW": + n, c, h, w = s[pad_data].op.axis + z, y, x = c, h, w + z_factor, y_factor, x_factor = cfg["tile_c"].size[1], 1, 1 + else: + n, h, w, c = s[pad_data].op.axis + z, y, x = h, w, c + z_factor, y_factor, x_factor = 1, 1, cfg["tile_c"].size[1] + tile_and_bind3d(s, pad_data, z, y, x, z_factor, y_factor, x_factor) + + # schedule dilation + if isinstance(kernel.op, tvm.te.ComputeOp) and "dilate" in kernel.op.tag: + s[kernel].compute_inline() + + # schedule conv + if conv.op not in s.outputs: + s[conv].set_scope("local") + OL = conv + output = s.outputs[0].output(0) + else: + OL = s.cache_write(conv, "local") + output = conv + + if layout == "NCHW": + n, c, h, w = s[output].op.axis + else: + n, h, w, c = s[output].op.axis + + bc, tc, ci = cfg["tile_c"].apply(s, output, c) + bh, th, hi = cfg["tile_y"].apply(s, output, h) + bw, tw, wi = cfg["tile_x"].apply(s, output, w) + + if layout == "NCHW": + bz, tz, by, ty, bx, tx = bc, tc, bh, th, bw, tw + else: + bz, tz, by, ty, bx, tx = bh, th, bw, tw, bc, tc + + bz = s[output].fuse(n, bz) + s[output].bind(bz, te.thread_axis("blockIdx.z")) + s[output].bind(tz, te.thread_axis("threadIdx.z")) + s[output].bind(by, te.thread_axis("blockIdx.y")) + s[output].bind(ty, te.thread_axis("threadIdx.y")) + s[output].bind(bx, te.thread_axis("blockIdx.x")) + s[output].bind(tx, te.thread_axis("threadIdx.x")) + + di, dj = s[OL].op.reduce_axis + s[OL].unroll(di) + s[OL].unroll(dj) + + s[OL].compute_at(s[output], tx) + + if layout == "NCHW": + n, ci, hi, wi = s[OL].op.axis + else: + n, hi, wi, ci = s[OL].op.axis + + cfg["ann_spatial"].apply( + s, + OL, + [ci, hi, wi], + axis_lens=[cfg["tile_c"].size[2], cfg["tile_y"].size[2], cfg["tile_x"].size[2]], + max_unroll=max_unroll, + vec_size=vec_size, + cfg=cfg, + ) + + def tile_and_bind3d(s, tensor, z, y, x, z_factor=2, y_factor=None, x_factor=None): """tile and bind 3d""" y_factor = y_factor or z_factor diff --git a/tests/python/topi/python/test_topi_depthwise_conv2d.py b/tests/python/topi/python/test_topi_depthwise_conv2d.py index 5952e624b708..27601cd32b89 100644 --- a/tests/python/topi/python/test_topi_depthwise_conv2d.py +++ b/tests/python/topi/python/test_topi_depthwise_conv2d.py @@ -61,6 +61,8 @@ ) ], "gpu": [(topi.nn.depthwise_conv2d_nhwc, topi.cuda.schedule_depthwise_conv2d_nhwc)], + "mali": [(topi.mali.depthwise_conv2d_nhwc, topi.mali.schedule_depthwise_conv2d_nhwc)], + "bifrost": [(topi.mali.depthwise_conv2d_nhwc, topi.mali.schedule_depthwise_conv2d_nhwc)], }, "NCHWc": { "generic": [(topi.x86.depthwise_conv2d_NCHWc, topi.x86.schedule_depthwise_conv2d_NCHWc)],