From 73986a603fd3996013d9b026de1cd8e662c869d4 Mon Sep 17 00:00:00 2001 From: Thomas Viehmann Date: Thu, 11 Jun 2020 18:38:46 +0200 Subject: [PATCH] Make batch matrix multiplication on GPU tunable (#5752) This is primarily aimed at the AMD GPU backend and done as part of a project for AMD, but should work for all users of the GPU schedule. --- python/tvm/relay/op/strategy/cuda.py | 2 +- topi/python/topi/cuda/batch_matmul.py | 80 ++++++++++++++------- topi/tests/python/test_topi_batch_matmul.py | 2 +- 3 files changed, 58 insertions(+), 26 deletions(-) diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py index 5ffb7f1dea67..4b019cfcbccc 100644 --- a/python/tvm/relay/op/strategy/cuda.py +++ b/python/tvm/relay/op/strategy/cuda.py @@ -481,7 +481,7 @@ def batch_matmul_strategy_cuda(attrs, inputs, out_type, target): """batch_matmul cuda strategy""" strategy = _op.OpStrategy() strategy.add_implementation( - wrap_compute_batch_matmul(topi.nn.batch_matmul), + wrap_compute_batch_matmul(topi.cuda.batch_matmul), wrap_topi_schedule(topi.cuda.schedule_batch_matmul), name="batch_matmul.cuda", plevel=10) diff --git a/topi/python/topi/cuda/batch_matmul.py b/topi/python/topi/cuda/batch_matmul.py index bf801820d25a..7d92edfb97b7 100644 --- a/topi/python/topi/cuda/batch_matmul.py +++ b/topi/python/topi/cuda/batch_matmul.py @@ -14,13 +14,24 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -# pylint: disable=invalid-name,too-many-locals,unused-variable +# pylint: disable=invalid-name,too-many-locals,unused-variable,unused-argument """cuda batch_matmul operators""" +import tvm +from tvm import autotvm from tvm import te from tvm.contrib import cublas +from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity +from .. import nn from ..util import traverse_inline, get_const_tuple, get_max_power2_factor -def schedule_batch_matmul(outs): +@autotvm.register_topi_compute("batch_matmul.cuda") +def batch_matmul(cfg, x, y): + """Compute conv2d with NCHW layout""" + return nn.batch_matmul(x, y) + + +@autotvm.register_topi_schedule("batch_matmul.cuda") +def schedule_batch_matmul(cfg, outs): """Schedule for batch_matmul Parameters @@ -37,7 +48,7 @@ def schedule_batch_matmul(outs): outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs s = te.create_schedule([x.op for x in outs]) - def _schedule(op): + def _schedule(cfg, op): C = op.output(0) A, B = s[C].op.input_tensors _, M, N = get_const_tuple(C.shape) @@ -51,16 +62,34 @@ def _schedule(op): C = s.outputs[0].output(0) b, y, x = s[C].op.axis - y_bn = get_max_power2_factor(M, 64) - x_bn = get_max_power2_factor(N, 64) - by, y = s[C].split(y, y_bn) - bx, x = s[C].split(x, x_bn) - y_nthreads = min(y_bn, 8) - x_nthreads = min(x_bn, 8) - ty, yi = s[C].split(y, nparts=y_nthreads) - tx, xi = s[C].split(x, nparts=x_nthreads) - thread_x = te.thread_axis((0, x_nthreads), "threadIdx.x") - thread_y = te.thread_axis((0, y_nthreads), "threadIdx.y") + k, = s[CC].op.reduce_axis + + cfg.define_split("tile_y", y, num_outputs=3) + cfg.define_split("tile_x", x, num_outputs=3) + cfg.define_split("tile_k", k, num_outputs=2) + cfg.define_knob("auto_unroll_max_step", [8, 16, 32, 64]) + target = tvm.target.Target.current() + if target.target_name in ['nvptx', 'rocm']: + # llvm-based backends cannot do non-explicit unrolling + cfg.define_knob("unroll_explicit", [1]) + else: + cfg.define_knob("unroll_explicit", [0, 1]) + + if cfg.is_fallback: + y_bn = get_max_power2_factor(M, 64) + x_bn = get_max_power2_factor(N, 64) + y_nthreads = min(y_bn, 8) + x_nthreads = min(x_bn, 8) + cfg['tile_x'] = SplitEntity([-1, x_nthreads, x_bn // x_nthreads]) + cfg['tile_y'] = SplitEntity([-1, y_nthreads, y_bn // y_nthreads]) + cfg['tile_k'] = SplitEntity([-1, 8]) + cfg['auto_unroll_max_step'] = OtherOptionEntity(16) + + by, ty, yi = cfg["tile_y"].apply(s, C, y) + bx, tx, xi = cfg["tile_x"].apply(s, C, x) + + thread_x = te.thread_axis("threadIdx.x") + thread_y = te.thread_axis("threadIdx.y") s[C].reorder(b, by, bx, ty, tx, yi, xi) s[C].bind(b, te.thread_axis("blockIdx.z")) @@ -68,38 +97,41 @@ def _schedule(op): s[C].bind(bx, te.thread_axis("blockIdx.x")) s[C].bind(ty, thread_y) s[C].bind(tx, thread_x) - s[C].pragma(yi, "auto_unroll_max_step", 16) + s[C].pragma(yi, "auto_unroll_max_step", cfg['auto_unroll_max_step'].val) + s[C].pragma(yi, 'unroll_explicit', cfg['unroll_explicit'].val) s[CC].compute_at(s[C], tx) _, yi, xi = s[CC].op.axis - k, = s[CC].op.reduce_axis - ko, ki = s[CC].split(k, 8) + ko, ki = cfg["tile_k"].apply(s, CC, k) s[CC].reorder(ko, ki, yi, xi) - s[CC].pragma(ki, "auto_unroll_max_step", 16) + s[CC].pragma(ki, "auto_unroll_max_step", cfg['auto_unroll_max_step'].val) + s[CC].pragma(ki, 'unroll_explicit', cfg['unroll_explicit'].val) s[AA].compute_at(s[CC], ko) s[AL].compute_at(s[CC], ki) s[BB].compute_at(s[CC], ko) s[BL].compute_at(s[CC], ki) _, y, k = s[AA].op.axis - ty, yi = s[AA].split(y, nparts=y_nthreads) - tx, ki = s[AA].split(k, nparts=x_nthreads) + ty, yi = s[AA].split(y, nparts=cfg["tile_y"].size[1]) + tx, ki = s[AA].split(k, nparts=cfg["tile_x"].size[1]) s[AA].reorder(ty, tx, yi, ki) s[AA].bind(ty, thread_y) s[AA].bind(tx, thread_x) - s[AA].pragma(yi, "auto_unroll_max_step", 16) + s[AA].pragma(yi, "auto_unroll_max_step", cfg['auto_unroll_max_step'].val) + s[AA].pragma(yi, 'unroll_explicit', cfg['unroll_explicit'].val) _, x, k = s[BB].op.axis - ty, xi = s[BB].split(x, nparts=y_nthreads) - tx, ki = s[BB].split(k, nparts=x_nthreads) + ty, xi = s[BB].split(x, nparts=cfg["tile_y"].size[1]) + tx, ki = s[BB].split(k, nparts=cfg["tile_x"].size[1]) s[BB].bind(ty, thread_y) s[BB].bind(tx, thread_x) s[BB].reorder(ty, tx, xi, ki) - s[BB].pragma(xi, "auto_unroll_max_step", 16) + s[BB].pragma(xi, "auto_unroll_max_step", cfg['auto_unroll_max_step'].val) + s[BB].pragma(xi, 'unroll_explicit', cfg['unroll_explicit'].val) def _callback(op): if "batch_matmul" in op.tag: - _schedule(op) + _schedule(cfg, op) traverse_inline(s, outs[0].op, _callback) return s diff --git a/topi/tests/python/test_topi_batch_matmul.py b/topi/tests/python/test_topi_batch_matmul.py index b8c854746847..716f40700339 100644 --- a/topi/tests/python/test_topi_batch_matmul.py +++ b/topi/tests/python/test_topi_batch_matmul.py @@ -28,7 +28,7 @@ _batch_matmul_implement = { "generic": (topi.nn.batch_matmul, topi.generic.schedule_batch_matmul), "cpu": (topi.x86.batch_matmul, topi.x86.schedule_batch_matmul), - "gpu": (topi.nn.batch_matmul, topi.cuda.schedule_batch_matmul), + "gpu": (topi.cuda.batch_matmul, topi.cuda.schedule_batch_matmul), } def verify_batch_matmul(batch, M, N, K):