From c2d190577928b0d81747947fdd5a2c2145adae62 Mon Sep 17 00:00:00 2001 From: Andrey Malyshev Date: Fri, 13 May 2022 18:43:06 +0300 Subject: [PATCH] Add Adreno GPU target and topi supporting textures with dynamically allocated textures (#11161) * Add Adreno GPU target and topi supporting textures - There are 5 compute/schedules: conv2d for NCHW/NHWC, depthwise_conv2d for NCHW/NHWC, average pooling - Fix of dynamically allocated textures caching - Add texture-nhwc scope - Fix issue with codegen of vars having non acceptable symbols Co-authored-by: Chris Sullivan Co-authored-by: Egor Churaev * Address comments * Add vectorization into some adreno pool flow Co-authored-by: Li * Fix adreno tests for running on the opencl host platform * remove unnecessary kDriverVersion in DeviceAttrKind * Move utils adreno functinos to separate shared file * fix black hits Co-authored-by: Chris Sullivan Co-authored-by: Egor Churaev Co-authored-by: Li --- python/tvm/_ffi/runtime_ctypes.py | 11 + python/tvm/relay/op/strategy/__init__.py | 1 + python/tvm/relay/op/strategy/adreno.py | 162 +++++ python/tvm/target/target.py | 14 + python/tvm/topi/__init__.py | 1 + python/tvm/topi/adreno/__init__.py | 25 + python/tvm/topi/adreno/conv2d_alter_op.py | 211 +++++++ python/tvm/topi/adreno/conv2d_nchw.py | 344 +++++++++++ python/tvm/topi/adreno/conv2d_nhwc.py | 339 +++++++++++ .../tvm/topi/adreno/depthwise_conv2d_nchw.py | 316 ++++++++++ .../tvm/topi/adreno/depthwise_conv2d_nhwc.py | 311 ++++++++++ python/tvm/topi/adreno/pooling.py | 89 +++ python/tvm/topi/adreno/utils.py | 549 +++++++++++++++++ src/runtime/opencl/opencl_common.h | 11 +- src/runtime/opencl/opencl_device_api.cc | 6 + src/runtime/texture.h | 6 + src/runtime/thread_storage_scope.h | 7 + src/target/source/codegen_source_base.cc | 5 + src/target/target_kind.cc | 1 + .../python/relay/test_conv2d_nchw_texture.py | 394 +++++++++++++ .../python/relay/test_conv2d_nhwc_texture.py | 556 ++++++++++++++++++ .../test_depthwise_conv2d_nchw_texture.py | 194 ++++++ .../test_depthwise_conv2d_nhwc_texture.py | 233 ++++++++ tests/python/relay/utils/adreno_utils.py | 118 ++++ 24 files changed, 3903 insertions(+), 1 deletion(-) create mode 100644 python/tvm/relay/op/strategy/adreno.py create mode 100644 python/tvm/topi/adreno/__init__.py create mode 100644 python/tvm/topi/adreno/conv2d_alter_op.py create mode 100644 python/tvm/topi/adreno/conv2d_nchw.py create mode 100644 python/tvm/topi/adreno/conv2d_nhwc.py create mode 100644 python/tvm/topi/adreno/depthwise_conv2d_nchw.py create mode 100644 python/tvm/topi/adreno/depthwise_conv2d_nhwc.py create mode 100644 python/tvm/topi/adreno/pooling.py create mode 100644 python/tvm/topi/adreno/utils.py create mode 100644 tests/python/relay/test_conv2d_nchw_texture.py create mode 100644 tests/python/relay/test_conv2d_nhwc_texture.py create mode 100644 tests/python/relay/test_depthwise_conv2d_nchw_texture.py create mode 100644 tests/python/relay/test_depthwise_conv2d_nhwc_texture.py create mode 100644 tests/python/relay/utils/adreno_utils.py diff --git a/python/tvm/_ffi/runtime_ctypes.py b/python/tvm/_ffi/runtime_ctypes.py index 03a68e9f9720..5dc3fe093858 100644 --- a/python/tvm/_ffi/runtime_ctypes.py +++ b/python/tvm/_ffi/runtime_ctypes.py @@ -432,6 +432,17 @@ def driver_version(self): """ return self._GetDeviceAttr(self.device_type, self.device_id, 12) + def texture_spatial_limit(self): + """Returns limits for textures by spatial dimensions + + Returns + ------- + limit : int or None + Maximum size of the texture by spatial dimensions + + """ + return self._GetDeviceAttr(self.device_type, self.device_id, 12) + def create_raw_stream(self): """Create a new runtime stream at the context. diff --git a/python/tvm/relay/op/strategy/__init__.py b/python/tvm/relay/op/strategy/__init__.py index cf915777ed0b..1be5425e702c 100644 --- a/python/tvm/relay/op/strategy/__init__.py +++ b/python/tvm/relay/op/strategy/__init__.py @@ -29,3 +29,4 @@ from . import rocm from . import intel_graphics from . import hexagon +from . import adreno diff --git a/python/tvm/relay/op/strategy/adreno.py b/python/tvm/relay/op/strategy/adreno.py new file mode 100644 index 000000000000..a783440bb38c --- /dev/null +++ b/python/tvm/relay/op/strategy/adreno.py @@ -0,0 +1,162 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Definition of adreno operator strategy.""" +# pylint: disable=invalid-name,unused-argument,wildcard-import,unused-wildcard-import +from tvm import topi +from .generic import * +from .. import op as _op + + +@conv2d_NCHWc_strategy.register("adreno") +@conv2d_strategy.register("adreno") +def conv2d_strategy_adreno(attrs, inputs, out_type, target): + """conv2d adreno strategy""" + strategy = _op.OpStrategy() + data, kernel = inputs + dilation_h, dilation_w = attrs.get_int_tuple("dilation") + groups = attrs.groups + data_layout = attrs.data_layout + kernel_layout = attrs.kernel_layout + if dilation_h < 1 or dilation_w < 1: + raise ValueError("dilation should be positive value") + + if groups == 1: + if (data_layout == "NCHW" and kernel_layout == "OIHW") or ( + data_layout == "NCHW4c" and kernel_layout == "OIHW4o" + ): + if out_type.dtype == "float16": + strategy.add_implementation( + wrap_compute_conv2d(topi.adreno.conv2d_nchwc), + wrap_topi_schedule(topi.adreno.schedule_conv2d_nchwc), + name="conv2d_nchwc.image2d", + plevel=10, + ) + strategy.add_implementation( + wrap_compute_conv2d(topi.adreno.conv2d_nchwc_acc32), + wrap_topi_schedule(topi.adreno.schedule_conv2d_nchwc_acc32), + name="conv2d_nchwc_tpack.image2d", + plevel=20, + ) + elif (data_layout == "NHWC" and kernel_layout == "HWIO") or ( + data_layout == "NHWC4c" and kernel_layout == "HWIO4o" + ): + if out_type.dtype == "float16": + strategy.add_implementation( + wrap_compute_conv2d(topi.adreno.conv2d_nhwc), + wrap_topi_schedule(topi.adreno.schedule_conv2d_nhwc), + name="conv2d_nhwc.image2d", + plevel=10, + ) + strategy.add_implementation( + wrap_compute_conv2d(topi.adreno.conv2d_nhwc_acc32), + wrap_topi_schedule(topi.adreno.schedule_conv2d_nhwc_acc32), + name="conv2d_nhwc_acc32.image2d", + plevel=20, + ) + else: + raise RuntimeError( + "Layout not supported: (" + + data_layout + + ", " + + kernel_layout + + ") - only support NCHW4c / OIHW4o and NHWC / HWOI layouts for conv2d" + ) + else: + # cannot use is_depthwise_conv2d because it does not know about NHWC4c/HWOI4o layouts + if data_layout == "NCHW": + ic = data.shape[1] + elif data_layout == "NCHW4c": + ic = data.shape[1] * data.shape[4] + elif data_layout == "NHWC": + ic = data.shape[3] + elif data_layout == "NHWC4c": + ic = data.shape[3] * data.shape[4] + else: + raise RuntimeError("Unsupported depthwise_conv2d data layout {}".format(data_layout)) + if kernel_layout == "OIHW": + oc = kernel.shape[0] + elif kernel_layout == "OIHW4o": + oc = kernel.shape[0] * kernel.shape[4] + elif kernel_layout == "HWOI": + oc = kernel.shape[2] + elif kernel_layout == "HWOI4o": + oc = kernel.shape[2] * kernel.shape[4] + else: + raise RuntimeError( + "Unsupported depthwise_conv2d kernel layout {}".format(kernel_layout) + ) + + if ic == oc == groups: + if (data_layout == "NCHW" and kernel_layout == "OIHW") or ( + data_layout == "NCHW4c" and kernel_layout == "OIHW4o" + ): + if out_type.dtype == "float16": + strategy.add_implementation( + wrap_compute_conv2d(topi.adreno.depthwise_conv2d_nchwc), + wrap_topi_schedule(topi.adreno.schedule_depthwise_conv2d_nchwc), + name="depthwise_conv2d_nchwc.image2d", + plevel=10, + ) + strategy.add_implementation( + wrap_compute_conv2d(topi.adreno.depthwise_conv2d_nchwc_acc32), + wrap_topi_schedule(topi.adreno.schedule_depthwise_conv2d_nchwc_acc32), + name="depthwise_conv2d_nchwc_acc32.image2d", + plevel=20, + ) + elif (data_layout == "NHWC" and kernel_layout == "HWOI") or ( + data_layout == "NHWC4c" and kernel_layout == "HWOI4o" + ): + if data.shape[-1] >= 4: + if out_type.dtype == "float16": + strategy.add_implementation( + wrap_compute_conv2d(topi.adreno.depthwise_conv2d_nhwc), + wrap_topi_schedule(topi.adreno.schedule_depthwise_conv2d_nhwc), + name="depthwise_conv2d_nhwc.image2d", + plevel=10, + ) + strategy.add_implementation( + wrap_compute_conv2d(topi.adreno.depthwise_conv2d_nhwc_acc32), + wrap_topi_schedule(topi.adreno.schedule_depthwise_conv2d_nhwc_acc32), + name="depthwise_conv2d_nhwc_acc32.image2d", + plevel=20, + ) + else: + strategy.add_implementation( + wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc), + wrap_topi_schedule(topi.cuda.schedule_depthwise_conv2d_nhwc), + name="depthwise_conv2d_nhwc.cuda", + ) + else: + raise RuntimeError( + "Layout not supported: (" + + data_layout + + ", " + + kernel_layout + + ") - only support NCHW4c / OIHW4o and NHWC / HWOI layouts for conv2d" + ) + else: + raise RuntimeError("General group convolution is not currently supported") + return strategy + + +@schedule_pool.register("adreno") +def schedule_pool_adreno(attrs, outs, target): + """schedule pooling ops for adreno""" + with target: + if attrs.layout == "NCHW4c": + return topi.adreno.schedule_pool(outs, attrs.layout) + return topi.cuda.schedule_pool(outs, attrs.layout) diff --git a/python/tvm/target/target.py b/python/tvm/target/target.py index 03115612c5ce..4752095d37c8 100644 --- a/python/tvm/target/target.py +++ b/python/tvm/target/target.py @@ -814,6 +814,20 @@ def stm32(series="unknown", options=None): return Target(" ".join(["c"] + opts)) +def adreno(model="unknown", options=None): + """Returns a Qualcomm GPU target. + Parameters + ---------- + model: str + The model of this device + options : str or list of str + Additional options + """ + opts = ["-device=adreno", "-model=%s" % model] + opts = _merge_opts(opts, options) + return Target(" ".join(["opencl"] + opts)) + + def create(target): """Deprecated. Use the constructor of :py:mod:`tvm.target.Target` directly.""" warnings.warn("tvm.target.create() is being deprecated. Please use tvm.target.Target() instead") diff --git a/python/tvm/topi/__init__.py b/python/tvm/topi/__init__.py index cc6c8fcc3187..785ba395d25b 100644 --- a/python/tvm/topi/__init__.py +++ b/python/tvm/topi/__init__.py @@ -64,6 +64,7 @@ from . import hls from . import random from . import hexagon +from . import adreno # error reporting from .utils import InvalidShapeError diff --git a/python/tvm/topi/adreno/__init__.py b/python/tvm/topi/adreno/__init__.py new file mode 100644 index 000000000000..6c9b7463c1d4 --- /dev/null +++ b/python/tvm/topi/adreno/__init__.py @@ -0,0 +1,25 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# pylint: disable=redefined-builtin, wildcard-import +"""Qualcomm Adreno GPU specific declaration and schedules.""" +from .conv2d_nchw import * +from .depthwise_conv2d_nchw import * +from .conv2d_nhwc import * +from .depthwise_conv2d_nhwc import * +from .pooling import * +from .conv2d_alter_op import * diff --git a/python/tvm/topi/adreno/conv2d_alter_op.py b/python/tvm/topi/adreno/conv2d_alter_op.py new file mode 100644 index 000000000000..e8944093c0f5 --- /dev/null +++ b/python/tvm/topi/adreno/conv2d_alter_op.py @@ -0,0 +1,211 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=invalid-name,unused-variable,unused-argument,no-member +"""Conv2D alter op for Qualcomm Adreno GPU""" + +import logging + +import re +import tvm +from tvm import te +from tvm import relay +from tvm import autotvm +from ..utils import get_const_tuple +from ..nn import conv2d_alter_layout + +logger = logging.getLogger("topi") + +# Number of wildcards for matching of supported layouts to be transformed +_NCHWc_matcher = re.compile("^NCHW[0-9]+c$") +_OIHWo_matcher = re.compile("^OIHW[0-9]+o$") +_NHWCc_matcher = re.compile("^NHWC[0-9]+c$") +_HWIOo_matcher = re.compile("^HWIO[0-9]+o$") +_HWOIo_matcher = re.compile("^HWOI[0-9]+o$") + + +@conv2d_alter_layout.register("adreno") +def _alter_conv2d_layout(attrs, inputs, tinfos, out_type): + """ + Prepare of the new conv2d with proper target blocked layout attributes + OpenCL Textures supports 1d/2d/3d/4d tetures but read happens always only for 4 elements + in a line. Thus way we are supporting for now only 4d conversions on the end + NCHW -> NCHW4c & OIHW ->OIHW4o + NHWC -> NHWC4c & HWIO -> HWIO4o & HWOI -> HWOI4o + """ + target = tvm.target.Target.current(allow_none=False) + dispatch_ctx = autotvm.task.DispatchContext.current + new_attrs = {k: attrs[k] for k in attrs.keys()} + + # Parse the attributes. + padding = attrs.get_int_tuple("padding") + strides = attrs.get_int_tuple("strides") + dilation = attrs.get_int_tuple("dilation") + data_layout = attrs["data_layout"] + kernel_layout = attrs["kernel_layout"] + data_tensor, kernel_tensor = tinfos + data_dtype = data_tensor.dtype + kernel_dtype = kernel_tensor.dtype + out_dtype = out_type.dtype + + if isinstance(dispatch_ctx, autotvm.task.ApplyGraphBest): + cfg = dispatch_ctx.query(target, None) + workload = cfg.workload + else: + impl, outs = relay.backend.te_compiler.select_implementation( + relay.op.get("nn.conv2d"), attrs, tinfos, out_type, target + ) + workload = autotvm.task.get_workload(outs) + if workload is None: + return None + + cfg = dispatch_ctx.query(target, workload) + + topi_tmpl = workload[0] + + if "conv2d_nchwc" in topi_tmpl: # covers both conv2d_nchwc and depthwise_conv2d_nchwc + if data_layout == "NCHW" and kernel_layout == "OIHW": + batch, in_channels, in_height, in_width = data_tensor.shape + out_channles, _, kernel_h, kernel_w = kernel_tensor.shape + in_channel_block = in_channels % 4 + if in_channel_block == 0: + in_channel_block = 4 + num_filter_block = out_channles % 4 + if num_filter_block == 0: + num_filter_block = 4 + + # no support yet for tensors that cannot be divisible by factor 4 + if in_channel_block != 4 or num_filter_block != 4: + return None + + batch_size, in_channel, height, width = get_const_tuple(data_tensor.shape) + out_channel, in_filter_channel, kh, kw = get_const_tuple(kernel_tensor.shape) + + # update new attrs + new_attrs["channels"] = out_channel + new_attrs["data_layout"] = "NCHW%dc" % in_channel_block + # (oc, ic, h, w) -> (OC, ic, h, w, oc) + new_attrs["kernel_layout"] = "OIHW%do" % num_filter_block + new_attrs["out_layout"] = "NCHW%dc" % num_filter_block + + # Store altered operator's config for applying of tuned AutoTVM statistics + new_data = te.placeholder( + (batch_size, in_channel // in_channel_block, height, width, in_channel_block), + dtype=data_dtype, + ) + new_kernel = te.placeholder( + (out_channel // num_filter_block, in_filter_channel, kh, kw, num_filter_block), + dtype=kernel_tensor.dtype, + ) + new_workload = autotvm.task.args_to_workload( + [ + new_data, + new_kernel, + strides, + padding, + dilation, + out_dtype, + ], + topi_tmpl, # "conv2d_nchwc.image2d", + ) + dispatch_ctx.update(target, new_workload, cfg) + else: + assert _NCHWc_matcher.match(data_layout) + assert _OIHWo_matcher.match(kernel_layout) + return relay.nn.conv2d(*inputs, **new_attrs) + + if "conv2d_nhwc" in topi_tmpl: # covers both conv2d_nhwcc and depthwise_conv2d_nhwcc + if (data_layout == "NHWC" and kernel_layout == "HWIO") or ( + data_layout == "NHWC" and kernel_layout == "HWOI" + ): + if kernel_layout == "HWIO": + batch_size, in_height, in_width, in_channels = data_tensor.shape + kernel_h, kernel_w, in_filter_channel, out_channles = kernel_tensor.shape + else: + batch_size, in_height, in_width, in_channels = data_tensor.shape + kernel_h, kernel_w, out_channles, in_filter_channel = kernel_tensor.shape + in_channel_block = in_channels % 4 + if in_channel_block == 0: + in_channel_block = 4 + num_filter_block = out_channles % 4 + if num_filter_block == 0: + num_filter_block = 4 + + # no support yet for tensors cannot be divisible by factor 4 + if in_channel_block != 4 or num_filter_block != 4: + return None + + # update new attrs + new_attrs["channels"] = out_channles + new_attrs["data_layout"] = "NHWC%dc" % in_channel_block + # (h, w, ic, oc) -> (h, w, ic, OC, oc) + if kernel_layout == "HWIO": + new_attrs["kernel_layout"] = "HWIO%do" % num_filter_block + else: + new_attrs["kernel_layout"] = "HWOI%do" % num_filter_block + new_attrs["out_layout"] = "NHWC%dc" % num_filter_block + + # Store altered operator's config for applying of tuned AutoTVM statistics + new_data = te.placeholder( + ( + batch_size, + in_height, + in_width, + in_channels // in_channel_block, + in_channel_block, + ), + dtype=data_dtype, + ) + if kernel_layout == "HWIO": + new_kernel = te.placeholder( + ( + kernel_h, + kernel_w, + in_filter_channel, + out_channles // num_filter_block, + num_filter_block, + ), + dtype=kernel_tensor.dtype, + ) + else: + new_kernel = te.placeholder( + ( + kernel_h, + kernel_w, + out_channles // num_filter_block, + in_filter_channel, + num_filter_block, + ), + dtype=kernel_tensor.dtype, + ) + new_workload = autotvm.task.args_to_workload( + [ + new_data, + new_kernel, + strides, + padding, + dilation, + out_dtype, + ], + topi_tmpl, + ) + dispatch_ctx.update(target, new_workload, cfg) + else: + assert _NHWCc_matcher.match(data_layout) + assert _HWIOo_matcher.match(kernel_layout) or _HWOIo_matcher.match(kernel_layout) + return relay.nn.conv2d(*inputs, **new_attrs) + + return None diff --git a/python/tvm/topi/adreno/conv2d_nchw.py b/python/tvm/topi/adreno/conv2d_nchw.py new file mode 100644 index 000000000000..96368b3e57c2 --- /dev/null +++ b/python/tvm/topi/adreno/conv2d_nchw.py @@ -0,0 +1,344 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=invalid-name,unused-variable,unused-argument,no-else-return +"""conv2d nchw schedule on Qualcomm Adreno GPU""" +import tvm +from tvm import te +from tvm import autotvm + +from ..utils import get_const_tuple, traverse_inline +from .utils import ( + split_to_chunks, + pack_input, + pack_filter, + expand_spatial_dimensions, + add_pad, + bind_data_copy, +) + + +@autotvm.register_topi_compute("conv2d_nchwc.image2d") +def conv2d_nchwc(cfg, data, kernel, strides, padding, dilation, out_dtype="float16"): + """Compute conv2d with NCHWc layout""" + args = {"shared": False, "accumulator": "float16"} + return compute_conv2d_NCHWc_KCRSk( + data, kernel, strides, padding, dilation, out_dtype, args=args + ) + + +@autotvm.register_topi_compute("conv2d_nchwc_acc32.image2d") +def conv2d_nchwc_acc32(cfg, data, kernel, strides, padding, dilation, out_dtype="float16"): + """Compute conv2d with NCHWc layout""" + args = {"shared": False, "accumulator": "float32"} + return compute_conv2d_NCHWc_KCRSk( + data, kernel, strides, padding, dilation, out_dtype, args=args + ) + + +@autotvm.register_topi_schedule("conv2d_nchwc.image2d") +def schedule_conv2d_nchwc(cfg, outs): + return schedule_conv2d_nchwc_impl(cfg, outs, tag="cast_from_acc16") + + +@autotvm.register_topi_schedule("conv2d_nchwc_acc32.image2d") +def schedule_conv2d_nchwc_acc32(cfg, outs): + return schedule_conv2d_nchwc_impl(cfg, outs, tag="cast_from_acc32") + + +def schedule_conv2d_nchwc_impl(cfg, outs, tag): + """Create the schedule for conv2d_nchw""" + outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs + s = te.create_schedule([x.op for x in outs]) + + def _callback(op): + if op.tag == tag: + schedule_conv2d_NCHWc_KCRSk(cfg, s, op.output(0)) + + traverse_inline(s, outs[0].op, _callback) + return s + + +def compute_conv2d_NCHWc_KCRSk(Input, Filter, stride, padding, dilation, out_dtype, args): + """ + Convolution operator in NCHWc layout. + Algo: + 1. Convert into blocked format if we have 4d original tensor. + In case of AutoTVM we override the convert by just tensors since such conversion + will be absent for real blocked convolution, no sense to include into tuning + 2. Expand spatial dimensions to have width and height be dividable by factor 4 + This leads to slightly bigger amount of compute but allow utilize GPU much better + 3. Add paddings. This happens even if we do not need pad originaly. This is useful + due to work arounding of the gaps of texture annotation between Primary Functions + and limited support of textures in schedules. Later on this pad will be executed + separately and will produce texture + 4. 5d Convolution compute with accumulating into out_dtype + 5. Cast to the origin output data type + 6. For case of 4d convolution: convert of output from 5d to 4d + """ + + if out_dtype is None: + out_dtype = Input.dtype + assert isinstance(stride, int) or len(stride) == 2 + assert isinstance(dilation, int) or len(dilation) == 2 + if isinstance(stride, int): + stride_h = stride_w = stride + else: + stride_h, stride_w = stride + + if isinstance(dilation, int): + dilation_h = dilation_w = dilation + else: + dilation_h, dilation_w = dilation + + convert_from4d = False + if len(Input.shape) == 4: + batch, in_channels, in_height, in_width = Input.shape + out_channles, in_filter_channels, kernel_h, kernel_w = Filter.shape + + in_channel_chunks, in_channel_block, in_channel_tail = split_to_chunks(in_channels, 4) + out_channel_chunks, out_channel_block, out_channel_tail = split_to_chunks(out_channles, 4) + + if autotvm.GLOBAL_SCOPE.in_tuning: + dshape = (batch, in_channel_chunks, in_height, in_width, in_channel_block) + Input = tvm.te.placeholder(dshape, Input.dtype, name="data_placeholder") + kshape = (out_channel_chunks, in_filter_channels, kernel_h, kernel_w, out_channel_block) + Filter = tvm.te.placeholder(kshape, Filter.dtype, name="kernel_placeholder") + else: + convert_from4d = True + Input = pack_input( + Input, + "NCHW", + batch, + in_channel_chunks, + in_channel_block, + in_channel_tail, + in_height, + in_width, + ) + Filter = pack_filter( + Filter, + "OIHW", + out_channel_chunks, + out_channel_block, + out_channel_tail, + in_filter_channels, + in_channel_chunks, + in_channel_block, + in_channel_tail, + kernel_h, + kernel_w, + ) + + else: + batch, in_channel_chunks, in_height, in_width, in_channel_block = Input.shape + out_channel_chunks, in_filter_channels, kernel_h, kernel_w, out_channel_block = Filter.shape + + out_height_orig, out_height, out_width_orig, out_width = expand_spatial_dimensions( + in_height, in_width, kernel_h, kernel_w, dilation_h, dilation_w, padding, stride_h, stride_w + ) + + temp = add_pad( + Input, + "NCHW", + out_height_orig, + out_width_orig, + kernel_h, + kernel_w, + dilation_h, + dilation_w, + padding, + stride_h, + stride_w, + ) + + rcc = te.reduce_axis((0, in_channel_chunks), name="rc") + rcb = te.reduce_axis((0, in_channel_block), name="rc") + ry = te.reduce_axis((0, kernel_h), name="ry") + rx = te.reduce_axis((0, kernel_w), name="rx") + + conv = te.compute( + (batch, out_channel_chunks, out_height, out_width, out_channel_block), + lambda nn, ffc, yy, xx, ffb: te.sum( + ( + temp[nn, rcc, yy * stride_h + ry * dilation_h, xx * stride_w + rx * dilation_w, rcb] + * Filter[ffc, rcc * in_channel_block + rcb, ry, rx, ffb] + ).astype(args["accumulator"]), + axis=[rcc, rcb, ry, rx], + ), + tag="conv2d_nchwc", + ) + + if convert_from4d and not autotvm.GLOBAL_SCOPE.in_tuning: + dummy_cast = te.compute( + (batch, out_channel_chunks, out_height_orig, out_width_orig, out_channel_block), + lambda n, fc, y, x, fb: conv[n, fc, y, x, fb].astype(out_dtype), + tag="dummy_cast", + ) + return te.compute( + (batch, out_channles, out_height_orig, out_width_orig), + lambda n, c, y, x: dummy_cast[n, c // out_channel_block, y, x, c % out_channel_block], + tag="cast_from_acc" + args["accumulator"][-2:], + ) + else: + return te.compute( + (batch, out_channel_chunks, out_height_orig, out_width_orig, out_channel_block), + lambda n, ffc, y, x, ffb: conv[n, ffc, y, x, ffb].astype(out_dtype), + tag="cast_from_acc" + args["accumulator"][-2:], + ) + + +def schedule_conv2d_NCHWc_KCRSk(cfg, s, output): + """ + schedule optimized for batch size = 1 + + Algo: + 1. Split output axis to three parts: global work size, vthread, local worksize. + The limitations for tuning includes heuristics from some tuned networks to limit + search space and not pay much time for useles configurations. + 2. In case of 4d convolution schedule copying of the input (and filter) into + 5d tensors + 4. pad should be scheduled separately to create independent opencl kernel. If pad is + inlined into convolution, this gives 1.5x performance drop + 5. We are using cache_read to produce texture and guarantee the best performance + on the next stage. + 6. For 5d convolution we schedule the latest op with binding 5d axis and vectorize + for textures + For 4d tensor we are doing the same for the latest blocked stage, i.e. conversion + of data type + 7. In case of 4d conv we need to schedule postops as well + """ + latest = s.outputs[0].output(0) + if len(latest.op.axis) == 4: + latest_blocked = dummy = output.op.input_tensors[0] + conv = dummy.op.input_tensors[0] + else: + conv = output.op.input_tensors[0] + latest_blocked = latest + + ##### space definition begin ##### + n, fc, y, x, fb = s[conv].op.axis + rcc, rcb, ry, rx = s[conv].op.reduce_axis + + if conv.shape[1] % 2 == 0: + min_threads_div = 2 + else: + min_threads_div = 1 + cfg.define_split( + "tile_fc", + fc, + num_outputs=3, + filter=lambda entity: entity.size[1] <= 8 + and entity.size[2] >= min_threads_div + and entity.size[2] < 256, + ) + cfg.define_split( + "tile_y", + y, + num_outputs=3, + filter=lambda entity: entity.size[1] <= 8 and entity.size[2] <= 16, + ) + cfg.define_split( + "tile_x", + x, + num_outputs=3, + filter=lambda entity: entity.size[1] <= 8 and entity.size[2] <= 16, + ) + + cfg.define_split("tile_rcc", rcc, num_outputs=2) + cfg.define_split("tile_ry", ry, num_outputs=2) + cfg.define_split("tile_rx", rx, num_outputs=2) + cfg.define_knob("auto_unroll_max_step", [0, 512, 1500]) + cfg.define_knob("unroll_explicit", [0, 1]) + + ##### space definition end ##### + + pad_data, kernel = s[conv].op.input_tensors + if ( + isinstance(kernel.op, tvm.te.ComputeOp) and "filter_pack" in kernel.op.tag + ): # len(latest.op.axis) == 4: + # manage scheduling of datacopy + pad_data, kernel = s[conv].op.input_tensors + pack_data = pad_data.op.input_tensors[0] + bind_data_copy(s[pack_data]) + bind_data_copy(s[kernel]) + + pad_data, kernel = s[conv].op.input_tensors + + s[pad_data].compute_inline() + + s[conv].set_scope("local") + if latest_blocked == latest and output != latest: + s[output].compute_inline() + + # create cache stage + AT = s.cache_read(pad_data, "global.texture", [conv]) + bind_data_copy(s[AT]) + WT = s.cache_read(kernel, "global.texture-weight", [conv]) + bind_data_copy(s[WT]) + + # tile and bind spatial axes + n, fc, y, x, fb = s[latest_blocked].op.axis + + kernel_scope, n = s[latest_blocked].split(n, nparts=1) + + bf, vf, tf = cfg["tile_fc"].apply(s, latest_blocked, fc) + by, vy, ty = cfg["tile_y"].apply(s, latest_blocked, y) + bx, vx, tx = cfg["tile_x"].apply(s, latest_blocked, x) + + bf = s[latest_blocked].fuse(n, bf) + s[latest_blocked].bind(bf, te.thread_axis("blockIdx.z")) + s[latest_blocked].bind(by, te.thread_axis("blockIdx.y")) + s[latest_blocked].bind(bx, te.thread_axis("blockIdx.x")) + s[latest_blocked].bind(vf, te.thread_axis("vthread")) + s[latest_blocked].bind(vy, te.thread_axis("vthread")) + s[latest_blocked].bind(vx, te.thread_axis("vthread")) + s[latest_blocked].bind(tf, te.thread_axis("threadIdx.z")) + s[latest_blocked].bind(ty, te.thread_axis("threadIdx.y")) + s[latest_blocked].bind(tx, te.thread_axis("threadIdx.x")) + s[latest_blocked].reorder(bf, by, bx, vf, vy, vx, tf, ty, tx, fb) + s[latest_blocked].vectorize(fb) + + s[conv].compute_at(s[latest_blocked], tx) + + # tile reduction axes + n, fc, y, x, fb = s[conv].op.axis + + rcc, rcb, ry, rx = s[conv].op.reduce_axis + rco, rci = cfg["tile_rcc"].apply(s, conv, rcc) + ryo, ryi = cfg["tile_ry"].apply(s, conv, ry) + rxo, rxi = cfg["tile_rx"].apply(s, conv, rx) + + s[conv].reorder(rco, ryo, rxo, rci, ryi, rxi, rcb, n, fc, y, x, fb) + s[conv].vectorize(fb) + s[conv].unroll(rcb) + + # unroll + s[latest_blocked].pragma(kernel_scope, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val) + s[latest_blocked].pragma(kernel_scope, "unroll_explicit", cfg["unroll_explicit"].val) + + if latest_blocked != latest: + s[latest].compute_root() + bind_data_copy(s[latest], 1) + if latest != output: + s[output].compute_inline() + + N, OCC, OH, OW, OCB = get_const_tuple(latest_blocked.shape) + _, IC, KH, KW, _ = get_const_tuple(kernel.shape) + ICKHKW = IC * KH * KW + + if isinstance(N, int): + cfg.add_flop(2 * N * OH * OW * OCC * OCB * ICKHKW) diff --git a/python/tvm/topi/adreno/conv2d_nhwc.py b/python/tvm/topi/adreno/conv2d_nhwc.py new file mode 100644 index 000000000000..d40f813fdb0f --- /dev/null +++ b/python/tvm/topi/adreno/conv2d_nhwc.py @@ -0,0 +1,339 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=invalid-name,unused-variable,unused-argument,no-else-return +"""conv2d nhwc schedule on Qualcomm Adreno GPU""" +import tvm +from tvm import te +from tvm import autotvm + +from ..utils import get_const_tuple, traverse_inline +from .utils import ( + split_to_chunks, + pack_input, + pack_filter, + expand_spatial_dimensions, + add_pad, + bind_data_copy, + get_texture_storage, +) + + +@autotvm.register_topi_compute("conv2d_nhwc.image2d") +def conv2d_nhwc(cfg, data, kernel, strides, padding, dilation, out_dtype="float16"): + """Compute conv2d with NCHWc layout""" + args = {"shared": False, "accumulator": "float16"} + return compute_conv2d_NHWC_HWIO(data, kernel, strides, padding, dilation, out_dtype, args=args) + + +@autotvm.register_topi_compute("conv2d_nhwc_acc32.image2d") +def conv2d_nhwc_acc32(cfg, data, kernel, strides, padding, dilation, out_dtype="float16"): + """Compute conv2d with NCHWc layout""" + args = {"shared": False, "accumulator": "float32"} + return compute_conv2d_NHWC_HWIO(data, kernel, strides, padding, dilation, out_dtype, args=args) + + +@autotvm.register_topi_schedule("conv2d_nhwc.image2d") +def schedule_conv2d_nhwc(cfg, outs): + return schedule_conv2d_nhwc_impl(cfg, outs, tag="cast_from_acc16") + + +@autotvm.register_topi_schedule("conv2d_nhwc_acc32.image2d") +def schedule_conv2d_nhwc_acc32(cfg, outs): + return schedule_conv2d_nhwc_impl(cfg, outs, tag="cast_from_acc32") + + +def schedule_conv2d_nhwc_impl(cfg, outs, tag): + """Create the schedule for conv2d_nhwc""" + outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs + s = te.create_schedule([x.op for x in outs]) + + def _callback(op): + if op.tag == tag: + schedule_conv2d_NHWC(cfg, s, op.output(0)) + + traverse_inline(s, outs[0].op, _callback) + return s + + +def compute_conv2d_NHWC_HWIO(Input, Filter, stride, padding, dilation, out_dtype, args): + """ + Convolution operator in NHWC layout. + Algo: + 1. Convert into blocked format if we have 4d original tensor. + In case of AutoTVM we override the convert by just tensors since such conversion + will be absent for real blocked convolution, no sense to include into tuning + 2. Expand spatial dimensions to have width and height be dividable by factor 4 + This leads to slightly bigger amount of compute but allow utilize GPU much better + 3. Add paddings. This happens even if we do not need pad originaly. This is useful + due to work arounding of the gaps of texture annotation between Primary Functions + and limited support of textures in schedules. Later on this pad will be executed + separately and will produce texture + 4. 5d Convolution compute with accumulating into out_dtype + 5. Cast to the origin output data type + 6. For case of 4d convolution: convert of output from 5d to 4d + """ + + if out_dtype is None: + out_dtype = Input.dtype + assert isinstance(stride, int) or len(stride) == 2 + assert isinstance(dilation, int) or len(dilation) == 2 + if isinstance(stride, int): + stride_h = stride_w = stride + else: + stride_h, stride_w = stride + + if isinstance(dilation, int): + dilation_h = dilation_w = dilation + else: + dilation_h, dilation_w = dilation + + convert_from4d = False + if len(Input.shape) == 4: + batch, in_height, in_width, in_channels = Input.shape + kernel_h, kernel_w, in_filter_channels, out_channles = Filter.shape + + in_channel_chunks, in_channel_block, in_channel_tail = split_to_chunks(in_channels, 4) + out_channel_chunks, out_channel_block, out_channel_tail = split_to_chunks(out_channles, 4) + + if autotvm.GLOBAL_SCOPE.in_tuning: + dshape = (batch, in_height, in_width, in_channel_chunks, in_channel_block) + Input = tvm.te.placeholder(dshape, Input.dtype, name="data_placeholder") + kshape = (kernel_h, kernel_w, in_filter_channels, out_channel_chunks, out_channel_block) + Filter = tvm.te.placeholder(kshape, Filter.dtype, name="kernel_placeholder") + else: + convert_from4d = True + Input = pack_input( + Input, + "NHWC", + batch, + in_channel_chunks, + in_channel_block, + in_channel_tail, + in_height, + in_width, + ) + Filter = pack_filter( + Filter, + "HWIO", + out_channel_chunks, + out_channel_block, + out_channel_tail, + in_filter_channels, + in_channel_chunks, + in_channel_block, + in_channel_tail, + kernel_h, + kernel_w, + ) + + else: + batch, in_height, in_width, in_channel_chunks, in_channel_block = Input.shape + kernel_h, kernel_w, in_filter_channels, out_channel_chunks, out_channel_block = Filter.shape + + out_height_orig, out_height, out_width_orig, out_width = expand_spatial_dimensions( + in_height, in_width, kernel_h, kernel_w, dilation_h, dilation_w, padding, stride_h, stride_w + ) + + temp = add_pad( + Input, + "NHWC", + out_height_orig, + out_width_orig, + kernel_h, + kernel_w, + dilation_h, + dilation_w, + padding, + stride_h, + stride_w, + ) + + rcc = te.reduce_axis((0, in_channel_chunks), name="rcc") + rcb = te.reduce_axis((0, in_channel_block), name="rcb") + ry = te.reduce_axis((0, kernel_h), name="ry") + rx = te.reduce_axis((0, kernel_w), name="rx") + conv = te.compute( + (batch, out_height, out_width, out_channel_chunks, out_channel_block), + lambda nn, yy, xx, fc, fb: te.sum( + ( + temp[nn, yy * stride_h + ry * dilation_h, xx * stride_w + rx * dilation_w, rcc, rcb] + * Filter[ry, rx, rcc * in_channel_block + rcb, fc, fb] + ).astype(args["accumulator"]), + axis=[ry, rx, rcc, rcb], + ), + tag="conv2d_nhwc", + ) + + if convert_from4d and not autotvm.GLOBAL_SCOPE.in_tuning: + dummy_cast = te.compute( + (batch, out_height_orig, out_width_orig, out_channel_chunks, out_channel_block), + lambda n, y, x, fc, fb: conv[n, y, x, fc, fb].astype(out_dtype), + tag="dummy_cast", + ) + return te.compute( + (batch, out_height_orig, out_width_orig, out_channles), + lambda n, y, x, c: dummy_cast[n, y, x, c // out_channel_block, c % out_channel_block], + tag="cast_from_acc" + args["accumulator"][-2:], + ) + else: + return te.compute( + (batch, out_height_orig, out_width_orig, out_channel_chunks, out_channel_block), + lambda n, y, x, ffc, ffb: conv[n, y, x, ffc, ffb].astype(out_dtype), + tag="cast_from_acc" + args["accumulator"][-2:], + ) + + +def schedule_conv2d_NHWC(cfg, s, output): + """ + schedule optimized for batch size = 1 + + Algo: + 1. Split output axis to three parts: global work size, vthread, local worksize. + The limitations for tuning includes heuristics from some tuned networks to limit + search space and not pay much time for useles configurations. + 2. In case of 4d convolution schedule copying of the input (and filter) into + 5d tensors + 4. pad should be scheduled separately to create independent opencl kernel. If pad is + inlined into convolution, this gives 1.5x performance drop + 5. We are using cache_read to produce texture and guarantee the best performance + on the next stage. + 6. For 5d convolution we schedule the latest op with binding 5d axis and vectorize + for textures + For 4d tensor we are doing the same for the latest blocked stage, i.e. conversion + of data type + 7. In case of 4d conv we need to schedule postops as well + """ + latest = s.outputs[0].output(0) + if len(latest.op.axis) == 4: + latest_blocked = dummy = output.op.input_tensors[0] + conv = dummy.op.input_tensors[0] + else: + conv = output.op.input_tensors[0] + latest_blocked = latest + + ##### space definition begin ##### + n, y, x, fc, fb = s[conv].op.axis + ry, rx, rcc, rcb = s[conv].op.reduce_axis + + if conv.shape[3] % 2 == 0: + min_threads_div = 2 + else: + min_threads_div = 1 + + cfg.define_split( + "tile_fc", + fc, + num_outputs=3, + filter=lambda entity: entity.size[1] <= 8 + and entity.size[2] >= min_threads_div + and entity.size[2] < 256, + ) + cfg.define_split( + "tile_y", + y, + num_outputs=3, + filter=lambda entity: entity.size[1] <= 8 and entity.size[2] <= 16, + ) + cfg.define_split( + "tile_x", + x, + num_outputs=3, + filter=lambda entity: entity.size[1] <= 8 and entity.size[2] <= 16, + ) + + cfg.define_split("tile_rcc", rcc, num_outputs=2) + cfg.define_split("tile_ry", ry, num_outputs=2) + cfg.define_split("tile_rx", rx, num_outputs=2) + cfg.define_knob("auto_unroll_max_step", [0, 512, 1500]) + cfg.define_knob("unroll_explicit", [0, 1]) + + pad_data, kernel = s[conv].op.input_tensors + if ( + isinstance(kernel.op, tvm.te.ComputeOp) and "filter_pack" in kernel.op.tag + ): # len(latest.op.axis) == 4: + # manage scheduling of datacopy + pad_data, kernel = s[conv].op.input_tensors + pack_data = pad_data.op.input_tensors[0] + bind_data_copy(s[pack_data]) + bind_data_copy(s[kernel]) + + pad_data, kernel = s[conv].op.input_tensors + + s[pad_data].compute_inline() + + s[conv].set_scope("local") + if latest_blocked == latest and output != latest: + s[output].compute_inline() + + # create cache stage + AT = s.cache_read(pad_data, get_texture_storage(pad_data.shape), [conv]) + bind_data_copy(s[AT]) + WT = s.cache_read(kernel, get_texture_storage(kernel.shape), [conv]) + bind_data_copy(s[WT]) + + # tile and bind spatial axes + n, y, x, fc, fb = s[latest_blocked].op.axis + + kernel_scope, n = s[latest_blocked].split(n, nparts=1) + + bf, vf, tf = cfg["tile_fc"].apply(s, latest_blocked, fc) + by, vy, ty = cfg["tile_y"].apply(s, latest_blocked, y) + bx, vx, tx = cfg["tile_x"].apply(s, latest_blocked, x) + + by = s[latest_blocked].fuse(n, by) + s[latest_blocked].bind(bf, te.thread_axis("blockIdx.z")) + s[latest_blocked].bind(by, te.thread_axis("blockIdx.y")) + s[latest_blocked].bind(bx, te.thread_axis("blockIdx.x")) + s[latest_blocked].bind(vf, te.thread_axis("vthread")) + s[latest_blocked].bind(vy, te.thread_axis("vthread")) + s[latest_blocked].bind(vx, te.thread_axis("vthread")) + s[latest_blocked].bind(tf, te.thread_axis("threadIdx.z")) + s[latest_blocked].bind(ty, te.thread_axis("threadIdx.y")) + s[latest_blocked].bind(tx, te.thread_axis("threadIdx.x")) + s[latest_blocked].reorder(bf, by, bx, vf, vy, vx, tf, ty, tx, fb) + s[latest_blocked].vectorize(fb) + + s[conv].compute_at(s[latest_blocked], tx) + + # tile reduction axes + n, y, x, fc, fb = s[conv].op.axis + + ry, rx, rcc, rcb = s[conv].op.reduce_axis + rco, rci = cfg["tile_rcc"].apply(s, conv, rcc) + ryo, ryi = cfg["tile_ry"].apply(s, conv, ry) + rxo, rxi = cfg["tile_rx"].apply(s, conv, rx) + + s[conv].reorder(rco, ryo, rxo, rci, ryi, rxi, rcb, n, fc, y, x, fb) + s[conv].vectorize(fb) + s[conv].unroll(rcb) + + # unroll + s[latest_blocked].pragma(kernel_scope, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val) + s[latest_blocked].pragma(kernel_scope, "unroll_explicit", cfg["unroll_explicit"].val) + + if latest_blocked != latest: + s[latest].compute_root() + bind_data_copy(s[latest], 1) + if latest != output: + s[output].compute_inline() + + N, OH, OW, OCC, OCB = get_const_tuple(latest_blocked.shape) + KH, KW, IC, _, _ = get_const_tuple(kernel.shape) + ICKHKW = IC * KH * KW + + if isinstance(N, int): + cfg.add_flop(2 * N * OH * OW * OCC * OCB * ICKHKW) diff --git a/python/tvm/topi/adreno/depthwise_conv2d_nchw.py b/python/tvm/topi/adreno/depthwise_conv2d_nchw.py new file mode 100644 index 000000000000..298bd11e00a7 --- /dev/null +++ b/python/tvm/topi/adreno/depthwise_conv2d_nchw.py @@ -0,0 +1,316 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=invalid-name,unused-variable,unused-argument,no-else-return +"""depthwise_conv2d_nchw(c) schedule on Qualcomm Adreno GPU""" +import tvm +from tvm import te +from tvm import autotvm + +from ..utils import get_const_tuple, traverse_inline +from .utils import ( + split_to_chunks, + pack_input, + pack_filter, + expand_spatial_dimensions, + add_pad, + bind_data_copy, +) + + +@autotvm.register_topi_compute("depthwise_conv2d_nchwc.image2d") +def depthwise_conv2d_nchwc(cfg, data, kernel, strides, padding, dilation, out_dtype="float16"): + """Compute depthwise_conv2d with NCHWc layout""" + args = {"shared": False, "accumulator": "float16"} + return compute_depthwise_conv2d_NCHWc_KCRSk( + data, kernel, strides, padding, dilation, out_dtype, args=args + ) + + +@autotvm.register_topi_compute("depthwise_conv2d_nchwc_acc32.image2d") +def depthwise_conv2d_nchwc_acc32( + cfg, data, kernel, strides, padding, dilation, out_dtype="float16" +): + """Compute depthwise_conv2d with NCHWc layout""" + args = {"shared": False, "accumulator": "float32"} + return compute_depthwise_conv2d_NCHWc_KCRSk( + data, kernel, strides, padding, dilation, out_dtype, args=args + ) + + +@autotvm.register_topi_schedule("depthwise_conv2d_nchwc.image2d") +def schedule_depthwise_conv2d_nchwc(cfg, outs): + return schedule_depthwise_conv2d_nchwc_impl(cfg, outs, tag="cast_from_acc16") + + +@autotvm.register_topi_schedule("depthwise_conv2d_nchwc_acc32.image2d") +def schedule_depthwise_conv2d_nchwc_acc32(cfg, outs): + return schedule_depthwise_conv2d_nchwc_impl(cfg, outs, tag="cast_from_acc32") + + +def schedule_depthwise_conv2d_nchwc_impl(cfg, outs, tag): + """Create the schedule for depthwise conv2d_nchw4c_ohwi4o""" + outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs + s = te.create_schedule([x.op for x in outs]) + + def _callback(op): + if op.tag == tag: + schedule_depthwise_conv2d_NCHWc_KCRSk(cfg, s, op.output(0)) + + traverse_inline(s, outs[0].op, _callback) + return s + + +def compute_depthwise_conv2d_NCHWc_KCRSk(Input, Filter, stride, padding, dilation, out_dtype, args): + """ + Depthwise convolution operator in NCHWc layout. + Algo: + 1. Convert into blocked format if we have 4d original tensor. + In case of AutoTVM we override the convert by just tensors since such conversion + will be absent for real blocked convolution, no sense to include into tuning + 2. Expand spatial dimensions to have width and height be dividable by factor 4 + This leads to slightly bigger amount of compute but allow utilize GPU much better + 3. Add paddings. This happens even if we do not need pad originaly. This is useful + due to work arounding of the gaps of texture annotation between Primary Functions + and limited support of textures in schedules. Later on this pad will be executed + separately and will produce texture + 4. 5d Convolution compute with accumulating into out_dtype + 5. Cast to the origin output data type + 6. For case of 4d convolution: convert of output from 5d to 4d + """ + if out_dtype is None: + out_dtype = Input.dtype + assert isinstance(stride, int) or len(stride) == 2 + assert isinstance(dilation, int) or len(dilation) == 2 + + if isinstance(stride, int): + stride_h = stride_w = stride + else: + stride_h, stride_w = stride + + if isinstance(dilation, int): + dilation_h = dilation_w = dilation + else: + dilation_h, dilation_w = dilation + + convert_from4d = False + if len(Input.shape) == 4: + batch, in_channels, in_height, in_width = Input.shape + out_channles, in_filter_channels, kernel_h, kernel_w = Filter.shape + + in_channel_chunks, in_channel_block, in_channel_tail = split_to_chunks(in_channels, 4) + out_channel_chunks, out_channel_block, out_channel_tail = split_to_chunks(out_channles, 4) + + if autotvm.GLOBAL_SCOPE.in_tuning: + dshape = (batch, in_channel_chunks, in_height, in_width, in_channel_block) + Input = tvm.te.placeholder(dshape, Input.dtype, name="data_placeholder") + kshape = (out_channel_chunks, in_filter_channels, kernel_h, kernel_w, out_channel_block) + Filter = tvm.te.placeholder(kshape, Filter.dtype, name="kernel_placeholder") + else: + convert_from4d = True + Input = pack_input( + Input, + "NCHW", + batch, + in_channel_chunks, + in_channel_block, + in_channel_tail, + in_height, + in_width, + ) + Filter = pack_filter( + Filter, + "OIHW", + out_channel_chunks, + out_channel_block, + out_channel_tail, + in_filter_channels, + in_channel_chunks, + in_channel_block, + in_channel_tail, + kernel_h, + kernel_w, + ) + + else: + batch, in_channel_chunks, in_height, in_width, in_channel_block = Input.shape + out_channel_chunks, in_filter_channels, kernel_h, kernel_w, out_channel_block = Filter.shape + + out_height_orig, out_height, out_width_orig, out_width = expand_spatial_dimensions( + in_height, in_width, kernel_h, kernel_w, dilation_h, dilation_w, padding, stride_h, stride_w + ) + + temp = add_pad( + Input, + "NCHW", + out_height_orig, + out_width_orig, + kernel_h, + kernel_w, + dilation_h, + dilation_w, + padding, + stride_h, + stride_w, + ) + + ry = te.reduce_axis((0, kernel_h), name="ry") + rx = te.reduce_axis((0, kernel_w), name="rx") + conv = te.compute( + (batch, out_channel_chunks, out_height, out_width, out_channel_block), + lambda nn, ffc, yy, xx, ffb: te.sum( + ( + temp[ + nn, + ffc // in_filter_channels, + yy * stride_h + ry * dilation_h, + xx * stride_w + rx * dilation_w, + ffb, + ] + * Filter[ffc // in_filter_channels, ffc % in_filter_channels, ry, rx, ffb] + ).astype(args["accumulator"]), + axis=[ry, rx], + ), + tag="depthwise_conv2d_nchwc_kcrsk", + ) + + if convert_from4d and not autotvm.GLOBAL_SCOPE.in_tuning: + dummy_cast = te.compute( + (batch, out_channel_chunks, out_height_orig, out_width_orig, out_channel_block), + lambda n, fc, y, x, fb: conv[n, fc, y, x, fb].astype(out_dtype), + tag="dummy_cast", + ) + return te.compute( + (batch, out_channles, out_height_orig, out_width_orig), + lambda n, c, y, x: dummy_cast[n, c // out_channel_block, y, x, c % out_channel_block], + tag="cast_from_acc" + args["accumulator"][-2:], + ) + else: + return te.compute( + (batch, out_channel_chunks, out_height_orig, out_width_orig, out_channel_block), + lambda n, ffc, y, x, ffb: conv[n, ffc, y, x, ffb].astype(out_dtype), + tag="cast_from_acc" + args["accumulator"][-2:], + ) + + +def schedule_depthwise_conv2d_NCHWc_KCRSk(cfg, s, output): + """ + schedule optimized for batch size = 1 + + Algo: + 1. Split output axis to three parts: global work size, vthread, local worksize. + The limitations for tuning includes heuristics from some tuned networks to limit + search space and not pay much time for useles configurations. + 2. For depthwise convolution it's better to inline pad into the conv2d compute, the + divergence in opencl kernel will not so significant as for regular conv2d. + 3. For 5d convolution we schedule the latest op with binding 5d axis and vectorize + for textures + For 4d tensor we are doing the same for the latest blocked stage, i.e. conversion + of data type + 4. In case of 4d conv we need to schedule postops as well + """ + latest = s.outputs[0].output(0) + if len(latest.op.axis) == 4: + latest_blocked = dummy = output.op.input_tensors[0] + conv = dummy.op.input_tensors[0] + else: + conv = output.op.input_tensors[0] + latest_blocked = latest + + ##### space definition begin ##### + n, fc, y, x, fb = s[conv].op.axis + ry, rx = s[conv].op.reduce_axis + cfg.define_split("tile_fc", fc, num_outputs=3) + cfg.define_split("tile_y", y, num_outputs=3) + cfg.define_split("tile_x", x, num_outputs=3) + cfg.define_split("tile_ry", ry, num_outputs=2) + cfg.define_split("tile_rx", rx, num_outputs=2) + cfg.define_knob("auto_unroll_max_step", [0, 512, 1500]) + cfg.define_knob("unroll_explicit", [0, 1]) + ##### space definition end ##### + + pad_data, kernel = s[conv].op.input_tensors + if ( + isinstance(kernel.op, tvm.te.ComputeOp) and "filter_pack" in kernel.op.tag + ): # len(latest.op.axis) == 4: + # manage scheduling of datacopy + pad_data, kernel = s[conv].op.input_tensors + pack_data = pad_data.op.input_tensors[0] + bind_data_copy(s[pack_data]) + bind_data_copy(s[kernel]) + + pad_data, kernel = s[conv].op.input_tensors + + s[pad_data].compute_inline() + + s[conv].set_scope("local") + if latest_blocked == latest and output != latest: + s[output].compute_inline() + + # create cache stage + AT = s.cache_read(pad_data, "global.texture", [conv]) + WT = s.cache_read(kernel, "global.texture-weight", [conv]) + bind_data_copy(s[AT]) + bind_data_copy(s[WT]) + + # tile and bind spatial axes + n, fc, y, x, fb = s[latest_blocked].op.axis + kernel_scope, n = s[latest_blocked].split(n, nparts=1) + + bf, vf, tf = cfg["tile_fc"].apply(s, latest_blocked, fc) + by, vy, ty = cfg["tile_y"].apply(s, latest_blocked, y) + bx, vx, tx = cfg["tile_x"].apply(s, latest_blocked, x) + + bf = s[latest_blocked].fuse(n, bf) + s[latest_blocked].bind(bf, te.thread_axis("blockIdx.z")) + s[latest_blocked].bind(by, te.thread_axis("blockIdx.y")) + s[latest_blocked].bind(bx, te.thread_axis("blockIdx.x")) + s[latest_blocked].bind(vf, te.thread_axis("vthread")) + s[latest_blocked].bind(vy, te.thread_axis("vthread")) + s[latest_blocked].bind(vx, te.thread_axis("vthread")) + s[latest_blocked].bind(tf, te.thread_axis("threadIdx.z")) + s[latest_blocked].bind(ty, te.thread_axis("threadIdx.y")) + s[latest_blocked].bind(tx, te.thread_axis("threadIdx.x")) + s[latest_blocked].reorder(bf, by, bx, vf, vy, vx, tf, ty, tx, fb) + s[latest_blocked].vectorize(fb) + + s[conv].compute_at(s[latest_blocked], tx) + + # tile reduction axes + n, fc, y, x, fb = s[conv].op.axis + + ry, rx = s[conv].op.reduce_axis + ryo, ryi = cfg["tile_ry"].apply(s, conv, ry) + rxo, rxi = cfg["tile_rx"].apply(s, conv, rx) + + s[conv].reorder(ryo, rxo, ryi, rxi, n, fc, y, x, fb) + s[conv].vectorize(fb) + + # unroll + s[latest_blocked].pragma(kernel_scope, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val) + s[latest_blocked].pragma(kernel_scope, "unroll_explicit", cfg["unroll_explicit"].val) + if latest_blocked != latest: + s[latest].compute_root() + bind_data_copy(s[latest], 1) + if latest != output: + s[output].compute_inline() + + N, OCC, OH, OW, OCB = get_const_tuple(latest_blocked.shape) + _, _, KH, KW, ICB = get_const_tuple(kernel.shape) + KHKW = KH * KW + + if isinstance(N, int): + cfg.add_flop(2 * N * OH * OW * OCC * OCB * KHKW * ICB) diff --git a/python/tvm/topi/adreno/depthwise_conv2d_nhwc.py b/python/tvm/topi/adreno/depthwise_conv2d_nhwc.py new file mode 100644 index 000000000000..b8a978d3c204 --- /dev/null +++ b/python/tvm/topi/adreno/depthwise_conv2d_nhwc.py @@ -0,0 +1,311 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=invalid-name,unused-variable,unused-argument,no-else-return +"""depthwise_conv2d_nhwc(c) schedule on Qualcomm Adreno GPU""" +import tvm +from tvm import te +from tvm import autotvm + +from ..utils import get_const_tuple, traverse_inline +from .utils import ( + split_to_chunks, + pack_input, + pack_filter, + expand_spatial_dimensions, + add_pad, + bind_data_copy, + get_texture_storage, +) + + +@autotvm.register_topi_compute("depthwise_conv2d_nhwc.image2d") +def depthwise_conv2d_nhwc(cfg, data, kernel, strides, padding, dilation, out_dtype="float16"): + """Compute depthwise_conv2d with NHWC layout""" + args = {"shared": False, "accumulator": "float16"} + return compute_depthwise_conv2d_NHWC_HWOI( + data, kernel, strides, padding, dilation, out_dtype, args=args + ) + + +@autotvm.register_topi_compute("depthwise_conv2d_nhwc_acc32.image2d") +def depthwise_conv2d_nhwc_acc32(cfg, data, kernel, strides, padding, dilation, out_dtype="float16"): + """Compute depthwise_conv2d with NHWC layout""" + args = {"shared": False, "accumulator": "float32"} + return compute_depthwise_conv2d_NHWC_HWOI( + data, kernel, strides, padding, dilation, out_dtype, args=args + ) + + +@autotvm.register_topi_schedule("depthwise_conv2d_nhwc.image2d") +def schedule_depthwise_conv2d_nhwc(cfg, outs): + return schedule_depthwise_conv2d_nhwc_impl(cfg, outs, tag="cast_from_acc16") + + +@autotvm.register_topi_schedule("depthwise_conv2d_nhwc_acc32.image2d") +def schedule_depthwise_conv2d_nhwc_acc32(cfg, outs): + return schedule_depthwise_conv2d_nhwc_impl(cfg, outs, tag="cast_from_acc32") + + +def schedule_depthwise_conv2d_nhwc_impl(cfg, outs, tag): + """Create the schedule for depthwise conv2d_nchw4c_ohwi4o""" + outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs + s = te.create_schedule([x.op for x in outs]) + + def _callback(op): + if op.tag == tag: + schedule_depthwise_conv2d_NHWC_HWOI(cfg, s, op.output(0)) + + traverse_inline(s, outs[0].op, _callback) + return s + + +def compute_depthwise_conv2d_NHWC_HWOI(Input, Filter, stride, padding, dilation, out_dtype, args): + """ + Depthwise convolution operator in NCHWc layout. + Algo: + 1. Convert into blocked format if we have 4d original tensor. + In case of AutoTVM we override the convert by just tensors since such conversion + will be absent for real blocked convolution, no sense to include into tuning + 2. Expand spatial dimensions to have width and height be dividable by factor 4 + This leads to slightly bigger amount of compute but allow utilize GPU much better + 3. Add paddings. This happens even if we do not need pad originaly. This is useful + due to work arounding of the gaps of texture annotation between Primary Functions + and limited support of textures in schedules. Later on this pad will be executed + separately and will produce texture + 4. 5d Convolution compute with accumulating into out_dtype + 5. Cast to the origin output data type + 6. For case of 4d convolution: convert of output from 5d to 4d + """ + if out_dtype is None: + out_dtype = Input.dtype + assert isinstance(stride, int) or len(stride) == 2 + assert isinstance(dilation, int) or len(dilation) == 2 + + if isinstance(stride, int): + stride_h = stride_w = stride + else: + stride_h, stride_w = stride + + if isinstance(dilation, int): + dilation_h = dilation_w = dilation + else: + dilation_h, dilation_w = dilation + + convert_from4d = False + if len(Input.shape) == 4: + batch, in_height, in_width, in_channels = Input.shape + kernel_h, kernel_w, out_channles, in_filter_channels = Filter.shape + + in_channel_chunks, in_channel_block, in_channel_tail = split_to_chunks(in_channels, 4) + out_channel_chunks, out_channel_block, out_channel_tail = split_to_chunks(out_channles, 4) + + if autotvm.GLOBAL_SCOPE.in_tuning: + dshape = (batch, in_height, in_width, in_channel_chunks, in_channel_block) + Input = tvm.te.placeholder(dshape, Input.dtype, name="data_placeholder") + kshape = (kernel_h, kernel_w, out_channel_block, in_filter_channels, out_channel_chunks) + Filter = tvm.te.placeholder(kshape, Filter.dtype, name="kernel_placeholder") + else: + convert_from4d = True + Input = pack_input( + Input, + "NHWC", + batch, + in_channel_chunks, + in_channel_block, + in_channel_tail, + in_height, + in_width, + ) + Filter = pack_filter( + Filter, + "HWOI", + out_channel_chunks, + out_channel_block, + out_channel_tail, + in_filter_channels, + in_channel_chunks, + in_channel_block, + in_channel_tail, + kernel_h, + kernel_w, + ) + + else: + batch, in_height, in_width, in_channel_chunks, in_channel_block = Input.shape + kernel_h, kernel_w, out_channel_chunks, in_filter_channels, out_channel_block = Filter.shape + + out_height_orig, out_height, out_width_orig, out_width = expand_spatial_dimensions( + in_height, in_width, kernel_h, kernel_w, dilation_h, dilation_w, padding, stride_h, stride_w + ) + + temp = add_pad( + Input, + "NHWC", + out_height_orig, + out_width_orig, + kernel_h, + kernel_w, + dilation_h, + dilation_w, + padding, + stride_h, + stride_w, + ) + + ry = te.reduce_axis((0, kernel_h), name="ry") + rx = te.reduce_axis((0, kernel_w), name="rx") + conv = te.compute( + (batch, out_height, out_width, out_channel_chunks, out_channel_block), + lambda nn, yy, xx, ffc, ffb: te.sum( + ( + temp[nn, yy * stride_h + ry * dilation_h, xx * stride_w + rx * dilation_w, ffc, ffb] + * Filter[ry, rx, ffc, 0, ffb] + ).astype(args["accumulator"]), + axis=[ry, rx], + ), + tag="depthwise_conv2d_nhwc", + ) + + if convert_from4d and not autotvm.GLOBAL_SCOPE.in_tuning: + dummy_cast = te.compute( + (batch, out_height_orig, out_width_orig, out_channel_chunks, out_channel_block), + lambda n, y, x, fc, fb: conv[n, y, x, fc, fb].astype(out_dtype), + tag="dummy_cast", + ) + return te.compute( + (batch, out_height_orig, out_width_orig, out_channles), + lambda n, y, x, c: dummy_cast[n, y, x, c // out_channel_block, c % out_channel_block], + tag="cast_from_acc" + args["accumulator"][-2:], + ) + else: + return te.compute( + (batch, out_height_orig, out_width_orig, out_channel_chunks, out_channel_block), + lambda n, y, x, ffc, ffb: conv[n, y, x, ffc, ffb].astype(out_dtype), + tag="cast_from_acc" + args["accumulator"][-2:], + ) + + +def schedule_depthwise_conv2d_NHWC_HWOI(cfg, s, output): + """ + schedule optimized for batch size = 1 + + Algo: + 1. Split output axis to three parts: global work size, vthread, local worksize. + The limitations for tuning includes heuristics from some tuned networks to limit + search space and not pay much time for useles configurations. + 2. In case of 4d convolution schedule copying of the input (and filter) into + 5d tensors + 3. For depthwise convolution it's better to inline pad into the conv2d compute, the + divergence in opencl kernel will not so significant as for regular conv2d. + 4. For 5d convolution we schedule the latest op with binding 5d axis and vectorize + for textures + For 4d tensor we are doing the same for the latest blocked stage, i.e. conversion + of data type + 5. In case of 4d conv we need to schedule postops as well + """ + latest = s.outputs[0].output(0) + if len(latest.op.axis) == 4: + latest_blocked = dummy = output.op.input_tensors[0] + conv = dummy.op.input_tensors[0] + else: + conv = output.op.input_tensors[0] + latest_blocked = latest + + ##### space definition begin ##### + n, y, x, fc, fb = s[conv].op.axis + ry, rx = s[conv].op.reduce_axis + cfg.define_split("tile_fc", fc, num_outputs=3) + cfg.define_split("tile_y", y, num_outputs=3) + cfg.define_split("tile_x", x, num_outputs=3) + cfg.define_split("tile_ry", ry, num_outputs=2) + cfg.define_split("tile_rx", rx, num_outputs=2) + cfg.define_knob("auto_unroll_max_step", [0, 512, 1500]) + cfg.define_knob("unroll_explicit", [0, 1]) + ##### space definition end ##### + + pad_data, kernel = s[conv].op.input_tensors + if ( + isinstance(kernel.op, tvm.te.ComputeOp) and "filter_pack" in kernel.op.tag + ): # len(latest.op.axis) == 4: + # manage scheduling of datacopy + pad_data, kernel = s[conv].op.input_tensors + pack_data = pad_data.op.input_tensors[0] + bind_data_copy(s[pack_data]) + bind_data_copy(s[kernel]) + + pad_data, kernel = s[conv].op.input_tensors + + s[pad_data].compute_inline() + + s[conv].set_scope("local") + if latest_blocked == latest and output != latest: + s[output].compute_inline() + + # create cache stage + AT = s.cache_read(pad_data, get_texture_storage(pad_data.shape), [conv]) + WT = s.cache_read(kernel, get_texture_storage(kernel.shape), [conv]) + bind_data_copy(s[AT]) + bind_data_copy(s[WT]) + + # tile and bind spatial axes + n, y, x, fc, fb = s[latest_blocked].op.axis + kernel_scope, n = s[latest_blocked].split(n, nparts=1) + + bf, vf, tf = cfg["tile_fc"].apply(s, latest_blocked, fc) + by, vy, ty = cfg["tile_y"].apply(s, latest_blocked, y) + bx, vx, tx = cfg["tile_x"].apply(s, latest_blocked, x) + + by = s[latest_blocked].fuse(n, by) + s[latest_blocked].bind(bf, te.thread_axis("blockIdx.z")) + s[latest_blocked].bind(by, te.thread_axis("blockIdx.y")) + s[latest_blocked].bind(bx, te.thread_axis("blockIdx.x")) + s[latest_blocked].bind(vf, te.thread_axis("vthread")) + s[latest_blocked].bind(vy, te.thread_axis("vthread")) + s[latest_blocked].bind(vx, te.thread_axis("vthread")) + s[latest_blocked].bind(tf, te.thread_axis("threadIdx.z")) + s[latest_blocked].bind(ty, te.thread_axis("threadIdx.y")) + s[latest_blocked].bind(tx, te.thread_axis("threadIdx.x")) + s[latest_blocked].reorder(bf, by, bx, vf, vy, vx, tf, ty, tx, fb) + s[latest_blocked].vectorize(fb) + + s[conv].compute_at(s[latest_blocked], tx) + + # tile reduction axes + n, y, x, fc, fb = s[conv].op.axis + + ry, rx = s[conv].op.reduce_axis + ryo, ryi = cfg["tile_ry"].apply(s, conv, ry) + rxo, rxi = cfg["tile_rx"].apply(s, conv, rx) + + s[conv].reorder(ryo, rxo, ryi, rxi, n, fc, y, x, fb) + s[conv].vectorize(fb) + + # unroll + s[latest_blocked].pragma(kernel_scope, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val) + s[latest_blocked].pragma(kernel_scope, "unroll_explicit", cfg["unroll_explicit"].val) + if latest_blocked != latest: + s[latest].compute_root() + bind_data_copy(s[latest], 1) + if latest != output: + s[output].compute_inline() + + N, OH, OW, OCC, OCB = get_const_tuple(latest_blocked.shape) + KH, KW, _, _, _ = get_const_tuple(kernel.shape) + KHKW = KH * KW + + if isinstance(N, int): + cfg.add_flop(2 * N * OH * OW * OCC * OCB * KHKW) diff --git a/python/tvm/topi/adreno/pooling.py b/python/tvm/topi/adreno/pooling.py new file mode 100644 index 000000000000..49f103c04a2f --- /dev/null +++ b/python/tvm/topi/adreno/pooling.py @@ -0,0 +1,89 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=invalid-name,unused-variable,unused-argument,no-else-return +"""pooling schedules for Qualcomm Adreno GPU""" +import tvm +from tvm import te +from .. import tag + + +def schedule_pool(outs, layout): + """Schedule for various pooling operators. + + Parameters + ---------- + outs: Array of Tensor + The computation graph description of pool + in the format of an array of tensors. + + layout: str + Data layout. + + Returns + ------- + s: Schedule + The computation schedule for pool. + """ + outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs + s = te.create_schedule([x.op for x in outs]) + + def _schedule(PaddedInput, Pool): + if isinstance(PaddedInput.op, tvm.te.ComputeOp): + s[PaddedInput].compute_inline() + num_thread = tvm.target.Target.current(allow_none=False).max_num_threads + num_thread = int(num_thread * 2) + if Pool.op in s.outputs: + Out = Pool + OL = s.cache_write(Pool, "local") + else: + Out = outs[0].op.output(0) + s[Pool].set_scope("local") + fused = s[Out].fuse(*s[Out].op.axis[:-1]) + bx, tx = s[Out].split(fused, factor=num_thread) + s[Out].bind(bx, te.thread_axis("blockIdx.x")) + s[Out].bind(tx, te.thread_axis("threadIdx.x")) + s[Out].vectorize(s[Out].op.axis[-1]) + if Pool.op in s.outputs: + s[OL].compute_at(s[Out], tx) + s[OL].vectorize(s[OL].op.axis[-1]) + else: + s[Pool].compute_at(s[Out], tx) + s[Pool].vectorize(s[Pool].op.axis[-1]) + + scheduled_ops = [] + + def traverse(OP): + """Internal traverse function""" + # inline all one-to-one-mapping operators except the last stage (output) + if tag.is_broadcast(OP.tag): + if OP not in s.outputs: + s[OP].compute_inline() + for tensor in OP.input_tensors: + if isinstance(tensor.op, te.tensor.ComputeOp) and tensor.op not in scheduled_ops: + traverse(tensor.op) + # schedule pool + elif OP.tag.startswith("pool"): + PaddedInput = OP.input_tensors[0] + Pool = OP.output(0) + _schedule(PaddedInput, Pool) + else: + raise RuntimeError("Unsupported operator: %s" % OP.tag) + + scheduled_ops.append(OP) + + traverse(outs[0].op) + return s diff --git a/python/tvm/topi/adreno/utils.py b/python/tvm/topi/adreno/utils.py new file mode 100644 index 000000000000..727741c11fd3 --- /dev/null +++ b/python/tvm/topi/adreno/utils.py @@ -0,0 +1,549 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=invalid-name,unused-variable,unused-argument,no-else-return +"""util functions to be reused in different compute/schedule on Qualcomm Adreno GPU""" + +import tvm +import numpy +from tvm import te +from tvm.topi.utils import simplify +from tvm.topi import nn +from ..utils import get_const_tuple + + +def get_div(value, start): + """Returns the maximum divider for `value` starting from `start` value""" + div = 1 + for d in range(start, 0, -1): + if (value % d) == 0: + div = d + break + return div + + +def split_to_chunks(extent, block): + """ + Splits the trip count value to chunks and block, returns the remainder as well + the chunks and blocks covers or overlaps the origin value + + If extent can be divisible by block: + extent = chunks * block + else + extent = (chunks - 1) * block + tail + + Parameters + ---------- + extent: int + tripcount for original compute + + block: int + size of the block + + Returns + ---------- + out: tuple of the (chunks, block, tail) + chunks = ceildiv(extent, block) + tail = number of origin elements in the latest chunk + """ + tail = extent % block + chunks = extent // block + if tail == 0: + tail = block + else: + chunks += 1 + return chunks, block, tail + + +def pack_input(Input, layout, batch, chunks, block, original_tail, in_height, in_width): + """ + Adds compute stages for packing of the data in runtime. Extends channel dimensions + to be dividable by factor 4 + + This function should be substituted by Schedule.transform_layout() in the future: see + https://github.com/apache/tvm-rfcs/blob/main/rfcs/0039-buffer-physical-layout.md + + Parameters + ---------- + Input: tvm.te.Tensor + Input tensor to be repacked in runtime + + layout: string + Layout of origin 4d tensor + NCHW or NHWC are acceptable + + batch: int + Batch size + + chunks: int + Number of channel chunks been in the final tensor + + block: int + size of the channel block + + original_tail: int + Tail in the latest chunk diffing original number of channels vs blocked one + If original_tail != block: + original_channels = chunks * block - original_tail + else + original_channels = chunks * block + + in_height: int + Height of the feature map + + in_width: int + Width of the feature map + """ + + pad_value = tvm.tir.const(0, Input.dtype) + + def _reorder_data_nchw(*indices): + condition = [] + condition.append(indices[1] == chunks - 1) + condition.append(indices[4] >= original_tail) + condition = tvm.tir.all(*condition) + return tvm.tir.if_then_else( + condition, + pad_value, + Input[indices[0], indices[1] * block + indices[4], indices[2], indices[3]], + ) + + def _reorder_data_nhwc(*indices): + condition = [] + condition.append(indices[3] == chunks - 1) + condition.append(indices[4] >= original_tail) + condition = tvm.tir.all(*condition) + return tvm.tir.if_then_else( + condition, + pad_value, + Input[indices[0], indices[1], indices[2], indices[3] * block + indices[4]], + ) + + # compute: + if layout == "NCHW": + reordered_data = te.compute( + [batch, chunks, in_height, in_width, block], + _reorder_data_nchw, + name="input_pack", + tag="input_pack", + ) + elif layout == "NHWC": + reordered_data = te.compute( + [batch, in_height, in_width, chunks, block], + _reorder_data_nhwc, + name="input_pack", + tag="input_pack", + ) + else: + assert False, "Adreno util function pack_input does not accept unknown layout" + return reordered_data + + +def pack_filter( + Filter, + layout, + out_chunks, + out_block, + out_original_tail, + in_filter_channels, + in_chunks, + in_block, + in_original_tail, + kernel_h, + kernel_w, +): + """ + Adds compute stages for packing of the filter in runtime. Extends channels dimensions + to be dividable by factor 4 + + This function should be substituted by Schedule.transform_layout() in the future: see + https://github.com/apache/tvm-rfcs/blob/main/rfcs/0039-buffer-physical-layout.md + + Parameters + ---------- + Filter: tvm.te.Tensor + Filter tensor to be repacked in runtime + + layout: string + Layout of origin 4d tensor + NCHW or NHWC are acceptable + + out_chunks: int + Number of chunks for filters + + out_block: int + Size of the block for output channels + + out_original_tail: int + Original size of the latest chunk of output filters + + in_filter_channels: int + Number of filter channels. might be different vs input channels in the + data due to groups/depthwise nature + + in_chunks: int + Number of input data channel chunks + + in_block: int + Size of the block for input data channels + + in_original_tail + Original size of the latest chunk for input data channels + + kernel_h: int + Height of the conv2d kernel + + kernel_w: int + Width of the conv2d kernel + """ + pad_value = tvm.tir.const(0, Filter.dtype) + + def _reorder_weights_depthwise_oihw(*indices): + conditionA = [] + conditionA.append(indices[0] == out_chunks - 1) + conditionA.append(indices[4] >= out_original_tail) + conditionAT = tvm.tir.all(*conditionA) + + return tvm.tir.if_then_else( + conditionAT, + pad_value, + Filter[indices[0] * out_block + indices[4], indices[1], indices[2], indices[3]], + ) + + def _reorder_weights_depthwise_hwoi(*indices): + conditionA = [] + conditionA.append(indices[2] == out_chunks - 1) + conditionA.append(indices[4] >= out_original_tail) + conditionAT = tvm.tir.all(*conditionA) + + return tvm.tir.if_then_else( + conditionAT, + pad_value, + Filter[indices[0], indices[1], indices[2] * out_block + indices[4], indices[3]], + ) + + def _reorder_weights_oihw(*indices): + conditionA = [] + conditionA.append(indices[0] == out_chunks - 1) + conditionA.append(indices[4] >= out_original_tail) + conditionAT = tvm.tir.all(*conditionA) + + conditionO = [] + conditionO.append(conditionAT) + conditionO.append(indices[1] >= in_chunks * in_block + in_original_tail) + conditionOT = tvm.tir.any(*conditionO) + return tvm.tir.if_then_else( + conditionOT, + pad_value, + Filter[indices[0] * out_block + indices[4], indices[1], indices[2], indices[3]], + ) + + def _reorder_weights_hwio(*indices): + conditionA = [] + conditionA.append(indices[3] == out_chunks - 1) + conditionA.append(indices[4] >= out_original_tail) + conditionAT = tvm.tir.all(*conditionA) + + conditionO = [] + conditionO.append(conditionAT) + conditionO.append(indices[2] >= in_chunks * in_block + in_original_tail) + conditionOT = tvm.tir.any(*conditionO) + return tvm.tir.if_then_else( + conditionOT, + pad_value, + Filter[indices[0], indices[1], indices[2], indices[3] * out_block + indices[4]], + ) + + if in_filter_channels == 1: + if layout == "OIHW": + reordered_filter = te.compute( + [out_chunks, in_filter_channels, kernel_h, kernel_w, out_block], + _reorder_weights_depthwise_oihw, + name="filter_pack", + tag="filter_pack", + ) + elif layout == "HWOI": + reordered_filter = te.compute( + [kernel_h, kernel_w, out_chunks, in_filter_channels, out_block], + _reorder_weights_depthwise_hwoi, + name="filter_pack", + tag="filter_pack", + ) + else: + assert False, "Adreno util function def pack_filter does not accept unknown layout" + else: + if layout == "OIHW": + reordered_filter = te.compute( + [out_chunks, in_filter_channels, kernel_h, kernel_w, out_block], + _reorder_weights_oihw, + name="filter_pack", + tag="filter_pack", + ) + elif layout == "HWIO": + reordered_filter = te.compute( + [kernel_h, kernel_w, in_filter_channels, out_chunks, out_block], + _reorder_weights_hwio, + name="filter_pack", + tag="filter_pack", + ) + else: + assert False, "Adreno util function def pack_filter does not accept unknown layout" + return reordered_filter + + +def expand_spatial_dimensions( + in_height, in_width, kernel_h, kernel_w, dilation_h, dilation_w, padding, stride_h, stride_w +): + """ + Expands spatial dimensions to be dividable by factor 4. This will allow us to do extrimely + better parallel computation on GPU. The drawback of this solution - it will be number of + useless computations. By fact the speed-up of parallelism significantly overcomes the slowdown + of extra compute and eventuially this is useful approach, at least for GPU + + Parameters + ---------- + in_height: int + Height of the feature map + + in_width: int + Width of the feature map + + kernel_h: int + Height of the conv2d kernel + + kernel_w: int + Width of the conv2d kernel + + dilation_h: int + Vertical dilation of the conv2d kernel + + dilation_w: int + Horizontal dilation of the conv2d kernel + + padding: tuple or list + Conv2d paddings + + stride_h: int + Vertical stride of the conv2d kernel + + stride_w: int + Horizontal stride of the conv2d kernel + """ + dilated_kernel_h = (kernel_h - 1) * dilation_h + 1 + dilated_kernel_w = (kernel_w - 1) * dilation_w + 1 + + pad_top, pad_left, pad_down, pad_right = nn.get_pad_tuple( + padding, (dilated_kernel_h, dilated_kernel_w) + ) + + out_height_orig = out_height = simplify( + (in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1 + ) + out_width_orig = out_width = simplify( + (in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1 + ) + + # can output shape be divded by 2 or even 4? + # if it cannot be divided, need to extend for further help with split + # theortically there should be addition padding for inputs, but it will be optimized by + # cache_read InferBound. We must proceed pad here exactly to produce tensor which is + # required for calculation of original out size, not more! In other case intermediate + # tensor might be allcoated with less sizes while compute will try to fill the expanded + # one - data discrepancy as a result + # And in case of textures it is not a problem if we provide texture of less size because + # 1. It is not important which values would be for extra calc - these calculations are + # required only for better utilizatin of GPU fit to working groups + # 2. When we request pixel out opf bound, texture will handle this correctly. As mentioned + # above, the value itself is not important + if out_height % 2 != 0: + out_height += 1 + if out_width % 2 != 0: + out_width += 1 + + if out_height % 4 != 0: + out_height += 2 + if out_width % 4 != 0: + out_width += 2 + return out_height_orig, out_height, out_width_orig, out_width + + +def add_pad( + data, + layout, + out_height, + out_width, + kernel_h, + kernel_w, + dilation_h, + dilation_w, + padding, + stride_h, + stride_w, +): + """Computes required padding values by the parameters of conv2d and adds + compute for extending of original tensor + + Parameters + ---------- + data: tvm.te.Tensor + 5d tensor, the layout of spatial dimensions are defined as separate argument + + layout: string + Layout of origin 4d tensor + + out_height: int + Height of the output feature map + + out_width: int + Width of the output feature map + + kernel_h: int + Height of the conv2d kernel + + kernel_w: int + Width of the conv2d kernel + + dilation_h: int + Height dilation value from conv2d attributes + + dilation_w: int + Width dilation value from conv2d attributes + + padding: list / tuple of n ints + Padding values from conv2d attributes + + stride_h: int + Height stride value from conv2d attributes + + stride_w: int + Width stride value from conv2d attributes + + Returns + ------- + Output : tvm.te.Tensor + n-D, the same layout as Input. + """ + dilated_kernel_h = (kernel_h - 1) * dilation_h + 1 + dilated_kernel_w = (kernel_w - 1) * dilation_w + 1 + pad_top, pad_left, pad_down, pad_right = nn.get_pad_tuple( + padding, (dilated_kernel_h, dilated_kernel_w) + ) + + # compute graph + if layout == "NCHW": + y_axis = 2 + x_axis = 3 + if len(data.shape) == 4: + _, _, in_height, in_width = data.shape + else: + _, _, in_height, in_width, _ = data.shape + elif layout == "NHWC": + y_axis = 1 + x_axis = 2 + if len(data.shape) == 4: + _, in_height, in_width, _ = data.shape + else: + _, in_height, in_width, _, _ = data.shape + else: + assert False, "not supported layout in adreno util add_pad" + pad_before = [0, 0, 0, 0, 0] + pad_after = [0, 0, 0, 0, 0] + pad_before[y_axis] = pad_top + pad_before[x_axis] = pad_left + pad_after[y_axis] = pad_down + pad_after[x_axis] = pad_right + + # calculation of real used input size: + input_latest_w = (out_width - 1) * stride_w + (kernel_w - 1) * dilation_w + 1 + input_latest_h = (out_height - 1) * stride_h + (kernel_h - 1) * dilation_h + 1 + if input_latest_w < in_width + pad_before[x_axis] + pad_after[x_axis]: + pad_after[x_axis] -= in_width + pad_before[x_axis] + pad_after[x_axis] - input_latest_w + if input_latest_h < in_height + pad_before[y_axis] + pad_after[y_axis]: + pad_after[y_axis] -= in_height + pad_before[y_axis] + pad_after[y_axis] - input_latest_h + return nn.pad(data, pad_before, pad_after, name="pad_temp") + + +def bind_data_copy(stage, axis_to_vectorize=None): + """ + Schedules the eltwise stages like copying of data or postops + + Parameters + ---------- + stage: tvm.te.Tensor + + axis_to_vectorize: + Causes to split certain axis, moves inner part to the end of schedule + and enable vectorization by this axis + If parameter is not pointed, the schedule will be vectorized if the most inner + dim is eq to 4 (size of the vector in texture) + """ + shape = get_const_tuple(stage.op.output(0).shape) + if axis_to_vectorize and len(shape) == 4 and shape[axis_to_vectorize] % 4 == 0: + ax0, ax1, ax2, ax3 = stage.op.axis + if axis_to_vectorize == 1: + oax1, iax1 = stage.split(ax1, factor=4) + stage.reorder(ax0, oax1, ax2, ax3, iax1) + stage.vectorize(iax1) + fused = stage.fuse(ax0, oax1, ax2, ax3) + elif axis_to_vectorize == 3: + oax3, iax3 = stage.split(ax3, factor=4) + stage.reorder(ax0, ax1, ax2, oax3, iax3) + stage.vectorize(iax3) + fused = stage.fuse(ax0, ax1, ax2, oax3) + + ftc = numpy.prod(shape) / 4 + div = get_div(ftc, 128) + block, thread = stage.split(fused, factor=div) + + stage.bind(block, te.thread_axis("blockIdx.z")) + stage.bind(thread, te.thread_axis("threadIdx.z")) + else: + axes = stage.op.axis + fused = stage.fuse(*axes[:-1]) + if shape[-1] <= 32: + ftc = numpy.prod(shape[:-1]) + div = get_div(ftc, 64) + block, thread = stage.split(fused, factor=div) + stage.bind(block, te.thread_axis("blockIdx.x")) + stage.bind(thread, te.thread_axis("threadIdx.x")) + if shape[-1] == 4: + stage.vectorize(axes[-1]) + else: + stage.bind(fused, te.thread_axis("blockIdx.x")) + stage.bind(*axes[-1:], te.thread_axis("threadIdx.x")) + + +def get_texture_storage(shape): + """ + Returns the texture layout acceptable for the shape + + Parameters + ---------- + shape: array + Shape of the tensor to be packed to texture + """ + # certain limitation of the Qualcomm devices. Subject to be determined for certain device + # individually, but until we have access to remote device during compilation, we have to + # define it uniformly for all target devices + # limit = 16384 + limit = tvm.target.Target.current().attrs["texture_spatial_limit"] + + if shape[0] * shape[1] * shape[2] < limit and shape[3] < limit: + return "global.texture" + elif shape[0] * shape[1] < limit and shape[2] * shape[3] < limit: + return "global.texture-nhwc" + else: + return "global.texture-weight" diff --git a/src/runtime/opencl/opencl_common.h b/src/runtime/opencl/opencl_common.h index c2905b432764..6877240c08a9 100644 --- a/src/runtime/opencl/opencl_common.h +++ b/src/runtime/opencl/opencl_common.h @@ -342,7 +342,12 @@ class OpenCLThreadEntry { static OpenCLThreadEntry* ThreadLocal(); }; -/*! \brief OpenCL runtime buffer structure with tracked memory layout */ +/*! \brief OpenCL runtime buffer structure with tracked memory layout + TODO(tvm-team): Uncouple use of storage scope and data layout by using the transform_layout + schedule primitive to express the desired texture layout. This will require supporting Nd + indices in BufferLoad and BufferStore in CodegenOpenCL, and ensuring Nd allocations for + texture are correctly routed to the AllocateTexture packed function in the OpenCL DeviceAPI. +*/ struct BufferDescriptor { enum class MemoryLayout { /*! \brief One dimensional buffer in row-major layout*/ @@ -355,6 +360,10 @@ struct BufferDescriptor { * e.g. image2d[height=O, width=IHW] */ kImage2DWeight, + /*! \brief Two dimensional texture w/ height = axis[1] + * e.g. image2d[height=NH, width=WC] + */ + kImage2DNHWC, }; BufferDescriptor() = default; explicit BufferDescriptor(Optional scope) : layout(MemoryLayoutFromScope(scope)) {} diff --git a/src/runtime/opencl/opencl_device_api.cc b/src/runtime/opencl/opencl_device_api.cc index 80b95a6ebfe9..478ec181e899 100644 --- a/src/runtime/opencl/opencl_device_api.cc +++ b/src/runtime/opencl/opencl_device_api.cc @@ -72,6 +72,8 @@ cl::BufferDescriptor::MemoryLayout cl::BufferDescriptor::MemoryLayoutFromScope( return cl::BufferDescriptor::MemoryLayout::kImage2DActivation; } else if (mem_scope.value() == "global.texture-weight") { return cl::BufferDescriptor::MemoryLayout::kImage2DWeight; + } else if (mem_scope.value() == "global.texture-nhwc") { + return cl::BufferDescriptor::MemoryLayout::kImage2DNHWC; } LOG(FATAL) << "No memory layout defined for memory of scope: " << mem_scope.value(); return cl::BufferDescriptor::MemoryLayout::kBuffer1D; @@ -85,6 +87,8 @@ String cl::BufferDescriptor::ScopeFromMemoryLayout(cl::BufferDescriptor::MemoryL return "global.texture"; case cl::BufferDescriptor::MemoryLayout::kImage2DWeight: return "global.texture-weight"; + case cl::BufferDescriptor::MemoryLayout::kImage2DNHWC: + return "global.texture-nhwc"; } LOG(FATAL) << "No scope corresponding to the provided memory layout: " << static_cast(layout); @@ -285,6 +289,7 @@ void OpenCLWorkspace::CopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHand break; case cl::BufferDescriptor::MemoryLayout::kImage2DActivation: case cl::BufferDescriptor::MemoryLayout::kImage2DWeight: + case cl::BufferDescriptor::MemoryLayout::kImage2DNHWC: auto image_info = GetImageInfo(from_desc, from); // TODO(csullivan): Support calculating row_pitch correctly in the case of reuse. // Note that when utilizing texture pools for memory reuse, the allocated image @@ -306,6 +311,7 @@ void OpenCLWorkspace::CopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHand break; case cl::BufferDescriptor::MemoryLayout::kImage2DActivation: case cl::BufferDescriptor::MemoryLayout::kImage2DWeight: + case cl::BufferDescriptor::MemoryLayout::kImage2DNHWC: auto image_info = GetImageInfo(to_desc, to); OPENCL_CALL(clEnqueueWriteImage( this->GetQueue(to->device), to_desc->buffer, CL_FALSE, image_info.origin, diff --git a/src/runtime/texture.h b/src/runtime/texture.h index 83725c00b8c2..5f43c8cee8f3 100644 --- a/src/runtime/texture.h +++ b/src/runtime/texture.h @@ -57,6 +57,12 @@ inline size_t DefaultTextureLayoutSeparator(size_t shape_rank, separator = shape_rank - 2; } else if (convention == "global.texture-weight") { separator = 1; + } else if (convention == "global.texture-nhwc") { + if (shape_rank == 3) { + separator = 1; + } else { + separator = 2; + } } else { LOG(FATAL) << "Encountered unknown texture lowering convention: " << convention; } diff --git a/src/runtime/thread_storage_scope.h b/src/runtime/thread_storage_scope.h index 4122f9d0798e..bc9e2faa809f 100644 --- a/src/runtime/thread_storage_scope.h +++ b/src/runtime/thread_storage_scope.h @@ -60,6 +60,8 @@ enum class StorageRank { kWMMAMatrixB = 5, /*! \brief wmma scope memory of accumulator */ kWMMAAccumulator = 6, + /*! \brief global scope texture memory */ + kTexture = 7, }; /*! @@ -109,6 +111,8 @@ struct StorageScope { return "wmma.matrix_b" + tag; case StorageRank::kWMMAAccumulator: return "wmma.accumulator" + tag; + case StorageRank::kTexture: + return "texture" + tag; default: LOG(FATAL) << "unknown storage scope"; return ""; @@ -144,6 +148,9 @@ struct StorageScope { } else if (s.compare(0, 16, "wmma.accumulator") == 0) { r.rank = StorageRank::kWMMAAccumulator; r.tag = s.substr(16, std::string::npos); + } else if (s.compare(0, 7, "texture") == 0) { + r.rank = StorageRank::kTexture; + r.tag = s.substr(7, std::string::npos); } else { LOG(FATAL) << "unknown storage scope " << s; } diff --git a/src/target/source/codegen_source_base.cc b/src/target/source/codegen_source_base.cc index 5acb42071b62..2353d2e6baf2 100644 --- a/src/target/source/codegen_source_base.cc +++ b/src/target/source/codegen_source_base.cc @@ -22,6 +22,8 @@ */ #include "codegen_source_base.h" +#include + namespace tvm { namespace codegen { @@ -73,6 +75,9 @@ std::string CodeGenSourceBase::AllocVarID(const tir::VarNode* v) { ICHECK(!var_idmap_.count(v)) << "Need input to be in SSA form dup " << v->name_hint; std::string key = v->name_hint; std::string vid = GetUniqueName(key); + std::replace(vid.begin(), vid.end(), ':', '_'); + std::replace(vid.begin(), vid.end(), '-', '_'); + std::replace(vid.begin(), vid.end(), '.', '_'); var_idmap_[v] = vid; return vid; } diff --git a/src/target/target_kind.cc b/src/target/target_kind.cc index 43bcfef105ff..1148013706ab 100644 --- a/src/target/target_kind.cc +++ b/src/target/target_kind.cc @@ -324,6 +324,7 @@ TVM_REGISTER_TARGET_KIND("opencl", kDLOpenCL) .add_attr_option("system-lib") .add_attr_option("max_num_threads", Integer(256)) .add_attr_option("thread_warp_size", Integer(1)) + .add_attr_option("texture_spatial_limit", Integer(16384)) .set_default_keys({"opencl", "gpu"}); // The metal has some limitations on the number of input parameters. This is why attribute diff --git a/tests/python/relay/test_conv2d_nchw_texture.py b/tests/python/relay/test_conv2d_nchw_texture.py new file mode 100644 index 000000000000..d36da51c8f71 --- /dev/null +++ b/tests/python/relay/test_conv2d_nchw_texture.py @@ -0,0 +1,394 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import tvm +import numpy as np +from tvm import relay +from tvm.relay import testing +from utils.adreno_utils import gpu_preprocess, build_run_compare + + +@tvm.testing.requires_opencl +def test_conv2d_inceptionv3_64x35x35_96x64x3x3_nopad(): + target = "opencl --device=adreno" + dtype = "float16" + + input_shape = (1, 32, 42, 42) + filter_shape = (96, 32, 3, 3) + bias_shape = (1, 96, 1, 1) + A = relay.var("data", shape=input_shape, dtype=dtype) + B = relay.var("weight", shape=filter_shape, dtype=dtype) + bias = relay.var("bias", shape=bias_shape, dtype=dtype) + + # C = relay.nn.relu(A) + conv = relay.nn.conv2d( + A, + B, + data_layout="NCHW", + kernel_layout="OIHW", + padding=[0, 0, 0, 0], + strides=[2, 2], + out_dtype=dtype, + channels=96, + kernel_size=(3, 3), + ) + D = relay.op.add(conv, bias) + D = relay.op.nn.relu(D) + + mod = relay.Function([A, B, bias], D) + np.random.seed(0) + initializer = relay.testing.init.Xavier() + filter_data = np.zeros(filter_shape).astype(dtype) + bias_data = np.zeros(bias_shape).astype(dtype) + initializer("weight", filter_data) + initializer("bias", bias_data) + params1 = { + "weight": tvm.nd.array(filter_data), + "bias": tvm.nd.array(bias_data), + } + + build_run_compare(mod, params1, {"data": input_shape}, dtype, target, gpu_preprocess) + + +@tvm.testing.requires_opencl +def test_conv2d_inceptionv3_64x35x35_96x64x3x3_nopad_pass(): + target = "opencl --device=adreno" + dtype = "float16" + + input_shape = (1, 32, 40, 40) + filter_shape = (96, 32, 2, 2) + bias_shape = (1, 96, 1, 1) + A = relay.var("data", shape=input_shape, dtype=dtype) + B = relay.var("weight", shape=filter_shape, dtype=dtype) + bias = relay.var("bias", shape=bias_shape, dtype=dtype) + + # C = relay.nn.relu(A) + conv = relay.nn.conv2d( + A, + B, + data_layout="NCHW", + kernel_layout="OIHW", + padding=[0, 0, 0, 0], + strides=[2, 2], + out_dtype=dtype, + channels=96, + kernel_size=(2, 2), + ) + D = relay.op.add(conv, bias) + D = relay.op.nn.relu(D) + + mod = relay.Function([A, B, bias], D) + np.random.seed(0) + initializer = relay.testing.init.Xavier() + filter_data = np.zeros(filter_shape).astype(dtype) + bias_data = np.zeros(bias_shape).astype(dtype) + initializer("weight", filter_data) + initializer("bias", bias_data) + params1 = { + "weight": tvm.nd.array(filter_data), + "bias": tvm.nd.array(bias_data), + } + + build_run_compare(mod, params1, {"data": input_shape}, dtype, target, gpu_preprocess) + + +@tvm.testing.requires_opencl +def test_conv2d_inceptionv3_35_35_strides(): + target = "opencl --device=adreno" + dtype = "float16" + + input_shape = (1, 48, 35, 35) + filter_shape = (64, 48, 5, 5) + bias_shape = (1, 64, 1, 1) + A = relay.var("data", shape=input_shape, dtype=dtype) + B = relay.var("weight", shape=filter_shape, dtype=dtype) + bias = relay.var("bias", shape=bias_shape, dtype=dtype) + + # C = relay.nn.relu(A) + conv = relay.nn.conv2d( + A, + B, + data_layout="NCHW", + kernel_layout="OIHW", + padding=[2, 2, 2, 2], + strides=[1, 1], + out_dtype=dtype, + channels=64, + kernel_size=(5, 5), + ) + D = relay.op.add(conv, bias) + D = relay.op.nn.relu(D) + + mod = relay.Function([A, B, bias], D) + np.random.seed(0) + initializer = relay.testing.init.Xavier() + filter_data = np.zeros(filter_shape).astype(dtype) + bias_data = np.zeros(bias_shape).astype(dtype) + initializer("weight", filter_data) + initializer("bias", bias_data) + params1 = { + "weight": tvm.nd.array(filter_data), + "bias": tvm.nd.array(bias_data), + } + + build_run_compare(mod, params1, {"data": input_shape}, dtype, target, gpu_preprocess) + + +@tvm.testing.requires_opencl +def test_conv2d_resnet50_v2_nchw_3c(): + target = "opencl --device=adreno" + dtype = "float16" + + input_shape = (1, 3, 224, 224) + filter_shape = (64, 3, 7, 7) + bias_shape = (1, 64, 1, 1) + A = relay.var("data", shape=input_shape, dtype=dtype) + B = relay.var("weight", shape=filter_shape, dtype=dtype) + bias = relay.var("bias", shape=bias_shape, dtype=dtype) + + # C = relay.nn.relu(A) + conv = relay.nn.conv2d( + A, + B, + data_layout="NCHW", + kernel_layout="OIHW", + padding=[3, 3, 3, 3], + strides=[2, 2], + out_dtype=dtype, + channels=64, + kernel_size=(7, 7), + ) + D = relay.op.add(conv, bias) + D = relay.op.nn.relu(D) + + mod = relay.Function([A, B, bias], D) + # mod, params = relay.testing.init.create_workload(func) + np.random.seed(1) + initializer = relay.testing.init.Xavier() + filter_data = np.zeros(filter_shape).astype(dtype) + bias_data = np.zeros(bias_shape).astype(dtype) + initializer("weight", filter_data) + initializer("bias", bias_data) + params1 = { + "weight": tvm.nd.array(filter_data), + "bias": tvm.nd.array(bias_data), + } + + build_run_compare(mod, params1, {"data": input_shape}, dtype, target) + + +@tvm.testing.requires_opencl +def test_conv2d_inceptionv3_nchw_3c(): + target = "opencl --device=adreno" + dtype = "float16" + + input_shape = (1, 3, 299, 299) + filter_shape = (64, 3, 3, 3) + bias_shape = (1, 64, 1, 1) + A = relay.var("data", shape=input_shape, dtype=dtype) + B = relay.var("weight", shape=filter_shape, dtype=dtype) + bias = relay.var("bias", shape=bias_shape, dtype=dtype) + + # C = relay.nn.relu(A) + conv = relay.nn.conv2d( + A, + B, + data_layout="NCHW", + kernel_layout="OIHW", + padding=[0, 0, 0, 0], + strides=[2, 2], + out_dtype=dtype, + channels=64, + kernel_size=(3, 3), + ) + D = relay.op.add(conv, bias) + D = relay.op.nn.relu(D) + + mod = relay.Function([A, B, bias], D) + np.random.seed(0) + initializer = relay.testing.init.Xavier() + filter_data = np.zeros(filter_shape).astype(dtype) + bias_data = np.zeros(bias_shape).astype(dtype) + initializer("weight", filter_data) + initializer("bias", bias_data) + params1 = { + "weight": tvm.nd.array(filter_data), + "bias": tvm.nd.array(bias_data), + } + + build_run_compare(mod, params1, {"data": input_shape}, dtype, target) + + +@tvm.testing.requires_opencl +def test_conv2d_1x1_16c16spatial(): + target = "opencl --device=adreno" + dtype = "float16" + + input_shape = (1, 16, 256, 256) + filter_shape = (32, 16, 4, 4) + bias_shape = (1, 32, 1, 1) + A = relay.var("data", shape=input_shape, dtype=dtype) + B = relay.var("weight", shape=filter_shape, dtype=dtype) + bias = relay.var("bias", shape=bias_shape, dtype=dtype) + + # C = relay.nn.relu(A) + conv = relay.nn.conv2d( + A, + B, + data_layout="NCHW", + kernel_layout="OIHW", + padding=[0, 0, 0, 0], + strides=[2, 2], + out_dtype=dtype, + channels=32, + kernel_size=(4, 4), + ) + D = relay.op.add(conv, bias) + D = relay.op.nn.relu(D) + + mod = relay.Function([A, B, bias], D) + np.random.seed(0) + initializer = relay.testing.init.Xavier() + filter_data = np.zeros(filter_shape).astype(dtype) + bias_data = np.zeros(bias_shape).astype(dtype) + initializer("weight", filter_data) + initializer("bias", bias_data) + params1 = { + "weight": tvm.nd.array(filter_data), + "bias": tvm.nd.array(bias_data), + } + + build_run_compare(mod, params1, {"data": input_shape}, dtype, target) + + +@tvm.testing.requires_opencl +def test_conv2d_4x4_16c16pad(): + target = "opencl --device=adreno" + dtype = "float16" + + input_shape = (1, 32, 256, 256) + filter_shape = (32, 32, 4, 4) + bias_shape = (1, 32, 1, 1) + A = relay.var("data", shape=input_shape, dtype=dtype) + B = relay.var("weight", shape=filter_shape, dtype=dtype) + bias = relay.var("bias", shape=bias_shape, dtype=dtype) + + # C = relay.nn.relu(A) + conv = relay.nn.conv2d( + A, + B, + data_layout="NCHW", + kernel_layout="OIHW", + padding=[3, 3, 0, 0], + strides=[2, 2], + out_dtype=dtype, + channels=32, + kernel_size=(4, 4), + ) + D = relay.op.add(conv, bias) + D = relay.op.nn.relu(D) + + mod = relay.Function([A, B, bias], D) + np.random.seed(0) + initializer = relay.testing.init.Xavier() + filter_data = np.zeros(filter_shape).astype(dtype) + bias_data = np.zeros(bias_shape).astype(dtype) + initializer("weight", filter_data) + initializer("bias", bias_data) + params1 = { + "weight": tvm.nd.array(filter_data), + "bias": tvm.nd.array(bias_data), + } + + build_run_compare(mod, params1, {"data": input_shape}, dtype, target) + + +@tvm.testing.requires_opencl +def test_conv2d_4x4x4_16c16pad(): + target = "opencl --device=adreno" + dtype = "float16" + + input_shape = (1, 32, 256, 256) + filter_shape = (4, 32, 4, 4) + bias_shape = (1, 4, 1, 1) + A = relay.var("data", shape=input_shape, dtype=dtype) + B = relay.var("weight", shape=filter_shape, dtype=dtype) + bias = relay.var("bias", shape=bias_shape, dtype=dtype) + + # C = relay.nn.relu(A) + conv = relay.nn.conv2d( + A, + B, + data_layout="NCHW", + kernel_layout="OIHW", + padding=[3, 3, 0, 0], + strides=[2, 2], + out_dtype=dtype, + channels=4, + kernel_size=(4, 4), + ) + D = relay.op.add(conv, bias) + D = relay.op.nn.relu(D) + + mod = relay.Function([A, B, bias], D) + np.random.seed(0) + initializer = relay.testing.init.Xavier() + filter_data = np.zeros(filter_shape).astype(dtype) + bias_data = np.zeros(bias_shape).astype(dtype) + initializer("weight", filter_data) + initializer("bias", bias_data) + params1 = { + "weight": tvm.nd.array(filter_data), + "bias": tvm.nd.array(bias_data), + } + + build_run_compare(mod, params1, {"data": input_shape}, dtype, target) + + +@tvm.testing.requires_opencl +def test_conv2d_yolov3_v2_nchw_3c(): + target = "opencl --device=adreno" + dtype = "float16" + + input_shape = (1, 1024, 13, 13) + filter_shape = (255, 1024, 1, 1) + A = relay.var("data", shape=input_shape, dtype=dtype) + B = relay.var("weight", shape=filter_shape, dtype=dtype) + + conv = relay.nn.conv2d( + A, + B, + data_layout="NCHW", + kernel_layout="OIHW", + padding=[0, 0, 0, 0], + strides=[1, 1], + out_dtype=dtype, + channels=255, + kernel_size=(1, 1), + ) + + mod = relay.Function([A, B], conv) + # mod, params = relay.testing.init.create_workload(func) + np.random.seed(0) + initializer = relay.testing.init.Xavier() + filter_data = np.zeros(filter_shape).astype(dtype) + initializer("weight", filter_data) + params = { + "weight": tvm.nd.array(filter_data), + } + + build_run_compare(mod, params, {"data": input_shape}, dtype, target) diff --git a/tests/python/relay/test_conv2d_nhwc_texture.py b/tests/python/relay/test_conv2d_nhwc_texture.py new file mode 100644 index 000000000000..a02b7cabbef6 --- /dev/null +++ b/tests/python/relay/test_conv2d_nhwc_texture.py @@ -0,0 +1,556 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import os +import tvm +import numpy as np +from tvm import relay +from tvm.relay import testing +from utils.adreno_utils import gpu_preprocess, build_run_compare + + +@tvm.testing.requires_opencl +def test_conv2d_deeplabv3_1_257_257_32x1_1_32_16(): + target = "opencl --device=adreno" + dtype = "float16" + + input_shape = (1, 257, 257, 32) + filter_shape = (1, 1, 32, 16) + bias_shape = (filter_shape[-1],) + A = relay.var("data", shape=input_shape, dtype=dtype) + B = relay.var("weight", shape=filter_shape, dtype=dtype) + bias = relay.var("bias", shape=bias_shape, dtype=dtype) + + conv = relay.nn.conv2d( + A, + B, + data_layout="NHWC", + kernel_layout="HWIO", + out_dtype=dtype, + channels=filter_shape[-1], + kernel_size=(1, 1), + ) + D = relay.op.add(conv, bias) + D = relay.op.nn.relu(D) + + mod = relay.Function([A, B, bias], D) + np.random.seed(1) + initializer = relay.testing.init.Xavier() + filter_data = np.zeros(filter_shape).astype(dtype) + bias_data = np.zeros(bias_shape).astype(dtype) + initializer("weight", filter_data) + initializer("bias", bias_data) + params1 = { + "weight": tvm.nd.array(filter_data), + "bias": tvm.nd.array(bias_data), + } + + build_run_compare(mod, params1, {"data": input_shape}, dtype, target) + + +@tvm.testing.requires_opencl +def test_conv2d_deeplabv3_1_257_257_32x1_1_32_16_with_padding(): + target = "opencl --device=adreno" + dtype = "float16" + + input_shape = (1, 257, 257, 32) + filter_shape = (1, 1, 32, 16) + bias_shape = (filter_shape[-1],) + A = relay.var("data", shape=input_shape, dtype=dtype) + B = relay.var("weight", shape=filter_shape, dtype=dtype) + bias = relay.var("bias", shape=bias_shape, dtype=dtype) + + conv = relay.nn.conv2d( + A, + B, + data_layout="NHWC", + kernel_layout="HWIO", + padding=[3, 3, 3, 3], + strides=[2, 2], + out_dtype=dtype, + channels=filter_shape[-1], + kernel_size=(1, 1), + ) + D = relay.op.add(conv, bias) + D = relay.op.nn.relu(D) + + mod = relay.Function([A, B, bias], D) + # mod, params = relay.testing.init.create_workload(func) + np.random.seed(1) + initializer = relay.testing.init.Xavier() + filter_data = np.zeros(filter_shape).astype(dtype) + bias_data = np.zeros(bias_shape).astype(dtype) + initializer("weight", filter_data) + initializer("bias", bias_data) + params1 = { + "weight": tvm.nd.array(filter_data), + "bias": tvm.nd.array(bias_data), + } + + build_run_compare(mod, params1, {"data": input_shape}, dtype, target) + + +@tvm.testing.requires_opencl +def test_conv2d_4_35_35_32x3_3_144_16(): + target = "opencl --device=adreno" + dtype = "float16" + + input_shape = (4, 35, 35, 32) + filter_shape = (3, 3, 32, 16) + bias_shape = (filter_shape[-1],) + kernel_size = (filter_shape[0], filter_shape[1]) + A = relay.var("data", shape=input_shape, dtype=dtype) + B = relay.var("weight", shape=filter_shape, dtype=dtype) + bias = relay.var("bias", shape=bias_shape, dtype=dtype) + + conv = relay.nn.conv2d( + A, + B, + data_layout="NHWC", + kernel_layout="HWIO", + out_dtype=dtype, + channels=filter_shape[-1], + kernel_size=kernel_size, + ) + D = relay.op.add(conv, bias) + D = relay.op.nn.relu(D) + + mod = relay.Function([A, B, bias], D) + np.random.seed(1) + initializer = relay.testing.init.Xavier() + filter_data = np.zeros(filter_shape).astype(dtype) + bias_data = np.zeros(bias_shape).astype(dtype) + initializer("weight", filter_data) + initializer("bias", bias_data) + params1 = { + "weight": tvm.nd.array(filter_data), + "bias": tvm.nd.array(bias_data), + } + + build_run_compare(mod, params1, {"data": input_shape}, dtype, target) + + +@tvm.testing.requires_opencl +def test_conv2d_deeplabv3_1_513_513_3x3_3_3_32(): + target = "opencl --device=adreno" + dtype = "float16" + + input_shape = (1, 513, 513, 3) + filter_shape = (3, 3, 3, 32) + bias_shape = (filter_shape[-1],) + kernel_size = (filter_shape[0], filter_shape[1]) + A = relay.var("data", shape=input_shape, dtype=dtype) + B = relay.var("weight", shape=filter_shape, dtype=dtype) + bias = relay.var("bias", shape=bias_shape, dtype=dtype) + + conv = relay.nn.conv2d( + A, + B, + data_layout="NHWC", + kernel_layout="HWIO", + out_dtype=dtype, + channels=filter_shape[-1], + kernel_size=kernel_size, + ) + D = relay.op.add(conv, bias) + D = relay.op.nn.relu(D) + + mod = relay.Function([A, B, bias], D) + np.random.seed(1) + initializer = relay.testing.init.Xavier() + filter_data = np.ones(filter_shape).astype(dtype) + bias_data = np.ones(bias_shape).astype(dtype) + initializer("weight", filter_data) + initializer("bias", bias_data) + params1 = { + "weight": tvm.nd.array(filter_data), + "bias": tvm.nd.array(bias_data), + } + + build_run_compare(mod, params1, {"data": input_shape}, dtype, target) + + +@tvm.testing.requires_opencl +def test_conv2d_inceptionv3_64x35x35_96x64x3x3_nopad(): + target = "opencl --device=adreno" + dtype = "float16" + + input_shape = (1, 42, 42, 32) + filter_shape = (3, 3, 32, 96) + bias_shape = (1, 1, 1, 96) + A = relay.var("data", shape=input_shape, dtype=dtype) + B = relay.var("weight", shape=filter_shape, dtype=dtype) + bias = relay.var("bias", shape=bias_shape, dtype=dtype) + + # C = relay.nn.relu(A) + conv = relay.nn.conv2d( + A, + B, + data_layout="NHWC", + kernel_layout="HWIO", + padding=[0, 0, 0, 0], + strides=[2, 2], + out_dtype=dtype, + channels=96, + kernel_size=(3, 3), + ) + D = relay.op.add(conv, bias) + D = relay.op.nn.relu(D) + + mod = relay.Function([A, B, bias], D) + np.random.seed(0) + initializer = relay.testing.init.Xavier() + filter_data = np.zeros(filter_shape).astype(dtype) + bias_data = np.zeros(bias_shape).astype(dtype) + initializer("weight", filter_data) + initializer("bias", bias_data) + params1 = { + "weight": tvm.nd.array(filter_data), + "bias": tvm.nd.array(bias_data), + } + + build_run_compare(mod, params1, {"data": input_shape}, dtype, target, gpu_preprocess) + + +@tvm.testing.requires_opencl +def test_conv2d_inceptionv3_64x35x35_96x64x3x3_nopad_pass(): + target = "opencl --device=adreno" + dtype = "float16" + + input_shape = (1, 40, 40, 32) + filter_shape = (2, 2, 32, 96) + bias_shape = (1, 1, 1, 96) + A = relay.var("data", shape=input_shape, dtype=dtype) + B = relay.var("weight", shape=filter_shape, dtype=dtype) + bias = relay.var("bias", shape=bias_shape, dtype=dtype) + + # C = relay.nn.relu(A) + conv = relay.nn.conv2d( + A, + B, + data_layout="NHWC", + kernel_layout="HWIO", + padding=[0, 0, 0, 0], + strides=[2, 2], + out_dtype=dtype, + channels=96, + kernel_size=(2, 2), + ) + D = relay.op.add(conv, bias) + D = relay.op.nn.relu(D) + + mod = relay.Function([A, B, bias], D) + np.random.seed(0) + initializer = relay.testing.init.Xavier() + filter_data = np.zeros(filter_shape).astype(dtype) + bias_data = np.zeros(bias_shape).astype(dtype) + initializer("weight", filter_data) + initializer("bias", bias_data) + params1 = { + "weight": tvm.nd.array(filter_data), + "bias": tvm.nd.array(bias_data), + } + + build_run_compare(mod, params1, {"data": input_shape}, dtype, target, gpu_preprocess) + + +@tvm.testing.requires_opencl +def test_conv2d_inceptionv3_35_35_strides(): + target = "opencl --device=adreno" + dtype = "float16" + + input_shape = (1, 35, 35, 48) + filter_shape = (5, 5, 48, 64) + bias_shape = (1, 1, 1, 64) + A = relay.var("data", shape=input_shape, dtype=dtype) + B = relay.var("weight", shape=filter_shape, dtype=dtype) + bias = relay.var("bias", shape=bias_shape, dtype=dtype) + + # C = relay.nn.relu(A) + conv = relay.nn.conv2d( + A, + B, + data_layout="NHWC", + kernel_layout="HWIO", + padding=[2, 2, 2, 2], + strides=[1, 1], + out_dtype=dtype, + channels=64, + kernel_size=(5, 5), + ) + D = relay.op.add(conv, bias) + D = relay.op.nn.relu(D) + + mod = relay.Function([A, B, bias], D) + np.random.seed(0) + initializer = relay.testing.init.Xavier() + filter_data = np.zeros(filter_shape).astype(dtype) + bias_data = np.zeros(bias_shape).astype(dtype) + initializer("weight", filter_data) + initializer("bias", bias_data) + params1 = { + "weight": tvm.nd.array(filter_data), + "bias": tvm.nd.array(bias_data), + } + + build_run_compare(mod, params1, {"data": input_shape}, dtype, target, gpu_preprocess) + + +@tvm.testing.requires_opencl +def test_conv2d_resnet50_v2_nhwc_3c(): + target = "opencl --device=adreno" + dtype = "float16" + + input_shape = (1, 224, 224, 3) + filter_shape = (7, 7, 3, 64) + bias_shape = (1, 1, 1, 64) + A = relay.var("data", shape=input_shape, dtype=dtype) + B = relay.var("weight", shape=filter_shape, dtype=dtype) + bias = relay.var("bias", shape=bias_shape, dtype=dtype) + + # C = relay.nn.relu(A) + conv = relay.nn.conv2d( + A, + B, + data_layout="NHWC", + kernel_layout="HWIO", + padding=[3, 3, 3, 3], + strides=[2, 2], + out_dtype=dtype, + channels=64, + kernel_size=(7, 7), + ) + D = relay.op.add(conv, bias) + D = relay.op.nn.relu(D) + + mod = relay.Function([A, B, bias], D) + # mod, params = relay.testing.init.create_workload(func) + np.random.seed(1) + initializer = relay.testing.init.Xavier() + filter_data = np.zeros(filter_shape).astype(dtype) + bias_data = np.zeros(bias_shape).astype(dtype) + initializer("weight", filter_data) + initializer("bias", bias_data) + params1 = { + "weight": tvm.nd.array(filter_data), + "bias": tvm.nd.array(bias_data), + } + + build_run_compare(mod, params1, {"data": input_shape}, dtype, target) + + +@tvm.testing.requires_opencl +def test_conv2d_inceptionv3_nhwc_3c(): + target = "opencl --device=adreno" + dtype = "float16" + + input_shape = (1, 299, 299, 3) + filter_shape = (3, 3, 3, 64) + bias_shape = (1, 1, 1, 64) + A = relay.var("data", shape=input_shape, dtype=dtype) + B = relay.var("weight", shape=filter_shape, dtype=dtype) + bias = relay.var("bias", shape=bias_shape, dtype=dtype) + + # C = relay.nn.relu(A) + conv = relay.nn.conv2d( + A, + B, + data_layout="NHWC", + kernel_layout="HWIO", + padding=[0, 0, 0, 0], + strides=[2, 2], + out_dtype=dtype, + channels=64, + kernel_size=(3, 3), + ) + D = relay.op.add(conv, bias) + D = relay.op.nn.relu(D) + + mod = relay.Function([A, B, bias], D) + np.random.seed(0) + initializer = relay.testing.init.Xavier() + filter_data = np.zeros(filter_shape).astype(dtype) + bias_data = np.zeros(bias_shape).astype(dtype) + initializer("weight", filter_data) + initializer("bias", bias_data) + params1 = { + "weight": tvm.nd.array(filter_data), + "bias": tvm.nd.array(bias_data), + } + + build_run_compare(mod, params1, {"data": input_shape}, dtype, target) + + +@tvm.testing.requires_opencl +def test_conv2d_1x1_16c16spatial(): + target = "opencl --device=adreno" + dtype = "float16" + + input_shape = (1, 256, 256, 16) + filter_shape = (4, 4, 16, 32) + bias_shape = (1, 1, 1, 32) + A = relay.var("data", shape=input_shape, dtype=dtype) + B = relay.var("weight", shape=filter_shape, dtype=dtype) + bias = relay.var("bias", shape=bias_shape, dtype=dtype) + + # C = relay.nn.relu(A) + conv = relay.nn.conv2d( + A, + B, + data_layout="NHWC", + kernel_layout="HWIO", + padding=[0, 0, 0, 0], + strides=[2, 2], + out_dtype=dtype, + channels=32, + kernel_size=(4, 4), + ) + D = relay.op.add(conv, bias) + D = relay.op.nn.relu(D) + + mod = relay.Function([A, B, bias], D) + np.random.seed(0) + initializer = relay.testing.init.Xavier() + filter_data = np.zeros(filter_shape).astype(dtype) + bias_data = np.zeros(bias_shape).astype(dtype) + initializer("weight", filter_data) + initializer("bias", bias_data) + params1 = { + "weight": tvm.nd.array(filter_data), + "bias": tvm.nd.array(bias_data), + } + + build_run_compare(mod, params1, {"data": input_shape}, dtype, target) + + +@tvm.testing.requires_opencl +def test_conv2d_4x4_16c16pad(): + target = "opencl --device=adreno" + dtype = "float16" + + input_shape = (1, 256, 256, 32) + filter_shape = (4, 4, 32, 32) + bias_shape = (1, 1, 1, 32) + A = relay.var("data", shape=input_shape, dtype=dtype) + B = relay.var("weight", shape=filter_shape, dtype=dtype) + bias = relay.var("bias", shape=bias_shape, dtype=dtype) + + # C = relay.nn.relu(A) + conv = relay.nn.conv2d( + A, + B, + data_layout="NHWC", + kernel_layout="HWIO", + padding=[3, 3, 0, 0], + strides=[2, 2], + out_dtype=dtype, + channels=32, + kernel_size=(4, 4), + ) + D = relay.op.add(conv, bias) + D = relay.op.nn.relu(D) + + mod = relay.Function([A, B, bias], D) + np.random.seed(0) + initializer = relay.testing.init.Xavier() + filter_data = np.zeros(filter_shape).astype(dtype) + bias_data = np.zeros(bias_shape).astype(dtype) + initializer("weight", filter_data) + initializer("bias", bias_data) + params1 = { + "weight": tvm.nd.array(filter_data), + "bias": tvm.nd.array(bias_data), + } + + build_run_compare(mod, params1, {"data": input_shape}, dtype, target) + + +@tvm.testing.requires_opencl +def test_conv2d_4x4x4_16c16pad(): + target = "opencl --device=adreno" + dtype = "float16" + + input_shape = (1, 256, 256, 32) + filter_shape = (4, 4, 32, 4) + bias_shape = (1, 1, 1, 4) + A = relay.var("data", shape=input_shape, dtype=dtype) + B = relay.var("weight", shape=filter_shape, dtype=dtype) + bias = relay.var("bias", shape=bias_shape, dtype=dtype) + + # C = relay.nn.relu(A) + conv = relay.nn.conv2d( + A, + B, + data_layout="NHWC", + kernel_layout="HWIO", + padding=[3, 3, 0, 0], + strides=[2, 2], + out_dtype=dtype, + channels=4, + kernel_size=(4, 4), + ) + D = relay.op.add(conv, bias) + D = relay.op.nn.relu(D) + + mod = relay.Function([A, B, bias], D) + np.random.seed(0) + initializer = relay.testing.init.Xavier() + filter_data = np.zeros(filter_shape).astype(dtype) + bias_data = np.zeros(bias_shape).astype(dtype) + initializer("weight", filter_data) + initializer("bias", bias_data) + params1 = { + "weight": tvm.nd.array(filter_data), + "bias": tvm.nd.array(bias_data), + } + + build_run_compare(mod, params1, {"data": input_shape}, dtype, target) + + +@tvm.testing.requires_opencl +def test_conv2d_yolov3_v2_nhwc_3c(): + target = "opencl --device=adreno" + dtype = "float16" + + input_shape = (1, 13, 13, 1024) + filter_shape = (1, 1, 1024, 255) + A = relay.var("data", shape=input_shape, dtype=dtype) + B = relay.var("weight", shape=filter_shape, dtype=dtype) + + conv = relay.nn.conv2d( + A, + B, + data_layout="NHWC", + kernel_layout="HWIO", + padding=[0, 0, 0, 0], + strides=[1, 1], + out_dtype=dtype, + channels=255, + kernel_size=(1, 1), + ) + + mod = relay.Function([A, B], conv) + # mod, params = relay.testing.init.create_workload(func) + np.random.seed(0) + initializer = relay.testing.init.Xavier() + filter_data = np.zeros(filter_shape).astype(dtype) + initializer("weight", filter_data) + params = { + "weight": tvm.nd.array(filter_data), + } + + build_run_compare(mod, params, {"data": input_shape}, dtype, target) diff --git a/tests/python/relay/test_depthwise_conv2d_nchw_texture.py b/tests/python/relay/test_depthwise_conv2d_nchw_texture.py new file mode 100644 index 000000000000..71cf62c5d85c --- /dev/null +++ b/tests/python/relay/test_depthwise_conv2d_nchw_texture.py @@ -0,0 +1,194 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import os +import tvm +import numpy as np +from tvm import relay +from tvm.relay import testing +from utils.adreno_utils import gpu_preprocess, build_run_compare + + +@tvm.testing.requires_opencl +def test_depthwise_conv2d_bias_nchwc(): + target = "opencl --device=adreno" + dtype = "float16" + + input_shape = (1, 64, 112, 112) + filter_shape = (64, 1, 3, 3) + bias_shape = (1, 64, 1, 1) + A = relay.var("data", shape=input_shape, dtype=dtype) + B = relay.var("weight", shape=filter_shape, dtype=dtype) + bias = relay.var("bias", shape=bias_shape, dtype=dtype) + + # C = relay.nn.relu(A) + conv = relay.nn.conv2d( + A, + B, + data_layout="NCHW", + kernel_layout="OIHW", + padding=[1, 1, 1, 1], + strides=[2, 2], + out_dtype=dtype, + channels=64, + groups=64, + kernel_size=(3, 3), + ) + D = relay.op.add(conv, bias) + D = relay.op.nn.relu(D) + + mod = relay.Function([A, B, bias], D) + # mod, params = relay.testing.init.create_workload(func) + np.random.seed(1) + initializer = relay.testing.init.Xavier() + filter_data = np.zeros(filter_shape).astype(dtype) + bias_data = np.zeros(bias_shape).astype(dtype) + initializer("weight", filter_data) + initializer("bias", bias_data) + params1 = { + "weight": tvm.nd.array(filter_data), + "bias": tvm.nd.array(bias_data), + } + + build_run_compare(mod, params1, {"data": input_shape}, dtype, target, gpu_preprocess) + + +@tvm.testing.requires_opencl +def test_depthwise_conv2d_nchwc(): + target = "opencl --device=adreno" + dtype = "float16" + + input_shape = (1, 64, 112, 112) + filter_shape = (64, 1, 3, 3) + bias_shape = (1, 64, 1, 1) + A = relay.var("data", shape=input_shape, dtype=dtype) + B = relay.var("weight", shape=filter_shape, dtype=dtype) + + # C = relay.nn.relu(A) + conv = relay.nn.conv2d( + A, + B, + data_layout="NCHW", + kernel_layout="OIHW", + padding=[1, 1, 1, 1], + strides=[2, 2], + out_dtype=dtype, + channels=64, + groups=64, + kernel_size=(3, 3), + ) + + mod = relay.Function([A, B], conv) + # mod, params = relay.testing.init.create_workload(func) + np.random.seed(1) + initializer = relay.testing.init.Xavier() + filter_data = np.zeros(filter_shape).astype(dtype) + bias_data = np.zeros(bias_shape).astype(dtype) + initializer("weight", filter_data) + params1 = { + "weight": tvm.nd.array(filter_data), + } + + build_run_compare(mod, params1, {"data": input_shape}, dtype, target, gpu_preprocess) + + +@tvm.testing.requires_opencl +def test_depthwise_conv2d_bias_nchw(): + target = "opencl --device=adreno" + dtype = "float16" + + input_shape = (1, 64, 112, 112) + filter_shape = (64, 1, 3, 3) + bias_shape = (1, 64, 1, 1) + A = relay.var("data", shape=input_shape, dtype=dtype) + B = relay.var("weight", shape=filter_shape, dtype=dtype) + bias = relay.var("bias", shape=bias_shape, dtype=dtype) + + # C = relay.nn.relu(A) + conv = relay.nn.conv2d( + A, + B, + data_layout="NCHW", + kernel_layout="OIHW", + padding=[1, 1, 1, 1], + strides=[2, 2], + out_dtype=dtype, + channels=64, + groups=64, + kernel_size=(3, 3), + ) + D = relay.op.add(conv, bias) + D = relay.op.nn.relu(D) + + mod = relay.Function([A, B, bias], D) + # mod, params = relay.testing.init.create_workload(func) + np.random.seed(1) + initializer = relay.testing.init.Xavier() + filter_data = np.zeros(filter_shape).astype(dtype) + bias_data = np.zeros(bias_shape).astype(dtype) + initializer("weight", filter_data) + initializer("bias", bias_data) + params1 = { + "weight": tvm.nd.array(filter_data), + "bias": tvm.nd.array(bias_data), + } + + build_run_compare(mod, params1, {"data": input_shape}, dtype, target) + + +@tvm.testing.requires_opencl +def test_depthwise_conv2d_repack_bias_nchw(): + target = "opencl --device=adreno" + dtype = "float16" + + input_shape = (1, 63, 112, 112) + filter_shape = (63, 1, 3, 3) + bias_shape = (1, 63, 1, 1) + A = relay.var("data", shape=input_shape, dtype=dtype) + B = relay.var("weight", shape=filter_shape, dtype=dtype) + bias = relay.var("bias", shape=bias_shape, dtype=dtype) + + # C = relay.nn.relu(A) + conv = relay.nn.conv2d( + A, + B, + data_layout="NCHW", + kernel_layout="OIHW", + padding=[1, 1, 1, 1], + strides=[2, 2], + out_dtype=dtype, + channels=63, + groups=63, + kernel_size=(3, 3), + ) + D = relay.op.add(conv, bias) + D = relay.op.nn.relu(D) + + mod = relay.Function([A, B, bias], D) + # mod, params = relay.testing.init.create_workload(func) + np.random.seed(1) + initializer = relay.testing.init.Xavier() + filter_data = np.zeros(filter_shape).astype(dtype) + bias_data = np.zeros(bias_shape).astype(dtype) + initializer("weight", filter_data) + initializer("bias", bias_data) + params1 = { + "weight": tvm.nd.array(filter_data), + "bias": tvm.nd.array(bias_data), + } + + build_run_compare(mod, params1, {"data": input_shape}, dtype, target) diff --git a/tests/python/relay/test_depthwise_conv2d_nhwc_texture.py b/tests/python/relay/test_depthwise_conv2d_nhwc_texture.py new file mode 100644 index 000000000000..16d26c77ca8e --- /dev/null +++ b/tests/python/relay/test_depthwise_conv2d_nhwc_texture.py @@ -0,0 +1,233 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import os +import tvm +import numpy as np +from tvm import relay +from tvm.relay import testing +from utils.adreno_utils import gpu_preprocess, build_run_compare + + +@tvm.testing.requires_opencl +def test_depthwise_conv2d_deeplabv3_1_129_129_144x3_3_144_1(): + target = "opencl --device=adreno" + dtype = "float16" + + input_shape = (1, 129, 129, 144) + filter_shape = (3, 3, 144, 1) + kernel_size = (filter_shape[0], filter_shape[1]) + bias_shape = (filter_shape[2],) + A = relay.var("data", shape=input_shape, dtype=dtype) + B = relay.var("weight", shape=filter_shape, dtype=dtype) + bias = relay.var("bias", shape=bias_shape, dtype=dtype) + + conv = relay.nn.conv2d( + A, + B, + data_layout="NHWC", + kernel_layout="HWOI", + out_dtype=dtype, + groups=filter_shape[2], + channels=filter_shape[2], + kernel_size=kernel_size, + ) + D = relay.op.add(conv, bias) + D = relay.op.nn.relu(D) + + mod = relay.Function([A, B, bias], D) + mod = relay.Function([A, B, bias], conv) + np.random.seed(1) + initializer = relay.testing.init.Xavier() + filter_data = np.zeros(filter_shape).astype(dtype) + bias_data = np.zeros(bias_shape).astype(dtype) + initializer("weight", filter_data) + initializer("bias", bias_data) + params1 = { + "weight": tvm.nd.array(filter_data), + "bias": tvm.nd.array(bias_data), + } + + build_run_compare(mod, params1, {"data": input_shape}, dtype, target) + + +@tvm.testing.requires_opencl +def test_depthwise_conv2d_deeplabv3_4_35_35_576x3_3_576_1(): + target = "opencl --device=adreno" + dtype = "float16" + + input_shape = (4, 35, 35, 576) + filter_shape = (3, 3, 576, 1) + kernel_size = (filter_shape[0], filter_shape[1]) + bias_shape = (filter_shape[2],) + A = relay.var("data", shape=input_shape, dtype=dtype) + B = relay.var("weight", shape=filter_shape, dtype=dtype) + bias = relay.var("bias", shape=bias_shape, dtype=dtype) + + conv = relay.nn.conv2d( + A, + B, + data_layout="NHWC", + kernel_layout="HWOI", + out_dtype=dtype, + groups=filter_shape[2], + channels=filter_shape[2], + kernel_size=kernel_size, + ) + D = relay.op.add(conv, bias) + D = relay.op.nn.relu(D) + + mod = relay.Function([A, B, bias], D) + mod = relay.Function([A, B, bias], conv) + np.random.seed(1) + initializer = relay.testing.init.Xavier() + filter_data = np.zeros(filter_shape).astype(dtype) + bias_data = np.zeros(bias_shape).astype(dtype) + initializer("weight", filter_data) + initializer("bias", bias_data) + params1 = { + "weight": tvm.nd.array(filter_data), + "bias": tvm.nd.array(bias_data), + } + + build_run_compare(mod, params1, {"data": input_shape}, dtype, target) + + +@tvm.testing.requires_opencl +def test_depthwise_conv2d_deeplabv3_1_129_129_144x3_3_144_1_with_padding(): + target = "opencl --device=adreno" + dtype = "float16" + + input_shape = (1, 129, 129, 144) + filter_shape = (3, 3, 144, 1) + kernel_size = (filter_shape[0], filter_shape[1]) + bias_shape = (filter_shape[2],) + A = relay.var("data", shape=input_shape, dtype=dtype) + B = relay.var("weight", shape=filter_shape, dtype=dtype) + bias = relay.var("bias", shape=bias_shape, dtype=dtype) + + conv = relay.nn.conv2d( + A, + B, + data_layout="NHWC", + kernel_layout="HWOI", + padding=[3, 3, 3, 3], + strides=[2, 2], + out_dtype=dtype, + groups=filter_shape[2], + channels=filter_shape[2], + kernel_size=kernel_size, + ) + D = relay.op.add(conv, bias) + D = relay.op.nn.relu(D) + + mod = relay.Function([A, B, bias], D) + # mod, params = relay.testing.init.create_workload(func) + np.random.seed(1) + initializer = relay.testing.init.Xavier() + filter_data = np.zeros(filter_shape).astype(dtype) + bias_data = np.zeros(bias_shape).astype(dtype) + initializer("weight", filter_data) + initializer("bias", bias_data) + params1 = { + "weight": tvm.nd.array(filter_data), + "bias": tvm.nd.array(bias_data), + } + + build_run_compare(mod, params1, {"data": input_shape}, dtype, target) + + +@tvm.testing.requires_opencl +def test_depthwise_conv2d_1_513_513_7x3_3_7_1(): + target = "opencl --device=adreno" + dtype = "float16" + + input_shape = (1, 513, 513, 7) + filter_shape = (3, 3, 7, 1) + bias_shape = (filter_shape[2],) + kernel_size = (filter_shape[0], filter_shape[1]) + A = relay.var("data", shape=input_shape, dtype=dtype) + B = relay.var("weight", shape=filter_shape, dtype=dtype) + bias = relay.var("bias", shape=bias_shape, dtype=dtype) + + conv = relay.nn.conv2d( + A, + B, + data_layout="NHWC", + kernel_layout="HWOI", + out_dtype=dtype, + channels=filter_shape[2], + groups=filter_shape[2], + kernel_size=kernel_size, + ) + D = relay.op.add(conv, bias) + D = relay.op.nn.relu(D) + + mod = relay.Function([A, B, bias], D) + np.random.seed(1) + initializer = relay.testing.init.Xavier() + filter_data = np.ones(filter_shape).astype(dtype) + bias_data = np.ones(bias_shape).astype(dtype) + initializer("weight", filter_data) + initializer("bias", bias_data) + params1 = { + "weight": tvm.nd.array(filter_data), + "bias": tvm.nd.array(bias_data), + } + + build_run_compare(mod, params1, {"data": input_shape}, dtype, target) + + +@tvm.testing.requires_opencl +def test_depthwise_conv2d_1_513_513_3x3_3_3_1(): + target = "opencl --device=adreno" + dtype = "float16" + + input_shape = (1, 513, 513, 3) + filter_shape = (3, 3, 3, 1) + bias_shape = (filter_shape[2],) + kernel_size = (filter_shape[0], filter_shape[1]) + A = relay.var("data", shape=input_shape, dtype=dtype) + B = relay.var("weight", shape=filter_shape, dtype=dtype) + bias = relay.var("bias", shape=bias_shape, dtype=dtype) + + conv = relay.nn.conv2d( + A, + B, + data_layout="NHWC", + kernel_layout="HWOI", + out_dtype=dtype, + channels=filter_shape[2], + groups=filter_shape[2], + kernel_size=kernel_size, + ) + D = relay.op.add(conv, bias) + D = relay.op.nn.relu(D) + + mod = relay.Function([A, B, bias], D) + np.random.seed(1) + initializer = relay.testing.init.Xavier() + filter_data = np.ones(filter_shape).astype(dtype) + bias_data = np.ones(bias_shape).astype(dtype) + initializer("weight", filter_data) + initializer("bias", bias_data) + params1 = { + "weight": tvm.nd.array(filter_data), + "bias": tvm.nd.array(bias_data), + } + + build_run_compare(mod, params1, {"data": input_shape}, dtype, target) diff --git a/tests/python/relay/utils/adreno_utils.py b/tests/python/relay/utils/adreno_utils.py new file mode 100644 index 000000000000..11abce3bfaa0 --- /dev/null +++ b/tests/python/relay/utils/adreno_utils.py @@ -0,0 +1,118 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Utils for adreno compute/schedules""" + +import os +import tvm +import numpy as np +from tvm import relay +from tvm.relay import testing +from tvm.relay.transform import recast +from tvm.contrib import graph_runtime + + +def get_cpu_reference(mod, params1, input_shape, inputs): + mod_fp32 = recast(mod, "float32", "float32", ops=["nn.conv2d", "add", "nn.relu"]) + with relay.build_config(opt_level=3): + graph, lib, params = relay.build(mod_fp32, "llvm", params=params1) + ctx = tvm.cpu() + m = graph_runtime.create(graph, lib, ctx) + if isinstance(input_shape, dict): + for key in input_shape: + m.set_input(key, inputs[-1]) + else: + m.set_input("data", inputs[-1]) + m.set_input(**params) + m.run() + return [ + m.get_output(0).asnumpy(), + ] + + +# build module run with opencl and cpu, compare results +def build_run_compare( + tvm_mod, params1, input_shape, dtype="float32", target="llvm", gpu_preprocess=None +): + + if "TVM_TRACKER_HOST" in os.environ and "TVM_TRACKER_PORT" in os.environ: + rpc_tracker_host = os.environ["TVM_TRACKER_HOST"] + rpc_tracker_port = os.environ["TVM_TRACKER_PORT"] + run_on_host = 0 + target_host = "llvm -mtriple=arm64-linux-android" + rpc_tracker_port = int(rpc_tracker_port) + else: + run_on_host = 1 + target_host = "llvm" + + if gpu_preprocess: + tvm_mod_nchwc = gpu_preprocess(tvm_mod) + else: + tvm_mod_nchwc = tvm_mod + + with relay.build_config(opt_level=3): + graph, lib, params = relay.build( + tvm_mod_nchwc, target_host=target_host, target=target, params=params1 + ) + if run_on_host: + ctx = tvm.opencl() + m = graph_runtime.create(graph, lib, ctx) + else: + from tvm import rpc + from tvm.contrib import utils, ndk + + rpc_key = "android" + tracker = rpc.connect_tracker(rpc_tracker_host, rpc_tracker_port) + remote = tracker.request(rpc_key, priority=0, session_timeout=600) + temp = utils.tempdir() + dso_binary = "dev_lib_cl.so" + dso_binary_path = temp.relpath(dso_binary) + ctx = remote.cl(0) + lib.export_library(dso_binary_path, ndk.create_shared) + remote.upload(dso_binary_path) + rlib = remote.load_module(dso_binary) + m = graph_runtime.create(graph, rlib, ctx) + m.set_input(**params) + inputs = [] + if isinstance(input_shape, dict): + for key in input_shape: + inputs.append(np.random.normal(size=input_shape[key]).astype(dtype)) + m.set_input(key, inputs[-1]) + else: + inputs.append(np.random.normal(size=input_shape).astype(dtype)) + m.set_input("data", inputs[-1]) + m.run() + + ref_outputs = get_cpu_reference(tvm_mod, params1, input_shape, inputs) + for i, ref_output in enumerate(ref_outputs): + tvm_output = m.get_output(i) + output = tvm_output.asnumpy() + # for index, x in np.ndenumerate(ref_output): + # if abs(output[index] - x) > 0.01: + # print(index, output[index], x) + + np.testing.assert_allclose(output, ref_output, rtol=1e-1, atol=1e-1) + + +def gpu_preprocess(tvm_mod): + layout_config = relay.transform.LayoutConfig() + desired_layouts = {"nn.conv2d": ["NCHW4c", "OIHW4o"]} + with layout_config: + seq = tvm.transform.Sequential([relay.transform.ConvertLayout(desired_layouts)]) + with tvm.transform.PassContext(opt_level=3): + mod = tvm.IRModule.from_expr(tvm_mod) + tvm_mod_nchwc = seq(mod) + return tvm_mod_nchwc