Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Relay] Legalize and AlterOpLayout for Int8 Intel. #3961

Merged
merged 1 commit into from
Sep 19, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
85 changes: 62 additions & 23 deletions tests/python/relay/test_op_level2.py
Original file line number Diff line number Diff line change
Expand Up @@ -541,18 +541,35 @@ def test_upsampling():


def test_conv2d_int8_intrinsics():
def _compile(input_dtype, weight_dtype, output_dtype, target):
n, ic, h, w, oc, ch, cw = 1, 16, 224, 224, 32, 3, 3
x = relay.var("x", relay.TensorType((n, ic, h, w), input_dtype))
w = relay.var("w", relay.TensorType((oc, ic, ch, cw), weight_dtype))
def _compile(ic, oc, target, data_layout, kernel_layout, dtypes):
input_dtype, weight_dtype, output_dtype = dtypes

n, h, w, ch, cw = 1, 64, 64, 3, 3
if data_layout == 'NCHW':
x = relay.var("x", relay.TensorType((n, ic, h, w), input_dtype))
elif data_layout == 'NHWC':
x = relay.var("x", relay.TensorType((n, h, w, ic), input_dtype))
else:
raise ValueError('Not supported')

if kernel_layout == 'OIHW':
kernel_shape = (oc, ic, ch, cw)
elif kernel_layout == 'HWIO':
kernel_shape = (ch, cw, ic, oc)
else:
raise ValueError('Not supported')

w = relay.var("w", relay.TensorType(kernel_shape, weight_dtype))
y = relay.nn.conv2d(x, w,
kernel_size=(ch, cw),
channels=oc,
padding=(1, 1),
dilation=(1, 1),
data_layout=data_layout,
kernel_layout=kernel_layout,
out_dtype=output_dtype)
func = relay.Function([x, w], y)
wdata = np.random.rand(oc, ic, ch, cw) * 10
wdata = np.random.rand(*kernel_shape) * 10
parameters = {"w": tvm.nd.array(wdata.astype(weight_dtype))}
with relay.build_config(opt_level=3):
graph, lib, params = relay.build(func, target, params=parameters)
Expand All @@ -564,37 +581,59 @@ def _compile(input_dtype, weight_dtype, output_dtype, target):
name = "llvm.x86.avx512.pmaddubs.w.512"
llvm_id = tvm.codegen.llvm_lookup_intrinsic_id(name)
if llvm_id != 0:
# Intel Int8 instruction need uint8 data and int8 kernel
asm = _compile(input_dtype="uint8",
weight_dtype="int8",
output_dtype="int32",
target=target)
# Check that intrinisic is present in the assembly.
fast_int8_dtypes = ('uint8', 'int8', 'int32')
# Sweep the input channels to check int8 robustness
for ic in range(1, 24):
asm = _compile(ic=ic, oc=32, target=target, data_layout="NCHW", kernel_layout='OIHW',
dtypes=fast_int8_dtypes)
assert "pmaddubs" in asm

for ic in range(1, 24):
asm = _compile(ic=ic, oc=32, target=target, data_layout="NHWC", kernel_layout='HWIO',
dtypes=fast_int8_dtypes)
assert "pmaddubs" in asm


# Sweep the output channels to check int8 robustness
for oc in range(2, 24):
asm = _compile(ic=16, oc=oc, target=target, data_layout="NCHW", kernel_layout='OIHW',
dtypes=fast_int8_dtypes)
assert "pmaddubs" in asm

for oc in range(2, 24):
asm = _compile(ic=16, oc=oc, target=target, data_layout="NHWC", kernel_layout='HWIO',
dtypes=fast_int8_dtypes)
assert "pmaddubs" in asm

# Check that both non-divisible oc and ic work
asm = _compile(ic=17, oc=29, target=target, data_layout="NCHW", kernel_layout='OIHW',
dtypes=fast_int8_dtypes)
assert "pmaddubs" in asm

asm = _compile(ic=17, oc=29, target=target, data_layout="NHWC", kernel_layout='HWIO',
dtypes=fast_int8_dtypes)
assert "pmaddubs" in asm

# Ensure that code is generated when datatypes are not HW supported.
asm = _compile(input_dtype="int8",
weight_dtype="int8",
output_dtype="int32",
target=target)
dtypes = ('int8', 'int8', 'int32')
asm = _compile(ic=16, oc=32, target=target, data_layout="NHWC", kernel_layout='HWIO',
dtypes=dtypes)
# Check that intrinisic is not present in the assembly.
assert "pmaddubs" not in asm

# Ensure that code is generated when datatypes are not HW supported.
asm = _compile(input_dtype="uint8",
weight_dtype="uint8",
output_dtype="int32",
target=target)
dtypes = ('uint8', 'uint8', 'int32')
asm = _compile(ic=16, oc=32, target=target, data_layout="NHWC", kernel_layout='HWIO',
dtypes=dtypes)
# Check that intrinisic is not present in the assembly.
assert "pmaddubs" not in asm

# Check that a vectorized instruction is generated for older Intel
# generations, because we default to NCHWc layout.
target = "llvm -mcpu=core-avx2"
asm = _compile(input_dtype="int8",
weight_dtype="int8",
output_dtype="int32",
target=target)
fast_int8_dtypes = ('uint8', 'int8', 'int32')
asm = _compile(ic=16, oc=32, target=target, data_layout="NCHW", kernel_layout='OIHW',
dtypes=fast_int8_dtypes)
# Check that vector int mult and add instructions are generated.
assert "vpmulld" in asm and "vpadd" in asm

Expand Down
2 changes: 1 addition & 1 deletion topi/python/topi/nn/conv2d.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ def _get_workload(data, kernel, stride, padding, out_dtype, data_layout='NCHW'):
if data_layout == 'NCHW':
CO, CIG, KH, KW = [x.value for x in kernel.shape]
else:
KH, KW, CO, CIG = [x.value for x in kernel.shape]
KH, KW, CIG, CO = [x.value for x in kernel.shape]

HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel)
GRPS = CI // CIG
Expand Down
1 change: 1 addition & 0 deletions topi/python/topi/x86/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,4 @@
from .roi_align import roi_align_nchw
from .conv2d_transpose import _schedule_conv2d_transpose_nchw
from .sparse import *
from .conv2d_alter_op import *
157 changes: 3 additions & 154 deletions topi/python/topi/x86/conv2d.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,40 +26,16 @@
from tvm.autotvm.task import get_config
from .. import generic, tag
from .. import nn
from ..util import get_const_tuple, get_shape
from ..nn.conv2d import conv2d, conv2d_NCHWc, conv2d_NCHWc_int8, \
conv2d_alter_layout, conv2d_infer_layout, _get_workload as _get_conv2d_workload
from ..nn.conv2d import conv2d, conv2d_NCHWc, \
conv2d_infer_layout, _get_workload as _get_conv2d_workload
from ..nn.depthwise_conv2d import _get_workload as _get_depthwise_conv2d_workload
from ..nn.depthwise_conv2d import depthwise_conv2d_NCHWc, depthwise_conv2d_nchw
from ..nn.pad import pad
from ..util import get_const_tuple

from . import conv2d_avx_1x1, conv2d_avx_common

logger = logging.getLogger('topi')

def _is_int8_hw_support(data_dtype, kernel_dtype, target):
"""
Checks to ensure that we can use Intel DLBoost instructions
1) The datatypes are correct.
2) LLVM version has support for the instructions.
3) Target is skylake and above.
"""
# 1) Check datatypes
is_dtype_support = data_dtype == 'uint8' and kernel_dtype == 'int8'

# 2) Check LLVM support
llvm_intrin_fast_int8 = "llvm.x86.avx512.pmaddubs.w.512"
llvm_id = tvm.codegen.llvm_lookup_intrinsic_id(llvm_intrin_fast_int8)
is_llvm_support = llvm_id != 0

# 3) Check target
is_target_support = False
for opt in target.options:
if opt == '-mcpu=skylake-avx512':
is_target_support = True

return is_dtype_support and is_llvm_support and is_target_support

def _get_default_config(cfg, data, kernel, strides, padding, out_dtype, is_depthwise=False,
layout='NCHW'):
"""
Expand Down Expand Up @@ -353,133 +329,6 @@ def _topi_nn_conv2d_NCHWc(*args, **kwargs):
return s, [new_data, new_kernel, C]


@conv2d_alter_layout.register("cpu")
def _alter_conv2d_layout(attrs, inputs, tinfo, F):

copy_inputs = [s for s in inputs]
new_attrs = {k : attrs[k] for k in attrs.keys()}

if F.__name__ == 'tvm.relay.op':
# Derive channels for frontends (e.g ONNX) that miss "channel" field.
new_attrs["channels"] = inputs[1].checked_type.shape[attrs['kernel_layout'].index('O')]

data, kernel = tinfo[0], tinfo[1]
batch_size, in_channel, height, width = get_const_tuple(data.shape)

groups = attrs.get_int("groups")
out_channel = attrs.get_int("channels") \
if F.__name__ == 'nnvm.symbol' else new_attrs["channels"]
padding = attrs.get_int_tuple("padding")
strides = attrs.get_int_tuple("strides")
dilation = attrs.get_int_tuple("dilation")
out_dtype = attrs["out_dtype"]

layout_name = 'layout' if F.__name__ == 'nnvm.symbol' else 'data_layout'

layout = attrs[layout_name]
kh, kw = attrs.get_int_tuple("kernel_size")

dtype = data.dtype
out_dtype = dtype if out_dtype in ("same", "") else out_dtype

kshape = get_shape(kernel.shape, attrs["kernel_layout"], "OIHW")
is_depthwise = groups == kshape[0] and kshape[1] == 1

# only optimize for NCHW
if layout != 'NCHW' or attrs["kernel_layout"] != "OIHW":
return None

if groups != 1 and not is_depthwise:
return None

dispatch_ctx = autotvm.task.DispatchContext.current
target = tvm.target.current_target()
# query schedule and fallback if necessary
workload = autotvm.task.args_to_workload(
[data, kernel, strides, padding, dilation, out_dtype], depthwise_conv2d_nchw) \
if is_depthwise else \
autotvm.task.args_to_workload(
[data, kernel, strides, padding, dilation, layout, out_dtype], conv2d)
cfg = dispatch_ctx.query(target, workload)
if cfg.is_fallback:
_get_default_config(cfg, data, kernel, strides, padding, out_dtype, is_depthwise)

ic_bn, oc_bn = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1]

new_attrs[layout_name] = 'NCHW%dc' % ic_bn
new_attrs['out_layout'] = 'NCHW%dc' % oc_bn

new_data = tvm.placeholder((batch_size, in_channel//ic_bn, height, width, ic_bn),
dtype=data.dtype)

if is_depthwise:
new_attrs['kernel_layout'] = 'OIHW1i%do' % oc_bn
# Store altered operator's config
new_kernel = tvm.placeholder((out_channel//oc_bn, 1, kh, kw, 1, oc_bn), dtype=kernel.dtype)
new_workload = autotvm.task.args_to_workload(
[new_data, new_kernel, strides, padding, dilation, new_attrs[layout_name],
new_attrs['out_layout'], out_dtype], depthwise_conv2d_NCHWc)
dispatch_ctx.update(target, new_workload, cfg)
if F.__name__ == 'nnvm.symbol':
logging.warning("Use native layout for depthwise convolution on NNVM.")
return None
return F.nn.contrib_depthwise_conv2d_nchwc(*copy_inputs, **new_attrs)

if _is_int8_hw_support(data.dtype, kernel.dtype, target):
# Convert kernel data layout from 4D to 7D
n_elems = 4
out_channel, _, kh, kw = get_const_tuple(kernel.shape)
data_expr, kernel_expr = inputs
kernel_IHWO = F.transpose(kernel_expr, axes=(1, 2, 3, 0))
kernel_IHWOo = F.reshape(kernel_IHWO, (in_channel, kh, kw, out_channel//oc_bn, oc_bn))
kernel_OHWoI = F.transpose(kernel_IHWOo, axes=(3, 1, 2, 4, 0))
kernel_OHWoIi = F.reshape(kernel_OHWoI, (out_channel//oc_bn, kh, kw, oc_bn,
in_channel//ic_bn, ic_bn))
kernel_OHWoIie = F.reshape(kernel_OHWoIi, (out_channel//oc_bn, kh, kw, oc_bn,
in_channel//ic_bn, ic_bn//n_elems, n_elems))
kernel_OIHWioe = F.transpose(kernel_OHWoIie, axes=(0, 4, 1, 2, 5, 3, 6))
copy_inputs = [data_expr, kernel_OIHWioe]

# Store altered operator's config. New kernel layout OIHWio4
new_kernel = tvm.placeholder((out_channel // oc_bn,
in_channel // ic_bn,
kh,
kw,
ic_bn // n_elems,
oc_bn,
n_elems), dtype=kernel.dtype)

new_workload = autotvm.task.args_to_workload([new_data,
new_kernel,
strides,
padding,
dilation,
new_attrs[layout_name],
new_attrs['out_layout'],
out_dtype],
conv2d_NCHWc_int8)
dispatch_ctx.update(target, new_workload, cfg)
if F.__name__ == 'nnvm.symbol':
logging.warning("Use native layout for int8 convolution on NNVM.")
return None
return F.nn.contrib_conv2d_nchwc_int8(*copy_inputs, **new_attrs)

out_channel, _, kh, kw = get_const_tuple(kernel.shape)
# (oc, ic, h, w) -> (OC, IC, h, w, ic, oc)
new_attrs['kernel_layout'] = 'OIHW%di%do' % (ic_bn, oc_bn)
# Store altered operator's config
new_kernel = tvm.placeholder((out_channel//oc_bn, in_channel//ic_bn,
kh, kw, ic_bn, oc_bn), dtype=kernel.dtype)
new_workload = autotvm.task.args_to_workload(
[new_data, new_kernel, strides, padding, dilation, new_attrs[layout_name],
new_attrs['out_layout'], out_dtype], conv2d_NCHWc)
dispatch_ctx.update(target, new_workload, cfg)

if F.__name__ == 'nnvm.symbol':
return F.contrib.conv2d_NCHWc(*copy_inputs, **new_attrs)
return F.nn.contrib_conv2d_nchwc(*copy_inputs, **new_attrs)


@conv2d_infer_layout.register("cpu")
def _conv2d_infer_layout(workload, cfg):
_, data, kernel, strides, padding, dilation, layout, dtype = workload
Expand Down
Loading