Skip to content

Commit

Permalink
[AutoScheduler] Enable winograd for conv2d and layout rewrite for con…
Browse files Browse the repository at this point in the history
…v3d (apache#7168)

* [AutoScheduler] Enable winograd for conv2d & Enable layout rewrite for conv3d

* fix test

* fix test

* update tutorials
  • Loading branch information
merrymercy authored Dec 26, 2020
1 parent 592ecc0 commit c1bca8e
Show file tree
Hide file tree
Showing 14 changed files with 379 additions and 82 deletions.
7 changes: 6 additions & 1 deletion docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,7 +250,12 @@
"tune_relay_arm.py",
"tune_relay_mobile_gpu.py",
],
"auto_scheduler": ["tune_matmul_x86.py", "tune_conv2d_layer_cuda.py"],
"auto_scheduler": [
"tune_matmul_x86.py",
"tune_conv2d_layer_cuda.py",
"tune_network_x86.py",
"tune_network_cuda.py",
],
"dev": ["low_level_custom_pass.py", "use_pass_infra.py", "bring_your_own_datatypes.py"],
}

Expand Down
14 changes: 8 additions & 6 deletions include/tvm/relay/attrs/nn.h
Original file line number Diff line number Diff line change
Expand Up @@ -210,9 +210,10 @@ struct Conv2DWinogradAttrs : public tvm::AttrsNode<Conv2DWinogradAttrs> {
int groups;
IndexExpr channels;
Array<IndexExpr> kernel_size;
std::string data_layout;
std::string kernel_layout;
std::string out_layout;
tvm::String data_layout;
tvm::String kernel_layout;
tvm::String out_layout;
tvm::String auto_scheduler_rewritten_layout; // The layout after auto-scheduler's layout rewrite
DataType out_dtype;

TVM_DECLARE_ATTRS(Conv2DWinogradAttrs, "relay.attrs.Conv2DWinogradAttrs") {
Expand Down Expand Up @@ -300,9 +301,10 @@ struct Conv3DAttrs : public tvm::AttrsNode<Conv3DAttrs> {
int groups;
IndexExpr channels;
Array<IndexExpr> kernel_size;
std::string data_layout;
std::string kernel_layout;
std::string out_layout;
tvm::String data_layout;
tvm::String kernel_layout;
tvm::String out_layout;
tvm::String auto_scheduler_rewritten_layout; // The layout after auto-scheduler's layout rewrite
DataType out_dtype;

TVM_DECLARE_ATTRS(Conv3DAttrs, "relay.attrs.Conv3DAttrs") {
Expand Down
18 changes: 12 additions & 6 deletions python/tvm/relay/op/strategy/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,10 @@


def naive_schedule(_, outs, target):
"""Return the naive default schedule"""
"""Return the naive default schedule.
This function acts as a placeholder for op implementations that uses auto-scheduler.
Implemenations using this function should only be used along with auto-scheduler.
"""
if "gpu" in target.keys:
# For GPU, we at least need thread binding to make a valid schedule.
# So the naive schedule cannot be compiled.
Expand Down Expand Up @@ -502,7 +505,7 @@ def conv3d_transpose_strategy(attrs, inputs, out_type, target):


# conv3d
def wrap_compute_conv3d(topi_compute, need_layout=False):
def wrap_compute_conv3d(topi_compute, need_layout=False, need_auto_scheduler_layout=False):
"""wrap conv3d topi compute"""

def _compute_conv3d(attrs, inputs, out_type):
Expand All @@ -519,11 +522,14 @@ def _compute_conv3d(attrs, inputs, out_type):
raise ValueError("Dilation should be positive value")
if groups != 1:
raise ValueError("Not support arbitrary group number for conv3d")

args = [inputs[0], inputs[1], strides, padding, dilation]
if need_layout:
out = topi_compute(inputs[0], inputs[1], strides, padding, dilation, layout, out_dtype)
else:
out = topi_compute(inputs[0], inputs[1], strides, padding, dilation, out_dtype)
return [out]
args.append(layout)
args.append(out_dtype)
if need_auto_scheduler_layout:
args.append(get_auto_scheduler_rewritten_layout(attrs))
return [topi_compute(*args)]

return _compute_conv3d

Expand Down
102 changes: 89 additions & 13 deletions python/tvm/relay/op/strategy/x86.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@ def conv2d_strategy_cpu(attrs, inputs, out_type, target):
"""conv2d x86 strategy"""
strategy = _op.OpStrategy()
data, kernel = inputs
stride_h, stride_w = get_const_tuple(attrs.strides)
dilation_h, dilation_w = get_const_tuple(attrs.dilation)
groups = attrs.groups
layout = attrs.data_layout
Expand Down Expand Up @@ -125,6 +126,35 @@ def conv2d_strategy_cpu(attrs, inputs, out_type, target):
wrap_topi_schedule(topi.x86.schedule_conv2d_nhwc),
name="conv2d_nhwc.x86",
)

judge_winograd_auto_scheduler = False
if len(kernel.shape) == 4:
kernel_h, kernel_w, _, co = get_const_tuple(kernel.shape)
judge_winograd_auto_scheduler = (
"float" in data.dtype
and "float" in kernel.dtype
and kernel_h == 3
and kernel_w == 3
and stride_h == 1
and stride_w == 1
and dilation_h == 1
and dilation_w == 1
and 64 < co < 512
# The last condition of co is based on our profiling of resnet workloads
# on skylake avx512 machines. We found winograd is faster than direct
# only when co is within this range
)

# register auto-scheduler implementations
if is_auto_scheduler_enabled() and judge_winograd_auto_scheduler:
strategy.add_implementation(
wrap_compute_conv2d(
topi.nn.conv2d_winograd_nhwc, need_auto_scheduler_layout=True
),
naive_schedule, # this implementation should never be picked by autotvm
name="conv2d_nhwc.winograd",
plevel=15,
)
elif layout == "HWCN":
assert kernel_layout == "HWIO"
if not is_auto_scheduler_enabled():
Expand Down Expand Up @@ -269,20 +299,39 @@ def conv3d_strategy_cpu(attrs, inputs, out_type, target):
"""conv3d generic strategy"""
strategy = _op.OpStrategy()
layout = attrs.data_layout
if layout == "NCDHW":
strategy.add_implementation(
wrap_compute_conv3d(topi.x86.conv3d_ncdhw),
wrap_topi_schedule(topi.x86.schedule_conv3d_ncdhw),
name="conv3d_ncdhw.x86",
)
elif layout == "NDHWC":
strategy.add_implementation(
wrap_compute_conv3d(topi.x86.conv3d_ndhwc),
wrap_topi_schedule(topi.x86.schedule_conv3d_ndhwc),
name="conv3d_ndhwc.x86",
)
if is_auto_scheduler_enabled():
# Use auto-scheduler. We should provide clear compute definition without autotvm templates
# or packed layouts.
if layout == "NCDHW":
strategy.add_implementation(
wrap_compute_conv3d(topi.nn.conv3d_ncdhw, need_auto_scheduler_layout=True),
naive_schedule,
name="conv3d_ncdhw.x86",
)
elif layout == "NDHWC":
strategy.add_implementation(
wrap_compute_conv3d(topi.nn.conv3d_ndhwc, need_auto_scheduler_layout=True),
naive_schedule,
name="conv3d_ndhwc.x86",
)
else:
raise ValueError("Not support this layout {} yet".format(layout))
else:
raise ValueError("Not support this layout {} yet".format(layout))
# Use autotvm templates
if layout == "NCDHW":
strategy.add_implementation(
wrap_compute_conv3d(topi.x86.conv3d_ncdhw),
wrap_topi_schedule(topi.x86.schedule_conv3d_ncdhw),
name="conv3d_ncdhw.x86",
)
elif layout == "NDHWC":
strategy.add_implementation(
wrap_compute_conv3d(topi.x86.conv3d_ndhwc),
wrap_topi_schedule(topi.x86.schedule_conv3d_ndhwc),
name="conv3d_ndhwc.x86",
)
else:
raise ValueError("Not support this layout {} yet".format(layout))
return strategy


Expand Down Expand Up @@ -476,3 +525,30 @@ def scatter_nd_strategy_cpu(attrs, inputs, out_type, target):
plevel=10,
)
return strategy


@conv2d_winograd_without_weight_transfrom_strategy.register("cpu")
def conv2d_winograd_without_weight_transfrom_strategy_cpu(attrs, inputs, out_type, target):
"""conv2d_winograd_without_weight_transfrom cpu strategy"""
dilation = attrs.get_int_tuple("dilation")
groups = attrs.get_int("groups")
layout = attrs.data_layout
strides = attrs.get_int_tuple("strides")
assert dilation == (1, 1), "Do not support dilate now"
assert strides == (1, 1), "Do not support strides now"
assert groups == 1, "Do not supoort arbitrary group number"
strategy = _op.OpStrategy()
if layout == "NHWC":
strategy.add_implementation(
wrap_compute_conv2d(
topi.nn.conv2d_winograd_nhwc_without_weight_transform,
need_auto_scheduler_layout=True,
),
naive_schedule,
name="ansor.winograd",
)
else:
raise RuntimeError(
"Unsupported conv2d_winograd_without_weight_transfrom layout {}".format(layout)
)
return strategy
9 changes: 8 additions & 1 deletion python/tvm/topi/cuda/conv2d_winograd.py
Original file line number Diff line number Diff line change
Expand Up @@ -363,7 +363,14 @@ def _callback(op):

@conv2d_winograd_nhwc.register(["cuda", "gpu"])
def conv2d_winograd_nhwc_cuda(
data, weight, strides, padding, dilation, out_dtype, pre_computed=False
data,
weight,
strides,
padding,
dilation,
out_dtype,
pre_computed=False,
auto_scheduler_rewritten_layout="",
):
"""Conv2D Winograd in NHWC layout.
This is a clean version to be used by the auto-scheduler for both CPU and GPU.
Expand Down
56 changes: 50 additions & 6 deletions python/tvm/topi/nn/conv2d.py
Original file line number Diff line number Diff line change
Expand Up @@ -982,6 +982,7 @@ def _conv2d_winograd_nhwc_impl(
out_dtype,
tile_size,
pre_computed=False,
auto_scheduler_rewritten_layout="",
):
"""Conv2D Winograd implementation in NHWC layout.
This is a clean version to be used by the auto-scheduler for both CPU and GPU.
Expand All @@ -1002,8 +1003,10 @@ def _conv2d_winograd_nhwc_impl(
Specifies the output data type.
tile_size : int
The size of the tile to use for the Winograd filter
pre_computed: bool
pre_computed: bool = False
Whether the kernel is precomputed
auto_scheduler_rewritten_layout: str = ""
The layout after auto-scheduler's layout rewrite pass.
Returns
-------
Expand All @@ -1020,7 +1023,16 @@ def _conv2d_winograd_nhwc_impl(
if not pre_computed:
KH, KW, CI, CO = get_const_tuple(weight.shape)
else:
H_CAT, W_CAT, CO, CI = get_const_tuple(weight.shape)
if auto_scheduler_rewritten_layout:
H_CAT, W_CAT, CO, CI = get_const_tuple(
auto_scheduler.get_shape_from_rewritten_layout(
auto_scheduler_rewritten_layout, ["eps", "nu", "co", "ci"]
)
)
auto_scheduler.remove_index_check(weight)
else:
H_CAT, W_CAT, CO, CI = get_const_tuple(weight.shape)

KH, KW = H_CAT - tile_size + 1, W_CAT - tile_size + 1

pad_t, pad_l, pad_b, pad_r = get_pad_tuple(padding, (KH, KW))
Expand Down Expand Up @@ -1052,8 +1064,10 @@ def _conv2d_winograd_nhwc_impl(
),
name="kernel_pack",
)
attrs = {}
else:
kernel_pack = weight
attrs = {"layout_free_placeholders": [kernel_pack]}

# pack data tile
input_tile = te.compute(
Expand Down Expand Up @@ -1085,9 +1099,12 @@ def _conv2d_winograd_nhwc_impl(
data_pack[eps][nu][p][ci] * kernel_pack[eps][nu][co][ci], axis=[ci]
),
name="bgemm",
attrs={"layout_free_placeholders": [kernel_pack]},
attrs=attrs,
)

if auto_scheduler_rewritten_layout:
bgemm = auto_scheduler.rewrite_compute_body(bgemm, auto_scheduler_rewritten_layout)

# inverse transform
r_a = te.reduce_axis((0, alpha), "r_a")
r_b = te.reduce_axis((0, alpha), "r_b")
Expand All @@ -1112,7 +1129,16 @@ def _conv2d_winograd_nhwc_impl(


@tvm.target.generic_func
def conv2d_winograd_nhwc(data, weight, strides, padding, dilation, out_dtype, pre_computed=False):
def conv2d_winograd_nhwc(
data,
weight,
strides,
padding,
dilation,
out_dtype,
pre_computed=False,
auto_scheduler_rewritten_layout="",
):
"""Conv2D Winograd in NHWC layout.
This is a clean version to be used by the auto-scheduler for both CPU and GPU.
Expand All @@ -1132,6 +1158,8 @@ def conv2d_winograd_nhwc(data, weight, strides, padding, dilation, out_dtype, pr
Specifies the output data type.
pre_computed: bool
Whether the kernel is precomputed
auto_scheduler_rewritten_layout: str = ""
The layout after auto-scheduler's layout rewrite pass.
Returns
-------
Expand All @@ -1149,11 +1177,18 @@ def conv2d_winograd_nhwc(data, weight, strides, padding, dilation, out_dtype, pr
out_dtype,
tile_size,
pre_computed,
auto_scheduler_rewritten_layout,
)


def conv2d_winograd_nhwc_without_weight_transform(
data, weight, strides, padding, dilation, out_dtype
data,
weight,
strides,
padding,
dilation,
out_dtype,
auto_scheduler_rewritten_layout="",
):
"""Conv2D Winograd without layout transform in NHWC layout.
This is a clean version to be used by the auto-scheduler for both CPU and GPU.
Expand All @@ -1172,6 +1207,8 @@ def conv2d_winograd_nhwc_without_weight_transform(
dilation size, or [dilation_height, dilation_width]
out_dtype : str, optional
Specifies the output data type.
auto_scheduler_rewritten_layout: str = ""
The layout after auto-scheduler's layout rewrite pass.
Returns
-------
Expand All @@ -1180,5 +1217,12 @@ def conv2d_winograd_nhwc_without_weight_transform(
"""

return conv2d_winograd_nhwc(
data, weight, strides, padding, dilation, out_dtype, pre_computed=True
data,
weight,
strides,
padding,
dilation,
out_dtype,
pre_computed=True,
auto_scheduler_rewritten_layout=auto_scheduler_rewritten_layout,
)
Loading

0 comments on commit c1bca8e

Please sign in to comment.