From da039794cfc7c7d94855b0dc6f93248f13222c7f Mon Sep 17 00:00:00 2001
From: Cody Hao Yu <comaniac0422@gmail.com>
Date: Sat, 14 Sep 2019 17:37:17 -0700
Subject: [PATCH] [AutoTVM] Enhance tuning space of split (#3949)

* Refine policies for define_split

- Rename policy "all" to "factors"
- Add policy "verbose" and "power2"

* Refine search space

* add doc
---
 python/tvm/autotvm/task/space.py             | 107 ++++++++++++-------
 topi/python/topi/arm_cpu/bitserial_conv2d.py |  10 +-
 topi/python/topi/arm_cpu/bitserial_dense.py  |   6 +-
 topi/python/topi/nn/bitserial_conv2d.py      |  12 +--
 topi/python/topi/nn/bitserial_dense.py       |   6 +-
 topi/python/topi/x86/bitserial_conv2d.py     |   8 +-
 topi/python/topi/x86/conv2d.py               |   2 +-
 7 files changed, 91 insertions(+), 60 deletions(-)

diff --git a/python/tvm/autotvm/task/space.py b/python/tvm/autotvm/task/space.py
index 8b2dc65bd42b..1247d615d596 100644
--- a/python/tvm/autotvm/task/space.py
+++ b/python/tvm/autotvm/task/space.py
@@ -165,6 +165,20 @@ def get_factors(n):
     ret.sort()
     return ret
 
+def get_pow2s(n):
+    """return all power-of-two numbers that are less or equal than the integer
+
+    Parameters
+    ----------
+    n: int
+        integer for reference
+
+    Returns
+    -------
+    factors: list
+        List of all power-of-two numbers
+    """
+    return [2**x for x in range(math.floor(math.log2(n)) + 1)]
 
 class SplitSpace(TransformSpace):
     """Split an axis for several times"""
@@ -175,43 +189,49 @@ def __init__(self, axes, policy, **kwargs):
         self.policy = policy
         self.entities = []
 
-        if policy == 'all':
-            num_outputs = kwargs["num_outputs"]
-            max_factor = kwargs.get("max_factor", 1 << 31)
-            fil = kwargs.get("filter", lambda x: True)
-
-            length = axis.length
-            factors = get_factors(length)
-            factors = [x for x in factors if x <= max_factor]
-            # copy factors for every level
-            self.product = length
-            self.num_outputs = num_outputs
-            self.factors = [factors] * (num_outputs-1)
-            self._generate_space(0, [None] * (num_outputs - 1))
-            self.entities = list(filter(fil, self.entities))
-            self.num_output = num_outputs
-        elif policy == 'candidate':
-            self.product = axis.length
-            self.num_outputs = kwargs["num_outputs"]
+        max_factor = kwargs.get("max_factor", 1 << 31)
+        fil = kwargs.get("filter", lambda x: True)
+        self.product = axis.length
+        self.num_output = kwargs.get("num_outputs", 0)
+        assert self.num_output > 0
+
+        if policy == 'candidate':
             for size in kwargs["candidate"]:
-                assert len(size) == self.num_outputs
-                # assert np.prod(size) == self.product
+                assert len(size) == self.num_output
                 self.entities.append(SplitEntity(size))
-            self.num_output = self.num_outputs
         else:
-            raise RuntimeError("Invalid policy: " + policy)
+            if policy == 'verbose':
+                # Include factors and power-of-twos. May generate tails.
+                divisibles = get_factors(self.product)
+                pow2s = get_pow2s(self.product)
+                factors = [x for x in list(set(divisibles) | set(pow2s)) if x <= max_factor]
+            elif policy == 'factors':
+                # Include divisible factors. Guarantee no tails.
+                factors = [x for x in get_factors(self.product) if x <= max_factor]
+            elif policy == 'power2':
+                # Include less, equal, and round-up power-of-two numbers. May generate tails.
+                factors = [x for x in get_pow2s(self.product) if x <= max_factor]
+            else:
+                raise RuntimeError("Invalid policy: %s" % policy)
 
-    def _generate_space(self, now, tmp_stack):
+            # Enforce the product of all split factors equals to the axis length
+            no_tail = kwargs.get("no_tail", policy == 'factors')
+
+            # Generate split entity by enumerating candidate factors.
+            self.factors = factors
+            self._generate_space(0, [None] * (self.num_output - 1), enforce_no_tail=no_tail)
+
+        self.entities = list(filter(fil, self.entities))
+
+    def _generate_space(self, now, tmp_stack, enforce_no_tail=False):
         """Generate space by DFS"""
-        if now == self.num_outputs - 1:
-            size = np.prod(tmp_stack, dtype=np.int64)
-            if self.product % size == 0:
-                first = int(self.product // int(size))
-                self.entities.append(SplitEntity([first] + tmp_stack[::-1]))
+        if now == self.num_output - 1:
+            if not enforce_no_tail or self.product % np.prod(tmp_stack, dtype=np.int64) == 0:
+                self.entities.append(SplitEntity([-1] + tmp_stack[::-1]))
         else:
-            for factor in self.factors[now]:
+            for factor in self.factors:
                 tmp_stack[now] = factor
-                self._generate_space(now + 1, tmp_stack)
+                self._generate_space(now + 1, tmp_stack, enforce_no_tail)
 
     @staticmethod
     def get_num_output(axes, policy, **kwargs):
@@ -219,7 +239,7 @@ def get_num_output(axes, policy, **kwargs):
 
     def __repr__(self):
         return ("Split(policy=%s, product=%d, num_outputs=%d) len=%d" %
-                (self.policy, self.product, self.num_outputs, len(self)))
+                (self.policy, self.product, self.num_output, len(self)))
 
 
 class SplitEntity(object):
@@ -609,7 +629,7 @@ def axis(var):
 
     reduce_axis = axis
 
-    def define_split(self, name, axis, policy='all', **kwargs):
+    def define_split(self, name, axis, policy='factors', **kwargs):
         """Define a new tunable knob which splits an axis into a list of axes
 
         Parameters
@@ -620,11 +640,22 @@ def define_split(self, name, axis, policy='all', **kwargs):
             axis to split
         policy: str
             name of policy.
-            If is 'all', the tuner will try all divisible factors.
-            If is 'candidate', try listed candidate.
+            If is 'factors', the tuner will try all divisible factors.
+            If is 'power2', the tuner will try power-of-two factors less or equal to the length.
+            If is 'verbose', the tuner will try all candidates in above two policies.
+            If is 'candidate', try given candidates.
         kwargs: dict
             extra arguments for policy
-            see examples below for how to use filter
+            max_factor: int
+                the maximum split factor.
+            filter: function(int) -> bool
+                see examples below for how to use filter.
+            num_outputs: int
+                the total number of axis after split.
+            no_tail: bool
+                should we only include divisible numbers as split factors.
+            candidate: list
+                (policy=candidate) manual candidate list.
 
         Examples
         --------
@@ -632,7 +663,7 @@ def define_split(self, name, axis, policy='all', **kwargs):
         >>> cfg.define_split('tile_x', x, policy='candidate', candidate=[[1, 4, 4], [4, 1, 4]])
 
         >>> # use a filter that only accepts the split scheme whose inner most tile is less then 4
-        >>> cfg.define_split('tile_y', y, policy='all', filter=lambda x: x.size[-1] <= 4)
+        >>> cfg.define_split('tile_y', y, policy='factors', filter=lambda x: x.size[-1] <= 4)
         """
         axes = [axis]
         return self._add_new_transform(SplitSpace, name, axes, policy, **kwargs)
@@ -944,7 +975,7 @@ def fallback_split(self, name, constraints):
         """
         space = self.space_map[name]
         assert isinstance(space, SplitSpace)
-        assert len(constraints) == space.num_outputs
+        assert len(constraints) == space.num_output
 
         # '-1' means no constraint
         constraints = [x if x != -1 else 1e10 for x in constraints]
@@ -952,7 +983,7 @@ def fallback_split(self, name, constraints):
         entity = self._entity_map[name]
         now = space.product
 
-        for i in reversed(range(space.num_outputs)):
+        for i in reversed(range(space.num_output)):
             factors = get_factors(now)
 
             find = len(factors) - 1
diff --git a/topi/python/topi/arm_cpu/bitserial_conv2d.py b/topi/python/topi/arm_cpu/bitserial_conv2d.py
index af9c5bebb998..072c187ee294 100644
--- a/topi/python/topi/arm_cpu/bitserial_conv2d.py
+++ b/topi/python/topi/arm_cpu/bitserial_conv2d.py
@@ -82,11 +82,11 @@ def spatial_pack_nhwc(cfg, data, kernel, stride, padding, activation_bits, weigh
     ci, kh, kw = cfg.reduce_axis(CI_packed), cfg.reduce_axis(KH), cfg.reduce_axis(KW)
     ib, kb = cfg.reduce_axis(activation_bits), cfg.reduce_axis(weight_bits)
 
-    co, vc = cfg.define_split('tile_co', co, policy='all', num_outputs=2,
+    co, vc = cfg.define_split('tile_co', co, num_outputs=2,
                               filter=lambda x: x.size[-1] == 8)
-    oh, vh = cfg.define_split('tile_oh', oh, policy='all', num_outputs=2,
+    oh, vh = cfg.define_split('tile_oh', oh, num_outputs=2,
                               filter=lambda x: x.size[-1] >= 2)
-    ow, vw = cfg.define_split('tile_ow', ow, policy='all', num_outputs=2,
+    ow, vw = cfg.define_split('tile_ow', ow, num_outputs=2,
                               filter=lambda x: x.size[-1] >= 2)
     ci_o, ci_i = cfg.define_split("tile_ci", ci, num_outputs=2,
                                   filter=lambda x: x.size[-1] == 8 or x.size[-1] == 16)
@@ -278,13 +278,13 @@ def _schedule_spatial_conv2d_nhwc(cfg, s, data_pad, data_vec, kernel_vec,
         s[data_pad].compute_inline()
 
     _, h, _, _, _, _, _ = s[data_vec].op.axis
-    cfg.define_split("tile_ah", cfg.axis(h), policy="all", num_outputs=2, max_factor=32)
+    cfg.define_split("tile_ah", cfg.axis(h), num_outputs=2, max_factor=32)
     oh, ih = cfg["tile_ah"].apply(s, data_vec, h)
     s[data_vec].parallel(oh)
 
     #### Schedule kernel packing
     co, _, _, _, _, _ = s[kernel_vec].op.axis
-    cfg.define_split("tile_bco", cfg.axis(co), policy="all", num_outputs=2, max_factor=32)
+    cfg.define_split("tile_bco", cfg.axis(co), num_outputs=2, max_factor=32)
     oco, ico = cfg["tile_bco"].apply(s, kernel_vec, co)
     s[kernel_vec].parallel(oco)
 
diff --git a/topi/python/topi/arm_cpu/bitserial_dense.py b/topi/python/topi/arm_cpu/bitserial_dense.py
index b07930f1577d..0148cfba3f38 100644
--- a/topi/python/topi/arm_cpu/bitserial_dense.py
+++ b/topi/python/topi/arm_cpu/bitserial_dense.py
@@ -66,10 +66,10 @@ def bitserial_dense_generic(cfg, data, weight, data_bits, weight_bits, pack_dtyp
     x, y = cfg.axis(batch), cfg.axis(out_dim)
     db, wb, k = cfg.reduce_axis(DB), cfg.reduce_axis(WB), cfg.reduce_axis(in_dim)
 
-    ko, ki = cfg.define_split('tile_k', k, policy='all', num_outputs=2,
+    ko, ki = cfg.define_split('tile_k', k, num_outputs=2,
                               filter=lambda xx: xx.size[-1] == 8 or xx.size[-1] == 16)
-    xo, xi = cfg.define_split('tile_x', x, policy='all', num_outputs=2)
-    yo, yi = cfg.define_split('tile_y', y, policy='all', num_outputs=2,
+    xo, xi = cfg.define_split('tile_x', x, num_outputs=2)
+    yo, yi = cfg.define_split('tile_y', y, num_outputs=2,
                               filter=lambda xx: xx.size[-1] == 8)
 
     cfg.define_reorder('reorder_0', [yo, xo, ko, xi, wb, db, yi, ki],
diff --git a/topi/python/topi/nn/bitserial_conv2d.py b/topi/python/topi/nn/bitserial_conv2d.py
index 21abdf0de1ec..c04ff01eec8b 100644
--- a/topi/python/topi/nn/bitserial_conv2d.py
+++ b/topi/python/topi/nn/bitserial_conv2d.py
@@ -254,11 +254,11 @@ def spatial_pack_nchw(cfg, data, kernel, stride, padding, in_bits, weight_bits,
     ci, kh, kw = cfg.reduce_axis(CI), cfg.reduce_axis(KH), cfg.reduce_axis(KW)
     ib, kb = cfg.reduce_axis(in_bits), cfg.reduce_axis(weight_bits)
 
-    co, vc = cfg.define_split('tile_co', co, policy='all', num_outputs=2,
+    co, vc = cfg.define_split('tile_co', co, num_outputs=2,
                               filter=lambda x: max(x.size[1:]) <= 16)
-    oh, vh = cfg.define_split('tile_oh', oh, policy='all', num_outputs=2,
+    oh, vh = cfg.define_split('tile_oh', oh, num_outputs=2,
                               filter=lambda x: max(x.size[1:]) <= 16)
-    ow, vw = cfg.define_split('tile_ow', ow, policy='all', num_outputs=2,
+    ow, vw = cfg.define_split('tile_ow', ow, num_outputs=2,
                               filter=lambda x: max(x.size[1:]) <= 16)
     cfg.define_annotate('ann_reduce', [ib, kb, kh, kw], policy='try_unroll')
 
@@ -358,11 +358,11 @@ def spatial_pack_nhwc(cfg, data, kernel, stride, padding, in_bits, weight_bits,
     ci, kh, kw = cfg.reduce_axis(CI), cfg.reduce_axis(KH), cfg.reduce_axis(KW)
     ib, kb = cfg.reduce_axis(in_bits), cfg.reduce_axis(weight_bits)
 
-    co, vc = cfg.define_split('tile_co', co, policy='all', num_outputs=2,
+    co, vc = cfg.define_split('tile_co', co, num_outputs=2,
                               filter=lambda x: max(x.size[1:]) <= 16)
-    oh, vh = cfg.define_split('tile_oh', oh, policy='all', num_outputs=2,
+    oh, vh = cfg.define_split('tile_oh', oh, num_outputs=2,
                               filter=lambda x: max(x.size[1:]) <= 16)
-    ow, vw = cfg.define_split('tile_ow', ow, policy='all', num_outputs=2,
+    ow, vw = cfg.define_split('tile_ow', ow, num_outputs=2,
                               filter=lambda x: max(x.size[1:]) <= 16)
     cfg.define_annotate('ann_reduce', [ib, kb, kh, kw], policy='try_unroll')
     cfg.define_reorder("reorder_0",
diff --git a/topi/python/topi/nn/bitserial_dense.py b/topi/python/topi/nn/bitserial_dense.py
index 5d47b2974a7c..b28b3a41555a 100644
--- a/topi/python/topi/nn/bitserial_dense.py
+++ b/topi/python/topi/nn/bitserial_dense.py
@@ -95,9 +95,9 @@ def bitserial_dense_default(cfg, data, weight, data_bits, weight_bits, pack_dtyp
     ######## Search space
     x, y = cfg.axis(X), cfg.axis(Y)
     db, wb, k = cfg.reduce_axis(DB), cfg.reduce_axis(WB), cfg.reduce_axis(K)
-    ko, ki = cfg.define_split('tile_k', k, policy='all', num_outputs=2)
-    yo, yi = cfg.define_split('tile_y', y, policy='all', num_outputs=2)
-    xo, xi = cfg.define_split('tile_x', x, policy='all', num_outputs=2)
+    ko, ki = cfg.define_split('tile_k', k, num_outputs=2)
+    yo, yi = cfg.define_split('tile_y', y, num_outputs=2)
+    xo, xi = cfg.define_split('tile_x', x, num_outputs=2)
 
     cfg.define_reorder('reorder_0', [yo, xo, ko, yi, wb, db, ki, xi],
                        policy='candidate', candidate=[
diff --git a/topi/python/topi/x86/bitserial_conv2d.py b/topi/python/topi/x86/bitserial_conv2d.py
index 905383fdb802..536386ab2fb0 100644
--- a/topi/python/topi/x86/bitserial_conv2d.py
+++ b/topi/python/topi/x86/bitserial_conv2d.py
@@ -99,7 +99,7 @@ def _schedule_bitserial_conv2d_nchw(cfg, s, data_q, data_pad, data_vec,
         s[data_pad].compute_inline()
 
     _, _, h, _, _, _, _ = s[data_vec].op.axis
-    cfg.define_split("tile_ah", cfg.axis(h), policy="all", num_outputs=2, max_factor=32)
+    cfg.define_split("tile_ah", cfg.axis(h), num_outputs=2, max_factor=32)
     oh, ih = cfg["tile_ah"].apply(s, data_vec, h)
     if cfg["tile_ah"].size[1] == 1:
         oaxis = oh
@@ -116,7 +116,7 @@ def _schedule_bitserial_conv2d_nchw(cfg, s, data_q, data_pad, data_vec,
 
     ##### Schedule Kenerl bitpacking
     co, _, _, _, _, _ = s[kernel_vec].op.axis
-    cfg.define_split("tile_bco", cfg.axis(co), policy="all", num_outputs=2, max_factor=32)
+    cfg.define_split("tile_bco", cfg.axis(co), num_outputs=2, max_factor=32)
     oco, ico = cfg["tile_bco"].apply(s, kernel_vec, co)
     if cfg["tile_bco"].size[1] == 1:
         oaxis = oco
@@ -185,13 +185,13 @@ def _schedule_bitserial_conv2d_nhwc(cfg, s, data_q, data_pad, data_vec,
         s[data_pad].compute_inline()
 
     _, h, _, _, _, _, _ = s[data_vec].op.axis
-    cfg.define_split("tile_ah", cfg.axis(h), policy="all", num_outputs=2, max_factor=32)
+    cfg.define_split("tile_ah", cfg.axis(h), num_outputs=2, max_factor=32)
     oh, ih = cfg["tile_ah"].apply(s, data_vec, h)
     s[data_vec].parallel(oh)
 
     ##### Schedule kernel packing
     co, _, _, _, _, _ = s[kernel_vec].op.axis
-    cfg.define_split("tile_bco", cfg.axis(co), policy="all", num_outputs=2, max_factor=32)
+    cfg.define_split("tile_bco", cfg.axis(co), num_outputs=2, max_factor=32)
     oco, ico = cfg["tile_bco"].apply(s, kernel_vec, co)
     s[kernel_vec].parallel(oco)
 
diff --git a/topi/python/topi/x86/conv2d.py b/topi/python/topi/x86/conv2d.py
index d5bc5adcec85..6f134ea45781 100644
--- a/topi/python/topi/x86/conv2d.py
+++ b/topi/python/topi/x86/conv2d.py
@@ -95,7 +95,7 @@ def _create_tuning_space(cfg, data, kernel, strides, padding, dilation, layout):
         if _is_int8_hw_support(data.dtype, kernel.dtype, target):
             oc_chunk, k_ic, kh, kw, k_ic_f, oc_bn, k_ic_s = kshape
             ic = ic_chunk*ic_bn
-            assert ic == k_ic*k_ic_f*kic_s
+            assert ic == k_ic*k_ic_f*k_ic_s
         else:
             oc_chunk, k_ic_chunk, kh, kw, k_ic_bn, oc_bn = kshape
             assert ic_chunk == k_ic_chunk