From 5ef694f4a9c6db3a8a90ead91fec285a2b929d57 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-38-96.us-west-2.compute.internal>
Date: Tue, 10 Mar 2020 23:13:04 +0000
Subject: [PATCH] Make 4D output work for task extraction.

---
 python/tvm/contrib/util.py           | 31 ++++++++++++++++++++++++++++
 topi/python/topi/cuda/conv2d_int8.py |  8 ++++++-
 2 files changed, 38 insertions(+), 1 deletion(-)

diff --git a/python/tvm/contrib/util.py b/python/tvm/contrib/util.py
index 2ebe175e8160..25d361585f96 100644
--- a/python/tvm/contrib/util.py
+++ b/python/tvm/contrib/util.py
@@ -166,3 +166,34 @@ def which(exec_name):
         if os.path.isfile(full_path) and os.access(full_path, os.X_OK):
             return full_path
     return None
+
+
+def get_lower_ir(s):
+    """Get lower ir code of a schedule.
+    This is useful for debug, since you don't have to find all inputs/outputs
+    for a schedule in a fused subgraph.
+    Parameters
+    ----------
+    s: Schedule
+    Returns
+    -------
+    ir: str
+        The lower ir
+    """
+    from tvm.te import tensor
+    from tvm.driver.build_module import lower
+
+    outputs = s.outputs
+
+    inputs = []
+    def find_all(op):
+        if isinstance(op, tensor.PlaceholderOp):
+            inputs.append(op.output(0))
+        else:
+            for x in op.input_tensors:
+                find_all(x.op)
+
+    for out in outputs:
+        find_all(out)
+
+    return lower(s, inputs, simple_mode=True)
diff --git a/topi/python/topi/cuda/conv2d_int8.py b/topi/python/topi/cuda/conv2d_int8.py
index a04f4091126a..66fc927af33d 100644
--- a/topi/python/topi/cuda/conv2d_int8.py
+++ b/topi/python/topi/cuda/conv2d_int8.py
@@ -217,7 +217,13 @@ def _schedule_conv2d_NCHWc_int8(cfg, s, output):
         output = s.outputs[0].output(0)
 
     # tile and bind spatial axes
-    n, f, y, x, c = s[output].op.axis
+    if len(s[output].op.axis) == 5:
+        n, f, y, x, c = s[output].op.axis
+    else:
+        # For task extraction of auto-tuning, the expected output is 4D.  Since auto-tuning tasks
+        # are created from scratch, therefore the real auto-tuning will still happen on 5D output.
+        n, f, y, x = s[output].op.axis
+
     cfg.define_split("tile_n", cfg.axis(n), num_outputs=4)
     cfg.define_split("tile_f", cfg.axis(f), num_outputs=4)
     cfg.define_split("tile_y", cfg.axis(y), num_outputs=4)