From 1da5012b09caac3d5a139a41041958f1e04088f3 Mon Sep 17 00:00:00 2001
From: Wang <wayao@9801a7a9c287.ant.amazon.com>
Date: Sat, 28 Jul 2018 22:10:47 -0700
Subject: [PATCH 01/43] Refactor NMS

---
 topi/python/topi/vision/nms.py | 79 +++++++++++++++++++++++++++++++---
 1 file changed, 73 insertions(+), 6 deletions(-)

diff --git a/topi/python/topi/vision/nms.py b/topi/python/topi/vision/nms.py
index a41ee5b50089..9e3614a73d46 100644
--- a/topi/python/topi/vision/nms.py
+++ b/topi/python/topi/vision/nms.py
@@ -2,7 +2,67 @@
 """Non-maximum suppression operator"""
 import tvm
 
-from tvm import api
+from tvm import api, hybrid
+
+@hybrid.script
+def rearrange_out(input, output):
+    """Rearrange nms output to move all valid entries to top.
+
+    Parameters
+    ----------
+    input : Tensor or Var or numpy NDArray
+        NMS output. 3-D tensor with shape
+        [batch_size, num_anchors, 6].
+
+    output : Tensor or Var or numpy NDArray
+        Transformed NMS output. 3-D tensor with shape
+        [batch_size, num_anchors, 6].
+        It should filled with invalid entry -1.
+    """
+    batch_size = input.shape[0]
+    num_anchors = input.shape[1]
+    elem_length = input.shape[2]
+    for i in range(batch_size):
+        valid_idx = 0
+        for j in range(num_anchors):
+            if input[i, j, 0] >= 0:
+                for k in range(elem_length):
+                    output[i, valid_idx, k] = input[i, j, k]
+                valid_idx += 1
+
+
+@hybrid.script
+def get_valid_counts(data, inter_data, valid_count, score_threshold):
+    """Get valid count of bounding boxes given a score threshlod.
+    Also moves valid boxes to the top of input data.
+
+    Parameters
+    ----------
+    data : Tensor or Var or numpy NDArray
+        Input data. 3-D tensor with shape [batch_size, num_anchors, 6].
+
+    inter_data : Tensor or Var or numpy NDArray
+        Intermediate output. 3-D tensor with shape
+        [batch_size, num_anchors, 6].
+
+    valid_count : Tensor or Var or numpy NDArray
+        1-D tensor for valid number of boxes.
+
+    score_threshold : float
+        Lower limit of score for valid bounding boxes.
+    """
+    batch_size = data.shape[0]
+    num_anchors = data.shape[1]
+    for i in range(batch_size):
+        valid_count[i] = 0
+        inter_idx = 0
+        for j in range(num_anchors):
+            score = data[i, j, 1]
+            if score >= score_threshold:
+                valid_count[i] += 1
+                inter_data[i, inter_idx] = data[i, j]
+                inter_idx += 1
+
 
 def nms_ir(data, sort_result, valid_count, out, nms_threshold, force_suppress, nms_topk):
     """Low level IR routing for transform location in multibox_detection operator.
@@ -107,12 +167,13 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx):
 
 
 @tvm.target.generic_func
-def nms(data, valid_count, nms_threshold=0.5, force_suppress=False, nms_topk=-1):
+def nms(data, valid_count, nms_threshold=0.5, force_suppress=False, nms_topk=-1,
+        do_rearrange=False):
     """Non-maximum suppression operator for object detection.
 
     Parameters
     ----------
-    data: tvm.Tensor
+    data : tvm.Tensor
         3-D tensor with shape [batch_size, num_anchors, 6].
         The last dimension should be in format of
         [class_id, score, box_left, box_top, box_right, box_bottom].
@@ -120,15 +181,18 @@ def nms(data, valid_count, nms_threshold=0.5, force_suppress=False, nms_topk=-1)
     valid_count : tvm.Tensor
         1-D tensor for valid number of boxes.
 
-    nms_threshold : float
+    nms_threshold : optional, float
         Non-maximum suppression threshold.
 
-    force_suppress : boolean
+    force_suppress : optional, boolean
         Whether to suppress all detections regardless of class_id.
 
-    nms_topk : int
+    nms_topk : optional, int
         Keep maximum top k detections before nms, -1 for no limit.
 
+    do_rearrange : optional, boolean
+        Whether to move all valid bounding boxes to the top.
+
     Returns
     -------
     out : tvm.Tensor
@@ -189,4 +253,7 @@ def nms(data, valid_count, nms_threshold=0.5, force_suppress=False, nms_topk=-1)
                    dtype="float32",
                    in_buffers=[data_buf, sort_tensor_buf, valid_count_buf],
                    tag="nms")
+    if do_rearrange:
+        normalized_out = tvm.compute(out.shape, lambda *index: -1)
+        hybrid.parse(rearrange_out, [out, normalized_out])
     return out

From 1eb27a8f59971c53997941f2968a69acecabb2d0 Mon Sep 17 00:00:00 2001
From: Wang <wayao@9801a7a9c287.ant.amazon.com>
Date: Tue, 4 Sep 2018 10:54:33 -0700
Subject: [PATCH 02/43] Avoid using function call for hybrid frontend

---
 topi/python/topi/vision/nms.py | 122 ++++++++++++++++++++++++++++++---
 1 file changed, 114 insertions(+), 8 deletions(-)

diff --git a/topi/python/topi/vision/nms.py b/topi/python/topi/vision/nms.py
index 9e3614a73d46..f7b090b88397 100644
--- a/topi/python/topi/vision/nms.py
+++ b/topi/python/topi/vision/nms.py
@@ -17,18 +17,22 @@ def rearrange_out(input, output):
     output : Tensor or Var or numpy NDArray
         Transformed NMS output. 3-D tensor with shape
         [batch_size, num_anchors, 6].
-        It should filled with invalid entry -1.
     """
     batch_size = input.shape[0]
     num_anchors = input.shape[1]
     elem_length = input.shape[2]
+    for i in range(batch_size):
+        for j in range(num_anchors):
+            for k in range(elem_length):
+                output[i, j, k] = -1.0
+
     for i in range(batch_size):
         valid_idx = 0
         for j in range(num_anchors):
             if input[i, j, 0] >= 0:
                 for k in range(elem_length):
                     output[i, valid_idx, k] = input[i, j, k]
-                valid_idx += 1
+                valid_idx = valid_idx + 1
 
 
 @hybrid.script
@@ -61,7 +65,7 @@ def get_valid_counts(data, inter_data, valid_count, score_threshold):
             if score >= score_threshold:
                 valid_count[i] += 1
                 inter_data[i, inter_idx] = data[i, j]
-                inter_idx += 1
+                inter_idx = inter_idx + 1
 
 
 def nms_ir(data, sort_result, valid_count, out, nms_threshold, force_suppress, nms_topk):
@@ -165,6 +169,101 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx):
                 p_out[n * num_anchors * 6 + (l + p_valid_count[n]) * 6 + m] = -1.0
     return ib.get()
 
+@hybrid.script
+def calculate_iou(inter_data, batch_idx, box_a_idx, box_b_idx, box_start_idx):
+    a_t = inter_data[batch_idx, box_a_idx, box_start_idx + 1]
+    a_b = inter_data[batch_idx, box_a_idx, box_start_idx + 3]
+    a_l = inter_data[batch_idx, box_a_idx, box_start_idx]
+    a_r = inter_data[batch_idx, box_a_idx, box_start_idx + 2]
+    b_t = inter_data[batch_idx, box_b_idx, box_start_idx + 1]
+    b_b = inter_data[batch_idx, box_b_idx, box_start_idx + 3]
+    b_l = inter_data[batch_idx, box_b_idx, box_start_idx]
+    b_r = inter_data[batch_idx, box_b_idx, box_start_idx + 2]
+    w = max(0.0, min(a_r, b_r) - max(a_l, b_l))
+    h = max(0.0, min(a_b, b_b) - max(a_t, b_t))
+    i = h * w
+    u = (a_r - a_l) * (a_b - a_t) + (b_r - b_l) * (b_b - b_t) - i
+    return 0.0 if u <= 0 else i / u
+
+@hybrid.script
+def hybrid_nms(data, sorted_index, valid_count, output, iou_threshold, force_suppress, nms_topk):
+    """Hybrid routing for non-maximum suppression.
+
+    Parameters
+    ----------
+    data: Tensor or Var or numpy NDArray
+        Bounding boxes with class and score. 3-D tensor with shape
+        [batch_size, num_anchors, 6].
+
+    sorted_index : Tensor or Var or numpy NDArray
+        Bounding box indexes sorted by score, with shape
+        [batch_size, num_anchors].
+
+    valid_count : Tensor or Var or numpy NDArray
+        1-D tensor for valid number of boxes.
+
+    output : Tensor or Var or numpy NDArray
+        NMS output tensor.
+
+    iou_threshold : float
+        Overlapping(IoU) threshold to suppress object with smaller score.
+
+    force_suppress : boolean
+        Whether to suppress all detections regardless of class_id.
+
+    nms_topk : int
+        Keep maximum top k detections before nms, -1 for no limit.
+    """
+    batch_size = data.shape[0]
+    num_anchors = data.shape[1]
+    box_data_length = data.shape[2]
+    for i in parallel(batch_size):
+        if iou_threshold > 0  and valid_count[i] > 0:
+            # Reorder output
+            nkeep = nms_topk if 0 < nms_topk < valid_count[i] else valid_count[i]
+            for j in range(nkeep):
+                for k in range(box_data_length):
+                    output[i, j, k] = data[i, sorted_index[i, j], k]
+            if 0 < nms_topk < valid_count[i]:
+                for j in range(valid_count[i] - nkeep):
+                    for k in range(box_data_length):
+                        output[i, j + nkeep, k] = data[i, j + nkeep, k]
+            # Apply nms
+            for j in range(valid_count[i]):
+                if output[i, j, 0] >= 0:
+                    for k in range(valid_count[i]):
+                        if k > j and output[i, k, 0] >= 0 and (force_suppress
+                                                               or output[i, j, 0]
+                                                               == output[i, k, 0]):
+                            #iou = calculate_iou(output, i, j, k, 2)
+                            inter_data = output
+                            batch_idx = i
+                            box_a_idx, box_b_idx = j, k
+                            box_start_idx = 2
+                            a_t = inter_data[batch_idx, box_a_idx, box_start_idx + 1]
+                            a_b = inter_data[batch_idx, box_a_idx, box_start_idx + 3]
+                            a_l = inter_data[batch_idx, box_a_idx, box_start_idx]
+                            a_r = inter_data[batch_idx, box_a_idx, box_start_idx + 2]
+                            b_t = inter_data[batch_idx, box_b_idx, box_start_idx + 1]
+                            b_b = inter_data[batch_idx, box_b_idx, box_start_idx + 3]
+                            b_l = inter_data[batch_idx, box_b_idx, box_start_idx]
+                            b_r = inter_data[batch_idx, box_b_idx, box_start_idx + 2]
+                            w = max(0.0, min(a_r, b_r) - max(a_l, b_l))
+                            h = max(0.0, min(a_b, b_b) - max(a_t, b_t))
+                            i = h * w
+                            u = (a_r - a_l) * (a_b - a_t) + (b_r - b_l) * (b_b - b_t) - i
+                            iou = 0.0 if u <= 0 else i / u
+                            if iou >= iou_threshold:
+                                output[i, k, 0] = -1.0
+        else:
+            for j in range(valid_count[i]):
+                for k in range(box_data_length):
+                    output[i, j, k] = data[i, j, k]
+        # Set invalid entry to be -1
+        for j in range(num_anchors - valid_count[i]):
+            for k in range(box_data_length):
+                output[i, j + valid_count[i], k] = -1.0
+
 
 @tvm.target.generic_func
 def nms(data, valid_count, nms_threshold=0.5, force_suppress=False, nms_topk=-1,
@@ -244,16 +343,23 @@ def nms(data, valid_count, nms_threshold=0.5, force_suppress=False, nms_topk=-1,
                    in_buffers=[score_tensor_buf, valid_count_buf],
                    out_buffers=sort_tensor_buf,
                    name="nms_sort")
+    d_plc = tvm.placeholder(data.shape, name="nms_out", dtype="float32")
+    out = tvm.placeholder(data.shape, name="nms_out", dtype="float32")
+    inter_func = hybrid.parse(hybrid_nms, [data, sort_tensor, valid_count, out, nms_threshold, force_suppress, nms_topk])
+    print(inter_func)
     out = \
         tvm.extern(data.shape,
                    [data, sort_tensor, valid_count],
-                   lambda ins, outs: nms_ir(
-                       ins[0], ins[1], ins[2], outs[0], nms_threshold,
-                       force_suppress, nms_topk),
+                   lambda ins, outs: hybrid.parse(
+                       hybrid_nms,
+                       [ins[0], ins[1], ins[2], outs[0], nms_threshold,
+                        force_suppress, nms_topk]),
                    dtype="float32",
                    in_buffers=[data_buf, sort_tensor_buf, valid_count_buf],
                    tag="nms")
     if do_rearrange:
-        normalized_out = tvm.compute(out.shape, lambda *index: -1)
-        hybrid.parse(rearrange_out, [out, normalized_out])
+        out = tvm.extern(out.shape, [out],
+                         lambda ins, outs: hybrid.parse(
+                             rearrange_out, [ins[0], outs[0]]),
+                         dtype="float32",)
     return out

From 1e2cdb544f7120ae14b5e54825f76faa78294111 Mon Sep 17 00:00:00 2001
From: Wang <wayao@9801a7a9c287.ant.amazon.com>
Date: Tue, 27 Nov 2018 16:58:51 -0800
Subject: [PATCH 03/43] Modify nms

---
 topi/python/topi/vision/nms.py | 25 ++++++-------------------
 1 file changed, 6 insertions(+), 19 deletions(-)

diff --git a/topi/python/topi/vision/nms.py b/topi/python/topi/vision/nms.py
index f7b090b88397..a2e8f50c1056 100644
--- a/topi/python/topi/vision/nms.py
+++ b/topi/python/topi/vision/nms.py
@@ -324,7 +324,6 @@ def nms(data, valid_count, nms_threshold=0.5, force_suppress=False, nms_topk=-1,
     valid_count_dtype = "int32"
     valid_count_buf = api.decl_buffer(valid_count.shape, valid_count_dtype,
                                       "valid_count_buf", data_alignment=4)
-    data_buf = api.decl_buffer(data.shape, data.dtype, "data_buf", data_alignment=8)
     score_axis = 1
     score_shape = (batch_size, num_anchors)
     score_tensor = tvm.compute(score_shape, lambda i, j: data[i, j, score_axis])
@@ -343,23 +342,11 @@ def nms(data, valid_count, nms_threshold=0.5, force_suppress=False, nms_topk=-1,
                    in_buffers=[score_tensor_buf, valid_count_buf],
                    out_buffers=sort_tensor_buf,
                    name="nms_sort")
-    d_plc = tvm.placeholder(data.shape, name="nms_out", dtype="float32")
-    out = tvm.placeholder(data.shape, name="nms_out", dtype="float32")
-    inter_func = hybrid.parse(hybrid_nms, [data, sort_tensor, valid_count, out, nms_threshold, force_suppress, nms_topk])
-    print(inter_func)
-    out = \
-        tvm.extern(data.shape,
-                   [data, sort_tensor, valid_count],
-                   lambda ins, outs: hybrid.parse(
-                       hybrid_nms,
-                       [ins[0], ins[1], ins[2], outs[0], nms_threshold,
-                        force_suppress, nms_topk]),
-                   dtype="float32",
-                   in_buffers=[data_buf, sort_tensor_buf, valid_count_buf],
-                   tag="nms")
+    out = tvm.placeholder(data.shape, dtype=data.dtype)
+    out = hybrid_nms(data, sort_tensor, valid_count, out,
+                     tvm.convert(nms_threshold), tvm.convert(force_suppress),
+                     tvm.convert(nms_topk))
     if do_rearrange:
-        out = tvm.extern(out.shape, [out],
-                         lambda ins, outs: hybrid.parse(
-                             rearrange_out, [ins[0], outs[0]]),
-                         dtype="float32",)
+        out = rearrange_out(out)
+
     return out

From d89286772336beebd16a559873e2335d9af1aa68 Mon Sep 17 00:00:00 2001
From: Wang <wayao@9801a7a9c287.ant.amazon.com>
Date: Mon, 10 Dec 2018 17:02:12 -0800
Subject: [PATCH 04/43] Add box_nms

---
 topi/python/topi/vision/nms.py | 344 ++++++++++++++++-----------------
 1 file changed, 166 insertions(+), 178 deletions(-)

diff --git a/topi/python/topi/vision/nms.py b/topi/python/topi/vision/nms.py
index a2e8f50c1056..7569ae636dca 100644
--- a/topi/python/topi/vision/nms.py
+++ b/topi/python/topi/vision/nms.py
@@ -5,19 +5,25 @@
 from tvm import api, hybrid
 
 @hybrid.script
-def rearrange_out(input, output):
+def rearrange_out(input):
     """Rearrange nms output to move all valid entries to top.
 
     Parameters
     ----------
-    input : Tensor or Var or numpy NDArray
+    input : tvm.Tensor or numpy NDArray
         NMS output. 3-D tensor with shape
         [batch_size, num_anchors, 6].
 
-    output : Tensor or Var or numpy NDArray
+    Returns
+    -------
+    output : tvm.Tensor or numpy NDArray
         Transformed NMS output. 3-D tensor with shape
         [batch_size, num_anchors, 6].
     """
+    output = output_tensor((input.shape[0],
+                            input.shape[1],
+                            input.shape[2],),
+                           input.dtype)
     batch_size = input.shape[0]
     num_anchors = input.shape[1]
     elem_length = input.shape[2]
@@ -33,226 +39,136 @@ def rearrange_out(input, output):
                 for k in range(elem_length):
                     output[i, valid_idx, k] = input[i, j, k]
                 valid_idx = valid_idx + 1
+    return output
 
 
 @hybrid.script
-def get_valid_counts(data, inter_data, valid_count, score_threshold):
+def get_valid_counts(data, score_threshold):
     """Get valid count of bounding boxes given a score threshlod.
     Also moves valid boxes to the top of input data.
 
     Parameters
     ----------
-    data : Tensor or Var or numpy NDArray
+    data : tvm.Tensor or numpy NDArray
         Input data. 3-D tensor with shape [batch_size, num_anchors, 6].
 
-    inter_data : Tensor or Var or numpy NDArray
-        Intermediate output. 3-D tensor with shape
-        [batch_size, num_anchors, 6].
+    score_threshold : tvm.const
+        Lower limit of score for valid bounding boxes.
 
-    valid_count : Tensor or Var or numpy NDArray
-        1-D tensor for valid number of boxes.
+    Returns
+    -------
+    out_tensor : tvm.Tensor or numpy NDArray
+        Rearranged data tensor.
 
-    score_threshold : float
-        Lower limit of score for valid bounding boxes.
+    valid_count : tvm.Tensor or numpy NDArray
+        1-D tensor for valid number of boxes.
     """
     batch_size = data.shape[0]
     num_anchors = data.shape[1]
+    box_data_length = data.shape[2]
+    valid_count = output_tensor((batch_size,), "int32")
+    out_tensor = output_tensor((batch_size,
+                                num_anchors,
+                                box_data_length),
+                               data.dtype)
     for i in range(batch_size):
         valid_count[i] = 0
         inter_idx = 0
         for j in range(num_anchors):
             score = data[i, j, 1]
             if score >= score_threshold:
+                for k in range(box_data_length):
+                    out_tensor[i, inter_idx, k] = data[i, j, k]
                 valid_count[i] += 1
-                inter_data[i, inter_idx] = data[i, j]
                 inter_idx = inter_idx + 1
 
+    return valid_count, out_tensor
 
-def nms_ir(data, sort_result, valid_count, out, nms_threshold, force_suppress, nms_topk):
-    """Low level IR routing for transform location in multibox_detection operator.
-
-    Parameters
-    ----------
-    data: Buffer
-        Buffer of output boxes with class and score.
-
-    sort_result : Buffer
-        Buffer of output box indexes sorted by score.
-
-    valid_count : Buffer
-        Buffer of number of valid output boxes.
-
-    out : Buffer
-        Output buffer.
-
-    nms_threshold : float
-        Non-maximum suppression threshold.
-
-    force_suppress : boolean
-        Whether to suppress all detections regardless of class_id.
-
-    nms_topk : int
-        Keep maximum top k detections before nms, -1 for no limit.
-
-    Returns
-    -------
-    stmt : Stmt
-        The result IR statement.
-    """
-    def calculate_overlap(out_tensor, box_a_idx, box_b_idx):
-        """Calculate overlap of two boxes.
-        """
-        w = tvm.make.Max(0.0, tvm.make.Min(out_tensor[box_a_idx + 2], out_tensor[box_b_idx + 2])
-                         - tvm.make.Max(out_tensor[box_a_idx], out_tensor[box_b_idx]))
-        h = tvm.make.Max(0.0, tvm.make.Min(out_tensor[box_a_idx + 3], out_tensor[box_b_idx + 3])
-                         - tvm.make.Max(out_tensor[box_a_idx + 1], out_tensor[box_b_idx + 1]))
-        i = w * h
-        u = (out_tensor[box_a_idx + 2] - out_tensor[box_a_idx]) * \
-            (out_tensor[box_a_idx + 3] - out_tensor[box_a_idx + 1]) + \
-            (out_tensor[box_b_idx + 2] - out_tensor[box_b_idx]) * \
-            (out_tensor[box_b_idx + 3] - out_tensor[box_b_idx + 1]) - i
-        return tvm.expr.Select(u <= 0.0, 0.0, i / u)
-
-    ib = tvm.ir_builder.create()
-    p_data = ib.buffer_ptr(data)
-    p_sort_result = ib.buffer_ptr(sort_result)
-    p_valid_count = ib.buffer_ptr(valid_count)
-    p_out = ib.buffer_ptr(out)
-    batch_size = out.shape[0]
-    num_anchors = out.shape[1]
-
-    nms_threshold_node = tvm.make.node("FloatImm", dtype="float32", value=nms_threshold)
-    nms_topk_node = tvm.make.node("IntImm", dtype="int32", value=nms_topk)
-    force_suppress_node = tvm.make.node("IntImm", dtype="int32", value=1 if force_suppress else 0)
-    with ib.for_range(0, batch_size, for_type="parallel", name="n") as n:
-        with ib.if_scope(tvm.all(nms_threshold_node > 0, nms_threshold_node < 1,
-                                 p_valid_count[0] > 0)):
-            # Reorder output
-            nkeep = tvm.if_then_else(
-                tvm.all(nms_topk_node > 0, nms_topk < p_valid_count[n]),
-                nms_topk, p_valid_count[n])
-            with ib.for_range(0, nkeep, name="l") as l:
-                with ib.for_range(0, 6, name="m") as m:
-                    p_out[(n * num_anchors * 6
-                           + l * 6 + m)] = p_data[(n * num_anchors * 6
-                                                   + p_sort_result[n * num_anchors + l] * 6 + m)]
-            with ib.if_scope(tvm.all(nms_topk_node > 0, nms_topk < p_valid_count[n])):
-                with ib.for_range(0, p_valid_count[n] - nkeep, name="l") as l:
-                    with ib.for_range(0, 6, name="m") as m:
-                        p_out[(n * num_anchors * 6
-                               + (l + nkeep) * 6 + m)] = p_data[(n * num_anchors * 6
-                                                                 + (l + nkeep) * 6 + m)]
-            # Apply nms
-            with ib.for_range(0, p_valid_count[n], name="l") as l:
-                offset_l = l * 6
-                with ib.if_scope(p_out[n * num_anchors * 6 + offset_l] >= 0):
-                    with ib.for_range(0, p_valid_count[n], name="m") as m:
-                        offset_m = m * 6
-                        with ib.if_scope(tvm.all(m > l, p_out[n * num_anchors * 6
-                                                              + offset_m] >= 0)):
-                            with ib.if_scope(tvm.any(force_suppress_node > 0,
-                                                     p_out[n * num_anchors * 6 + offset_l] ==
-                                                     p_out[n * num_anchors * 6 + offset_m])):
-                                # When force_suppress == True or class_id equals
-                                iou = calculate_overlap(p_out, n * num_anchors * 6 + offset_l + 2,
-                                                        n * num_anchors * 6 + offset_m + 2)
-                                with ib.if_scope(iou >= nms_threshold):
-                                    p_out[n * num_anchors * 6 + offset_m] = -1.0
-        with ib.else_scope():
-            with ib.for_range(0, p_valid_count[n], name="l") as l:
-                with ib.for_range(0, 6, name="m") as m:
-                    p_out[(n * num_anchors * 6
-                           + l * 6 + m)] = p_data[n * num_anchors * 6 + l * 6 + m]
-        # Set invalid entry to be -1
-        with ib.for_range(0, num_anchors - p_valid_count[n], name="l") as l:
-            with ib.for_range(0, 6, name="m") as m:
-                p_out[n * num_anchors * 6 + (l + p_valid_count[n]) * 6 + m] = -1.0
-    return ib.get()
-
-@hybrid.script
-def calculate_iou(inter_data, batch_idx, box_a_idx, box_b_idx, box_start_idx):
-    a_t = inter_data[batch_idx, box_a_idx, box_start_idx + 1]
-    a_b = inter_data[batch_idx, box_a_idx, box_start_idx + 3]
-    a_l = inter_data[batch_idx, box_a_idx, box_start_idx]
-    a_r = inter_data[batch_idx, box_a_idx, box_start_idx + 2]
-    b_t = inter_data[batch_idx, box_b_idx, box_start_idx + 1]
-    b_b = inter_data[batch_idx, box_b_idx, box_start_idx + 3]
-    b_l = inter_data[batch_idx, box_b_idx, box_start_idx]
-    b_r = inter_data[batch_idx, box_b_idx, box_start_idx + 2]
-    w = max(0.0, min(a_r, b_r) - max(a_l, b_l))
-    h = max(0.0, min(a_b, b_b) - max(a_t, b_t))
-    i = h * w
-    u = (a_r - a_l) * (a_b - a_t) + (b_r - b_l) * (b_b - b_t) - i
-    return 0.0 if u <= 0 else i / u
 
 @hybrid.script
-def hybrid_nms(data, sorted_index, valid_count, output, iou_threshold, force_suppress, nms_topk):
+def hybrid_nms(data, sorted_index, valid_count,
+               iou_threshold, force_suppress, topk):
     """Hybrid routing for non-maximum suppression.
 
     Parameters
     ----------
-    data: Tensor or Var or numpy NDArray
+    data: tvm.Tensor or numpy NDArray
         Bounding boxes with class and score. 3-D tensor with shape
         [batch_size, num_anchors, 6].
 
-    sorted_index : Tensor or Var or numpy NDArray
+    sorted_index : tvm.Tensor or numpy NDArray
         Bounding box indexes sorted by score, with shape
         [batch_size, num_anchors].
 
-    valid_count : Tensor or Var or numpy NDArray
+    valid_count : tvm.Tensor or numpy NDArray
         1-D tensor for valid number of boxes.
 
-    output : Tensor or Var or numpy NDArray
-        NMS output tensor.
-
-    iou_threshold : float
+    iou_threshold : tvm.const
         Overlapping(IoU) threshold to suppress object with smaller score.
 
-    force_suppress : boolean
+    force_suppress : tvm.const
         Whether to suppress all detections regardless of class_id.
 
-    nms_topk : int
+    topk : tvm.const
         Keep maximum top k detections before nms, -1 for no limit.
+
+    Returns
+    -------
+    valid_count : tvm.Tensor or numpy NDArray
+        1-D tensor for valid number of boxes.
     """
     batch_size = data.shape[0]
     num_anchors = data.shape[1]
     box_data_length = data.shape[2]
+    output = output_tensor((batch_size,
+                            num_anchors,
+                            box_data_length,),
+                           data.dtype)
     for i in parallel(batch_size):
-        if iou_threshold > 0  and valid_count[i] > 0:
-            # Reorder output
-            nkeep = nms_topk if 0 < nms_topk < valid_count[i] else valid_count[i]
-            for j in range(nkeep):
-                for k in range(box_data_length):
-                    output[i, j, k] = data[i, sorted_index[i, j], k]
-            if 0 < nms_topk < valid_count[i]:
-                for j in range(valid_count[i] - nkeep):
+        if iou_threshold > 0:
+            if valid_count[i] > 0:
+                # Reorder output
+                nkeep = valid_count[i]
+                if topk > 0:
+                    if topk < valid_count[i]:
+                        nkeep = topk
+                for j in range(nkeep):
                     for k in range(box_data_length):
-                        output[i, j + nkeep, k] = data[i, j + nkeep, k]
+                        output[i, j, k] = data[i, sorted_index[i, j], k]
+                if topk > 0:
+                    if topk < valid_count[i]:
+                        for j in range(valid_count[i] - nkeep):
+                            for k in range(box_data_length):
+                                output[i, j + nkeep, k] = data[i, j + nkeep, k]
             # Apply nms
             for j in range(valid_count[i]):
                 if output[i, j, 0] >= 0:
                     for k in range(valid_count[i]):
-                        if k > j and output[i, k, 0] >= 0 and (force_suppress
-                                                               or output[i, j, 0]
-                                                               == output[i, k, 0]):
-                            #iou = calculate_iou(output, i, j, k, 2)
-                            inter_data = output
+                        check_iou = 0
+                        if k > j:
+                            if output[i, k, 0] >= 0:
+                                if force_suppress:
+                                    check_iou = 1
+                                elif output[i, j, 0] == output[i, k, 0]:
+                                    check_iou = 1
+                        if check_iou:
                             batch_idx = i
-                            box_a_idx, box_b_idx = j, k
+                            box_a_idx = j
+                            box_b_idx = k
                             box_start_idx = 2
-                            a_t = inter_data[batch_idx, box_a_idx, box_start_idx + 1]
-                            a_b = inter_data[batch_idx, box_a_idx, box_start_idx + 3]
-                            a_l = inter_data[batch_idx, box_a_idx, box_start_idx]
-                            a_r = inter_data[batch_idx, box_a_idx, box_start_idx + 2]
-                            b_t = inter_data[batch_idx, box_b_idx, box_start_idx + 1]
-                            b_b = inter_data[batch_idx, box_b_idx, box_start_idx + 3]
-                            b_l = inter_data[batch_idx, box_b_idx, box_start_idx]
-                            b_r = inter_data[batch_idx, box_b_idx, box_start_idx + 2]
+                            a_t = output[batch_idx, box_a_idx, box_start_idx + 1]
+                            a_b = output[batch_idx, box_a_idx, box_start_idx + 3]
+                            a_l = output[batch_idx, box_a_idx, box_start_idx]
+                            a_r = output[batch_idx, box_a_idx, box_start_idx + 2]
+                            b_t = output[batch_idx, box_b_idx, box_start_idx + 1]
+                            b_b = output[batch_idx, box_b_idx, box_start_idx + 3]
+                            b_l = output[batch_idx, box_b_idx, box_start_idx]
+                            b_r = output[batch_idx, box_b_idx, box_start_idx + 2]
                             w = max(0.0, min(a_r, b_r) - max(a_l, b_l))
                             h = max(0.0, min(a_b, b_b) - max(a_t, b_t))
-                            i = h * w
-                            u = (a_r - a_l) * (a_b - a_t) + (b_r - b_l) * (b_b - b_t) - i
-                            iou = 0.0 if u <= 0 else i / u
+                            area = h * w
+                            u = (a_r - a_l) * (a_b - a_t) + (b_r - b_l) * (b_b - b_t) - area
+                            iou = 0.0 if u <= 0.0 else area / u
                             if iou >= iou_threshold:
                                 output[i, k, 0] = -1.0
         else:
@@ -263,11 +179,12 @@ def hybrid_nms(data, sorted_index, valid_count, output, iou_threshold, force_sup
         for j in range(num_anchors - valid_count[i]):
             for k in range(box_data_length):
                 output[i, j + valid_count[i], k] = -1.0
+    return output
 
 
 @tvm.target.generic_func
-def nms(data, valid_count, nms_threshold=0.5, force_suppress=False, nms_topk=-1,
-        do_rearrange=False):
+def nms(data, valid_count, iou_threshold=0.5, force_suppress=False,
+        topk=-1, do_rearrange=False):
     """Non-maximum suppression operator for object detection.
 
     Parameters
@@ -280,13 +197,13 @@ def nms(data, valid_count, nms_threshold=0.5, force_suppress=False, nms_topk=-1,
     valid_count : tvm.Tensor
         1-D tensor for valid number of boxes.
 
-    nms_threshold : optional, float
+    iou_threshold : optional, float
         Non-maximum suppression threshold.
 
     force_suppress : optional, boolean
         Whether to suppress all detections regardless of class_id.
 
-    nms_topk : optional, int
+    topk : optional, int
         Keep maximum top k detections before nms, -1 for no limit.
 
     do_rearrange : optional, boolean
@@ -305,12 +222,12 @@ def nms(data, valid_count, nms_threshold=0.5, force_suppress=False, nms_topk=-1,
         dshape = (1, 5, 6)
         data = tvm.placeholder(dshape, name="data")
         valid_count = tvm.placeholder((dshape[0],), dtype="int32", name="valid_count")
-        nms_threshold = 0.7
+        iou_threshold = 0.7
         force_suppress = True
-        nms_topk = -1
-        out = nms(data, valid_count, nms_threshold, force_suppress, nms_topk)
-        np_data = np.random.uniform(size=dshape).astype("float32")
-        np_valid_count = np.array([4]).astype("int32")
+        topk = -1
+        out = nms(data, valid_count, iou_threshold, force_suppress, topk)
+        np_data = np.random.uniform(dshape)
+        np_valid_count = np.array([4])
         s = topi.generic.schedule_nms(out)
         f = tvm.build(s, [data, valid_count, out], "llvm")
         ctx = tvm.cpu()
@@ -342,11 +259,82 @@ def nms(data, valid_count, nms_threshold=0.5, force_suppress=False, nms_topk=-1,
                    in_buffers=[score_tensor_buf, valid_count_buf],
                    out_buffers=sort_tensor_buf,
                    name="nms_sort")
-    out = tvm.placeholder(data.shape, dtype=data.dtype)
-    out = hybrid_nms(data, sort_tensor, valid_count, out,
-                     tvm.convert(nms_threshold), tvm.convert(force_suppress),
-                     tvm.convert(nms_topk))
+    out = hybrid_nms(data, sort_tensor, valid_count,
+                     tvm.const(iou_threshold, dtype="float32"),
+                     tvm.const(force_suppress, dtype="bool"),
+                     tvm.const(topk, dtype="int32"))
     if do_rearrange:
         out = rearrange_out(out)
 
     return out
+
+@tvm.target.generic_func
+def box_nms(data, iou_threshold=0.5, score_threshold=0,
+            force_suppress=True, topk=-1):
+    """Apply non-maximum suppression to input.
+    Comparing to nms, this function takes score_threshold
+    as argument and automatically filters valid anchor boxes.
+
+    Parameters
+    ----------
+    data : tvm.Tensor
+        3-D tensor with shape [batch_size, num_anchors, 6].
+        The last dimension should be in format of
+        [class_id, score, box_left, box_top, box_right, box_bottom].
+
+    iou_threshold : optional, float
+        Non-maximum suppression threshold.
+
+    score_threshold : optional, float
+        Lower limit of score for valid bounding boxes.
+
+    force_suppress : optional, boolean
+        Whether to suppress all detections regardless of class_id.
+
+    topk : optional, int
+        Keep maximum top k detections before nms, -1 for no limit.
+
+    Returns
+    -------
+    out : tvm.Tensor
+        3-D tensor with shape [batch_size, num_anchors, 6].
+    """
+    score_threshold_const = tvm.const(score_threshold,
+                                      dtype="float32")
+    valid_count, out = get_valid_counts(data, score_threshold_const)
+    return nms(out, valid_count, iou_threshold,
+               force_suppress, topk, True)
+
+
+if __name__ == '__main__':
+    import tvm
+    import topi
+    import numpy as np
+
+    score_threshold = 0.13
+    overlap_thresh = 0.5
+
+    # This works.
+    # Here we first call get_valid_counts with np data,
+    # then build nms function and feed data into it.
+    np_data = np.random.uniform(size=(1, 5000, 6)).astype("float32")
+    np_valid_count, np_inter_out = topi.vision.get_valid_counts(np_data, score_threshold)
+    data = tvm.placeholder((1, 5000, 6), name="data", dtype="float32")
+    valid_count = tvm.placeholder((1,), name="valid_count", dtype="int32")
+    result = topi.vision.nms(data, valid_count, iou_threshold=overlap_thresh, force_suppress=True, do_rearrange=True)
+    st = tvm.create_schedule(result.op)
+    f = tvm.build(st, [data, valid_count, result], "llvm")
+    ctx = tvm.cpu(0)
+    np_out = np.zeros(np_inter_out.shape)
+    aa = tvm.nd.array(np_inter_out.astype(data.dtype), ctx)
+    bb = tvm.nd.array(np_valid_count.astype(valid_count.dtype), ctx)
+    cc = tvm.nd.array(np_out.astype(result.dtype), ctx)
+    f(aa, bb, cc)
+
+
+    # This will fail
+    # We combine get_valid_counts and nms into box_nms
+    data = tvm.placeholder((1, 5000, 6), name="data", dtype="float32")
+    result = topi.vision.box_nms(data, iou_threshold=overlap_thresh, force_suppress=True, score_threshold=score_threshold)
+    st = tvm.create_schedule(result.op)
+    f = tvm.build(st, [data, result], "llvm")

From c19526b79f06206fc4a9351c90f1c03e884acee4 Mon Sep 17 00:00:00 2001
From: Wang <wayao@9801a7a9c287.ant.amazon.com>
Date: Thu, 13 Dec 2018 14:24:36 -0800
Subject: [PATCH 05/43] Add test for get_valid_counts

---
 topi/python/topi/cuda/nms.py          |   5 +-
 topi/python/topi/generic/vision.py    |  17 +++
 topi/python/topi/vision/nms.py        | 174 +++++++++-----------------
 topi/tests/python/test_topi_vision.py |  51 +++++++-
 4 files changed, 132 insertions(+), 115 deletions(-)

diff --git a/topi/python/topi/cuda/nms.py b/topi/python/topi/cuda/nms.py
index e0d71559f1a0..89c0da381aae 100644
--- a/topi/python/topi/cuda/nms.py
+++ b/topi/python/topi/cuda/nms.py
@@ -1,4 +1,4 @@
-# pylint: disable=invalid-name, no-member, too-many-locals, too-many-arguments, too-many-statements, singleton-comparison
+# pylint: disable=invalid-name, no-member, too-many-locals, too-many-arguments, too-many-statements, singleton-comparison, unused-argument
 """Non-maximum suppression operator"""
 import math
 import tvm
@@ -182,7 +182,8 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx):
 
 
 @nms.register(["cuda", "gpu"])
-def nms_gpu(data, valid_count, nms_threshold=0.5, force_suppress=False, nms_topk=-1):
+def nms_gpu(data, valid_count, nms_threshold=0.5, force_suppress=False, nms_topk=-1,
+            do_rearrange=False):
     """Non-maximum suppression operator for object detection.
 
     Parameters
diff --git a/topi/python/topi/generic/vision.py b/topi/python/topi/generic/vision.py
index 76e8545bfc52..bfd6c55d533a 100644
--- a/topi/python/topi/generic/vision.py
+++ b/topi/python/topi/generic/vision.py
@@ -36,6 +36,23 @@ def schedule_reorg(outs):
     cpp_target = cpp.TEST_create_target(target.target_name)
     return cpp.generic.default_schedule(cpp_target, outs, False)
 
+@tvm.target.generic_func
+def schedule_get_valid_counts(outs):
+    """Schedule for get_valid_counts
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+      The computation graph description of nms
+      in the format of an array of tensors.
+
+    Returns
+    -------
+    s: Schedule
+      The computation schedule for the op.
+    """
+    return _default_schedule(outs, False)
+
 @tvm.target.generic_func
 def schedule_nms(outs):
     """Schedule for non-maximum suppression
diff --git a/topi/python/topi/vision/nms.py b/topi/python/topi/vision/nms.py
index 7569ae636dca..0b113733f8da 100644
--- a/topi/python/topi/vision/nms.py
+++ b/topi/python/topi/vision/nms.py
@@ -1,16 +1,17 @@
-# pylint: disable=invalid-name, no-member, too-many-locals, too-many-arguments
+# pylint: disable=invalid-name, no-member, too-many-locals, too-many-arguments, undefined-variable, too-many-nested-blocks, too-many-branches
 """Non-maximum suppression operator"""
 import tvm
 
 from tvm import api, hybrid
 
 @hybrid.script
-def rearrange_out(input):
-    """Rearrange nms output to move all valid entries to top.
+def hybrid_rearrange_out(data):
+    """Hybrid routine to rearrange nms output to
+    move all valid entries to top.
 
     Parameters
     ----------
-    input : tvm.Tensor or numpy NDArray
+    data : tvm.Tensor or numpy NDArray
         NMS output. 3-D tensor with shape
         [batch_size, num_anchors, 6].
 
@@ -20,32 +21,32 @@ def rearrange_out(input):
         Transformed NMS output. 3-D tensor with shape
         [batch_size, num_anchors, 6].
     """
-    output = output_tensor((input.shape[0],
-                            input.shape[1],
-                            input.shape[2],),
-                           input.dtype)
-    batch_size = input.shape[0]
-    num_anchors = input.shape[1]
-    elem_length = input.shape[2]
-    for i in range(batch_size):
-        for j in range(num_anchors):
-            for k in range(elem_length):
-                output[i, j, k] = -1.0
+    output = output_tensor((data.shape[0],
+                            data.shape[1],
+                            data.shape[2],),
+                           data.dtype)
+    batch_size = data.shape[0]
+    num_anchors = data.shape[1]
+    elem_length = data.shape[2]
 
-    for i in range(batch_size):
+    for i in parallel(batch_size):
         valid_idx = 0
         for j in range(num_anchors):
-            if input[i, j, 0] >= 0:
+            if data[i, j, 0] >= 0:
+                for k in range(elem_length):
+                    output[i, valid_idx, k] = data[i, j, k]
+                valid_idx += 1
+            if j >= valid_idx:
                 for k in range(elem_length):
-                    output[i, valid_idx, k] = input[i, j, k]
-                valid_idx = valid_idx + 1
+                    output[i, j, k] = -1.0
     return output
 
 
 @hybrid.script
-def get_valid_counts(data, score_threshold):
-    """Get valid count of bounding boxes given a score threshlod.
-    Also moves valid boxes to the top of input data.
+def hybrid_get_valid_counts(data, score_threshold):
+    """Hybrid routine to get valid count of bounding boxes
+    given a score threshold. Also moves valid boxes to the
+    top of input data.
 
     Parameters
     ----------
@@ -71,7 +72,7 @@ def get_valid_counts(data, score_threshold):
                                 num_anchors,
                                 box_data_length),
                                data.dtype)
-    for i in range(batch_size):
+    for i in parallel(batch_size):
         valid_count[i] = 0
         inter_idx = 0
         for j in range(num_anchors):
@@ -80,10 +81,33 @@ def get_valid_counts(data, score_threshold):
                 for k in range(box_data_length):
                     out_tensor[i, inter_idx, k] = data[i, j, k]
                 valid_count[i] += 1
-                inter_idx = inter_idx + 1
-
+                inter_idx += 1
     return valid_count, out_tensor
 
+@tvm.target.generic_func
+def get_valid_counts(data, score_threshold=0):
+    """Get valid count of bounding boxes given a score threshold.
+    Also moves valid boxes to the top of input data.
+
+    Parameters
+    ----------
+    data : tvm.Tensor
+        Input data. 3-D tensor with shape [batch_size, num_anchors, 6].
+
+    score_threshold : optional, float
+        Lower limit of score for valid bounding boxes.
+
+    Returns
+    -------
+    out_tensor : tvm.Tensor
+        Rearranged data tensor.
+
+    valid_count : tvm.Tensor
+        1-D tensor for valid number of boxes.
+    """
+    score_threshold_const = tvm.const(score_threshold, "float")
+    return hybrid_get_valid_counts(data, score_threshold_const)
+
 
 @hybrid.script
 def hybrid_nms(data, sorted_index, valid_count,
@@ -129,29 +153,26 @@ def hybrid_nms(data, sorted_index, valid_count,
             if valid_count[i] > 0:
                 # Reorder output
                 nkeep = valid_count[i]
-                if topk > 0:
-                    if topk < valid_count[i]:
-                        nkeep = topk
+                if 0 < topk < nkeep:
+                    nkeep = topk
                 for j in range(nkeep):
                     for k in range(box_data_length):
                         output[i, j, k] = data[i, sorted_index[i, j], k]
-                if topk > 0:
-                    if topk < valid_count[i]:
-                        for j in range(valid_count[i] - nkeep):
-                            for k in range(box_data_length):
-                                output[i, j + nkeep, k] = data[i, j + nkeep, k]
+                if 0 < topk < valid_count[i]:
+                    for j in range(valid_count[i] - nkeep):
+                        for k in range(box_data_length):
+                            output[i, j + nkeep, k] = data[i, j + nkeep, k]
             # Apply nms
             for j in range(valid_count[i]):
                 if output[i, j, 0] >= 0:
                     for k in range(valid_count[i]):
                         check_iou = 0
-                        if k > j:
-                            if output[i, k, 0] >= 0:
-                                if force_suppress:
-                                    check_iou = 1
-                                elif output[i, j, 0] == output[i, k, 0]:
-                                    check_iou = 1
-                        if check_iou:
+                        if k > j and output[i, k, 0] >= 0:
+                            if force_suppress:
+                                check_iou = 1
+                            elif output[i, j, 0] == output[i, k, 0]:
+                                check_iou = 1
+                        if check_iou > 0:
                             batch_idx = i
                             box_a_idx = j
                             box_b_idx = k
@@ -264,77 +285,6 @@ def nms(data, valid_count, iou_threshold=0.5, force_suppress=False,
                      tvm.const(force_suppress, dtype="bool"),
                      tvm.const(topk, dtype="int32"))
     if do_rearrange:
-        out = rearrange_out(out)
+        out = hybrid_rearrange_out(out)
 
     return out
-
-@tvm.target.generic_func
-def box_nms(data, iou_threshold=0.5, score_threshold=0,
-            force_suppress=True, topk=-1):
-    """Apply non-maximum suppression to input.
-    Comparing to nms, this function takes score_threshold
-    as argument and automatically filters valid anchor boxes.
-
-    Parameters
-    ----------
-    data : tvm.Tensor
-        3-D tensor with shape [batch_size, num_anchors, 6].
-        The last dimension should be in format of
-        [class_id, score, box_left, box_top, box_right, box_bottom].
-
-    iou_threshold : optional, float
-        Non-maximum suppression threshold.
-
-    score_threshold : optional, float
-        Lower limit of score for valid bounding boxes.
-
-    force_suppress : optional, boolean
-        Whether to suppress all detections regardless of class_id.
-
-    topk : optional, int
-        Keep maximum top k detections before nms, -1 for no limit.
-
-    Returns
-    -------
-    out : tvm.Tensor
-        3-D tensor with shape [batch_size, num_anchors, 6].
-    """
-    score_threshold_const = tvm.const(score_threshold,
-                                      dtype="float32")
-    valid_count, out = get_valid_counts(data, score_threshold_const)
-    return nms(out, valid_count, iou_threshold,
-               force_suppress, topk, True)
-
-
-if __name__ == '__main__':
-    import tvm
-    import topi
-    import numpy as np
-
-    score_threshold = 0.13
-    overlap_thresh = 0.5
-
-    # This works.
-    # Here we first call get_valid_counts with np data,
-    # then build nms function and feed data into it.
-    np_data = np.random.uniform(size=(1, 5000, 6)).astype("float32")
-    np_valid_count, np_inter_out = topi.vision.get_valid_counts(np_data, score_threshold)
-    data = tvm.placeholder((1, 5000, 6), name="data", dtype="float32")
-    valid_count = tvm.placeholder((1,), name="valid_count", dtype="int32")
-    result = topi.vision.nms(data, valid_count, iou_threshold=overlap_thresh, force_suppress=True, do_rearrange=True)
-    st = tvm.create_schedule(result.op)
-    f = tvm.build(st, [data, valid_count, result], "llvm")
-    ctx = tvm.cpu(0)
-    np_out = np.zeros(np_inter_out.shape)
-    aa = tvm.nd.array(np_inter_out.astype(data.dtype), ctx)
-    bb = tvm.nd.array(np_valid_count.astype(valid_count.dtype), ctx)
-    cc = tvm.nd.array(np_out.astype(result.dtype), ctx)
-    f(aa, bb, cc)
-
-
-    # This will fail
-    # We combine get_valid_counts and nms into box_nms
-    data = tvm.placeholder((1, 5000, 6), name="data", dtype="float32")
-    result = topi.vision.box_nms(data, iou_threshold=overlap_thresh, force_suppress=True, score_threshold=score_threshold)
-    st = tvm.create_schedule(result.op)
-    f = tvm.build(st, [data, result], "llvm")
diff --git a/topi/tests/python/test_topi_vision.py b/topi/tests/python/test_topi_vision.py
index 3c0c3aa854d7..d77520c60cf8 100644
--- a/topi/tests/python/test_topi_vision.py
+++ b/topi/tests/python/test_topi_vision.py
@@ -8,7 +8,55 @@
 
 from tvm.contrib.pickle_memoize import memoize
 from topi.util import get_const_tuple
-from topi.vision import ssd, nms
+from topi.vision import ssd, nms, get_valid_counts
+
+
+def verify_get_valid_counts(dshape, score_threshold):
+    dtype = "float32"
+    batch_size, num_anchor, elem_length = dshape
+    np_data = np.random.uniform(size=dshape).astype(dtype)
+    np_out1 = np.zeros(shape=(batch_size,))
+    np_out2 = np.zeros(shape=dshape).astype("float32")
+    for i in range(batch_size):
+        np_out1[i] = 0
+        inter_idx = 0
+        for j in range(num_anchor):
+            score = np_data[i, j, 1]
+            if score >= score_threshold:
+                for k in range(elem_length):
+                    np_out2[i, inter_idx, k] = np_data[i, j, k]
+                np_out1[i] += 1
+                inter_idx += 1
+
+    def check_device(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
+            print("Skip because %s is not enabled" % device)
+            return
+        print("Running on target: %s" % device)
+        with tvm.target.create(device):
+            data = tvm.placeholder(dshape, name="data", dtype=dtype)
+            outs = get_valid_counts(data, score_threshold)
+            s = topi.generic.schedule_multibox_prior(outs)
+
+        tvm_input_data = tvm.nd.array(np_data, ctx)
+        tvm_out1 = tvm.nd.array(np.zeros(np_out1.shape, dtype="int32"), ctx)
+        tvm_out2 = tvm.nd.array(np.zeros(np_out2.shape, dtype=dtype), ctx)
+        f = tvm.build(s, [data, outs[0], outs[1]], device)
+        f(tvm_input_data, tvm_out1, tvm_out2)
+        tvm.testing.assert_allclose(tvm_out1.asnumpy(), np_out1, rtol=1e-3)
+        tvm.testing.assert_allclose(tvm_out2.asnumpy(), np_out2, rtol=1e-3)
+
+    for device in ['llvm']:
+        check_device(device)
+
+
+def test_get_valid_counts():
+    verify_get_valid_counts((1, 2500, 6), 0)
+    verify_get_valid_counts((1, 2500, 6), -1)
+    verify_get_valid_counts((3, 1000, 6), 0.15)
+    verify_get_valid_counts((16, 500, 6), 0.95)
+>>>>>>> Add test for get_valid_counts
 
 
 def test_nms():
@@ -274,6 +322,7 @@ def test_proposal():
 
 
 if __name__ == "__main__":
+    test_get_valid_counts()
     test_nms()
     test_multibox_prior()
     test_multibox_detection()

From 9e0eee78a319fc705574473e4d86ce966093188c Mon Sep 17 00:00:00 2001
From: Wang <wayao@9801a7a9c287.ant.amazon.com>
Date: Sat, 15 Dec 2018 16:10:28 -0800
Subject: [PATCH 06/43] Add missing operators

---
 nnvm/include/nnvm/top/nn.h                    |  19 ++-
 nnvm/include/nnvm/top/tensor.h                |  15 +++
 nnvm/python/nnvm/frontend/mxnet.py            |  62 +++++++++-
 nnvm/python/nnvm/top/transform.py             |   4 +
 nnvm/python/nnvm/top/vision.py                |  24 +++-
 nnvm/src/top/tensor/elemwise.cc               | 109 ++++++++++++++++++
 nnvm/src/top/tensor/transform.cc              |  85 ++++++++++++++
 nnvm/src/top/vision/nms.cc                    |  53 +++++++++
 nnvm/tests/python/compiler/test_top_level4.py |  81 ++++++++++++-
 topi/python/topi/vision/nms.py                |  11 +-
 topi/tests/python/test_topi_vision.py         |   7 +-
 11 files changed, 444 insertions(+), 26 deletions(-)

diff --git a/nnvm/include/nnvm/top/nn.h b/nnvm/include/nnvm/top/nn.h
index 143a9548f18a..543820e724bd 100644
--- a/nnvm/include/nnvm/top/nn.h
+++ b/nnvm/include/nnvm/top/nn.h
@@ -443,17 +443,28 @@ struct MultiBoxTransformLocParam : public dmlc::Parameter<MultiBoxTransformLocPa
   }
 };
 
+struct GetValidCountsParam : public dmlc::Parameter<GetValidCountsParam> {
+  float score_threshold;
+  DMLC_DECLARE_PARAMETER(GetValidCountsParam) {
+    DMLC_DECLARE_FIELD(score_threshold).set_default(0.0)
+      .describe("Lower limit of score for valid bounding boxes.");
+  }
+};
+
 struct NMSParam : public dmlc::Parameter<NMSParam> {
-  float nms_threshold;
+  float iou_threshold;
   bool force_suppress;
-  int nms_topk;
+  int topk;
+  bool do_rearrange;
   DMLC_DECLARE_PARAMETER(NMSParam) {
-    DMLC_DECLARE_FIELD(nms_threshold).set_default(0.5)
+    DMLC_DECLARE_FIELD(iou_threshold).set_default(0.5)
       .describe("Non-maximum suppression threshold.");
     DMLC_DECLARE_FIELD(force_suppress).set_default(false)
     .describe("Suppress all detections regardless of class_id.");
-    DMLC_DECLARE_FIELD(nms_topk).set_default(-1)
+    DMLC_DECLARE_FIELD(topk).set_default(-1)
     .describe("Keep maximum top k detections before nms, -1 for no limit.");
+    DMLC_DECLARE_FIELD(do_rearrange).set_default(false)
+      .describe("Whether to move all valid bounding boxes to the top.");
   }
 };
 
diff --git a/nnvm/include/nnvm/top/tensor.h b/nnvm/include/nnvm/top/tensor.h
index bed1b05984da..dc3c23a6198b 100644
--- a/nnvm/include/nnvm/top/tensor.h
+++ b/nnvm/include/nnvm/top/tensor.h
@@ -74,6 +74,21 @@ struct StridedSliceParam : public dmlc::Parameter<StridedSliceParam> {
   }
 };
 
+struct SliceAxisParam : public dmlc::Parameter<SliceAxisParam> {
+  int axis;
+  int begin;
+  int end;
+
+  DMLC_DECLARE_PARAMETER(SliceAxisParam) {
+    DMLC_DECLARE_FIELD(axis)
+      .describe("Axis along which to be sliced.");
+    DMLC_DECLARE_FIELD(begin)
+      .describe("Index for begin of slice");
+    DMLC_DECLARE_FIELD(end).set_default(0)
+      .describe("Index for end of the slice");
+  }
+};
+
 enum TypeFlag {
   kFloat32 = 0,
   kFloat64 = 1,
diff --git a/nnvm/python/nnvm/frontend/mxnet.py b/nnvm/python/nnvm/frontend/mxnet.py
index 179e1126fd4d..29a6f65c6eb2 100644
--- a/nnvm/python/nnvm/frontend/mxnet.py
+++ b/nnvm/python/nnvm/frontend/mxnet.py
@@ -238,15 +238,15 @@ def _clip(inputs, attrs):
 
 def _contrib_multibox_detection(inputs, attrs):
     clip = _parse_bool_str(attrs, 'clip', default='True')
-    threshold = attrs.get('threshold') or 0.01
-    nms_threshold = attrs.get('nms_threshold') or 0.5
+    threshold = attrs.get('threshold', 0.01)
+    iou_threshold = attrs.get('nms_threshold', 0.5)
     force_suppress = _parse_bool_str(attrs, 'force_suppress', default='False')
     variances = tuple([float(x.strip()) for x in attrs.get('variances').strip('()').split(',')]) \
         if attrs.get('variances') is not None else (0.1, 0.1, 0.2, 0.2)
-    nms_topk = attrs.get('nms_topk') or -1
+    topk = attrs.get('nms_topk', -1)
     new_attrs0 = {'clip': clip, 'threshold': float(threshold), 'variances': variances}
-    new_attrs1 = {'nms_threshold': float(nms_threshold), 'force_suppress': force_suppress,
-                  'nms_topk': int(nms_topk)}
+    new_attrs1 = {'iou_threshold': float(iou_threshold), 'force_suppress': force_suppress,
+                  'topk': int(topk)}
     data, valid_count = _get_nnvm_op('multibox_transform_loc')(inputs[0], inputs[1],
                                                                inputs[2], **new_attrs0)
     return _get_nnvm_op('nms')(data, valid_count, **new_attrs1)
@@ -314,6 +314,47 @@ def _argmin(inputs, attrs):
     new_attrs['keepdims'] = _parse_bool_str(attrs, 'keepdims', default="False")
     return _get_nnvm_op(op_name)(*inputs, **new_attrs)
 
+def _contrib_box_nms(inputs, attrs):
+    force_suppress = _parse_bool_str(attrs, 'force_suppress', default="False")
+    overlap_thresh = attrs.get('overlap_thresh', 0.5)
+    topk = attrs.get('topk', -1)
+    valid_thresh = attrs.get('valid_thresh', 0)
+    coord_start = attrs.get('coord_start', 2)
+    score_index = attrs.get('score_index', 1)
+    id_index = attrs.get('id_index', -1)
+    in_format = attrs.get('in_format', 'corner')
+    out_format = attrs.get('out_format', 'corner')
+    if int(coord_start) != 2:
+        _raise_not_supported('coord_start: %s' % coord_start, 'box_nms')
+    if int(score_index) != 1:
+        _raise_not_supported('score_index: %s' % score_index, 'box_nms')
+    if int(id_index) != -1 and int(id_index) != 0:
+        _raise_not_supported('id_index: %s' % id_index, 'box_nms')
+    if in_format != 'corner':
+        _raise_not_supported('in_format: %s' % in_format, 'box_nms')
+    if out_format != 'corner':
+        _raise_not_supported('out_format: %s' % out_format, 'box_nms')
+
+    valid_counts, inter_out = \
+        _get_nnvm_op('get_valid_counts')(inputs[0], score_threshold=valid_thresh)
+    nms_out = _get_nnvm_op('nms')(inter_out, valid_counts,
+                                  iou_threshold=overlap_thresh,
+                                  force_suppress=force_suppress,
+                                  topk=topk, do_rearrange=True)
+    return nms_out
+
+def _slice_like(inputs, attrs):
+    op_name = 'slice_like'
+    axis = attrs.get('axes', ())
+    return _get_nnvm_op(op_name)(inputs[0], inputs[1], axis=axis)
+
+def _slice_axis(inputs, attrs):
+    op_name, new_attrs = 'slice_axis', {}
+    new_attrs['axis'] = attrs.get('axis')
+    new_attrs['begin'] = attrs.get('begin')
+    new_attrs['end'] = 0 if attrs.get('end') == "None" else attrs.get('end')
+    return _get_nnvm_op(op_name)(inputs[0], **new_attrs)
+
 _identity_list = ['__add_scalar__', '__add_symbol__', '__div_scalar__',
                   '__div_symbol__', '__mul_scalar__', '__mul_symbol__',
                   '__pow_scalar__', '__rdiv_scalar__', '__rpow_scalar__',
@@ -322,7 +363,7 @@ def _argmin(inputs, attrs):
                   'broadcast_sub', 'broadcast_to', 'cast', 'elemwise_add',
                   'elemwise_div', 'elemwise_mul', 'elemwise_sub', 'exp',
                   'flatten', 'log', 'log_softmax', 'max', 'min', 'negative',
-                  'ones_like', 'relu', 'sigmoid', 'slice_like', 'softmax',
+                  'ones_like', 'relu', 'sigmoid', 'softmax',
                   'sum', 'tanh', 'transpose', 'zeros_like', 'gather_nd',
                   'reshape_like', 'where']
 
@@ -334,6 +375,13 @@ def _argmin(inputs, attrs):
     '_plus_scalar'  : _rename('__add_scalar__'),
     '_rdiv_scalar'  : _rename('__rdiv_scalar__'),
     '_rminus_scalar': _rename('__rsub_scalar__'),
+    '_equal_scalar' : _rename('__equal_scalar__'),
+    '_not_equal_scalar': _rename('__not_equal_scalar__'),
+    '_greater_scalar': _rename('__greater_scalar__'),
+    '_greater_equal_scalar': _rename('__greater_equal_scalar__'),
+    '_less_scalar': _rename('__less_scalar__'),
+    '_less_equal_scalar': _rename('__less_equal_scalar__'),
+    '_contrib_box_nms'       : _contrib_box_nms,
     '_contrib_MultiBoxPrior' : _rename('multibox_prior'),
     '_contrib_MultiBoxDetection' : _contrib_multibox_detection,
     '_minimum'      : _minimum,
@@ -360,6 +408,8 @@ def _argmin(inputs, attrs):
     'Reshape'       : _reshape,
     'slice'         : _slice,
     'SliceChannel'  : _split,
+    'slice_axis'    : _slice_axis,
+    'slice_like'    : _slice_like,
     'split'         : _split,
     'Softmax'       : _rename('softmax'),
     'SoftmaxActivation' : _softmax_activation,
diff --git a/nnvm/python/nnvm/top/transform.py b/nnvm/python/nnvm/top/transform.py
index 8fde9632a8af..d6c85ea283d3 100644
--- a/nnvm/python/nnvm/top/transform.py
+++ b/nnvm/python/nnvm/top/transform.py
@@ -83,6 +83,10 @@ def schedule_concatenate(_, outs, target):
 reg.register_pattern("slice_like", OpPattern.INJECTIVE)
 reg.register_schedule("slice_like", _fschedule_injective)
 
+# slice_axis
+reg.register_pattern("slice_axis", OpPattern.INJECTIVE)
+reg.register_schedule("slice_axis", _fschedule_injective)
+
 # where
 reg.register_pattern("where", OpPattern.INJECTIVE)
 reg.register_schedule("where", _fschedule_injective)
diff --git a/nnvm/python/nnvm/top/vision.py b/nnvm/python/nnvm/top/vision.py
index 1b20baab47c3..5df1bb34aa3a 100644
--- a/nnvm/python/nnvm/top/vision.py
+++ b/nnvm/python/nnvm/top/vision.py
@@ -60,6 +60,21 @@ def compute_multibox_transform_loc(attrs, inputs, _):
 
 reg.register_pattern("multibox_detection", OpPattern.OPAQUE)
 
+# Get valid number of anchor boxes
+@reg.register_schedule("get_valid_counts")
+def schedule_get_valid_counts(_, outs, target):
+    """Schedule definition of get_valid_counts"""
+    with tvm.target.create(target):
+        return topi.generic.schedule_get_valid_counts(outs)
+
+@reg.register_compute("get_valid_counts")
+def compute_get_valid_counts(attrs, inputs, _):
+    """Compute definition of get_valid_counts"""
+    score_threshold = attrs.get_float("score_threshold")
+    return topi.vision.get_valid_counts(inputs[0], score_threshold)
+
+reg.register_pattern("get_valid_counts", OpPattern.OPAQUE)
+
 # non-maximum suppression
 @reg.register_schedule("nms")
 def schedule_nms(_, outs, target):
@@ -70,11 +85,12 @@ def schedule_nms(_, outs, target):
 @reg.register_compute("nms")
 def compute_nms(attrs, inputs, _):
     """Compute definition of nms"""
-    nms_threshold = attrs.get_float('nms_threshold')
+    iou_threshold = attrs.get_float('iou_threshold')
     force_suppress = attrs.get_bool('force_suppress')
-    nms_topk = attrs.get_int('nms_topk')
+    topk = attrs.get_int('topk')
+    do_rearrange = attrs.get_bool('do_rearrange')
 
-    return topi.vision.nms(inputs[0], inputs[1], nms_threshold,
-                           force_suppress, nms_topk)
+    return topi.vision.nms(inputs[0], inputs[1], iou_threshold,
+                           force_suppress, topk, do_rearrange)
 
 reg.register_pattern("nms", OpPattern.OPAQUE)
diff --git a/nnvm/src/top/tensor/elemwise.cc b/nnvm/src/top/tensor/elemwise.cc
index 2d9813e22131..9c1687beab35 100644
--- a/nnvm/src/top/tensor/elemwise.cc
+++ b/nnvm/src/top/tensor/elemwise.cc
@@ -806,6 +806,115 @@ NNVM_REGISTER_ELEMWISE_BINARY_SCALAR(__rpow_scalar__)
     };
 });
 
+NNVM_REGISTER_ELEMWISE_BINARY_SCALAR(__equal_scalar__)
+.describe(R"code(Tensor equal scalar
+
+)code"  NNVM_ADD_FILELINE)
+.set_support_level(3)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+
+    Tensor out = topi::cast(
+      binary_scalar_op(attrs, inputs[0],
+                       [](Expr x, Expr y) { return x == y; }),
+      out_info[0]->dtype
+    );
+    return Array<Tensor>{ out };
+})
+.set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes);
+
+NNVM_REGISTER_ELEMWISE_BINARY_SCALAR(__not_equal_scalar__)
+.describe(R"code(Tensor not equal scalar
+
+)code"  NNVM_ADD_FILELINE)
+.set_support_level(3)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+    Tensor out = topi::cast(
+      binary_scalar_op(attrs, inputs[0],
+                       [](Expr x, Expr y) { return x != y; }),
+      out_info[0]->dtype
+    );
+    return Array<Tensor>{ out };
+})
+.set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes);
+
+NNVM_REGISTER_ELEMWISE_BINARY_SCALAR(__greater_scalar__)
+.describe(R"code(Tensor greater scalar
+
+)code"  NNVM_ADD_FILELINE)
+.set_support_level(3)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+    Tensor out = topi::cast(
+      binary_scalar_op(attrs, inputs[0],
+                       [](Expr x, Expr y) { return x > y; }),
+      out_info[0]->dtype
+    );
+    return Array<Tensor>{ out };
+})
+.set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes);
+
+NNVM_REGISTER_ELEMWISE_BINARY_SCALAR(__greater_equal_scalar__)
+.describe(R"code(Tensor greater equal scalar
+
+)code"  NNVM_ADD_FILELINE)
+.set_support_level(3)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+    Tensor out = topi::cast(
+      binary_scalar_op(attrs, inputs[0],
+                       [](Expr x, Expr y) { return x >= y; }),
+      out_info[0]->dtype
+    );
+    return Array<Tensor>{ out };
+})
+.set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes);
+
+NNVM_REGISTER_ELEMWISE_BINARY_SCALAR(__less_scalar__)
+.describe(R"code(Tensor less scalar
+
+)code"  NNVM_ADD_FILELINE)
+.set_support_level(3)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+    Tensor out = topi::cast(
+      binary_scalar_op(attrs, inputs[0],
+                       [](Expr x, Expr y) { return x < y; }),
+      out_info[0]->dtype
+    );
+    return Array<Tensor>{ out };
+})
+.set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes);
+
+NNVM_REGISTER_ELEMWISE_BINARY_SCALAR(__less_equal_scalar__)
+.describe(R"code(Tensor less equal scalar
+
+)code"  NNVM_ADD_FILELINE)
+.set_support_level(3)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+    Tensor out = topi::cast(
+      binary_scalar_op(attrs, inputs[0],
+                       [](Expr x, Expr y) { return x <= y; }),
+      out_info[0]->dtype
+    );
+    return Array<Tensor>{ out };
+})
+.set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes);
+
 DMLC_REGISTER_PARAMETER(ElementWiseReduceParam);
 
 NNVM_REGISTER_ELEMWISE_REDUCE_OP(elemwise_sum)
diff --git a/nnvm/src/top/tensor/transform.cc b/nnvm/src/top/tensor/transform.cc
index 9d259ae77d9b..4f09062ac607 100644
--- a/nnvm/src/top/tensor/transform.cc
+++ b/nnvm/src/top/tensor/transform.cc
@@ -1283,6 +1283,91 @@ NNVM_REGISTER_OP(slice_like)
 })
 .set_support_level(4);
 
+// SliceAxis
+DMLC_REGISTER_PARAMETER(SliceAxisParam);
+
+inline bool SliceAxisShape(const nnvm::NodeAttrs& attrs,
+                           std::vector<TShape>* in_attrs,
+                           std::vector<TShape>* out_attrs) {
+  const SliceAxisParam& param = nnvm::get<SliceAxisParam>(attrs.parsed);
+  const TShape& src_shape = in_attrs->at(0);
+  int axis = param.axis;
+  int begin = param.begin;
+  int end = param.end;
+
+  if (axis < 0) {
+    axis += src_shape.ndim();
+  }
+  if (begin < 0) {
+    begin += src_shape[axis];
+  }
+  if (end <= 0) {
+    end += src_shape[axis];
+  }
+  CHECK_LT(begin, end)
+    << "Begin index must be smaller than end index: "
+    << begin << " vs " << end;
+
+  TShape out_shape(src_shape);
+  out_shape[axis] = end - begin;
+  NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_attrs, 0, out_shape);
+  return true;
+}
+
+NNVM_REGISTER_OP(slice_axis)
+.describe(R"code(Slices along a given axis.
+Returns an array slice along a given axis starting from
+the begin index to the end index.
+)code" NNVM_ADD_FILELINE)
+.add_argument("data", "Tensor", "Input data to be sliced.")
+.set_num_outputs(1)
+.set_num_inputs(1)
+.add_arguments(SliceAxisParam::__FIELDS__())
+.set_attr_parser(ParamParser<SliceAxisParam>)
+.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<SliceAxisParam>)
+.set_attr<FInferShape>("FInferShape", SliceAxisShape)
+.set_attr<FInferType>("FInferType", ElemwiseType<1, 1>)
+.set_attr<FCorrectLayout>("FCorrectLayout", ElemwiseArbitraryLayout<1, 1>)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+    const SliceAxisParam& param = nnvm::get<SliceAxisParam>(attrs.parsed);
+    const Array<Expr> src_shape = inputs[0]->shape;
+    Array<Expr> begin_idx, end_idx, strides;
+    int axis = param.axis;
+    int begin = param.begin;
+    int end = param.end;
+
+    if (axis < 0) {
+      axis += src_shape.size();
+    }
+    if (begin < 0) {
+      begin += topi::GetConstInt(src_shape[axis]);
+    }
+    if (end <= 0) {
+      end += topi::GetConstInt(src_shape[axis]);
+    }
+    for (size_t i = 0; i < src_shape.size(); ++i) {
+      begin_idx.push_back(make_const(tvm::Int(32), 0));
+      strides.push_back(make_const(tvm::Int(32), 1));
+    }
+    end_idx = Array<Expr>(src_shape);
+    begin_idx.Set(axis, make_const(tvm::Int(32), begin));
+    end_idx.Set(axis, make_const(tvm::Int(32), end));
+
+    return Array<Tensor>{
+      topi::strided_slice(inputs[0],
+                          GetIntArray(begin_idx),
+                          GetIntArray(end_idx),
+                          GetIntArray(strides))
+    };
+})
+.set_attr<FListInputNames>("FListInputNames", [](const NodeAttrs& attrs) {
+  return std::vector<std::string>{"data"};
+})
+.set_support_level(4);
+
 // where
 inline bool WhereShape(const nnvm::NodeAttrs& attrs,
                        std::vector<TShape>* in_attrs,
diff --git a/nnvm/src/top/vision/nms.cc b/nnvm/src/top/vision/nms.cc
index 2680b894255b..a74a135175ba 100644
--- a/nnvm/src/top/vision/nms.cc
+++ b/nnvm/src/top/vision/nms.cc
@@ -19,6 +19,59 @@ using compiler::FTVMCompute;
 using tvm::Tensor;
 using tvm::Array;
 
+DMLC_REGISTER_PARAMETER(GetValidCountsParam);
+
+bool GetValidCountsShape(const NodeAttrs& attrs,
+                         std::vector<TShape> *in_attrs,
+                         std::vector<TShape> *out_attrs) {
+  TShape dshape = in_attrs->at(0);
+  TShape vshape = TShape({dshape[0]});
+  CHECK_EQ(dshape.ndim(), 3U) << "Input data should be 3-D.";
+  out_attrs->clear();
+  NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_attrs, 0, vshape);
+  NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_attrs, 1, dshape);
+  return true;
+}
+
+inline bool GetValidCountsInferType(const NodeAttrs &attrs,
+                                    std::vector<int> *in_attrs,
+                                    std::vector<int> *out_attrs) {
+  DTYPE_ASSIGN(out_attrs->at(0), static_cast<int>(kInt32));
+  DTYPE_ASSIGN(out_attrs->at(1), in_attrs->at(0))
+  return true;
+}
+
+inline bool GetValidCountsInferLayout(const NodeAttrs& attrs,
+                                      std::vector<Layout> *ilayouts,
+                                      const std::vector<Layout> *last_ilayouts,
+                                      std::vector<Layout> *olayouts) {
+  static const Layout kNCHW("NCHW");
+  CHECK_EQ(ilayouts->size(), 1U);
+  CHECK_EQ(olayouts->size(), 2U);
+  NNVM_ASSIGN_LAYOUT(*ilayouts, 0, kNCHW);
+  return true;
+}
+
+NNVM_REGISTER_OP(get_valid_counts)
+.describe(R"doc("Get valid count of bounding boxes given
+a score threshold. Also moves valid boxes to the top of
+input data."
+)doc" NNVM_ADD_FILELINE)
+.set_num_inputs(1)
+.set_num_outputs(2)
+.set_attr_parser(ParamParser<GetValidCountsParam>)
+.set_attr<FGetAttrDict>("FGetAttrDict",
+                         ParamGetAttrDict<GetValidCountsParam>)
+.add_arguments(GetValidCountsParam::__FIELDS__())
+.add_argument("data", "Tensor", "Input data.")
+.set_attr<FListInputNames>("FListInputNames", [](const NodeAttrs& attrs) {
+  return std::vector<std::string>{"data"};
+})
+.set_attr<FInferShape>("FInferShape", GetValidCountsShape)
+.set_attr<FInferType>("FInferType", GetValidCountsInferType)
+.set_attr<FCorrectLayout>("FCorrectLayout", GetValidCountsInferLayout)
+.set_support_level(4);
+
 DMLC_REGISTER_PARAMETER(NMSParam);
 
 bool NMSShape(const NodeAttrs& attrs,
diff --git a/nnvm/tests/python/compiler/test_top_level4.py b/nnvm/tests/python/compiler/test_top_level4.py
index fc4e62fb7156..aab81565c3ff 100644
--- a/nnvm/tests/python/compiler/test_top_level4.py
+++ b/nnvm/tests/python/compiler/test_top_level4.py
@@ -573,15 +573,56 @@ def test_multibox_transform_loc():
     out = m.get_output(0, tvm.nd.empty(expected_np_out.shape, dtype))
     tvm.testing.assert_allclose(out.asnumpy(), expected_np_out, atol=1e-5, rtol=1e-5)
 
+def verify_get_valid_counts(dshape, score_threshold):
+    dtype = "float32"
+    batch_size, num_anchor, elem_length = dshape
+    np_data = np.random.uniform(size=dshape).astype(dtype)
+    np_out1 = np.zeros(shape=(batch_size,))
+    np_out2 = np.zeros(shape=dshape).astype(dtype)
+    for i in range(batch_size):
+        np_out1[i] = 0
+        inter_idx = 0
+        for j in range(num_anchor):
+            score = np_data[i, j, 1]
+            if score >= score_threshold:
+                for k in range(elem_length):
+                    np_out2[i, inter_idx, k] = np_data[i, j, k]
+                np_out1[i] += 1
+                inter_idx += 1
+            if j >= np_out1[i]:
+                for k in range(elem_length):
+                    np_out2[i, j, k] = -1
+
+    target = "llvm"
+    ctx = tvm.cpu()
+    data = sym.Variable("data", dtype=dtype)
+    valid_counts, inter_data = sym.get_valid_counts(data, score_threshold=score_threshold)
+    out = sym.Group([valid_counts, inter_data])
+    graph, lib, _ = nnvm.compiler.build(out, target, {"data": dshape})
+    m = graph_runtime.create(graph, lib, ctx)
+    m.set_input("data", np_data)
+    m.run()
+    out1 = m.get_output(0, tvm.nd.empty(np_out1.shape, "int32"))
+    out2 = m.get_output(1, tvm.nd.empty(dshape, dtype))
+    tvm.testing.assert_allclose(out1.asnumpy(), np_out1, rtol=1e-3)
+    tvm.testing.assert_allclose(out2.asnumpy(), np_out2, rtol=1e-3)
+
+
+def test_get_valid_counts():
+    verify_get_valid_counts((1, 2500, 6), 0)
+    verify_get_valid_counts((1, 2500, 6), -1)
+    verify_get_valid_counts((3, 1000, 6), 0.55)
+    verify_get_valid_counts((16, 500, 6), 0.95)
+
 def test_nms():
     dshape = (1, 5, 6)
     data = sym.Variable("data")
     valid_count = sym.Variable("valid_count", dtype="int32")
-    nms_threshold = 0.7
+    iou_threshold = 0.7
     force_suppress = True
-    nms_topk = 2
-    out = sym.nms(data=data, valid_count=valid_count, nms_threshold=nms_threshold,
-                  force_suppress=force_suppress, nms_topk=nms_topk)
+    topk = 2
+    out = sym.nms(data=data, valid_count=valid_count, iou_threshold=iou_threshold,
+                  force_suppress=force_suppress, topk=topk)
 
     np_data = np.array([[[0, 0.8, 1, 20, 25, 45], [1, 0.7, 30, 60, 50, 80],
                          [0, 0.4, 4, 21, 19, 40], [2, 0.9, 35, 61, 52, 79],
@@ -656,6 +697,35 @@ def test_slice_like():
     axis = (2, 3)
     verify_slice_like(np_data, np_shape_like, axis)
 
+def verify_slice_axis(dshape, axis, begin, end):
+    data = sym.Variable("data")
+    net = sym.slice_axis(data, axis=axis, begin=begin, end=end)
+    if axis < 0:
+        axis += len(dshape)
+    if begin < 0:
+        begin += dshape[axis]
+    if end <= 0:
+        end += dshape[axis]
+    np_data = np.random.uniform(size=dshape)
+    slc = [slice(None)] * len(dshape)
+    slc[axis] = slice(begin, end)
+    np_out = np_data[slc]
+
+    dtype = "float32"
+    for target, ctx in ctx_list():
+        graph, lib, _ = nnvm.compiler.build(net, target, {"data": dshape}, dtype=dtype)
+        m = graph_runtime.create(graph, lib, ctx)
+        m.set_input("data", np_data)
+        m.run()
+        out = m.get_output(0, tvm.nd.empty(np_out.shape, dtype))
+        tvm.testing.assert_allclose(out.asnumpy(), np_out, atol=1e-5, rtol=1e-5)
+
+def test_slice_axis():
+    verify_slice_axis((1, 2, 3, 4), 3, 0, 2)
+    verify_slice_axis((100, 50), -1, 1, -1)
+    verify_slice_axis((20,), -1, -9, -3)
+    verify_slice_axis((20, 30, 40), 1, 5, 0)
+
 def verify_where(condition, x, y):
     dtype = "float32"
     if len(condition.shape) == 1:
@@ -710,6 +780,7 @@ def test_argmax():
     np.testing.assert_allclose(out.asnumpy(), np_argmax, atol=1e-5, rtol=1e-5)
 
 if __name__ == "__main__":
+    test_get_valid_counts()
     test_reshape()
     test_broadcast()
     test_reduce()
@@ -726,8 +797,10 @@ def test_argmax():
     test_flip()
     test_multibox_prior()
     test_multibox_transform_loc()
+    test_get_valid_counts()
     test_nms()
     test_slice_like()
+    test_slice_axis()
     test_where()
     test_argmax()
     print(nnvm.compiler.engine.dump())
diff --git a/topi/python/topi/vision/nms.py b/topi/python/topi/vision/nms.py
index 0b113733f8da..f937c8779e2f 100644
--- a/topi/python/topi/vision/nms.py
+++ b/topi/python/topi/vision/nms.py
@@ -74,14 +74,15 @@ def hybrid_get_valid_counts(data, score_threshold):
                                data.dtype)
     for i in parallel(batch_size):
         valid_count[i] = 0
-        inter_idx = 0
         for j in range(num_anchors):
             score = data[i, j, 1]
             if score >= score_threshold:
                 for k in range(box_data_length):
-                    out_tensor[i, inter_idx, k] = data[i, j, k]
+                    out_tensor[i, valid_count[i], k] = data[i, j, k]
                 valid_count[i] += 1
-                inter_idx += 1
+            if j >= valid_count[i]:
+                for k in range(box_data_length):
+                    out_tensor[i, j, k] = -1.0
     return valid_count, out_tensor
 
 @tvm.target.generic_func
@@ -168,9 +169,7 @@ def hybrid_nms(data, sorted_index, valid_count,
                     for k in range(valid_count[i]):
                         check_iou = 0
                         if k > j and output[i, k, 0] >= 0:
-                            if force_suppress:
-                                check_iou = 1
-                            elif output[i, j, 0] == output[i, k, 0]:
+                            if force_suppress or output[i, j, 0] == output[i, k, 0]:
                                 check_iou = 1
                         if check_iou > 0:
                             batch_idx = i
diff --git a/topi/tests/python/test_topi_vision.py b/topi/tests/python/test_topi_vision.py
index d77520c60cf8..517d1f7ee80b 100644
--- a/topi/tests/python/test_topi_vision.py
+++ b/topi/tests/python/test_topi_vision.py
@@ -16,7 +16,7 @@ def verify_get_valid_counts(dshape, score_threshold):
     batch_size, num_anchor, elem_length = dshape
     np_data = np.random.uniform(size=dshape).astype(dtype)
     np_out1 = np.zeros(shape=(batch_size,))
-    np_out2 = np.zeros(shape=dshape).astype("float32")
+    np_out2 = np.zeros(shape=dshape).astype(dtype)
     for i in range(batch_size):
         np_out1[i] = 0
         inter_idx = 0
@@ -27,6 +27,9 @@ def verify_get_valid_counts(dshape, score_threshold):
                     np_out2[i, inter_idx, k] = np_data[i, j, k]
                 np_out1[i] += 1
                 inter_idx += 1
+            if j >= np_out1[i]:
+                for k in range(elem_length):
+                    np_out2[i, j, k] = -1.0
 
     def check_device(device):
         ctx = tvm.context(device, 0)
@@ -54,7 +57,7 @@ def check_device(device):
 def test_get_valid_counts():
     verify_get_valid_counts((1, 2500, 6), 0)
     verify_get_valid_counts((1, 2500, 6), -1)
-    verify_get_valid_counts((3, 1000, 6), 0.15)
+    verify_get_valid_counts((3, 1000, 6), 0.55)
     verify_get_valid_counts((16, 500, 6), 0.95)
 >>>>>>> Add test for get_valid_counts
 

From 037ca23dc63bcbfdc66b53719ebffe7cd8a19f6c Mon Sep 17 00:00:00 2001
From: Wang <wayao@9801a7a9c287.ant.amazon.com>
Date: Sat, 15 Dec 2018 19:36:37 -0800
Subject: [PATCH 07/43] Add id_index to box_nms op

---
 nnvm/include/nnvm/top/nn.h         |  3 +++
 nnvm/python/nnvm/frontend/mxnet.py |  3 ++-
 nnvm/python/nnvm/top/vision.py     |  4 +++-
 topi/python/topi/vision/nms.py     | 20 +++++++++++++++-----
 4 files changed, 23 insertions(+), 7 deletions(-)

diff --git a/nnvm/include/nnvm/top/nn.h b/nnvm/include/nnvm/top/nn.h
index 543820e724bd..1513be122b41 100644
--- a/nnvm/include/nnvm/top/nn.h
+++ b/nnvm/include/nnvm/top/nn.h
@@ -455,6 +455,7 @@ struct NMSParam : public dmlc::Parameter<NMSParam> {
   float iou_threshold;
   bool force_suppress;
   int topk;
+  int id_index;
   bool do_rearrange;
   DMLC_DECLARE_PARAMETER(NMSParam) {
     DMLC_DECLARE_FIELD(iou_threshold).set_default(0.5)
@@ -463,6 +464,8 @@ struct NMSParam : public dmlc::Parameter<NMSParam> {
     .describe("Suppress all detections regardless of class_id.");
     DMLC_DECLARE_FIELD(topk).set_default(-1)
     .describe("Keep maximum top k detections before nms, -1 for no limit.");
+    DMLC_DECLARE_FIELD(id_index).set_default(0)
+      .describe("Keep maximum top k detections before nms, -1 for no limit.");
     DMLC_DECLARE_FIELD(do_rearrange).set_default(false)
       .describe("Whether to move all valid bounding boxes to the top.");
   }
diff --git a/nnvm/python/nnvm/frontend/mxnet.py b/nnvm/python/nnvm/frontend/mxnet.py
index 29a6f65c6eb2..8467beb61e4f 100644
--- a/nnvm/python/nnvm/frontend/mxnet.py
+++ b/nnvm/python/nnvm/frontend/mxnet.py
@@ -340,7 +340,8 @@ def _contrib_box_nms(inputs, attrs):
     nms_out = _get_nnvm_op('nms')(inter_out, valid_counts,
                                   iou_threshold=overlap_thresh,
                                   force_suppress=force_suppress,
-                                  topk=topk, do_rearrange=True)
+                                  topk=topk, id_index=id_index,
+                                  do_rearrange=True)
     return nms_out
 
 def _slice_like(inputs, attrs):
diff --git a/nnvm/python/nnvm/top/vision.py b/nnvm/python/nnvm/top/vision.py
index 5df1bb34aa3a..f5f41d33e363 100644
--- a/nnvm/python/nnvm/top/vision.py
+++ b/nnvm/python/nnvm/top/vision.py
@@ -88,9 +88,11 @@ def compute_nms(attrs, inputs, _):
     iou_threshold = attrs.get_float('iou_threshold')
     force_suppress = attrs.get_bool('force_suppress')
     topk = attrs.get_int('topk')
+    id_index = attrs.get_int('id_index')
     do_rearrange = attrs.get_bool('do_rearrange')
 
     return topi.vision.nms(inputs[0], inputs[1], iou_threshold,
-                           force_suppress, topk, do_rearrange)
+                           force_suppress, topk, id_index,
+                           do_rearrange)
 
 reg.register_pattern("nms", OpPattern.OPAQUE)
diff --git a/topi/python/topi/vision/nms.py b/topi/python/topi/vision/nms.py
index f937c8779e2f..43de07c3dc76 100644
--- a/topi/python/topi/vision/nms.py
+++ b/topi/python/topi/vision/nms.py
@@ -76,7 +76,7 @@ def hybrid_get_valid_counts(data, score_threshold):
         valid_count[i] = 0
         for j in range(num_anchors):
             score = data[i, j, 1]
-            if score >= score_threshold:
+            if score > score_threshold:
                 for k in range(box_data_length):
                     out_tensor[i, valid_count[i], k] = data[i, j, k]
                 valid_count[i] += 1
@@ -112,7 +112,8 @@ def get_valid_counts(data, score_threshold=0):
 
 @hybrid.script
 def hybrid_nms(data, sorted_index, valid_count,
-               iou_threshold, force_suppress, topk):
+               iou_threshold, force_suppress,
+               topk, id_index):
     """Hybrid routing for non-maximum suppression.
 
     Parameters
@@ -137,6 +138,9 @@ def hybrid_nms(data, sorted_index, valid_count,
     topk : tvm.const
         Keep maximum top k detections before nms, -1 for no limit.
 
+    id_index : tvm.const
+        index of the class categories, -1 to disable.
+
     Returns
     -------
     valid_count : tvm.Tensor or numpy NDArray
@@ -169,7 +173,9 @@ def hybrid_nms(data, sorted_index, valid_count,
                     for k in range(valid_count[i]):
                         check_iou = 0
                         if k > j and output[i, k, 0] >= 0:
-                            if force_suppress or output[i, j, 0] == output[i, k, 0]:
+                            if force_suppress:
+                                check_iou = 1
+                            elif id_index < 0 or output[i, j, 0] == output[i, k, 0]:
                                 check_iou = 1
                         if check_iou > 0:
                             batch_idx = i
@@ -204,7 +210,7 @@ def hybrid_nms(data, sorted_index, valid_count,
 
 @tvm.target.generic_func
 def nms(data, valid_count, iou_threshold=0.5, force_suppress=False,
-        topk=-1, do_rearrange=False):
+        topk=-1, id_index=0, do_rearrange=False):
     """Non-maximum suppression operator for object detection.
 
     Parameters
@@ -226,6 +232,9 @@ def nms(data, valid_count, iou_threshold=0.5, force_suppress=False,
     topk : optional, int
         Keep maximum top k detections before nms, -1 for no limit.
 
+    id_index : optional, int
+        index of the class categories, -1 to disable.
+
     do_rearrange : optional, boolean
         Whether to move all valid bounding boxes to the top.
 
@@ -282,7 +291,8 @@ def nms(data, valid_count, iou_threshold=0.5, force_suppress=False,
     out = hybrid_nms(data, sort_tensor, valid_count,
                      tvm.const(iou_threshold, dtype="float32"),
                      tvm.const(force_suppress, dtype="bool"),
-                     tvm.const(topk, dtype="int32"))
+                     tvm.const(topk, dtype="int32"),
+                     tvm.const(id_index, dtype="int32"))
     if do_rearrange:
         out = hybrid_rearrange_out(out)
 

From 8f9e9e2c6b52cdf6c2efdbce846d6af2dab7c1b0 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-70-212.ec2.internal>
Date: Sun, 16 Dec 2018 07:00:48 +0000
Subject: [PATCH 08/43] Add l2_normalize to from_mxnet

---
 nnvm/python/nnvm/frontend/mxnet.py  | 10 ++++++++++
 topi/include/topi/nn/l2_normalize.h |  7 ++++++-
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/nnvm/python/nnvm/frontend/mxnet.py b/nnvm/python/nnvm/frontend/mxnet.py
index 8467beb61e4f..dffc8d960c88 100644
--- a/nnvm/python/nnvm/frontend/mxnet.py
+++ b/nnvm/python/nnvm/frontend/mxnet.py
@@ -356,6 +356,15 @@ def _slice_axis(inputs, attrs):
     new_attrs['end'] = 0 if attrs.get('end') == "None" else attrs.get('end')
     return _get_nnvm_op(op_name)(inputs[0], **new_attrs)
 
+def _l2_normalize(inputs, attrs):
+    op_name, new_attrs = 'l2_normalize', {}
+    mode = attrs.get('mode', 'instance')
+    if mode != 'channel':
+        _raise_not_supported('mode: %s' % mode, 'L2Normalization')
+    new_attrs['eps'] = attrs.get('eps', 1e-10)
+    new_attrs['axis'] = 1
+    return _get_nnvm_op(op_name)(inputs[0], **new_attrs)
+
 _identity_list = ['__add_scalar__', '__add_symbol__', '__div_scalar__',
                   '__div_symbol__', '__mul_scalar__', '__mul_symbol__',
                   '__pow_scalar__', '__rdiv_scalar__', '__rpow_scalar__',
@@ -404,6 +413,7 @@ def _slice_axis(inputs, attrs):
     'Flatten'       : _rename('flatten'),
     'FullyConnected': _dense,
     'LeakyReLU'     : _leaky_relu,
+    'L2Normalization' : _l2_normalize,
     'Pooling'       : _pooling,
     'Pooling_v1'    : _pooling,
     'Reshape'       : _reshape,
diff --git a/topi/include/topi/nn/l2_normalize.h b/topi/include/topi/nn/l2_normalize.h
index a9fd49cbee64..e022d76871a0 100644
--- a/topi/include/topi/nn/l2_normalize.h
+++ b/topi/include/topi/nn/l2_normalize.h
@@ -30,7 +30,12 @@ inline Tensor l2_normalize(const Tensor& data,
                            const Array<Integer>& axis,
                            std::string name = "tensor",
                            std::string tag = "l2_normalize") {
-  CHECK_EQ(data->shape.size(), 4) << "L2 normalization requires 4-D input";
+  for (size_t i = 0; i < axis.size(); ++i) {
+    int ax = topi::detail::GetConstInt(axis[i]);
+    CHECK_LT(ax, data->shape.size()) <<
+             "Axis " << ax << " exceeds input data dim " << 
+             data->shape.size();
+  }
   auto input_shape = data->shape;
   Tensor dot_value = topi::power(data, static_cast<float>(2.0));
   Tensor sum_value = topi::sum(dot_value, axis, true);

From 5c5e6f791190c5e8a7109ae8393ba7a7bdc104bb Mon Sep 17 00:00:00 2001
From: Wang <wayao@9801a7a9c287.ant.amazon.com>
Date: Tue, 18 Dec 2018 17:05:42 -0800
Subject: [PATCH 09/43] Modify SSD tutorial

---
 topi/python/topi/vision/nms.py          |   8 +-
 topi/python/topi/vision/ssd/multibox.py | 226 +++++++++++++-----------
 tutorials/nnvm/deploy_ssd.py            |   2 +-
 3 files changed, 123 insertions(+), 113 deletions(-)

diff --git a/topi/python/topi/vision/nms.py b/topi/python/topi/vision/nms.py
index 43de07c3dc76..1dddffc0a2f4 100644
--- a/topi/python/topi/vision/nms.py
+++ b/topi/python/topi/vision/nms.py
@@ -21,13 +21,13 @@ def hybrid_rearrange_out(data):
         Transformed NMS output. 3-D tensor with shape
         [batch_size, num_anchors, 6].
     """
-    output = output_tensor((data.shape[0],
-                            data.shape[1],
-                            data.shape[2],),
-                           data.dtype)
     batch_size = data.shape[0]
     num_anchors = data.shape[1]
     elem_length = data.shape[2]
+    output = output_tensor((batch_size,
+                            num_anchors,
+                            elem_length),
+                           data.dtype)
 
     for i in parallel(batch_size):
         valid_idx = 0
diff --git a/topi/python/topi/vision/ssd/multibox.py b/topi/python/topi/vision/ssd/multibox.py
index f1de42430dd6..87a4a84c5ab5 100644
--- a/topi/python/topi/vision/ssd/multibox.py
+++ b/topi/python/topi/vision/ssd/multibox.py
@@ -1,75 +1,70 @@
-# pylint: disable=invalid-name, no-member, too-many-locals, too-many-arguments
+# pylint: disable=invalid-name, no-member, too-many-locals, too-many-arguments, undefined-variable
 """SSD multibox operators"""
 from __future__ import absolute_import as _abs
-import math
 import tvm
 
-from tvm import api
+from tvm import hybrid
+from tvm.intrin import exp, sqrt
 
 import topi
 
 from ..nms import nms
 
-def multibox_prior_ir(data, out, sizes, ratios, steps, offsets):
-    """Low level IR routing for multibox_prior operator.
+@hybrid.script
+def hybrid_multibox_prior(data, sizes, ratios, steps, offsets):
+    """Hybrid routing for multibox_prior operator.
 
     Parameters
     ----------
-    data : Buffer
-        Input data buffer.
+    data : tvm.Tensor or numpy NDArray
+        4-D tensor with shape [batch, channel, height, width]]
 
-    out : Buffer
-        Output buffer.
+    sizes : tvm.ndarray
+        1-D tensor of sizes for anchor boxes.
 
-    sizes : tuple of float
-        Tuple of sizes for anchor boxes.
-
-    ratios : tuple of float
-        Tuple of ratios for anchor boxes.
+    ratios : tvm.ndarray
+        1-D tensor of ratios for anchor boxes.
 
-    steps : Tuple of float
-        Priorbox step across y and x, -1 for auto calculation.
+    steps : tvm.ndarray
+        1-D tensor of priorbox step across y and x, -1 for auto calculation.
 
-    offsets : tuple of int
-        Priorbox center offsets, y and x respectively.
+    offsets : tvm.ndarray
+        1-D tensor priorbox center offsets, y and x respectively.
 
     Returns
     -------
-    stmt : Stmt
-        The result IR statement.
+    output : tvm.Tensor or numpy NDArray
+        3-D tensor with shape [1, h_in * w_in * (num_sizes + num_ratios - 1), 4]
     """
-    ib = tvm.ir_builder.create()
-    p_out = ib.buffer_ptr(out)
-    in_height = data.shape[2]
-    in_width = data.shape[3]
-    num_sizes = len(sizes)
-    num_ratios = len(ratios)
-    size_ratio_concat = sizes + ratios
+    in_height, in_width = data.shape[2], data.shape[3]
+    num_sizes, num_ratios = sizes.shape[0], ratios.shape[0]
+    num_boxes = in_height * in_width * (num_sizes + num_ratios - 1)
+    output = output_tensor((1, num_boxes, 4), data.dtype)
     steps_h = steps[0] if steps[0] > 0 else 1.0 / in_height
     steps_w = steps[1] if steps[1] > 0 else 1.0 / in_width
     offset_h = offsets[0]
     offset_w = offsets[1]
 
-    with ib.for_range(0, in_height, for_type="parallel", name="i") as i:
+    for i in parallel(in_height):
         center_h = (i + offset_h) * steps_h
-        with ib.for_range(0, in_width, name="j") as j:
+        for j in range(in_width):
             center_w = (j + offset_w) * steps_w
             for k in range(num_sizes + num_ratios - 1):
-                w = tvm.if_then_else(k < num_sizes,
-                                     size_ratio_concat[k] * in_height / in_width / 2.0,
-                                     size_ratio_concat[0] * in_height / in_width *
-                                     math.sqrt(size_ratio_concat[k + 1]) / 2.0)
-                h = tvm.if_then_else(
-                    k < num_sizes, size_ratio_concat[k] / 2.0,
-                    size_ratio_concat[0] / math.sqrt(size_ratio_concat[k + 1]) / 2.0)
-                count = (i * in_width * (num_sizes + num_ratios - 1) +
-                         j * (num_sizes + num_ratios - 1) + k) * 4
-                p_out[count] = center_w - w
-                p_out[count + 1] = center_h - h
-                p_out[count + 2] = center_w + w
-                p_out[count + 3] = center_h + h
-
-    return ib.get()
+                if k < num_sizes:
+                    w = sizes[k] * in_height / in_width / 2.0
+                    h = sizes[k] / 2.0
+                else:
+                    w = sizes[0] * in_height / in_width \
+                        * sqrt(ratios[k - num_sizes + 1]) / 2.0
+                    h = sizes[0] * sqrt(ratios[k - num_sizes + 1]) / 2.0
+                count = i * in_width * (num_sizes + num_ratios - 1) \
+                        + j * (num_sizes + num_ratios - 1) + k
+                output[0, count, 0] = center_w - w
+                output[0, count, 1] = center_h - h
+                output[0, count, 2] = center_w + w
+                output[0, count, 3] = center_h + h
+
+    return output
 
 
 @tvm.target.generic_func
@@ -101,48 +96,62 @@ def multibox_prior(data, sizes=(1,), ratios=(1,), steps=(-1, -1), offsets=(0.5,
     out : tvm.Tensor
         3-D tensor with shape [1, h_in * w_in * (num_sizes + num_ratios - 1), 4]
     """
-    num_sizes = len(sizes)
-    num_ratios = len(ratios)
-    oshape = (1, data.shape[2] * data.shape[3] * (num_sizes + num_ratios - 1), 4)
-    out = tvm.extern(oshape, [data], lambda ins, outs:
-                     multibox_prior_ir(ins[0], outs[0], sizes, ratios, steps, offsets),
-                     tag="multibox_prior")
+    out = hybrid_multibox_prior(data, sizes, ratios, steps, offsets)
     if clip:
         out = topi.clip(out, 0, 1)
     return out
 
-
-def transform_loc_ir(cls_prob, loc_pred, anchor, valid_count, out, clip, threshold, variances):
-    """Low level IR routing for transform location in multibox_detection operator.
+@hybrid.script
+def _hybridy_transform_loc(box, pred_loc, variance, clip):
+    """Transform prior anchor box to output box through location predictions.
+    """
+    al, at, ar, ab = box[0], box[1], box[2], box[3]
+    px, py, pw, ph = pred_loc[0], pred_loc[1], \
+                     pred_loc[2], pred_loc[3]
+    vx, vy, vw, vh = variance[0], variance[1], \
+                     variance[2], variance[3]
+    aw = ar - al
+    ah = ab - at
+    ax = (al + ar) / 2.0
+    ay = (at + ab) / 2.0
+    ox = px * vx * aw + ax
+    oy = py * vy * ah + ay
+    ow = exp(pw * vw) * aw / 2.0
+    oh = exp(ph * vh) * ah / 2.0
+    out_l = max(0, min(1, ox - ow)) if clip else ox - ow
+    out_t = max(0, min(1, oy - oh)) if clip else oy - oh
+    out_r = max(0, min(1, ox + ow)) if clip else ox + ow
+    out_b = max(0, min(1, oy + oh)) if clip else oy + oh
+    return out_l, out_t, out_r, out_b
+
+@hybrid.script
+def hybrid_multibox_transform_loc(cls_prob, loc_pred, anchor,
+                                  clip, threshold, variances):
+    """Hybrid routing for transform location in multibox_detection operator.
 
     Parameters
     ----------
-    cls_prob : Buffer
-        Buffer of class probabilities.
-
-    loc_pred : Buffer
-        Buffer of location regression predictions.
+    cls_prob : tvm.Tensor or numpy NDArray
+        3-D tensor of class probabilities.
 
-    anchor : Buffer
-        Buffer of prior anchor boxes.
+    loc_pred : tvm.Tensor or numpy NDArray
+        3-D tensor of location regression predictions.
 
-    valid_count : Buffer
-        Buffer of number of valid output boxes.
+    anchor : tvm.Tensor or numpy NDArray
+        3-D tensor of prior anchor boxes.
 
-    out : Buffer
-        Output buffer.
-
-    clip : boolean
+    clip : tvm.const
         Whether to clip out-of-boundary boxes.
 
-    threshold : float
+    threshold : tvm.const
         Threshold to be a positive prediction.
 
-    variances : tuple of float
+    variances : tvm.ndarray
         Variances to be decoded from box regression output.
 
     Returns
     -------
+<<<<<<< HEAD
     stmt : Stmt
         The result IR statement.
     """
@@ -169,21 +178,26 @@ def transform_loc(loc, loc_base_idx, anchor, anchor_base_idx, clip, vx, vy, vw,
                tvm.if_then_else(clip, tvm.max(0, tvm.min(1, oy - oh)), oy - oh), \
                tvm.if_then_else(clip, tvm.max(0, tvm.min(1, ox + ow)), ox + ow), \
                tvm.if_then_else(clip, tvm.max(0, tvm.min(1, oy + oh)), oy + oh)
+=======
+    out_loc : tvm.Tensor or numpy NDArray
+        3-D tensor of transformed location.
+>>>>>>> Modify SSD tutorial
 
+    valid_count : tvm.Tensor or numpy NDArray
+        1_d tensor of valid counts for boxes.
+    """
     batch_size = cls_prob.shape[0]
     num_classes = cls_prob.shape[1]
     num_anchors = cls_prob.shape[2]
+    out_loc = output_tensor((batch_size, num_anchors, 6),
+                            loc_pred.dtype)
+    valid_count = output_tensor((batch_size,), "int32")
 
-    ib = tvm.ir_builder.create()
-    p_cls_prob = ib.buffer_ptr(cls_prob)
-    p_loc_pred = ib.buffer_ptr(loc_pred)
-    p_anchor = ib.buffer_ptr(anchor)
-    p_valid_count = ib.buffer_ptr(valid_count)
-    p_out = ib.buffer_ptr(out)
-    with ib.for_range(0, batch_size, for_type="parallel", name="n") as n:
-        p_valid_count[n] = 0
-        with ib.for_range(0, num_anchors, name="i") as i:
+    for i in parallel(batch_size):
+        valid_count[i] = 0
+        for j in range(num_anchors):
             # Find the predicted class id and probability
+<<<<<<< HEAD
             score = ib.allocate('float32', (1,), name="score", scope="local")
             cls_id = ib.allocate('int32', (1,), name="id", scope="local")
             score[0] = -1.0
@@ -195,21 +209,30 @@ def transform_loc(loc, loc_base_idx, anchor, anchor_base_idx, clip, vx, vy, vw,
                     score[0] = tvm.max(temp, score[0])
             with ib.if_scope(tvm.all(cls_id[0] > 0, score[0] < threshold)):
                 cls_id[0] = 0
+=======
+            score = -1.0
+            cls_id = 0
+            for k in range(num_classes):
+                if k > 0:
+                    temp = cls_prob[i, k, j]
+                    cls_id = j if temp > score else cls_id
+                    score = max(temp, score)
+            if cls_id > 0 and score < threshold:
+                cls_id = 0
+>>>>>>> Modify SSD tutorial
             # [id, prob, xmin, ymin, xmax, ymax]
             # Remove background, restore original id
-            with ib.if_scope(cls_id[0] > 0):
-                out_base_idx = n * num_anchors * 6 + p_valid_count[n] * 6
-                p_out[out_base_idx] = cls_id[0] - 1.0
-                p_out[out_base_idx + 1] = score[0]
-                offset = i * 4
-                p_out[out_base_idx + 2], p_out[out_base_idx + 3], p_out[out_base_idx + 4], \
-                p_out[out_base_idx + 5] = transform_loc(p_loc_pred, n * num_anchors * 4 + offset,
-                                                        p_anchor, offset, clip, variances[0],
-                                                        variances[1], variances[2], variances[3])
-                p_valid_count[n] += 1
-
-    return ib.get()
-
+            if cls_id > 0:
+                out_loc[i, valid_count[i], 0] = cls_id - 1.0
+                out_loc[i, valid_count[i], 1] = score
+                out_coord = _hybridy_transform_loc(anchor[j], loc_pred[i, j],
+                                                   variances, clip)
+                out_loc[i, valid_count[i], 2] = out_coord[0]
+                out_loc[i, valid_count[i], 3] = out_coord[1]
+                out_loc[i, valid_count[i], 4] = out_coord[2]
+                out_loc[i, valid_count[i], 5] = out_coord[3]
+
+    return out_loc, valid_count
 
 @tvm.target.generic_func
 def multibox_transform_loc(cls_prob, loc_pred, anchor, clip=True, threshold=0.01,
@@ -240,24 +263,11 @@ def multibox_transform_loc(cls_prob, loc_pred, anchor, clip=True, threshold=0.01
     -------
     ret : tuple of tvm.Tensor
     """
-    batch_size = cls_prob.shape[0]
-    num_anchors = anchor.shape[1]
-    oshape = (batch_size, num_anchors, 6)
-    # Define data alignment for intermediate buffer
-    valid_count_dtype = "int32"
-    valid_count_buf = api.decl_buffer((batch_size,), valid_count_dtype,
-                                      "valid_count_buf", data_alignment=4)
-    out_buf = api.decl_buffer(oshape, cls_prob.dtype, "out_buf", data_alignment=8)
-    valid_count, out = \
-        tvm.extern([(batch_size,), oshape],
-                   [cls_prob, loc_pred, anchor],
-                   lambda ins, outs: transform_loc_ir(
-                       ins[0], ins[1], ins[2], outs[0], outs[1], clip, threshold, variances),
-                   dtype=[valid_count_dtype, cls_prob.dtype],
-                   out_buffers=[valid_count_buf, out_buf],
-                   tag="multibox_transform_loc")
-    return [out, valid_count]
-
+    out, valid_count = hybrid_multibox_transform_loc(cls_prob, loc_pred, anchor,
+                                                     tvm.const(clip, "bool"),
+                                                     tvm.const(threshold, "float32"),
+                                                     variances)
+    return out, valid_count
 
 @tvm.target.generic_func
 def multibox_detection(cls_prob, loc_pred, anchor, clip=True, threshold=0.01, nms_threshold=0.5,
diff --git a/tutorials/nnvm/deploy_ssd.py b/tutorials/nnvm/deploy_ssd.py
index eadb8fd28e0c..1a71c96eaa0c 100644
--- a/tutorials/nnvm/deploy_ssd.py
+++ b/tutorials/nnvm/deploy_ssd.py
@@ -61,7 +61,7 @@
 image_url = "https://cloud.githubusercontent.com/assets/3307514/20012567/" \
             "cbb60336-a27d-11e6-93ff-cbc3f09f5c9e.jpg"
 inference_symbol_folder = \
-"c1904e900848df4548ce5dfb18c719c7-a28c4856c827fe766aa3da0e35bad41d44f0fb26"
+    "c1904e900848df4548ce5dfb18c719c7-a28c4856c827fe766aa3da0e35bad41d44f0fb26"
 inference_symbol_url = "https://gist.github.com/kevinthesun/c1904e900848df4548ce5dfb18c719c7/" \
                        "archive/a28c4856c827fe766aa3da0e35bad41d44f0fb26.zip"
 

From 3054af5402acb8f3a126a1444215c1aa304e91ce Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-70-212.ec2.internal>
Date: Wed, 19 Dec 2018 02:08:44 +0000
Subject: [PATCH 10/43] Fix tutorial

---
 tutorials/nnvm/deploy_ssd.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tutorials/nnvm/deploy_ssd.py b/tutorials/nnvm/deploy_ssd.py
index 1a71c96eaa0c..f7e3b19f9767 100644
--- a/tutorials/nnvm/deploy_ssd.py
+++ b/tutorials/nnvm/deploy_ssd.py
@@ -165,4 +165,4 @@ def display(img, out, thresh=0.5):
     plt.show()
 
 image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
-display(image, tvm_output.asnumpy()[0], thresh=0.45)
+display(image, tvm_output.asnumpy()[0], thresh=0.45)
\ No newline at end of file

From ef21b022fb4fef9c58ddb18e4e6ad9cb8c74b774 Mon Sep 17 00:00:00 2001
From: Wang <wayao@9801a7a9c287.ant.amazon.com>
Date: Mon, 7 Jan 2019 17:41:39 -0800
Subject: [PATCH 11/43] Relay support

---
 include/tvm/relay/attrs/transform.h           |  15 +++
 include/tvm/relay/attrs/vision.h              |  32 +++--
 nnvm/include/nnvm/top/nn.h                    |  18 +--
 nnvm/src/top/vision/nms.cc                    |   1 -
 nnvm/tests/python/compiler/test_top_level4.py |  10 +-
 .../python/frontend/mxnet/test_forward.py     |  13 +-
 python/tvm/relay/frontend/mxnet.py            |  43 +++++++
 python/tvm/relay/op/_transform.py             |   1 +
 python/tvm/relay/op/transform.py              |  28 +++-
 python/tvm/relay/op/vision/__init__.py        |   2 +-
 .../op/vision/{_multibox.py => _vision.py}    |  25 +++-
 python/tvm/relay/op/vision/nms.py             |  41 +++++-
 src/relay/op/tensor/transform.cc              | 121 ++++++++++++++++--
 src/relay/op/vision/multibox_op.cc            |   6 +-
 src/relay/op/vision/nms.cc                    |  61 ++++++++-
 tests/python/relay/test_op_level4.py          |  23 ++++
 topi/python/topi/testing/__init__.py          |   4 +
 topi/python/topi/testing/slice_axis_python.py |  34 +++++
 topi/tests/python/test_topi_vision.py         |   1 -
 19 files changed, 425 insertions(+), 54 deletions(-)
 rename python/tvm/relay/op/vision/{_multibox.py => _vision.py} (72%)
 create mode 100644 topi/python/topi/testing/slice_axis_python.py

diff --git a/include/tvm/relay/attrs/transform.h b/include/tvm/relay/attrs/transform.h
index fea2c960d032..44b910aaf0bf 100644
--- a/include/tvm/relay/attrs/transform.h
+++ b/include/tvm/relay/attrs/transform.h
@@ -171,6 +171,21 @@ struct StridedSliceAttrs : public tvm::AttrsNode<StridedSliceAttrs> {
   }
 };
 
+struct SliceAxisAttrs : public tvm::AttrsNode<SliceAxisAttrs> {
+  int axis;
+  int begin;
+  int end;
+
+  TVM_DECLARE_ATTRS(SliceAxisAttrs, "relay.attrs.SliceAxisAttrs") {
+    TVM_ATTR_FIELD(axis)
+      .describe("Axis along which to be sliced.");
+    TVM_ATTR_FIELD(begin)
+      .describe("Index for begin of slice");
+    TVM_ATTR_FIELD(end).set_default(0)
+      .describe("Index for end of the slice");
+  }
+};
+
 struct SliceLikeAttrs : public tvm::AttrsNode<SliceLikeAttrs> {
   Array<Integer> axes;
 
diff --git a/include/tvm/relay/attrs/vision.h b/include/tvm/relay/attrs/vision.h
index df059a6238e1..345a67655552 100644
--- a/include/tvm/relay/attrs/vision.h
+++ b/include/tvm/relay/attrs/vision.h
@@ -58,19 +58,35 @@ struct MultiBoxTransformLocAttrs
   }
 };
 
-/*! \brief Attributes used in non_maximum_suppression operators */
+/*! \brief Attributes used in get_valid_counts operator */
+struct GetValidCountsAttrs : public tvm::AttrsNode<GetValidCountsAttrs>{
+  double score_threshold;
+
+  TVM_DECLARE_ATTRS(GetValidCountsAttrs, "relay.attrs.GetValidCountsAttrs") {
+    TVM_ATTR_FIELD(score_threshold).set_default(0.0)
+      .describe("Lower limit of score for valid bounding boxes.");
+  }
+};
+
+/*! \brief Attributes used in non_maximum_suppression operator */
 struct NMSAttrs : public tvm::AttrsNode<NMSAttrs>{
-  double overlap_threshold;
+  double iou_threshold;
   bool force_suppress;
   int topk;
+  int id_index;
+  bool do_rearrange;
 
   TVM_DECLARE_ATTRS(NMSAttrs, "relay.attrs.NMSAttrs") {
-      TVM_ATTR_FIELD(overlap_threshold).set_default(0.5)
-        .describe("Non-maximum suppression threshold.");
-      TVM_ATTR_FIELD(force_suppress).set_default(false)
-        .describe("Suppress all detections regardless of class_id.");
-      TVM_ATTR_FIELD(topk).set_default(-1)
-        .describe("Keep maximum top k detections before nms, -1 for no limit.");
+    TVM_ATTR_FIELD(iou_threshold).set_default(0.5)
+      .describe("Non-maximum suppression threshold.");
+    TVM_ATTR_FIELD(force_suppress).set_default(false)
+      .describe("Suppress all detections regardless of class_id.");
+    TVM_ATTR_FIELD(topk).set_default(-1)
+      .describe("Keep maximum top k detections before nms, -1 for no limit.");
+    TVM_ATTR_FIELD(id_index).set_default(0)
+      .describe("Axis index of id.");
+    TVM_ATTR_FIELD(do_rearrange).set_default(false)
+      .describe("Whether to move all valid bounding boxes to the top.");
   }
 };
 
diff --git a/nnvm/include/nnvm/top/nn.h b/nnvm/include/nnvm/top/nn.h
index 1513be122b41..82f3230b4931 100644
--- a/nnvm/include/nnvm/top/nn.h
+++ b/nnvm/include/nnvm/top/nn.h
@@ -402,9 +402,9 @@ struct LayoutTransformParam : public dmlc::Parameter<LayoutTransformParam> {
 
   DMLC_DECLARE_PARAMETER(LayoutTransformParam) {
     DMLC_DECLARE_FIELD(src_layout).set_default("__undef__")
-    .describe("Dimension ordering of data");
+      .describe("Dimension ordering of data");
     DMLC_DECLARE_FIELD(dst_layout).set_default("__undef__")
-    .describe("Dimension ordering of data.");
+      .describe("Dimension ordering of data.");
   }
 };
 
@@ -419,13 +419,13 @@ struct MultiBoxPriorParam : public dmlc::Parameter<MultiBoxPriorParam> {
     DMLC_DECLARE_FIELD(sizes).set_default(Tuple<float>({1.0}))
       .describe("List of sizes of generated MultiBoxPriores.");
     DMLC_DECLARE_FIELD(ratios).set_default(Tuple<float>({1.0}))
-    .describe("List of aspect ratios of generated MultiBoxPriores.");
+      .describe("List of aspect ratios of generated MultiBoxPriores.");
     DMLC_DECLARE_FIELD(steps).set_default(Tuple<float>({-1.0, -1.0}))
-    .describe("Priorbox step across y and x, -1 for auto calculation.");
+      .describe("Priorbox step across y and x, -1 for auto calculation.");
     DMLC_DECLARE_FIELD(offsets).set_default(Tuple<float>({0.5, 0.5}))
-    .describe("Priorbox center offsets, y and x respectively.");
+      .describe("Priorbox center offsets, y and x respectively.");
     DMLC_DECLARE_FIELD(clip).set_default(false)
-    .describe("Whether to clip out-of-boundary boxes.");
+      .describe("Whether to clip out-of-boundary boxes.");
   }
 };
 
@@ -461,11 +461,11 @@ struct NMSParam : public dmlc::Parameter<NMSParam> {
     DMLC_DECLARE_FIELD(iou_threshold).set_default(0.5)
       .describe("Non-maximum suppression threshold.");
     DMLC_DECLARE_FIELD(force_suppress).set_default(false)
-    .describe("Suppress all detections regardless of class_id.");
+      .describe("Suppress all detections regardless of class_id.");
     DMLC_DECLARE_FIELD(topk).set_default(-1)
-    .describe("Keep maximum top k detections before nms, -1 for no limit.");
-    DMLC_DECLARE_FIELD(id_index).set_default(0)
       .describe("Keep maximum top k detections before nms, -1 for no limit.");
+    DMLC_DECLARE_FIELD(id_index).set_default(0)
+      .describe("Axis index for id.");
     DMLC_DECLARE_FIELD(do_rearrange).set_default(false)
       .describe("Whether to move all valid bounding boxes to the top.");
   }
diff --git a/nnvm/src/top/vision/nms.cc b/nnvm/src/top/vision/nms.cc
index a74a135175ba..71b4c3ff7860 100644
--- a/nnvm/src/top/vision/nms.cc
+++ b/nnvm/src/top/vision/nms.cc
@@ -11,7 +11,6 @@
 #include <nnvm/op_attr_types.h>
 #include <nnvm/compiler/op_attr_types.h>
 #include "../op_common.h"
-#include "../elemwise_op_common.h"
 
 namespace nnvm {
 namespace top {
diff --git a/nnvm/tests/python/compiler/test_top_level4.py b/nnvm/tests/python/compiler/test_top_level4.py
index aab81565c3ff..b84621128614 100644
--- a/nnvm/tests/python/compiler/test_top_level4.py
+++ b/nnvm/tests/python/compiler/test_top_level4.py
@@ -700,16 +700,8 @@ def test_slice_like():
 def verify_slice_axis(dshape, axis, begin, end):
     data = sym.Variable("data")
     net = sym.slice_axis(data, axis=axis, begin=begin, end=end)
-    if axis < 0:
-        axis += len(dshape)
-    if begin < 0:
-        begin += dshape[axis]
-    if end <= 0:
-        end += dshape[axis]
     np_data = np.random.uniform(size=dshape)
-    slc = [slice(None)] * len(dshape)
-    slc[axis] = slice(begin, end)
-    np_out = np_data[slc]
+    np_out = topi.testing.slice_axis_python(np_data, axis, begin, end)
 
     dtype = "float32"
     for target, ctx in ctx_list():
diff --git a/nnvm/tests/python/frontend/mxnet/test_forward.py b/nnvm/tests/python/frontend/mxnet/test_forward.py
index e046f39f02ca..c9d1c7795489 100644
--- a/nnvm/tests/python/frontend/mxnet/test_forward.py
+++ b/nnvm/tests/python/frontend/mxnet/test_forward.py
@@ -227,6 +227,7 @@ def test_forward_slice():
     mx_sym = mx.sym.slice(data, begin=(-1, 1), end=(-3, 4), step=(-1, 2))
     verify_mxnet_frontend_impl(mx_sym, (3, 4), (2, 2))
 
+<<<<<<< HEAD
 def test_forward_maximum():
     a = mx.sym.var('a')
     b = mx.sym.var('b')
@@ -289,6 +290,15 @@ def test_forward_minimum():
         tvm_out = m.get_output(0, tvm.nd.empty(out_shape, dtype)).asnumpy()
         tvm.testing.assert_allclose(mx_out, tvm_out, rtol=1e-5, atol=1e-5)
 
+def test_forward_slice_axis():
+    data = mx.sym.var('data')
+    mx_sym = mx.sym.slice_axis(data, axis=1, begin=-5)
+    verify_mxnet_frontend_impl(mx_sym, (1, 10, 6), (1, 5, 6))
+
+def test_forward_l2_normalize():
+    data = mx.sym.var('data')
+    mx_sym = mx.sym.L2Normalization(data, mode="channel")
+    verify_mxnet_frontend_impl(mx_sym, (2, 3, 4, 5), (2, 3, 4, 5))
 
 if __name__ == '__main__':
     test_forward_mlp()
@@ -315,4 +325,5 @@ def test_forward_minimum():
     test_forward_slice()
     test_forward_maximum()
     test_forward_minimum()
-
+    test_forward_slice_axis()
+    test_forward_l2_normalize()
diff --git a/python/tvm/relay/frontend/mxnet.py b/python/tvm/relay/frontend/mxnet.py
index 2e0ccd07fdc1..d53a4f5f75a8 100644
--- a/python/tvm/relay/frontend/mxnet.py
+++ b/python/tvm/relay/frontend/mxnet.py
@@ -380,6 +380,47 @@ def _mx_proposal(inputs, attrs):
     return _op.vision.proposal(inputs[0], inputs[1], inputs[2], **new_attrs)
 
 
+def _mx_box_nms(inputs, attrs):
+    force_suppress = attrs.get_bool("force_suppress", False)
+    overlap_thresh = attrs.get_float('overlap_thresh', 0.5)
+    topk = attrs.get_int('topk', -1)
+    valid_thresh = attrs.get_float('valid_thresh', 0)
+    coord_start = attrs.get_int('coord_start', 2)
+    score_index = attrs.get_int('score_index', 1)
+    id_index = attrs.get_int('id_index', -1)
+    in_format = attrs.get_str('in_format', 'corner')
+    out_format = attrs.get_str('out_format', 'corner')
+    if coord_start != 2:
+        raise RuntimeError('coord_start %s is not supported.' % coord_start)
+    if score_index != 1:
+        raise RuntimeError('score_index %s is not supported.' % score_index)
+    if id_index != -1 and int(id_index) != 0:
+        raise RuntimeError('id_index %s is not supported.' % id_index)
+    if in_format != 'corner':
+        raise RuntimeError('in_format %s is not supported.' % in_format)
+    if out_format != 'corner':
+        raise RuntimeError('out_format %s is not supported.' % out_format)
+
+    valid_counts, inter_out = \
+        _op.vision.get_valid_counts(inputs[0], score_threshold=valid_thresh)
+    nms_out = _op.vision.nms(inter_out, valid_counts,
+                             iou_threshold=overlap_thresh,
+                             force_suppress=force_suppress,
+                             topk=topk, id_index=id_index,
+                             do_rearrange=True)
+    return nms_out
+
+
+def _mx_l2_normalize(inputs, attrs):
+    new_attrs = {}
+    mode = attrs.get_str('mode', 'instance')
+    if mode != 'channel':
+        raise RuntimeError('mode %s is not supported.' % mode)
+    new_attrs['eps'] = attrs.get_float('eps', 1e-10)
+    new_attrs['axis'] = 1
+    return _op.nn.l2_normalize(inputs[0], **new_attrs)
+
+
 # Note: due to attribute conversion constraint
 # ops in the identity set must be attribute free
 _identity_list = [
@@ -481,7 +522,9 @@ def _mx_proposal(inputs, attrs):
     "slice"         : _mx_slice,
     "slice_like"    : _mx_slice_like,
     "slice_axis"    : _mx_slice_axis,
+    "L2Normalization"  : _mx_l2_normalize,∂
     "SliceChannel"  : _mx_split,
+    "slice_axis"    : _mx_slice_axis,
     "split"         : _mx_split,
     "expand_dims"   : _mx_expand_dims,
     "Concat"        : _mx_concat,
diff --git a/python/tvm/relay/op/_transform.py b/python/tvm/relay/op/_transform.py
index 1389f96b8325..83b5ce5a854f 100644
--- a/python/tvm/relay/op/_transform.py
+++ b/python/tvm/relay/op/_transform.py
@@ -21,6 +21,7 @@
 _reg.register_schedule("arange", schedule_injective)
 _reg.register_schedule("cast", schedule_injective)
 _reg.register_schedule("strided_slice", schedule_injective)
+_reg.register_schedule("slice_axis", schedule_injective)
 _reg.register_schedule("slice_like", schedule_injective)
 _reg.register_schedule("split", schedule_injective)
 _reg.register_schedule("take", schedule_injective)
diff --git a/python/tvm/relay/op/transform.py b/python/tvm/relay/op/transform.py
index 845ee02b0582..f19aa19772b4 100644
--- a/python/tvm/relay/op/transform.py
+++ b/python/tvm/relay/op/transform.py
@@ -456,7 +456,7 @@ def strided_slice(data, begin, end, strides=None):
         The indices to begin with in the slicing.
 
     end: list of int
-        Indicies indicating end of the slice.
+        Indices indicating end of the slice.
 
     strides: list of int, optional
         Specifies the stride values, it can be negative in that case,
@@ -471,6 +471,32 @@ def strided_slice(data, begin, end, strides=None):
     return _make.strided_slice(data, list(begin), list(end), list(strides))
 
 
+def slice_axis(data, axis, begin, end=None):
+    """Slice input array along specific axis.
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The source array to be sliced.
+
+    axis : int
+        Axis to be sliced.
+
+    begin: int
+        The index to begin with in the slicing.
+
+    end: int, optional
+        The index indicating end of the slice.
+
+    Returns
+    -------
+    ret : relay.Expr
+        The computed result.
+    """
+    end = end or 0
+    return _make.slice_axis(data, axis, begin, end)
+
+
 def slice_like(data, shape_like, axes=None):
     """Slice the first input with respect to the second input.
 
diff --git a/python/tvm/relay/op/vision/__init__.py b/python/tvm/relay/op/vision/__init__.py
index 10cf6c2fd3ee..0cee4e4faeec 100644
--- a/python/tvm/relay/op/vision/__init__.py
+++ b/python/tvm/relay/op/vision/__init__.py
@@ -6,6 +6,6 @@
 from .nms import *
 from .rcnn import *
 from .yolo import *
-from . import _multibox
 from . import _rcnn
 from . import _yolo
+from . import _vision
diff --git a/python/tvm/relay/op/vision/_multibox.py b/python/tvm/relay/op/vision/_vision.py
similarity index 72%
rename from python/tvm/relay/op/vision/_multibox.py
rename to python/tvm/relay/op/vision/_vision.py
index e9ef43f7e06f..2d15562995ec 100644
--- a/python/tvm/relay/op/vision/_multibox.py
+++ b/python/tvm/relay/op/vision/_vision.py
@@ -54,6 +54,23 @@ def compute_multibox_transform_loc(attrs, inputs, _, target):
 reg.register_pattern("vision.multibox_detection", OpPattern.OPAQUE)
 
 
+# Get counts of valid boxes
+@reg.register_schedule("vision.get_valid_counts")
+def schedule_get_valid_counts(_, outs, target):
+    """Schedule definition of get_valid_counts"""
+    with target:
+        return topi.generic.schedule_nms(outs)
+
+
+@reg.register_compute("vision.get_valid_counts")
+def compute_get_valid_counts(attrs, inputs, _, target):
+    """Compute definition of get_valid_counts"""
+    score_threshold = get_const_float(attrs.score_threshold)
+    return topi.vision.get_valid_counts(inputs[0], score_threshold)
+
+reg.register_pattern("vision.get_valid_counts", OpPattern.OPAQUE)
+
+
 # non-maximum suppression
 @reg.register_schedule("vision.nms")
 def schedule_nms(_, outs, target):
@@ -65,12 +82,14 @@ def schedule_nms(_, outs, target):
 @reg.register_compute("vision.nms")
 def compute_nms(attrs, inputs, _, target):
     """Compute definition of nms"""
-    overlap_threshold = get_const_float(attrs.overlap_threshold)
+    iou_threshold = get_const_float(attrs.iou_threshold)
     force_suppress = bool(get_const_int(attrs.force_suppress))
     topk = get_const_int(attrs.topk)
+    id_index = get_const_int(attrs.id_index)
+    do_rearrange = bool(get_const_int(attrs.do_rearrange))
     return [
-        topi.vision.nms(inputs[0], inputs[1], overlap_threshold,
-                        force_suppress, topk)
+        topi.vision.nms(inputs[0], inputs[1], iou_threshold,
+                        force_suppress, topk, id_index, do_rearrange)
     ]
 
 
diff --git a/python/tvm/relay/op/vision/nms.py b/python/tvm/relay/op/vision/nms.py
index 8035e3030b17..aecc111204b9 100644
--- a/python/tvm/relay/op/vision/nms.py
+++ b/python/tvm/relay/op/vision/nms.py
@@ -2,11 +2,37 @@
 from __future__ import absolute_import as _abs
 from . import _make
 
+def get_valid_counts(data,
+                     score_threshold):
+    """Get valid count of bounding boxes given a score threshold.
+    Also moves valid boxes to the top of input data.
+
+    Parameters
+    ----------
+    data : relay.Expr
+        Input data. 3-D tensor with shape [batch_size, num_anchors, 6].
+
+    score_threshold : optional, float
+        Lower limit of score for valid bounding boxes.
+
+    Returns
+    -------
+    out_tensor : relay.Expr
+        Rearranged data tensor.
+
+    valid_count : relay.Expr
+        1-D tensor for valid number of boxes.
+    """
+    return _make.get_valid_counts(data, score_threshold)
+
+
 def nms(data,
         valid_count,
-        overlap_threshold=0.5,
+        iou_threshold=0.5,
         force_suppress=False,
-        topk=-1):
+        topk=-1,
+        id_index=0,
+        do_rearrange=False):
     """Non-maximum suppression operator for object detection.
 
     Parameters
@@ -19,7 +45,7 @@ def nms(data,
     valid_count : relay.Expr
         1-D tensor for valid number of boxes.
 
-    overlap_threshold : float, optional
+    iou_threshold : float, optional
         Non-maximum suppression threshold.
 
     force_suppress : bool, optional
@@ -28,9 +54,16 @@ def nms(data,
     topk : int, optional
         Keep maximum top k detections before nms, -1 for no limit.
 
+    id_index : optional, int
+        index of the class categories, -1 to disable.
+
+    do_rearrange : optional, boolean
+        Whether to move all valid bounding boxes to the top.
+
     Returns
     -------
     out : relay.Expr
         3-D tensor with shape [batch_size, num_anchors, 6].
     """
-    return _make.nms(data, valid_count, overlap_threshold, force_suppress, topk)
+    return _make.nms(data, valid_count, iou_threshold,
+                     force_suppress, topk, id_index, do_rearrange)
diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index de3ac03977f4..dd0cbacb1e59 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -1347,6 +1347,118 @@ RELAY_REGISTER_OP("broadcast_to_like")
 .set_attr<TOpPattern>("TOpPattern", kBroadcast);
 
 
+// Adapter function to make int array.
+Array<Integer> GetIntArray(Array<IndexExpr> arr) {
+  for (size_t i = 0; i < arr.size(); ++i) {
+    CHECK(!arr[i].defined() || arr[i].as<IntImm>())
+      << "Expect an int array";
+  }
+  return Array<Integer>(arr.node_);
+}
+
+// slice_axis
+TVM_REGISTER_NODE_TYPE(SliceAxisAttrs);
+
+bool SliceAxisRel(const Array<Type>& types,
+                  int num_inputs,
+                  const Attrs& attrs,
+                  const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 2);
+  const auto* data = types[0].as<TensorTypeNode>();
+  const SliceAxisAttrs *param = attrs.as<SliceAxisAttrs>();
+
+  auto src_shape = data->shape;
+  int axis = param->axis;
+  int begin = param->begin;
+  int end = param->end;
+
+  if (axis < 0) {
+    axis += src_shape.size();
+  }
+  if (begin < 0) {
+    begin += *as_const_int(src_shape[axis]);
+  }
+  if (end <= 0) {
+    end += *as_const_int(src_shape[axis]);
+  }
+  CHECK_LT(begin, end)
+    << "Begin index must be smaller than end index: "
+    << begin << " vs " << end;
+
+  std::vector<IndexExpr>&& oshape = AsVector(data->shape);
+  oshape[axis] = IndexExpr(end - begin);
+
+  // assign output type
+  reporter->Assign(types[1], TensorTypeNode::make(oshape, data->dtype));
+  return true;
+}
+
+Expr MakeSliceAxis(Expr data,
+                   int axis,
+                   int begin,
+                   int end) {
+  auto attrs = make_node<SliceAxisAttrs>();
+  attrs->axis = axis;
+  attrs->begin = begin;
+  attrs->end = end;
+  static const Op& op = Op::Get("slice_axis");
+  return CallNode::make(op, {data}, Attrs(attrs), {});
+}
+
+TVM_REGISTER_API("relay.op._make.slice_axis")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+  runtime::detail::unpack_call<Expr, 4>(MakeSliceAxis, args, rv);
+});
+
+Array<Tensor> SliceAxisCompute(const Attrs& attrs,
+                               const Array<Tensor>& inputs,
+                               const Type& out_type,
+                               const Target& target) {
+  const SliceAxisAttrs *param = attrs.as<SliceAxisAttrs>();
+  const Array<IndexExpr> src_shape = inputs[0]->shape;
+  Array<IndexExpr> begin_idx, end_idx, strides;
+  int axis = param->axis;
+  int begin = param->begin;
+  int end = param->end;
+
+  if (axis < 0) {
+    axis += src_shape.size();
+  }
+  if (begin < 0) {
+    begin += *as_const_int(src_shape[axis]);
+  }
+  if (end <= 0) {
+    end += *as_const_int(src_shape[axis]);
+  }
+  for (size_t i = 0; i < src_shape.size(); ++i) {
+    begin_idx.push_back(make_const(tvm::Int(32), 0));
+    strides.push_back(make_const(tvm::Int(32), 1));
+  }
+  end_idx = Array<IndexExpr>(src_shape);
+  begin_idx.Set(axis, make_const(tvm::Int(32), begin));
+  end_idx.Set(axis, make_const(tvm::Int(32), end));
+
+  return Array<Tensor>{
+    topi::strided_slice(inputs[0],
+                        GetIntArray(begin_idx),
+                        GetIntArray(end_idx),
+                        GetIntArray(strides))
+  };
+}
+
+RELAY_REGISTER_OP("relay.op._make.slice_axis")
+.describe(R"doc(Slices along a given axis.
+Returns an array slice along a given axis starting from
+the begin index to the end index.
+)doc" TVM_ADD_FILELINE)
+.set_num_inputs(1)
+.add_argument("data", "Tensor", "Input data.")
+.set_support_level(4)
+.add_type_rel("SliceAxis", SliceAxisRel)
+.set_attr<FTVMCompute>("FTVMCompute", SliceAxisCompute)
+.set_attr<TOpPattern>("TOpPattern", kInjective);
+
+
 // strided_slice
 TVM_REGISTER_NODE_TYPE(StridedSliceAttrs);
 bool StridedSliceRel(const Array<Type>& types,
@@ -1701,15 +1813,6 @@ Expr MakeSliceLike(Expr data,
   return CallNode::make(op, {data, shape_like}, Attrs(attrs), {});
 }
 
-// Adapter function to make int array.
-Array<Integer> GetIntArray(Array<IndexExpr> arr) {
-  for (size_t i = 0; i < arr.size(); ++i) {
-    CHECK(!arr[i].defined() || arr[i].as<IntImm>())
-        << "Expect an int array";
-  }
-  return Array<Integer>(arr.node_);
-}
-
 Array<Tensor> SliceLikeCompute(const Attrs& attrs,
                                const Array<Tensor>& inputs,
                                const Type& out_type,
diff --git a/src/relay/op/vision/multibox_op.cc b/src/relay/op/vision/multibox_op.cc
index 55db8862e849..04f105c44744 100644
--- a/src/relay/op/vision/multibox_op.cc
+++ b/src/relay/op/vision/multibox_op.cc
@@ -70,8 +70,10 @@ RELAY_REGISTER_OP("vision.multibox_prior")
 
 TVM_REGISTER_NODE_TYPE(MultiBoxTransformLocAttrs);
 
-bool MultiBoxTransformLocRel(const Array<Type>& types, int num_inputs,
-                             const Attrs& attrs, const TypeReporter& reporter) {
+bool MultiBoxTransformLocRel(const Array<Type>& types,
+                             int num_inputs,
+                             const Attrs& attrs,
+                             const TypeReporter& reporter) {
   CHECK_EQ(types.size(), 4);
 
   const auto* cls_prob = types[0].as<TensorTypeNode>();
diff --git a/src/relay/op/vision/nms.cc b/src/relay/op/vision/nms.cc
index 3e3f73bc6cb4..c284be7c3441 100644
--- a/src/relay/op/vision/nms.cc
+++ b/src/relay/op/vision/nms.cc
@@ -9,6 +9,53 @@
 namespace tvm {
 namespace relay {
 
+TVM_REGISTER_NODE_TYPE(GetValidCountsAttrs);
+
+bool GetValidCountRel(const Array<Type>& types,
+                      int num_inputs,
+                      const Attrs& attrs,
+                      const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 2);
+  const auto* data = types[0].as<TensorTypeNode>();
+  const auto& dshape = data->shape;
+  CHECK_EQ(dshape.size(), 3) << "Input data should be 3-D.";
+
+  std::vector<IndexExpr> oshape({data->shape[0]});
+  std::vector<Type> fields;
+  fields.push_back(TensorTypeNode::make(data->shape, data->dtype));
+  fields.push_back(TensorTypeNode::make(oshape, Int(32)));
+
+  // assign output type
+  reporter->Assign(types[1], TupleTypeNode::make(Array<Type>(fields)));
+  return true;
+}
+
+Expr MakeGetValidCounts(Expr data,
+                        double score_threshold) {
+  auto attrs = make_node<GetValidCountsAttrs>();
+  attrs->score_threshold = score_threshold;
+  static const Op& op = Op::Get("vision.nms");
+  return CallNode::make(op, {data}, Attrs(attrs), {});
+}
+
+
+TVM_REGISTER_API("relay.op.vision._make.get_valid_counts")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+  runtime::detail::unpack_call<Expr, 2>(MakeGetValidCounts, args, rv);
+});
+
+
+RELAY_REGISTER_OP("vision.get_valid_counts")
+.describe(R"doc(Get valid count of bounding boxes given
+a score threshold. Also moves valid boxes to the top of
+input data.
+)doc" TVM_ADD_FILELINE)
+.set_num_inputs(1)
+.add_argument("data", "Tensor", "Input data.")
+.set_support_level(5)
+.add_type_rel("GetValidCount", GetValidCountRel);
+
+
 TVM_REGISTER_NODE_TYPE(NMSAttrs);
 
 bool NMSRel(const Array<Type>& types,
@@ -31,13 +78,17 @@ bool NMSRel(const Array<Type>& types,
 
 Expr MakeNMS(Expr data,
              Expr valid_count,
-             double overlap_threshold,
+             double iou_threshold,
              bool force_suppress,
-             int topk) {
+             int topk,
+             int id_index,
+             bool do_rearrange) {
   auto attrs = make_node<NMSAttrs>();
-  attrs->overlap_threshold = overlap_threshold;
+  attrs->iou_threshold = iou_threshold;
   attrs->force_suppress = force_suppress;
   attrs->topk = topk;
+  attrs->id_index = id_index;
+  attrs->do_rearrange = do_rearrange;
   static const Op& op = Op::Get("vision.nms");
   return CallNode::make(op, {data, valid_count}, Attrs(attrs), {});
 }
@@ -45,12 +96,12 @@ Expr MakeNMS(Expr data,
 
 TVM_REGISTER_API("relay.op.vision._make.nms")
 .set_body([](const TVMArgs& args, TVMRetValue* rv) {
-  runtime::detail::unpack_call<Expr, 5>(MakeNMS, args, rv);
+  runtime::detail::unpack_call<Expr, 7>(MakeNMS, args, rv);
 });
 
 
 RELAY_REGISTER_OP("vision.nms")
-.describe(R"doc("Non-maximum suppression."
+.describe(R"doc(Non-maximum suppression.
 )doc" TVM_ADD_FILELINE)
 .set_num_inputs(2)
 .add_argument("data", "Tensor", "Input data.")
diff --git a/tests/python/relay/test_op_level4.py b/tests/python/relay/test_op_level4.py
index ae7fe320940a..76ca67f56398 100644
--- a/tests/python/relay/test_op_level4.py
+++ b/tests/python/relay/test_op_level4.py
@@ -196,6 +196,29 @@ def _wrapper(data, axis=None, keepdims=False):
         verify_reduce(func, (128, 24, 128), (0, 2), True, False, (1, 24, 1))
 
 
+def test_slice_axis():
+    def verify(dshape, axis, begin, end):
+        x = relay.var("x", relay.TensorType(dshape, "float32"))
+        z = relay.slice_axis(x, axis=axis, begin=begin, end=end)
+        func = relay.Function([x], z)
+        func = relay.ir_pass.infer_type(func)
+        text = func.astext()
+        assert "begin=" in text
+        assert "end=" in text
+        x_data = np.random.uniform(size=dshape).astype("float32")
+        ref_res = topi.testing.slice_axis_python(
+            x_data, axis, begin, end)
+        for target, ctx in ctx_list():
+            intrp = relay.create_executor("graph", ctx=ctx, target=target)
+            op_res = intrp.evaluate(func)(x_data)
+            tvm.testing.assert_allclose(op_res.asnumpy(), ref_res)
+
+    verify((1, 2, 3, 4), 3, 0, 2)
+    verify((100, 50), -1, 1, -1)
+    verify((20,), -1, -9, -3)
+    verify((20, 30, 40), 1, 5, 0)
+
+
 def test_strided_slice():
     def verify(dshape, begin, end, strides, output, test_ref=True):
         x = relay.var("x", relay.TensorType(dshape, "float32"))
diff --git a/topi/python/topi/testing/__init__.py b/topi/python/topi/testing/__init__.py
index 0ccc422010c1..90b8e8e0e58c 100644
--- a/topi/python/topi/testing/__init__.py
+++ b/topi/python/topi/testing/__init__.py
@@ -19,4 +19,8 @@
 from .l2_normalize_python import l2_normalize_python
 from .gather_nd_python import gather_nd_python
 from .strided_slice_python import strided_slice_python
+<<<<<<< HEAD
 from .batch_matmul import batch_matmul
+=======
+from .slice_axis_python import slice_axis_python
+>>>>>>> Relay support
diff --git a/topi/python/topi/testing/slice_axis_python.py b/topi/python/topi/testing/slice_axis_python.py
new file mode 100644
index 000000000000..2db646c9e3a8
--- /dev/null
+++ b/topi/python/topi/testing/slice_axis_python.py
@@ -0,0 +1,34 @@
+"""Slice axis in python"""
+
+def slice_axis_python(data, axis, begin, end=None):
+    """Slice input array along specific axis.
+
+    Parameters
+    ----------
+    data : numpy.ndarray
+        The source array to be sliced.
+
+    axis : int
+        Axis to be sliced.
+
+    begin: int
+        The index to begin with in the slicing.
+
+    end: int, optional
+        The index indicating end of the slice.
+
+    Returns
+    -------
+    ret : numpy.ndarray
+        The computed result.
+    """
+    dshape = data.shape
+    if axis < 0:
+        axis += len(dshape)
+    if begin < 0:
+        begin += dshape[axis]
+    if end <= 0:
+        end += dshape[axis]
+    slc = [slice(None)] * len(dshape)
+    slc[axis] = slice(begin, end)
+    return data[slc]
diff --git a/topi/tests/python/test_topi_vision.py b/topi/tests/python/test_topi_vision.py
index 517d1f7ee80b..9c14c2c95051 100644
--- a/topi/tests/python/test_topi_vision.py
+++ b/topi/tests/python/test_topi_vision.py
@@ -59,7 +59,6 @@ def test_get_valid_counts():
     verify_get_valid_counts((1, 2500, 6), -1)
     verify_get_valid_counts((3, 1000, 6), 0.55)
     verify_get_valid_counts((16, 500, 6), 0.95)
->>>>>>> Add test for get_valid_counts
 
 
 def test_nms():

From 31f09279e2c1c1086caa9fad6f5f490ed67ef25a Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-70-212.ec2.internal>
Date: Tue, 8 Jan 2019 22:30:17 +0000
Subject: [PATCH 12/43] Add more ops to from_mxnet

---
 nnvm/tests/python/frontend/mxnet/test_forward.py | 2 +-
 python/tvm/relay/frontend/mxnet.py               | 9 ++++-----
 python/tvm/relay/op/transform.py                 | 3 +--
 python/tvm/relay/op/vision/nms.py                | 9 +++++----
 src/relay/op/vision/nms.cc                       | 4 ++--
 tests/python/relay/test_op_level4.py             | 1 +
 tests/python/relay/test_op_level5.py             | 2 +-
 topi/python/topi/vision/nms.py                   | 2 +-
 8 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/nnvm/tests/python/frontend/mxnet/test_forward.py b/nnvm/tests/python/frontend/mxnet/test_forward.py
index c9d1c7795489..67f1ad5ff27d 100644
--- a/nnvm/tests/python/frontend/mxnet/test_forward.py
+++ b/nnvm/tests/python/frontend/mxnet/test_forward.py
@@ -292,7 +292,7 @@ def test_forward_minimum():
 
 def test_forward_slice_axis():
     data = mx.sym.var('data')
-    mx_sym = mx.sym.slice_axis(data, axis=1, begin=-5)
+    mx_sym = mx.sym.slice_axis(data, axis=1, begin=-5, end=None)
     verify_mxnet_frontend_impl(mx_sym, (1, 10, 6), (1, 5, 6))
 
 def test_forward_l2_normalize():
diff --git a/python/tvm/relay/frontend/mxnet.py b/python/tvm/relay/frontend/mxnet.py
index d53a4f5f75a8..50e77e08809d 100644
--- a/python/tvm/relay/frontend/mxnet.py
+++ b/python/tvm/relay/frontend/mxnet.py
@@ -401,9 +401,8 @@ def _mx_box_nms(inputs, attrs):
     if out_format != 'corner':
         raise RuntimeError('out_format %s is not supported.' % out_format)
 
-    valid_counts, inter_out = \
-        _op.vision.get_valid_counts(inputs[0], score_threshold=valid_thresh)
-    nms_out = _op.vision.nms(inter_out, valid_counts,
+    ret =_op.vision.get_valid_counts(inputs[0], score_threshold=valid_thresh)
+    nms_out = _op.vision.nms(ret[1], ret[0],
                              iou_threshold=overlap_thresh,
                              force_suppress=force_suppress,
                              topk=topk, id_index=id_index,
@@ -522,9 +521,8 @@ def _mx_l2_normalize(inputs, attrs):
     "slice"         : _mx_slice,
     "slice_like"    : _mx_slice_like,
     "slice_axis"    : _mx_slice_axis,
-    "L2Normalization"  : _mx_l2_normalize,∂
+    "L2Normalization"  : _mx_l2_normalize,
     "SliceChannel"  : _mx_split,
-    "slice_axis"    : _mx_slice_axis,
     "split"         : _mx_split,
     "expand_dims"   : _mx_expand_dims,
     "Concat"        : _mx_concat,
@@ -541,6 +539,7 @@ def _mx_l2_normalize(inputs, attrs):
     "_contrib_ROIAlign" : _mx_roi_align,
     "_contrib_Proposal" : _mx_proposal,
     "_contrib_MultiProposal" : _mx_proposal,
+    "_contrib_box_nms" : _mx_box_nms,
     # List of missing operators that are present in NNVMv1
     # TODO(tvm-tvm): support all operators.
     #
diff --git a/python/tvm/relay/op/transform.py b/python/tvm/relay/op/transform.py
index f19aa19772b4..9a6f308d341a 100644
--- a/python/tvm/relay/op/transform.py
+++ b/python/tvm/relay/op/transform.py
@@ -471,7 +471,7 @@ def strided_slice(data, begin, end, strides=None):
     return _make.strided_slice(data, list(begin), list(end), list(strides))
 
 
-def slice_axis(data, axis, begin, end=None):
+def slice_axis(data, axis, begin, end=0):
     """Slice input array along specific axis.
 
     Parameters
@@ -493,7 +493,6 @@ def slice_axis(data, axis, begin, end=None):
     ret : relay.Expr
         The computed result.
     """
-    end = end or 0
     return _make.slice_axis(data, axis, begin, end)
 
 
diff --git a/python/tvm/relay/op/vision/nms.py b/python/tvm/relay/op/vision/nms.py
index aecc111204b9..157008ec2174 100644
--- a/python/tvm/relay/op/vision/nms.py
+++ b/python/tvm/relay/op/vision/nms.py
@@ -1,6 +1,7 @@
 """Non-maximum suppression operations."""
 from __future__ import absolute_import as _abs
 from . import _make
+from ...expr import TupleWrapper
 
 def get_valid_counts(data,
                      score_threshold):
@@ -17,13 +18,13 @@ def get_valid_counts(data,
 
     Returns
     -------
-    out_tensor : relay.Expr
-        Rearranged data tensor.
-
     valid_count : relay.Expr
         1-D tensor for valid number of boxes.
+
+    out_tensor : relay.Expr
+        Rearranged data tensor.
     """
-    return _make.get_valid_counts(data, score_threshold)
+    return TupleWrapper(_make.get_valid_counts(data, score_threshold), 2)
 
 
 def nms(data,
diff --git a/src/relay/op/vision/nms.cc b/src/relay/op/vision/nms.cc
index c284be7c3441..e8a84734b3d7 100644
--- a/src/relay/op/vision/nms.cc
+++ b/src/relay/op/vision/nms.cc
@@ -22,8 +22,8 @@ bool GetValidCountRel(const Array<Type>& types,
 
   std::vector<IndexExpr> oshape({data->shape[0]});
   std::vector<Type> fields;
-  fields.push_back(TensorTypeNode::make(data->shape, data->dtype));
   fields.push_back(TensorTypeNode::make(oshape, Int(32)));
+  fields.push_back(TensorTypeNode::make(data->shape, data->dtype));
 
   // assign output type
   reporter->Assign(types[1], TupleTypeNode::make(Array<Type>(fields)));
@@ -34,7 +34,7 @@ Expr MakeGetValidCounts(Expr data,
                         double score_threshold) {
   auto attrs = make_node<GetValidCountsAttrs>();
   attrs->score_threshold = score_threshold;
-  static const Op& op = Op::Get("vision.nms");
+  static const Op& op = Op::Get("vision.get_valid_counts");
   return CallNode::make(op, {data}, Attrs(attrs), {});
 }
 
diff --git a/tests/python/relay/test_op_level4.py b/tests/python/relay/test_op_level4.py
index 76ca67f56398..dcb5f985fcb2 100644
--- a/tests/python/relay/test_op_level4.py
+++ b/tests/python/relay/test_op_level4.py
@@ -260,3 +260,4 @@ def verify(dshape, begin, end, strides, output, test_ref=True):
     test_binary_int_broadcast()
     test_where()
     test_reduce_functions()
+    test_slice_axis()
diff --git a/tests/python/relay/test_op_level5.py b/tests/python/relay/test_op_level5.py
index 003318f01a2f..c6cbfd921257 100644
--- a/tests/python/relay/test_op_level5.py
+++ b/tests/python/relay/test_op_level5.py
@@ -142,7 +142,7 @@ def verify_nms(x0_data, x1_data, dshape, ref_res, valid_count,
         x0 = relay.var("x0", relay.ty.TensorType(dshape, "float32"))
         x1 = relay.var("x1", relay.ty.TensorType((dshape[0],), "int"))
         z = relay.vision.nms(x0, x1, overlap_threshold, force_suppress, topk)
-        assert "overlap_threshold" in z.astext()
+        assert "iou_threshold" in z.astext()
         zz = relay.ir_pass.infer_type(z)
         assert zz.checked_type == relay.ty.TensorType(dshape, "float32")
 
diff --git a/topi/python/topi/vision/nms.py b/topi/python/topi/vision/nms.py
index 1dddffc0a2f4..66a4b0df9c30 100644
--- a/topi/python/topi/vision/nms.py
+++ b/topi/python/topi/vision/nms.py
@@ -166,7 +166,7 @@ def hybrid_nms(data, sorted_index, valid_count,
                 if 0 < topk < valid_count[i]:
                     for j in range(valid_count[i] - nkeep):
                         for k in range(box_data_length):
-                            output[i, j + nkeep, k] = data[i, j + nkeep, k]
+                            output[i, j + nkeep, k] = -1.0
             # Apply nms
             for j in range(valid_count[i]):
                 if output[i, j, 0] >= 0:

From fb43612cb1fe6cbc7eddf55302b1b02c3449ccec Mon Sep 17 00:00:00 2001
From: Wang <wayao@9801a7a9c287.ant.amazon.com>
Date: Thu, 10 Jan 2019 18:26:58 -0800
Subject: [PATCH 13/43] Support multibox op with hybrid script

---
 include/tvm/relay/attrs/transform.h     |  8 +--
 src/relay/op/tensor/transform.cc        | 28 ++++-----
 topi/python/topi/cuda/nms.py            | 35 ++++++-----
 topi/python/topi/vision/ssd/multibox.py | 79 ++++++++++++++++---------
 topi/tests/python/test_topi_vision.py   |  2 +-
 5 files changed, 89 insertions(+), 63 deletions(-)

diff --git a/include/tvm/relay/attrs/transform.h b/include/tvm/relay/attrs/transform.h
index 44b910aaf0bf..d179ae46fade 100644
--- a/include/tvm/relay/attrs/transform.h
+++ b/include/tvm/relay/attrs/transform.h
@@ -172,16 +172,16 @@ struct StridedSliceAttrs : public tvm::AttrsNode<StridedSliceAttrs> {
 };
 
 struct SliceAxisAttrs : public tvm::AttrsNode<SliceAxisAttrs> {
-  int axis;
-  int begin;
-  int end;
+  Integer axis;
+  Integer begin;
+  Integer end;
 
   TVM_DECLARE_ATTRS(SliceAxisAttrs, "relay.attrs.SliceAxisAttrs") {
     TVM_ATTR_FIELD(axis)
       .describe("Axis along which to be sliced.");
     TVM_ATTR_FIELD(begin)
       .describe("Index for begin of slice");
-    TVM_ATTR_FIELD(end).set_default(0)
+    TVM_ATTR_FIELD(end)
       .describe("Index for end of the slice");
   }
 };
diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index dd0cbacb1e59..abdd41e306ab 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -1368,9 +1368,9 @@ bool SliceAxisRel(const Array<Type>& types,
   const SliceAxisAttrs *param = attrs.as<SliceAxisAttrs>();
 
   auto src_shape = data->shape;
-  int axis = param->axis;
-  int begin = param->begin;
-  int end = param->end;
+  int64_t axis = param->axis;
+  int64_t begin = param->begin;
+  int64_t end = param->end;
 
   if (axis < 0) {
     axis += src_shape.size();
@@ -1386,7 +1386,7 @@ bool SliceAxisRel(const Array<Type>& types,
     << begin << " vs " << end;
 
   std::vector<IndexExpr>&& oshape = AsVector(data->shape);
-  oshape[axis] = IndexExpr(end - begin);
+  oshape[axis] = make_const(Int(64), end - begin);
 
   // assign output type
   reporter->Assign(types[1], TensorTypeNode::make(oshape, data->dtype));
@@ -1394,9 +1394,9 @@ bool SliceAxisRel(const Array<Type>& types,
 }
 
 Expr MakeSliceAxis(Expr data,
-                   int axis,
-                   int begin,
-                   int end) {
+                   Integer axis,
+                   Integer begin,
+                   Integer end) {
   auto attrs = make_node<SliceAxisAttrs>();
   attrs->axis = axis;
   attrs->begin = begin;
@@ -1417,9 +1417,9 @@ Array<Tensor> SliceAxisCompute(const Attrs& attrs,
   const SliceAxisAttrs *param = attrs.as<SliceAxisAttrs>();
   const Array<IndexExpr> src_shape = inputs[0]->shape;
   Array<IndexExpr> begin_idx, end_idx, strides;
-  int axis = param->axis;
-  int begin = param->begin;
-  int end = param->end;
+  int64_t axis = param->axis;
+  int64_t begin = param->begin;
+  int64_t end = param->end;
 
   if (axis < 0) {
     axis += src_shape.size();
@@ -1431,12 +1431,12 @@ Array<Tensor> SliceAxisCompute(const Attrs& attrs,
     end += *as_const_int(src_shape[axis]);
   }
   for (size_t i = 0; i < src_shape.size(); ++i) {
-    begin_idx.push_back(make_const(tvm::Int(32), 0));
-    strides.push_back(make_const(tvm::Int(32), 1));
+    begin_idx.push_back(make_const(Int(64), 0));
+    strides.push_back(make_const(Int(64), 1));
   }
   end_idx = Array<IndexExpr>(src_shape);
-  begin_idx.Set(axis, make_const(tvm::Int(32), begin));
-  end_idx.Set(axis, make_const(tvm::Int(32), end));
+  begin_idx.Set(axis, make_const(Int(64), begin));
+  end_idx.Set(axis, make_const(Int(64), end));
 
   return Array<Tensor>{
     topi::strided_slice(inputs[0],
diff --git a/topi/python/topi/cuda/nms.py b/topi/python/topi/cuda/nms.py
index 89c0da381aae..900a8e856ddb 100644
--- a/topi/python/topi/cuda/nms.py
+++ b/topi/python/topi/cuda/nms.py
@@ -182,13 +182,13 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx):
 
 
 @nms.register(["cuda", "gpu"])
-def nms_gpu(data, valid_count, nms_threshold=0.5, force_suppress=False, nms_topk=-1,
-            do_rearrange=False):
+def nms_gpu(data, valid_count, iou_threshold=0.5, force_suppress=False,
+            topk=-1, id_index=0, do_rearrange=False):
     """Non-maximum suppression operator for object detection.
 
     Parameters
     ----------
-    data: tvm.Tensor
+    data : tvm.Tensor
         3-D tensor with shape [batch_size, num_anchors, 6].
         The last dimension should be in format of
         [class_id, score, box_left, box_top, box_right, box_bottom].
@@ -196,15 +196,21 @@ def nms_gpu(data, valid_count, nms_threshold=0.5, force_suppress=False, nms_topk
     valid_count : tvm.Tensor
         1-D tensor for valid number of boxes.
 
-    nms_threshold : float
+    iou_threshold : optional, float
         Non-maximum suppression threshold.
 
-    force_suppress : boolean
+    force_suppress : optional, boolean
         Whether to suppress all detections regardless of class_id.
 
-    nms_topk : int
+    topk : optional, int
         Keep maximum top k detections before nms, -1 for no limit.
 
+    id_index : optional, int
+        index of the class categories, -1 to disable.
+
+    do_rearrange : optional, boolean
+        Whether to move all valid bounding boxes to the top.
+
     Returns
     -------
     out : tvm.Tensor
@@ -217,14 +223,13 @@ def nms_gpu(data, valid_count, nms_threshold=0.5, force_suppress=False, nms_topk
         # An example to use nms
         dshape = (1, 5, 6)
         data = tvm.placeholder(dshape, name="data")
-        valid_count = tvm.placeholder(
-            (dshape[0],), dtype="int32", name="valid_count")
-        nms_threshold = 0.7
+        valid_count = tvm.placeholder((dshape[0],), dtype="int32", name="valid_count")
+        iou_threshold = 0.7
         force_suppress = True
-        nms_topk = -1
-        out = nms(data, valid_count, nms_threshold, force_suppress, nms_topk)
-        np_data = np.random.uniform(size=dshape).astype("float32")
-        np_valid_count = np.array([4]).astype("int32")
+        topk = -1
+        out = nms(data, valid_count, iou_threshold, force_suppress, topk)
+        np_data = np.random.uniform(dshape)
+        np_valid_count = np.array([4])
         s = topi.generic.schedule_nms(out)
         f = tvm.build(s, [data, valid_count, out], "llvm")
         ctx = tvm.cpu()
@@ -264,8 +269,8 @@ def nms_gpu(data, valid_count, nms_threshold=0.5, force_suppress=False, nms_topk
         tvm.extern(data.shape,
                    [data, sort_tensor, valid_count],
                    lambda ins, outs: nms_ir(
-                       ins[0], ins[1], ins[2], outs[0], nms_threshold,
-                       force_suppress, nms_topk),
+                       ins[0], ins[1], ins[2], outs[0], iou_threshold,
+                       force_suppress, topk),
                    dtype="float32",
                    in_buffers=[data_buf, sort_tensor_buf, valid_count_buf],
                    tag="nms")
diff --git a/topi/python/topi/vision/ssd/multibox.py b/topi/python/topi/vision/ssd/multibox.py
index 87a4a84c5ab5..907f6ac40346 100644
--- a/topi/python/topi/vision/ssd/multibox.py
+++ b/topi/python/topi/vision/ssd/multibox.py
@@ -19,29 +19,31 @@ def hybrid_multibox_prior(data, sizes, ratios, steps, offsets):
     data : tvm.Tensor or numpy NDArray
         4-D tensor with shape [batch, channel, height, width]]
 
-    sizes : tvm.ndarray
-        1-D tensor of sizes for anchor boxes.
+    sizes : tvm ConsExpr
+        Sizes for anchor boxes.
 
-    ratios : tvm.ndarray
-        1-D tensor of ratios for anchor boxes.
+    ratios : tvm ConsExpr
+        Ratios for anchor boxes.
 
-    steps : tvm.ndarray
-        1-D tensor of priorbox step across y and x, -1 for auto calculation.
+    steps : tvm ConsExpr
+        Priorbox step across y and x, -1 for auto calculation.
 
-    offsets : tvm.ndarray
-        1-D tensor priorbox center offsets, y and x respectively.
+    offsets : tvm ConsExpr
+        Priorbox center offsets, y and x respectively.
 
     Returns
     -------
     output : tvm.Tensor or numpy NDArray
         3-D tensor with shape [1, h_in * w_in * (num_sizes + num_ratios - 1), 4]
     """
-    in_height, in_width = data.shape[2], data.shape[3]
-    num_sizes, num_ratios = sizes.shape[0], ratios.shape[0]
+    in_height = data.shape[2]
+    in_width = data.shape[3]
+    num_sizes = len(sizes)
+    num_ratios = len(ratios)
     num_boxes = in_height * in_width * (num_sizes + num_ratios - 1)
-    output = output_tensor((1, num_boxes, 4), data.dtype)
-    steps_h = steps[0] if steps[0] > 0 else 1.0 / in_height
-    steps_w = steps[1] if steps[1] > 0 else 1.0 / in_width
+    output = output_tensor((1, num_boxes, 4), "float32")
+    steps_h = steps[0] * 1.0 if steps[0] > 0 else 1.0 / in_height
+    steps_w = steps[1] * 1.0 if steps[1] > 0 else 1.0 / in_width
     offset_h = offsets[0]
     offset_w = offsets[1]
 
@@ -49,7 +51,7 @@ def hybrid_multibox_prior(data, sizes, ratios, steps, offsets):
         center_h = (i + offset_h) * steps_h
         for j in range(in_width):
             center_w = (j + offset_w) * steps_w
-            for k in range(num_sizes + num_ratios - 1):
+            for k in const_range(num_sizes + num_ratios - 1):
                 if k < num_sizes:
                     w = sizes[k] * in_height / in_width / 2.0
                     h = sizes[k] / 2.0
@@ -96,7 +98,8 @@ def multibox_prior(data, sizes=(1,), ratios=(1,), steps=(-1, -1), offsets=(0.5,
     out : tvm.Tensor
         3-D tensor with shape [1, h_in * w_in * (num_sizes + num_ratios - 1), 4]
     """
-    out = hybrid_multibox_prior(data, sizes, ratios, steps, offsets)
+    out = hybrid_multibox_prior(data, tvm.convert(sizes), tvm.convert(ratios),
+                                tvm.convert(steps), tvm.convert(offsets))
     if clip:
         out = topi.clip(out, 0, 1)
     return out
@@ -105,11 +108,23 @@ def multibox_prior(data, sizes=(1,), ratios=(1,), steps=(-1, -1), offsets=(0.5,
 def _hybridy_transform_loc(box, pred_loc, variance, clip):
     """Transform prior anchor box to output box through location predictions.
     """
-    al, at, ar, ab = box[0], box[1], box[2], box[3]
-    px, py, pw, ph = pred_loc[0], pred_loc[1], \
-                     pred_loc[2], pred_loc[3]
-    vx, vy, vw, vh = variance[0], variance[1], \
-                     variance[2], variance[3]
+    al = box[0]
+    at = box[1]
+    ar = box[2]
+    ab = box[3]
+
+    px = pred_loc[0]
+    py = pred_loc[1]
+    pw = pred_loc[2]
+    ph = pred_loc[3]
+
+    vx = variance[0]
+    vy = variance[1]
+    vw = variance[2]
+    vh = variance[3]
+
+    output = output_tensor((4,), pred_loc.dtype)
+
     aw = ar - al
     ah = ab - at
     ax = (al + ar) / 2.0
@@ -118,11 +133,11 @@ def _hybridy_transform_loc(box, pred_loc, variance, clip):
     oy = py * vy * ah + ay
     ow = exp(pw * vw) * aw / 2.0
     oh = exp(ph * vh) * ah / 2.0
-    out_l = max(0, min(1, ox - ow)) if clip else ox - ow
-    out_t = max(0, min(1, oy - oh)) if clip else oy - oh
-    out_r = max(0, min(1, ox + ow)) if clip else ox + ow
-    out_b = max(0, min(1, oy + oh)) if clip else oy + oh
-    return out_l, out_t, out_r, out_b
+    output[0] = max(0.0, min(1.0, ox - ow)) if clip else ox - ow
+    output[1] = max(0.0, min(1.0, oy - oh)) if clip else oy - oh
+    output[2] = max(0.0, min(1.0, ox + ow)) if clip else ox + ow
+    output[3] = max(0.0, min(1.0, oy + oh)) if clip else oy + oh
+    return output
 
 @hybrid.script
 def hybrid_multibox_transform_loc(cls_prob, loc_pred, anchor,
@@ -135,7 +150,7 @@ def hybrid_multibox_transform_loc(cls_prob, loc_pred, anchor,
         3-D tensor of class probabilities.
 
     loc_pred : tvm.Tensor or numpy NDArray
-        3-D tensor of location regression predictions.
+        2-D tensor of location regression predictions.
 
     anchor : tvm.Tensor or numpy NDArray
         3-D tensor of prior anchor boxes.
@@ -189,6 +204,8 @@ def transform_loc(loc, loc_base_idx, anchor, anchor_base_idx, clip, vx, vy, vw,
     batch_size = cls_prob.shape[0]
     num_classes = cls_prob.shape[1]
     num_anchors = cls_prob.shape[2]
+    box_coord = allocate((4,), loc_pred.dtype)
+    pred_coord = allocate((4,), loc_pred.dtype)
     out_loc = output_tensor((batch_size, num_anchors, 6),
                             loc_pred.dtype)
     valid_count = output_tensor((batch_size,), "int32")
@@ -215,7 +232,7 @@ def transform_loc(loc, loc_base_idx, anchor, anchor_base_idx, clip, vx, vy, vw,
             for k in range(num_classes):
                 if k > 0:
                     temp = cls_prob[i, k, j]
-                    cls_id = j if temp > score else cls_id
+                    cls_id = k if temp > score else cls_id
                     score = max(temp, score)
             if cls_id > 0 and score < threshold:
                 cls_id = 0
@@ -225,12 +242,16 @@ def transform_loc(loc, loc_base_idx, anchor, anchor_base_idx, clip, vx, vy, vw,
             if cls_id > 0:
                 out_loc[i, valid_count[i], 0] = cls_id - 1.0
                 out_loc[i, valid_count[i], 1] = score
-                out_coord = _hybridy_transform_loc(anchor[j], loc_pred[i, j],
+                for l in range(4):
+                    box_coord[l] = anchor[0, j, l]
+                    pred_coord[l] = loc_pred[i, j * 4 + l]
+                out_coord = _hybridy_transform_loc(box_coord, pred_coord,
                                                    variances, clip)
                 out_loc[i, valid_count[i], 2] = out_coord[0]
                 out_loc[i, valid_count[i], 3] = out_coord[1]
                 out_loc[i, valid_count[i], 4] = out_coord[2]
                 out_loc[i, valid_count[i], 5] = out_coord[3]
+                valid_count[i] += 1
 
     return out_loc, valid_count
 
@@ -266,7 +287,7 @@ def multibox_transform_loc(cls_prob, loc_pred, anchor, clip=True, threshold=0.01
     out, valid_count = hybrid_multibox_transform_loc(cls_prob, loc_pred, anchor,
                                                      tvm.const(clip, "bool"),
                                                      tvm.const(threshold, "float32"),
-                                                     variances)
+                                                     tvm.convert(variances))
     return out, valid_count
 
 @tvm.target.generic_func
diff --git a/topi/tests/python/test_topi_vision.py b/topi/tests/python/test_topi_vision.py
index 9c14c2c95051..fd52f13795a1 100644
--- a/topi/tests/python/test_topi_vision.py
+++ b/topi/tests/python/test_topi_vision.py
@@ -74,7 +74,7 @@ def test_nms():
                          [1, 0.5, 100, 60, 70, 110]]]).astype(data.dtype)
     np_valid_count = np.array([4]).astype(valid_count.dtype)
     np_result = np.array([[[2, 0.9, 35, 61, 52, 79], [0, 0.8, 1, 20, 25, 45],
-                           [0, 0.4, 4, 21, 19, 40], [-1, 0.9, 35, 61, 52, 79],
+                           [-1, -1, -1, -1, -1, -1], [-1, -1, -1, -1, -1, -1],
                            [-1, -1, -1, -1, -1, -1]]])
 
     def check_device(device):

From 717e61c6cec1d91d4ddaae77c2d7952c8e6a06bd Mon Sep 17 00:00:00 2001
From: Wang <wayao@9801a7a9c287.ant.amazon.com>
Date: Fri, 11 Jan 2019 17:18:18 -0800
Subject: [PATCH 14/43] Fix slice_axis relay register issue

---
 include/tvm/relay/attrs/transform.h           |  6 ++---
 src/relay/op/tensor/transform.cc              | 24 +++++++++----------
 tests/python/relay/test_op_level4.py          |  4 ++--
 topi/python/topi/testing/slice_axis_python.py |  2 +-
 4 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/include/tvm/relay/attrs/transform.h b/include/tvm/relay/attrs/transform.h
index d179ae46fade..720d6b9d3690 100644
--- a/include/tvm/relay/attrs/transform.h
+++ b/include/tvm/relay/attrs/transform.h
@@ -172,9 +172,9 @@ struct StridedSliceAttrs : public tvm::AttrsNode<StridedSliceAttrs> {
 };
 
 struct SliceAxisAttrs : public tvm::AttrsNode<SliceAxisAttrs> {
-  Integer axis;
-  Integer begin;
-  Integer end;
+  int axis;
+  int begin;
+  int end;
 
   TVM_DECLARE_ATTRS(SliceAxisAttrs, "relay.attrs.SliceAxisAttrs") {
     TVM_ATTR_FIELD(axis)
diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index abdd41e306ab..73b74a60d756 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -1368,9 +1368,9 @@ bool SliceAxisRel(const Array<Type>& types,
   const SliceAxisAttrs *param = attrs.as<SliceAxisAttrs>();
 
   auto src_shape = data->shape;
-  int64_t axis = param->axis;
-  int64_t begin = param->begin;
-  int64_t end = param->end;
+  int axis = param->axis;
+  int begin = param->begin;
+  int end = param->end;
 
   if (axis < 0) {
     axis += src_shape.size();
@@ -1386,7 +1386,7 @@ bool SliceAxisRel(const Array<Type>& types,
     << begin << " vs " << end;
 
   std::vector<IndexExpr>&& oshape = AsVector(data->shape);
-  oshape[axis] = make_const(Int(64), end - begin);
+  oshape[axis] = make_const(Int(32), end - begin);
 
   // assign output type
   reporter->Assign(types[1], TensorTypeNode::make(oshape, data->dtype));
@@ -1417,9 +1417,9 @@ Array<Tensor> SliceAxisCompute(const Attrs& attrs,
   const SliceAxisAttrs *param = attrs.as<SliceAxisAttrs>();
   const Array<IndexExpr> src_shape = inputs[0]->shape;
   Array<IndexExpr> begin_idx, end_idx, strides;
-  int64_t axis = param->axis;
-  int64_t begin = param->begin;
-  int64_t end = param->end;
+  int axis = param->axis;
+  int begin = param->begin;
+  int end = param->end;
 
   if (axis < 0) {
     axis += src_shape.size();
@@ -1431,12 +1431,12 @@ Array<Tensor> SliceAxisCompute(const Attrs& attrs,
     end += *as_const_int(src_shape[axis]);
   }
   for (size_t i = 0; i < src_shape.size(); ++i) {
-    begin_idx.push_back(make_const(Int(64), 0));
-    strides.push_back(make_const(Int(64), 1));
+    begin_idx.push_back(make_const(Int(32), 0));
+    strides.push_back(make_const(Int(32), 1));
   }
   end_idx = Array<IndexExpr>(src_shape);
-  begin_idx.Set(axis, make_const(Int(64), begin));
-  end_idx.Set(axis, make_const(Int(64), end));
+  begin_idx.Set(axis, make_const(Int(32), begin));
+  end_idx.Set(axis, make_const(Int(32), end));
 
   return Array<Tensor>{
     topi::strided_slice(inputs[0],
@@ -1446,7 +1446,7 @@ Array<Tensor> SliceAxisCompute(const Attrs& attrs,
   };
 }
 
-RELAY_REGISTER_OP("relay.op._make.slice_axis")
+RELAY_REGISTER_OP("slice_axis")
 .describe(R"doc(Slices along a given axis.
 Returns an array slice along a given axis starting from
 the begin index to the end index.
diff --git a/tests/python/relay/test_op_level4.py b/tests/python/relay/test_op_level4.py
index dcb5f985fcb2..4ba7e8cd1e72 100644
--- a/tests/python/relay/test_op_level4.py
+++ b/tests/python/relay/test_op_level4.py
@@ -203,8 +203,8 @@ def verify(dshape, axis, begin, end):
         func = relay.Function([x], z)
         func = relay.ir_pass.infer_type(func)
         text = func.astext()
-        assert "begin=" in text
-        assert "end=" in text
+        assert "begin" in text
+        assert "end" in text
         x_data = np.random.uniform(size=dshape).astype("float32")
         ref_res = topi.testing.slice_axis_python(
             x_data, axis, begin, end)
diff --git a/topi/python/topi/testing/slice_axis_python.py b/topi/python/topi/testing/slice_axis_python.py
index 2db646c9e3a8..589e5914a36c 100644
--- a/topi/python/topi/testing/slice_axis_python.py
+++ b/topi/python/topi/testing/slice_axis_python.py
@@ -31,4 +31,4 @@ def slice_axis_python(data, axis, begin, end=None):
         end += dshape[axis]
     slc = [slice(None)] * len(dshape)
     slc[axis] = slice(begin, end)
-    return data[slc]
+    return data[tuple(slc)]

From b9681eeed67faa85f1827441a8df0de6b4ddb187 Mon Sep 17 00:00:00 2001
From: Wang <wayao@9801a7a9c287.ant.amazon.com>
Date: Sun, 13 Jan 2019 11:21:09 -0800
Subject: [PATCH 15/43] Add get_valid_counts relay test

---
 nnvm/tests/python/compiler/test_top_level4.py |  2 +-
 tests/python/relay/test_op_level5.py          | 56 ++++++++++++++++---
 topi/python/topi/vision/ssd/multibox.py       | 52 +++--------------
 3 files changed, 55 insertions(+), 55 deletions(-)

diff --git a/nnvm/tests/python/compiler/test_top_level4.py b/nnvm/tests/python/compiler/test_top_level4.py
index b84621128614..9e44fa0e5b1c 100644
--- a/nnvm/tests/python/compiler/test_top_level4.py
+++ b/nnvm/tests/python/compiler/test_top_level4.py
@@ -629,7 +629,7 @@ def test_nms():
                          [1, 0.5, 100, 60, 70, 110]]]).astype("float32")
     np_valid_count = np.array([4]).astype("int32")
     np_result = np.array([[[2, 0.9, 35, 61, 52, 79], [0, 0.8, 1, 20, 25, 45],
-                           [0, 0.4, 4, 21, 19, 40], [-1, 0.9, 35, 61, 52, 79],
+                           [-1, -1, -1, -1, -1, -1], [-1, -1, -1, -1, -1, -1],
                            [-1, -1, -1, -1, -1, -1]]])
 
     target = "llvm"
diff --git a/tests/python/relay/test_op_level5.py b/tests/python/relay/test_op_level5.py
index c6cbfd921257..1e7fe76c0a9e 100644
--- a/tests/python/relay/test_op_level5.py
+++ b/tests/python/relay/test_op_level5.py
@@ -135,8 +135,47 @@ def verify_multibox_prior(x, dshape, ref_res, sizes=(1.0,),
     verify_multibox_prior(x, dshape, ref_res, clip=False, check_type_only=True)
 
 
+def test_get_valid_counts():
+    def verify_get_valid_counts(dshape, score_threshold):
+        dtype = "float32"
+        batch_size, num_anchor, elem_length = dshape
+        np_data = np.random.uniform(size=dshape).astype(dtype)
+        np_out1 = np.zeros(shape=(batch_size,))
+        np_out2 = np.zeros(shape=dshape).astype(dtype)
+        for i in range(batch_size):
+            np_out1[i] = 0
+            inter_idx = 0
+            for j in range(num_anchor):
+                score = np_data[i, j, 1]
+                if score >= score_threshold:
+                    for k in range(elem_length):
+                        np_out2[i, inter_idx, k] = np_data[i, j, k]
+                    np_out1[i] += 1
+                    inter_idx += 1
+                if j >= np_out1[i]:
+                    for k in range(elem_length):
+                        np_out2[i, j, k] = -1
+
+        x = relay.var("x", relay.ty.TensorType(dshape, dtype))
+        z = relay.vision.get_valid_counts(x, score_threshold)
+        assert "score_threshold" in z.astext()
+        func = relay.Function([x], z.astuple())
+        func = relay.ir_pass.infer_type(func)
+        ctx_list = [("llvm", tvm.cpu(0))]
+        for target, ctx in ctx_list:
+            intrp = relay.create_executor("debug", ctx=ctx, target=target)
+            out = intrp.evaluate(func)(np_data)
+            tvm.testing.assert_allclose(out[0].asnumpy(), np_out1, rtol=1e-3)
+            tvm.testing.assert_allclose(out[1].asnumpy(), np_out2, rtol=1e-3)
+
+    verify_get_valid_counts((1, 2500, 6), 0)
+    verify_get_valid_counts((1, 2500, 6), -1)
+    verify_get_valid_counts((3, 1000, 6), 0.55)
+    verify_get_valid_counts((16, 500, 6), 0.95)
+
+
 def test_nms():
-    def verify_nms(x0_data, x1_data, dshape, ref_res, valid_count,
+    def verify_nms(x0_data, x1_data, dshape, ref_res,
                    overlap_threshold=0.5, force_suppress=False, topk=-1,
                    check_type_only=False):
         x0 = relay.var("x0", relay.ty.TensorType(dshape, "float32"))
@@ -165,26 +204,24 @@ def verify_nms(x0_data, x1_data, dshape, ref_res, valid_count,
                          [1, 0.5, 100, 60, 70, 110]]]).astype("float32")
     np_valid_count = np.array([4]).astype("int32")
     np_result = np.array([[[2, 0.9, 35, 61, 52, 79], [0, 0.8, 1, 20, 25, 45],
-                           [0, 0.4, 4, 21, 19, 40], [-1, 0.9, 35, 61, 52, 79],
+                           [-1, -1, -1, -1, -1, -1], [-1, -1, -1, -1, -1, -1],
                            [-1, -1, -1, -1, -1, -1]]])
     num_anchors = 5
 
     dshape = (tvm.var("n"), num_anchors, 6)
-    verify_nms(np_data, np_valid_count, dshape, np_result, dshape[0],
+    verify_nms(np_data, np_valid_count, dshape, np_result,
                force_suppress=True, topk=2, check_type_only=True)
     dshape = (1, num_anchors, 6)
-    verify_nms(np_data, np_valid_count, dshape, np_result, dshape[0],
+    verify_nms(np_data, np_valid_count, dshape, np_result,
                force_suppress=True, topk=2, check_type_only=False)
 
     np_result = np.array([[[2, 0.9, 35, 61, 52, 79], [0, 0.8, 1, 20, 25, 45],
-                           [1, 0.7, 30, 60, 50, 80], [-1, 0.9, 35, 61, 52, 79],
+                           [1, 0.7, 30, 60, 50, 80], [-1, -1, -1, -1, -1, -1],
                            [-1, -1, -1, -1, -1, -1]]])
     dshape = (tvm.var("n"), num_anchors, 6)
-    verify_nms(np_data, np_valid_count, dshape, np_result, dshape[0],
-               check_type_only=True)
+    verify_nms(np_data, np_valid_count, dshape, np_result, check_type_only=True)
     dshape = (1, num_anchors, 6)
-    verify_nms(np_data, np_valid_count, dshape, np_result, dshape[0],
-               topk=3)
+    verify_nms(np_data, np_valid_count, dshape, np_result, topk=3)
 
 
 def test_multibox_transform_loc():
@@ -411,6 +448,7 @@ def verify_yolo_reorg(shape, stride):
     test_resize()
     test_multibox_prior()
     test_multibox_transform_loc()
+    test_get_valid_counts()
     test_nms()
     test_roi_align()
     test_proposal()
diff --git a/topi/python/topi/vision/ssd/multibox.py b/topi/python/topi/vision/ssd/multibox.py
index 907f6ac40346..c63874750e04 100644
--- a/topi/python/topi/vision/ssd/multibox.py
+++ b/topi/python/topi/vision/ssd/multibox.py
@@ -47,6 +47,10 @@ def hybrid_multibox_prior(data, sizes, ratios, steps, offsets):
     offset_h = offsets[0]
     offset_w = offsets[1]
 
+    # Need to define var out of const_range + if
+    w = 0.0
+    h = 0.0
+
     for i in parallel(in_height):
         center_h = (i + offset_h) * steps_h
         for j in range(in_width):
@@ -57,8 +61,8 @@ def hybrid_multibox_prior(data, sizes, ratios, steps, offsets):
                     h = sizes[k] / 2.0
                 else:
                     w = sizes[0] * in_height / in_width \
-                        * sqrt(ratios[k - num_sizes + 1]) / 2.0
-                    h = sizes[0] * sqrt(ratios[k - num_sizes + 1]) / 2.0
+                        * sqrt(ratios[k - num_sizes + 1] * 1.0) / 2.0
+                    h = sizes[0] / sqrt(ratios[k - num_sizes + 1] * 1.0) / 2.0
                 count = i * in_width * (num_sizes + num_ratios - 1) \
                         + j * (num_sizes + num_ratios - 1) + k
                 output[0, count, 0] = center_w - w
@@ -104,6 +108,7 @@ def multibox_prior(data, sizes=(1,), ratios=(1,), steps=(-1, -1), offsets=(0.5,
         out = topi.clip(out, 0, 1)
     return out
 
+
 @hybrid.script
 def _hybridy_transform_loc(box, pred_loc, variance, clip):
     """Transform prior anchor box to output box through location predictions.
@@ -166,37 +171,8 @@ def hybrid_multibox_transform_loc(cls_prob, loc_pred, anchor,
 
     Returns
     -------
-<<<<<<< HEAD
-    stmt : Stmt
-        The result IR statement.
-    """
-    def transform_loc(loc, loc_base_idx, anchor, anchor_base_idx, clip, vx, vy, vw, vh):
-        """Transform prior anchor box to output box through location predictions.
-        """
-        al = anchor[anchor_base_idx]
-        at = anchor[anchor_base_idx + 1]
-        ar = anchor[anchor_base_idx + 2]
-        ab = anchor[anchor_base_idx + 3]
-        aw = ar - al
-        ah = ab - at
-        ax = (al + ar) / 2.0
-        ay = (at + ab) / 2.0
-        px = loc[loc_base_idx]
-        py = loc[loc_base_idx + 1]
-        pw = loc[loc_base_idx + 2]
-        ph = loc[loc_base_idx + 3]
-        ox = px * vx * aw + ax
-        oy = py * vy * ah + ay
-        ow = tvm.exp(pw * vw) * aw / 2.0
-        oh = tvm.exp(ph * vh) * ah / 2.0
-        return tvm.if_then_else(clip, tvm.max(0, tvm.min(1, ox - ow)), ox - ow), \
-               tvm.if_then_else(clip, tvm.max(0, tvm.min(1, oy - oh)), oy - oh), \
-               tvm.if_then_else(clip, tvm.max(0, tvm.min(1, ox + ow)), ox + ow), \
-               tvm.if_then_else(clip, tvm.max(0, tvm.min(1, oy + oh)), oy + oh)
-=======
     out_loc : tvm.Tensor or numpy NDArray
         3-D tensor of transformed location.
->>>>>>> Modify SSD tutorial
 
     valid_count : tvm.Tensor or numpy NDArray
         1_d tensor of valid counts for boxes.
@@ -214,19 +190,6 @@ def transform_loc(loc, loc_base_idx, anchor, anchor_base_idx, clip, vx, vy, vw,
         valid_count[i] = 0
         for j in range(num_anchors):
             # Find the predicted class id and probability
-<<<<<<< HEAD
-            score = ib.allocate('float32', (1,), name="score", scope="local")
-            cls_id = ib.allocate('int32', (1,), name="id", scope="local")
-            score[0] = -1.0
-            cls_id[0] = 0
-            with ib.for_range(0, num_classes, name="j") as j:
-                with ib.if_scope(j > 0):
-                    temp = p_cls_prob[n * num_anchors * num_classes + j * num_anchors + i]
-                    cls_id[0] = tvm.if_then_else(temp > score[0], j, cls_id[0])
-                    score[0] = tvm.max(temp, score[0])
-            with ib.if_scope(tvm.all(cls_id[0] > 0, score[0] < threshold)):
-                cls_id[0] = 0
-=======
             score = -1.0
             cls_id = 0
             for k in range(num_classes):
@@ -236,7 +199,6 @@ def transform_loc(loc, loc_base_idx, anchor, anchor_base_idx, clip, vx, vy, vw,
                     score = max(temp, score)
             if cls_id > 0 and score < threshold:
                 cls_id = 0
->>>>>>> Modify SSD tutorial
             # [id, prob, xmin, ymin, xmax, ymax]
             # Remove background, restore original id
             if cls_id > 0:

From e277a552c03b378e00c7f2286efa44914201dac7 Mon Sep 17 00:00:00 2001
From: Wang <wayao@9801a7a9c287.ant.amazon.com>
Date: Sun, 13 Jan 2019 22:06:46 -0800
Subject: [PATCH 16/43] Fix multibox_transform_loc

---
 nnvm/python/nnvm/top/vision.py                | 2 +-
 nnvm/tests/python/compiler/test_top_level4.py | 3 ++-
 topi/python/topi/vision/ssd/multibox.py       | 9 ++++-----
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/nnvm/python/nnvm/top/vision.py b/nnvm/python/nnvm/top/vision.py
index f5f41d33e363..cb69b897a7be 100644
--- a/nnvm/python/nnvm/top/vision.py
+++ b/nnvm/python/nnvm/top/vision.py
@@ -58,7 +58,7 @@ def compute_multibox_transform_loc(attrs, inputs, _):
     return topi.vision.ssd.multibox_transform_loc(inputs[0], inputs[1], inputs[2],
                                                   clip, threshold, variance)
 
-reg.register_pattern("multibox_detection", OpPattern.OPAQUE)
+reg.register_pattern("multibox_transform_loc", OpPattern.OPAQUE)
 
 # Get valid number of anchor boxes
 @reg.register_schedule("get_valid_counts")
diff --git a/nnvm/tests/python/compiler/test_top_level4.py b/nnvm/tests/python/compiler/test_top_level4.py
index 9e44fa0e5b1c..38646b01a4c9 100644
--- a/nnvm/tests/python/compiler/test_top_level4.py
+++ b/nnvm/tests/python/compiler/test_top_level4.py
@@ -3,6 +3,7 @@
 import tvm
 from tvm.contrib import graph_runtime
 import topi
+import topi.testing
 import nnvm.symbol as sym
 import nnvm.compiler
 from nnvm.testing.config import ctx_list
@@ -657,7 +658,7 @@ def np_slice_like(np_data, np_shape_like, axis=[]):
     slice_idx = []
     for b, e in zip(begin_idx, end_idx):
         slice_idx.append(slice(b, e))
-    np_result = np_data[slice_idx]
+    np_result = np_data[tuple(slice_idx)]
     return np_result
 
 def verify_slice_like(np_data, np_shape_like, axis=[]):
diff --git a/topi/python/topi/vision/ssd/multibox.py b/topi/python/topi/vision/ssd/multibox.py
index c63874750e04..f20a286960cc 100644
--- a/topi/python/topi/vision/ssd/multibox.py
+++ b/topi/python/topi/vision/ssd/multibox.py
@@ -246,11 +246,10 @@ def multibox_transform_loc(cls_prob, loc_pred, anchor, clip=True, threshold=0.01
     -------
     ret : tuple of tvm.Tensor
     """
-    out, valid_count = hybrid_multibox_transform_loc(cls_prob, loc_pred, anchor,
-                                                     tvm.const(clip, "bool"),
-                                                     tvm.const(threshold, "float32"),
-                                                     tvm.convert(variances))
-    return out, valid_count
+    return hybrid_multibox_transform_loc(cls_prob, loc_pred, anchor,
+                                         tvm.const(clip, "bool"),
+                                         tvm.const(threshold, "float32"),
+                                         tvm.convert(variances))
 
 @tvm.target.generic_func
 def multibox_detection(cls_prob, loc_pred, anchor, clip=True, threshold=0.01, nms_threshold=0.5,

From ef00b7fd7c762a6be3c21d96f7a55d5d4e908f16 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-70-212.ec2.internal>
Date: Mon, 14 Jan 2019 20:58:50 +0000
Subject: [PATCH 17/43] Fix relay from_mxnet

---
 python/tvm/relay/frontend/mxnet.py    | 2 ++
 python/tvm/relay/op/transform.py      | 3 ++-
 python/tvm/relay/op/vision/_vision.py | 2 +-
 3 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/python/tvm/relay/frontend/mxnet.py b/python/tvm/relay/frontend/mxnet.py
index 50e77e08809d..6f6639c5df9a 100644
--- a/python/tvm/relay/frontend/mxnet.py
+++ b/python/tvm/relay/frontend/mxnet.py
@@ -682,6 +682,8 @@ def from_mxnet(symbol,
             params[k] = _nd.array(v.data().asnumpy())
         data = mx.sym.Variable("data")
         sym = symbol(data)
+        if isinstance(sym, (list, tuple)):
+            sym = mx.sym.Group(sym)
         shape, dtype = _update_shape_dtype(shape, dtype, params)
         sym = _from_mxnet_impl(sym, shape, dtype)
     elif isinstance(symbol, mx.gluon.Block):
diff --git a/python/tvm/relay/op/transform.py b/python/tvm/relay/op/transform.py
index 9a6f308d341a..f19aa19772b4 100644
--- a/python/tvm/relay/op/transform.py
+++ b/python/tvm/relay/op/transform.py
@@ -471,7 +471,7 @@ def strided_slice(data, begin, end, strides=None):
     return _make.strided_slice(data, list(begin), list(end), list(strides))
 
 
-def slice_axis(data, axis, begin, end=0):
+def slice_axis(data, axis, begin, end=None):
     """Slice input array along specific axis.
 
     Parameters
@@ -493,6 +493,7 @@ def slice_axis(data, axis, begin, end=0):
     ret : relay.Expr
         The computed result.
     """
+    end = end or 0
     return _make.slice_axis(data, axis, begin, end)
 
 
diff --git a/python/tvm/relay/op/vision/_vision.py b/python/tvm/relay/op/vision/_vision.py
index 2d15562995ec..57ca6f2da2ce 100644
--- a/python/tvm/relay/op/vision/_vision.py
+++ b/python/tvm/relay/op/vision/_vision.py
@@ -59,7 +59,7 @@ def compute_multibox_transform_loc(attrs, inputs, _, target):
 def schedule_get_valid_counts(_, outs, target):
     """Schedule definition of get_valid_counts"""
     with target:
-        return topi.generic.schedule_nms(outs)
+        return topi.generic.schedule_get_valid_counts(outs)
 
 
 @reg.register_compute("vision.get_valid_counts")

From fa89a2ab6a69e4305f51b45229d6eef0f5d00cd9 Mon Sep 17 00:00:00 2001
From: Wang <wayao@9801a7a9c287.ant.amazon.com>
Date: Mon, 14 Jan 2019 14:18:27 -0800
Subject: [PATCH 18/43] Fix l2_normalize

---
 python/tvm/relay/frontend/mxnet.py          | 2 +-
 tests/python/frontend/mxnet/test_forward.py | 9 ++++++++-
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/python/tvm/relay/frontend/mxnet.py b/python/tvm/relay/frontend/mxnet.py
index 6f6639c5df9a..c5a6ae8e8b4d 100644
--- a/python/tvm/relay/frontend/mxnet.py
+++ b/python/tvm/relay/frontend/mxnet.py
@@ -416,7 +416,7 @@ def _mx_l2_normalize(inputs, attrs):
     if mode != 'channel':
         raise RuntimeError('mode %s is not supported.' % mode)
     new_attrs['eps'] = attrs.get_float('eps', 1e-10)
-    new_attrs['axis'] = 1
+    new_attrs['axis'] = [1]
     return _op.nn.l2_normalize(inputs[0], **new_attrs)
 
 
diff --git a/tests/python/frontend/mxnet/test_forward.py b/tests/python/frontend/mxnet/test_forward.py
index 2dfe20c503e6..b6646b3c2a2f 100644
--- a/tests/python/frontend/mxnet/test_forward.py
+++ b/tests/python/frontend/mxnet/test_forward.py
@@ -257,6 +257,7 @@ def verify(start, stop, step):
     verify(20, 1, -1)
     verify(20, 1, -1.5)
 
+<<<<<<< HEAD
 def _mx_symbol(F, op_name, inputs):
     op = getattr(F, op_name)
     return op(*inputs)
@@ -374,6 +375,11 @@ def verify(x_shape, y_shape, axes):
     verify((3, 4), (2, 3), (0))
     verify((3, 4), (2, 3), (-1))
 
+def test_forward_l2_normalize():
+    data = mx.sym.var('data')
+    mx_sym = mx.sym.L2Normalization(data, mode="channel")
+    verify_mxnet_frontend_impl(mx_sym, (2, 3, 4, 5), (2, 3, 4, 5))
+
 
 if __name__ == '__main__':
     test_forward_mlp()
@@ -401,5 +407,6 @@ def verify(x_shape, y_shape, axes):
     test_forward_broadcast_ops()
     test_forward_elemwise_ops()
     test_forward_scalar_ops()
-    test_forward_slice_axis()
     test_forward_slice_like()
+    test_forward_slice_axis()
+    test_forward_l2_normalize()

From 925c14021a995cdc1e7c367a9514006778bd86ab Mon Sep 17 00:00:00 2001
From: Wang <wayao@9801a7a9c287.ant.amazon.com>
Date: Mon, 14 Jan 2019 16:44:08 -0800
Subject: [PATCH 19/43] Fix lint

---
 nnvm/src/top/tensor/elemwise.cc     | 19 ++++++-------------
 python/tvm/relay/frontend/mxnet.py  |  2 +-
 topi/include/topi/nn/l2_normalize.h |  2 +-
 3 files changed, 8 insertions(+), 15 deletions(-)

diff --git a/nnvm/src/top/tensor/elemwise.cc b/nnvm/src/top/tensor/elemwise.cc
index 9c1687beab35..5a39f3ecc392 100644
--- a/nnvm/src/top/tensor/elemwise.cc
+++ b/nnvm/src/top/tensor/elemwise.cc
@@ -815,12 +815,10 @@ NNVM_REGISTER_ELEMWISE_BINARY_SCALAR(__equal_scalar__)
   "FTVMCompute", [](const NodeAttrs& attrs,
                     const Array<Tensor>& inputs,
                     const Array<Tensor>& out_info) {
-
     Tensor out = topi::cast(
       binary_scalar_op(attrs, inputs[0],
                        [](Expr x, Expr y) { return x == y; }),
-      out_info[0]->dtype
-    );
+      out_info[0]->dtype);
     return Array<Tensor>{ out };
 })
 .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes);
@@ -837,8 +835,7 @@ NNVM_REGISTER_ELEMWISE_BINARY_SCALAR(__not_equal_scalar__)
     Tensor out = topi::cast(
       binary_scalar_op(attrs, inputs[0],
                        [](Expr x, Expr y) { return x != y; }),
-      out_info[0]->dtype
-    );
+      out_info[0]->dtype);
     return Array<Tensor>{ out };
 })
 .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes);
@@ -855,8 +852,7 @@ NNVM_REGISTER_ELEMWISE_BINARY_SCALAR(__greater_scalar__)
     Tensor out = topi::cast(
       binary_scalar_op(attrs, inputs[0],
                        [](Expr x, Expr y) { return x > y; }),
-      out_info[0]->dtype
-    );
+      out_info[0]->dtype);
     return Array<Tensor>{ out };
 })
 .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes);
@@ -873,8 +869,7 @@ NNVM_REGISTER_ELEMWISE_BINARY_SCALAR(__greater_equal_scalar__)
     Tensor out = topi::cast(
       binary_scalar_op(attrs, inputs[0],
                        [](Expr x, Expr y) { return x >= y; }),
-      out_info[0]->dtype
-    );
+      out_info[0]->dtype);
     return Array<Tensor>{ out };
 })
 .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes);
@@ -891,8 +886,7 @@ NNVM_REGISTER_ELEMWISE_BINARY_SCALAR(__less_scalar__)
     Tensor out = topi::cast(
       binary_scalar_op(attrs, inputs[0],
                        [](Expr x, Expr y) { return x < y; }),
-      out_info[0]->dtype
-    );
+      out_info[0]->dtype);
     return Array<Tensor>{ out };
 })
 .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes);
@@ -909,8 +903,7 @@ NNVM_REGISTER_ELEMWISE_BINARY_SCALAR(__less_equal_scalar__)
     Tensor out = topi::cast(
       binary_scalar_op(attrs, inputs[0],
                        [](Expr x, Expr y) { return x <= y; }),
-      out_info[0]->dtype
-    );
+      out_info[0]->dtype);
     return Array<Tensor>{ out };
 })
 .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes);
diff --git a/python/tvm/relay/frontend/mxnet.py b/python/tvm/relay/frontend/mxnet.py
index c5a6ae8e8b4d..727fa5828aef 100644
--- a/python/tvm/relay/frontend/mxnet.py
+++ b/python/tvm/relay/frontend/mxnet.py
@@ -401,7 +401,7 @@ def _mx_box_nms(inputs, attrs):
     if out_format != 'corner':
         raise RuntimeError('out_format %s is not supported.' % out_format)
 
-    ret =_op.vision.get_valid_counts(inputs[0], score_threshold=valid_thresh)
+    ret = _op.vision.get_valid_counts(inputs[0], score_threshold=valid_thresh)
     nms_out = _op.vision.nms(ret[1], ret[0],
                              iou_threshold=overlap_thresh,
                              force_suppress=force_suppress,
diff --git a/topi/include/topi/nn/l2_normalize.h b/topi/include/topi/nn/l2_normalize.h
index e022d76871a0..4f9bdb61ab70 100644
--- a/topi/include/topi/nn/l2_normalize.h
+++ b/topi/include/topi/nn/l2_normalize.h
@@ -33,7 +33,7 @@ inline Tensor l2_normalize(const Tensor& data,
   for (size_t i = 0; i < axis.size(); ++i) {
     int ax = topi::detail::GetConstInt(axis[i]);
     CHECK_LT(ax, data->shape.size()) <<
-             "Axis " << ax << " exceeds input data dim " << 
+             "Axis " << ax << " exceeds input data dim " <<
              data->shape.size();
   }
   auto input_shape = data->shape;

From 6819dc38a5f915850b434b2a1b852a382380c069 Mon Sep 17 00:00:00 2001
From: Wang <wayao@9801a7a9c287.ant.amazon.com>
Date: Mon, 14 Jan 2019 17:43:59 -0800
Subject: [PATCH 20/43] Add cuda schedule for get_valid_counts

---
 topi/python/topi/cuda/vision.py | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/topi/python/topi/cuda/vision.py b/topi/python/topi/cuda/vision.py
index 17497abc0d8b..e3bc0fb9d547 100644
--- a/topi/python/topi/cuda/vision.py
+++ b/topi/python/topi/cuda/vision.py
@@ -162,3 +162,20 @@ def traverse(op):
         scheduled_ops.append(op)
     traverse(outs[0].op)
     return s
+
+@generic.schedule_get_valid_counts.register(["cuda", "gpu"])
+def schedule_get_valid_counts(outs):
+    """Schedule for get_valid_counts operator.
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+        The computation graph description of get_valid_counts
+        in the format of an array of tensors.
+
+    Returns
+    -------
+    s: Schedule
+      The computation schedule for the op.
+    """
+    return _default_schedule(outs)

From 8eaff5c82e732d83bfe416f81b6cccd24e04d5e7 Mon Sep 17 00:00:00 2001
From: Wang <wayao@9801a7a9c287.ant.amazon.com>
Date: Tue, 15 Jan 2019 10:10:20 -0800
Subject: [PATCH 21/43] Fix tutorial

---
 tutorials/nnvm/deploy_ssd.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tutorials/nnvm/deploy_ssd.py b/tutorials/nnvm/deploy_ssd.py
index f7e3b19f9767..1a71c96eaa0c 100644
--- a/tutorials/nnvm/deploy_ssd.py
+++ b/tutorials/nnvm/deploy_ssd.py
@@ -165,4 +165,4 @@ def display(img, out, thresh=0.5):
     plt.show()
 
 image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
-display(image, tvm_output.asnumpy()[0], thresh=0.45)
\ No newline at end of file
+display(image, tvm_output.asnumpy()[0], thresh=0.45)

From 742376221ba4a79730e834a6a4b4023033e5b9da Mon Sep 17 00:00:00 2001
From: Wang <wayao@9801a7a9c287.ant.amazon.com>
Date: Thu, 17 Jan 2019 22:16:24 -0800
Subject: [PATCH 22/43] Fix ctx_list

---
 nnvm/tests/python/compiler/test_top_level4.py | 82 ++++++++++---------
 tutorials/nnvm/deploy_ssd.py                  |  2 +-
 2 files changed, 43 insertions(+), 41 deletions(-)

diff --git a/nnvm/tests/python/compiler/test_top_level4.py b/nnvm/tests/python/compiler/test_top_level4.py
index 38646b01a4c9..87620c8b3acf 100644
--- a/nnvm/tests/python/compiler/test_top_level4.py
+++ b/nnvm/tests/python/compiler/test_top_level4.py
@@ -528,14 +528,13 @@ def verify_multibox_prior(dshape, sizes=(1,), ratios=(1,), steps=(-1, -1),
     if clip:
         np_out = np.clip(np_out, 0, 1)
 
-    target = "llvm"
-    ctx = tvm.cpu()
-    graph, lib, _ = nnvm.compiler.build(out, target, {"data": dshape})
-    m = graph_runtime.create(graph, lib, ctx)
-    m.set_input("data", np.random.uniform(size=dshape).astype(dtype))
-    m.run()
-    out = m.get_output(0, tvm.nd.empty(np_out.shape, dtype))
-    tvm.testing.assert_allclose(out.asnumpy(), np_out, atol=1e-5, rtol=1e-5)
+    for target, ctx in ctx_list():
+        graph, lib, _ = nnvm.compiler.build(out, target, {"data": dshape})
+        m = graph_runtime.create(graph, lib, ctx)
+        m.set_input("data", np.random.uniform(size=dshape).astype(dtype))
+        m.run()
+        out = m.get_output(0, tvm.nd.empty(np_out.shape, dtype))
+        tvm.testing.assert_allclose(out.asnumpy(), np_out, atol=1e-5, rtol=1e-5)
 
 def test_multibox_prior():
     verify_multibox_prior((1, 3, 50, 50))
@@ -562,17 +561,18 @@ def test_multibox_transform_loc():
                                  [0, 0.44999999, 1, 1, 1, 1],
                                  [0, 0.30000001, 0, 0, 0.22903419, 0.20435292]]])
 
-    target = "llvm"
     dtype = "float32"
-    ctx = tvm.cpu()
-    graph, lib, _ = nnvm.compiler.build(out, target, {"cls_prob": (batch_size, num_anchors, num_classes),
-                                                      "loc_preds": (batch_size, num_anchors * 4),
-                                                      "anchors": (1, num_anchors, 4)})
-    m = graph_runtime.create(graph, lib, ctx)
-    m.set_input(**{"cls_prob": np_cls_prob.astype(dtype), "loc_preds": np_loc_preds.astype(dtype), "anchors": np_anchors.astype(dtype)})
-    m.run()
-    out = m.get_output(0, tvm.nd.empty(expected_np_out.shape, dtype))
-    tvm.testing.assert_allclose(out.asnumpy(), expected_np_out, atol=1e-5, rtol=1e-5)
+    for target, ctx in ctx_list():
+        if target == "cuda":
+            continue
+        graph, lib, _ = nnvm.compiler.build(out, target, {"cls_prob": (batch_size, num_anchors, num_classes),
+                                                          "loc_preds": (batch_size, num_anchors * 4),
+                                                          "anchors": (1, num_anchors, 4)})
+        m = graph_runtime.create(graph, lib, ctx)
+        m.set_input(**{"cls_prob": np_cls_prob.astype(dtype), "loc_preds": np_loc_preds.astype(dtype), "anchors": np_anchors.astype(dtype)})
+        m.run()
+        out = m.get_output(0, tvm.nd.empty(expected_np_out.shape, dtype))
+        tvm.testing.assert_allclose(out.asnumpy(), expected_np_out, atol=1e-5, rtol=1e-5)
 
 def verify_get_valid_counts(dshape, score_threshold):
     dtype = "float32"
@@ -594,19 +594,20 @@ def verify_get_valid_counts(dshape, score_threshold):
                 for k in range(elem_length):
                     np_out2[i, j, k] = -1
 
-    target = "llvm"
-    ctx = tvm.cpu()
-    data = sym.Variable("data", dtype=dtype)
-    valid_counts, inter_data = sym.get_valid_counts(data, score_threshold=score_threshold)
-    out = sym.Group([valid_counts, inter_data])
-    graph, lib, _ = nnvm.compiler.build(out, target, {"data": dshape})
-    m = graph_runtime.create(graph, lib, ctx)
-    m.set_input("data", np_data)
-    m.run()
-    out1 = m.get_output(0, tvm.nd.empty(np_out1.shape, "int32"))
-    out2 = m.get_output(1, tvm.nd.empty(dshape, dtype))
-    tvm.testing.assert_allclose(out1.asnumpy(), np_out1, rtol=1e-3)
-    tvm.testing.assert_allclose(out2.asnumpy(), np_out2, rtol=1e-3)
+    for target, ctx in ctx_list():
+        if target == "cuda":
+            continue
+        data = sym.Variable("data", dtype=dtype)
+        valid_counts, inter_data = sym.get_valid_counts(data, score_threshold=score_threshold)
+        out = sym.Group([valid_counts, inter_data])
+        graph, lib, _ = nnvm.compiler.build(out, target, {"data": dshape})
+        m = graph_runtime.create(graph, lib, ctx)
+        m.set_input("data", np_data)
+        m.run()
+        out1 = m.get_output(0, tvm.nd.empty(np_out1.shape, "int32"))
+        out2 = m.get_output(1, tvm.nd.empty(dshape, dtype))
+        tvm.testing.assert_allclose(out1.asnumpy(), np_out1, rtol=1e-3)
+        tvm.testing.assert_allclose(out2.asnumpy(), np_out2, rtol=1e-3)
 
 
 def test_get_valid_counts():
@@ -633,15 +634,16 @@ def test_nms():
                            [-1, -1, -1, -1, -1, -1], [-1, -1, -1, -1, -1, -1],
                            [-1, -1, -1, -1, -1, -1]]])
 
-    target = "llvm"
-    ctx = tvm.cpu()
-    graph, lib, _ = nnvm.compiler.build(out, target, {"data": dshape, "valid_count": (dshape[0],)},
-                                        dtype={"data": "float32", "valid_count": "int32"})
-    m = graph_runtime.create(graph, lib, ctx)
-    m.set_input(**{"data": np_data, "valid_count": np_valid_count})
-    m.run()
-    out = m.get_output(0, tvm.nd.empty(np_result.shape, "float32"))
-    tvm.testing.assert_allclose(out.asnumpy(), np_result, atol=1e-5, rtol=1e-5)
+    for target, ctx in ctx_list():
+        if target == "cuda":
+            continue
+        graph, lib, _ = nnvm.compiler.build(out, target, {"data": dshape, "valid_count": (dshape[0],)},
+                                            dtype={"data": "float32", "valid_count": "int32"})
+        m = graph_runtime.create(graph, lib, ctx)
+        m.set_input(**{"data": np_data, "valid_count": np_valid_count})
+        m.run()
+        out = m.get_output(0, tvm.nd.empty(np_result.shape, "float32"))
+        tvm.testing.assert_allclose(out.asnumpy(), np_result, atol=1e-5, rtol=1e-5)
 
 def np_slice_like(np_data, np_shape_like, axis=[]):
     begin_idx = [0 for _ in np_data.shape]
diff --git a/tutorials/nnvm/deploy_ssd.py b/tutorials/nnvm/deploy_ssd.py
index 1a71c96eaa0c..f7e3b19f9767 100644
--- a/tutorials/nnvm/deploy_ssd.py
+++ b/tutorials/nnvm/deploy_ssd.py
@@ -165,4 +165,4 @@ def display(img, out, thresh=0.5):
     plt.show()
 
 image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
-display(image, tvm_output.asnumpy()[0], thresh=0.45)
+display(image, tvm_output.asnumpy()[0], thresh=0.45)
\ No newline at end of file

From b3c8a7c32a34b34883d32978b38ef758cd76fbea Mon Sep 17 00:00:00 2001
From: Wang <wayao@9801a7a9c287.ant.amazon.com>
Date: Fri, 18 Jan 2019 10:27:49 -0800
Subject: [PATCH 23/43] Add install gluoncv

---
 nnvm/tests/python/compiler/test_top_level4.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/nnvm/tests/python/compiler/test_top_level4.py b/nnvm/tests/python/compiler/test_top_level4.py
index 87620c8b3acf..f3c297e5f6e9 100644
--- a/nnvm/tests/python/compiler/test_top_level4.py
+++ b/nnvm/tests/python/compiler/test_top_level4.py
@@ -529,6 +529,8 @@ def verify_multibox_prior(dshape, sizes=(1,), ratios=(1,), steps=(-1, -1),
         np_out = np.clip(np_out, 0, 1)
 
     for target, ctx in ctx_list():
+        if target == "cuda":
+            continue
         graph, lib, _ = nnvm.compiler.build(out, target, {"data": dshape})
         m = graph_runtime.create(graph, lib, ctx)
         m.set_input("data", np.random.uniform(size=dshape).astype(dtype))

From 986c4f7b4cf02b2317dcea4c8a5c9b194e45515f Mon Sep 17 00:00:00 2001
From: Wang <wayao@9801a7a9c287.ant.amazon.com>
Date: Sun, 27 Jan 2019 21:38:50 -0800
Subject: [PATCH 24/43] Disable box_nms in frontend test

---
 tests/python/frontend/mxnet/test_forward.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/python/frontend/mxnet/test_forward.py b/tests/python/frontend/mxnet/test_forward.py
index b6646b3c2a2f..fb975c11add0 100644
--- a/tests/python/frontend/mxnet/test_forward.py
+++ b/tests/python/frontend/mxnet/test_forward.py
@@ -257,6 +257,7 @@ def verify(start, stop, step):
     verify(20, 1, -1)
     verify(20, 1, -1.5)
 
+<<<<<<< HEAD
 <<<<<<< HEAD
 def _mx_symbol(F, op_name, inputs):
     op = getattr(F, op_name)

From e7df94c3d024fe842bfea63af8c265a20592e600 Mon Sep 17 00:00:00 2001
From: Wang <wayao@9801a7a9c287.ant.amazon.com>
Date: Mon, 28 Jan 2019 11:59:24 -0800
Subject: [PATCH 25/43] Fix test get_valid_counts numpy result

---
 topi/tests/python/test_topi_vision.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/topi/tests/python/test_topi_vision.py b/topi/tests/python/test_topi_vision.py
index fd52f13795a1..d7dbab0bca4e 100644
--- a/topi/tests/python/test_topi_vision.py
+++ b/topi/tests/python/test_topi_vision.py
@@ -22,7 +22,7 @@ def verify_get_valid_counts(dshape, score_threshold):
         inter_idx = 0
         for j in range(num_anchor):
             score = np_data[i, j, 1]
-            if score >= score_threshold:
+            if score > score_threshold:
                 for k in range(elem_length):
                     np_out2[i, inter_idx, k] = np_data[i, j, k]
                 np_out1[i] += 1

From 45b6aacf3b9ed6229105d2df31edb97b2dc8fa8a Mon Sep 17 00:00:00 2001
From: Wang <wayao@9801a7a9c287.ant.amazon.com>
Date: Thu, 31 Jan 2019 12:07:48 -0800
Subject: [PATCH 26/43] Rename ssd tutorial

---
 tutorials/nnvm/{deploy_ssd.py => deploy_ssd_gluoncv.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename tutorials/nnvm/{deploy_ssd.py => deploy_ssd_gluoncv.py} (100%)

diff --git a/tutorials/nnvm/deploy_ssd.py b/tutorials/nnvm/deploy_ssd_gluoncv.py
similarity index 100%
rename from tutorials/nnvm/deploy_ssd.py
rename to tutorials/nnvm/deploy_ssd_gluoncv.py

From 26ece34043173adf184591c6409d4d1f2b14847a Mon Sep 17 00:00:00 2001
From: Wang <wayao@9801a7a9c287.ant.amazon.com>
Date: Thu, 31 Jan 2019 12:14:58 -0800
Subject: [PATCH 27/43] Fix rebase

---
 tutorials/nnvm/deploy_ssd_gluoncv.py | 200 ++++++++++-----------------
 tutorials/nnvm/deploy_ssd_mxnet.py   | 168 ++++++++++++++++++++++
 2 files changed, 240 insertions(+), 128 deletions(-)
 create mode 100644 tutorials/nnvm/deploy_ssd_mxnet.py

diff --git a/tutorials/nnvm/deploy_ssd_gluoncv.py b/tutorials/nnvm/deploy_ssd_gluoncv.py
index f7e3b19f9767..d83d1f86b75e 100644
--- a/tutorials/nnvm/deploy_ssd_gluoncv.py
+++ b/tutorials/nnvm/deploy_ssd_gluoncv.py
@@ -1,26 +1,20 @@
 """
 Deploy Single Shot Multibox Detector(SSD) model
 ===============================================
-**Author**: `Yao Wang <https://github.com/kevinthesun>`_, \
-`Leyuan Wang <https://github.com/Laurawly>`_
+**Author**: `Yao Wang <https://github.com/kevinthesun>`_
 
 This article is an introductory tutorial to deploy SSD models with TVM.
-We will use mxnet pretrained SSD model with Resnet50 as body network and
-convert it to NNVM graph;
+We will use GluonCV pre-trained SSD model and convert it to NNVM graph.
 """
-import os
-import zipfile
 import tvm
-import mxnet as mx
-import cv2
-import numpy as np
 
+from matplotlib import pyplot as plt
 from nnvm import compiler
 from nnvm.frontend import from_mxnet
+from nnvm.testing.config import ctx_list
 from tvm import relay
-from tvm.contrib.download import download
 from tvm.contrib import graph_runtime
-from mxnet.model import load_checkpoint
+from gluoncv import model_zoo, data, utils
 
 
 ######################################################################
@@ -33,136 +27,86 @@
 #   echo "set(USE_SORT ON)" > config.mk
 #   make -j8
 #
-
-model_name = "ssd_resnet50_512"
-model_file = "%s.zip" % model_name
-test_image = "dog.jpg"
+# .. note::
+#
+#   Currently we support compiling SSD on CPU only.
+#   GPU support is in progress.
+#
+#   To get best inference performance on CPU, change
+#   target argument according to your device and
+#   follow the :ref:`tune_nnvm_x86` to tune x86 CPU and
+#   :ref:`tune_nnvm_arm` for arm cpu.
+#
+#   SSD with VGG as body network is not supported yet since
+#   x86 conv2d schedule doesn't support dilation.
+
+supported_model = [
+    'ssd_512_resnet18_v1_voc',
+    'ssd_512_resnet18_v1_coco',
+    'ssd_512_resnet50_v1_voc',
+    'ssd_512_resnet50_v1_coco',
+    'ssd_512_resnet101_v2_voc',
+    'ssd_512_mobilenet1_0_voc',
+    'ssd_512_mobilenet1_0_coco',
+]
+
+model_name = "ssd_512_resnet50_v1_voc"
 dshape = (1, 3, 512, 512)
 dtype = "float32"
-
-# Target settings
-# Use these commented settings to build for cuda.
-#target = 'cuda'
-#ctx = tvm.gpu(0)
-# Use these commented settings to build for opencl.
-#target = 'opencl'
-#ctx = tvm.opencl(0)
-target = "llvm"
-ctx = tvm.cpu()
+target_list = ctx_list()
+frontend_list = ["nnvm", "relay"]
 
 ######################################################################
-# Download MXNet SSD pre-trained model and demo image
-# ---------------------------------------------------
-# Pre-trained model available at
-# https://github.com/apache/incubator-\mxnet/tree/master/example/ssd
-
-model_url = "https://github.com/zhreshold/mxnet-ssd/releases/download/v0.6/" \
-            "resnet50_ssd_512_voc0712_trainval.zip"
-image_url = "https://cloud.githubusercontent.com/assets/3307514/20012567/" \
-            "cbb60336-a27d-11e6-93ff-cbc3f09f5c9e.jpg"
-inference_symbol_folder = \
-    "c1904e900848df4548ce5dfb18c719c7-a28c4856c827fe766aa3da0e35bad41d44f0fb26"
-inference_symbol_url = "https://gist.github.com/kevinthesun/c1904e900848df4548ce5dfb18c719c7/" \
-                       "archive/a28c4856c827fe766aa3da0e35bad41d44f0fb26.zip"
-
-dir = "ssd_model"
-if not os.path.exists(dir):
-    os.makedirs(dir)
-model_file_path = "%s/%s" % (dir, model_file)
-test_image_path = "%s/%s" % (dir, test_image)
-inference_symbol_path = "%s/inference_model.zip" % dir
-download(model_url, model_file_path)
-download(image_url, test_image_path)
-download(inference_symbol_url, inference_symbol_path)
-
-zip_ref = zipfile.ZipFile(model_file_path, 'r')
-zip_ref.extractall(dir)
-zip_ref.close()
-zip_ref = zipfile.ZipFile(inference_symbol_path)
-zip_ref.extractall(dir)
-zip_ref.close()
+# Download and pre-process demo image
+
+im_fname = utils.download('https://github.com/dmlc/web-data/blob/master/' +
+                          'gluoncv/detection/street_small.jpg?raw=true',
+                          path='street_small.jpg')
+x, img = data.transforms.presets.ssd.load_test(im_fname, short=512)
 
 ######################################################################
 # Convert and compile model with NNVM or Relay for CPU.
 
-sym = mx.sym.load("%s/%s/ssd_resnet50_inference.json" % (dir, inference_symbol_folder))
-_, arg_params, aux_params = load_checkpoint("%s/%s" % (dir, model_name), 0)
-
-import argparse
-parser = argparse.ArgumentParser()
-parser.add_argument(
-    "-f", "--frontend",
-    help="Frontend for compilation, nnvm or relay",
-    type=str,
-    default="nnvm")
-args = parser.parse_args()
-if args.frontend == "relay":
-    net, params = relay.frontend.from_mxnet(sym, {"data": dshape}, arg_params=arg_params, \
-                                            aux_params=aux_params)
-    with relay.build_config(opt_level=3):
-        graph, lib, params = relay.build(net, target, params=params)
-elif args.frontend == "nnvm":
-    net, params = from_mxnet(sym, arg_params, aux_params)
-    with compiler.build_config(opt_level=3):
-        graph, lib, params = compiler.build(
-            net, target, {"data": dshape}, params=params)
-else:
-    parser.print_help()
-    parser.exit()
+block = model_zoo.get_model(model_name, pretrained=True)
+
+def compile(frontend, target):
+    if frontend == "relay":
+        net, params = relay.frontend.from_mxnet(block, {"data": dshape})
+        with relay.build_config(opt_level=3):
+            graph, lib, params = relay.build(net, target, params=params)
+    else:
+        net, params = from_mxnet(block)
+        with compiler.build_config(opt_level=3):
+            graph, lib, params = compiler.build(
+                net, target, {"data": dshape}, params=params)
+    return graph, lib, params
 
 ######################################################################
 # Create TVM runtime and do inference
 
-# Preprocess image
-image = cv2.imread(test_image_path)
-img_data = cv2.resize(image, (dshape[2], dshape[3]))
-img_data = img_data[:, :, (2, 1, 0)].astype(np.float32)
-img_data -= np.array([123, 117, 104])
-img_data = np.transpose(np.array(img_data), (2, 0, 1))
-img_data = np.expand_dims(img_data, axis=0)
-# Build TVM runtime
-m = graph_runtime.create(graph, lib, ctx)
-m.set_input('data', tvm.nd.array(img_data.astype(dtype)))
-m.set_input(**params)
-# execute
-m.run()
-# get outputs
-tvm_output = m.get_output(0)
-
+def run(graph, lib, params, ctx):
+    # Build TVM runtime
+    m = graph_runtime.create(graph, lib, ctx)
+    tvm_input = tvm.nd.array(x.asnumpy(), ctx=ctx)
+    m.set_input('data', tvm_input)
+    m.set_input(**params)
+    # execute
+    m.run()
+    # get outputs
+    class_IDs, scores, bounding_boxs = m.get_output(0), m.get_output(1), m.get_output(2)
+    return class_IDs, scores, bounding_boxs
+
+for target, ctx in target_list:
+    if target == "cuda":
+        print("GPU not supported yet, skip.")
+        continue
+    for frontend in frontend_list:
+        graph, lib, params = compile(frontend, target)
+        class_IDs, scores, bounding_boxs = run(graph, lib, params, ctx)
 
 ######################################################################
 # Display result
 
-class_names = ["aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", "chair",
-               "cow", "diningtable", "dog", "horse", "motorbike", "person", "pottedplant",
-               "sheep", "sofa", "train", "tvmonitor"]
-def display(img, out, thresh=0.5):
-    import random
-    import matplotlib as mpl
-    import matplotlib.pyplot as plt
-    mpl.rcParams['figure.figsize'] = (10, 10)
-    pens = dict()
-    plt.clf()
-    plt.imshow(img)
-    for det in out:
-        cid = int(det[0])
-        if cid < 0:
-            continue
-        score = det[1]
-        if score < thresh:
-            continue
-        if cid not in pens:
-            pens[cid] = (random.random(), random.random(), random.random())
-        scales = [img.shape[1], img.shape[0]] * 2
-        xmin, ymin, xmax, ymax = [int(p * s) for p, s in zip(det[2:6].tolist(), scales)]
-        rect = plt.Rectangle((xmin, ymin), xmax - xmin, ymax - ymin, fill=False,
-                             edgecolor=pens[cid], linewidth=3)
-        plt.gca().add_patch(rect)
-        text = class_names[cid]
-        plt.gca().text(xmin, ymin-2, '{:s} {:.3f}'.format(text, score),
-                       bbox=dict(facecolor=pens[cid], alpha=0.5),
-                       fontsize=12, color='white')
-    plt.show()
-
-image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
-display(image, tvm_output.asnumpy()[0], thresh=0.45)
\ No newline at end of file
+ax = utils.viz.plot_bbox(img, bounding_boxs.asnumpy()[0], scores.asnumpy()[0],
+                         class_IDs.asnumpy()[0], class_names=block.classes)
+plt.show()
diff --git a/tutorials/nnvm/deploy_ssd_mxnet.py b/tutorials/nnvm/deploy_ssd_mxnet.py
new file mode 100644
index 000000000000..1a71c96eaa0c
--- /dev/null
+++ b/tutorials/nnvm/deploy_ssd_mxnet.py
@@ -0,0 +1,168 @@
+"""
+Deploy Single Shot Multibox Detector(SSD) model
+===============================================
+**Author**: `Yao Wang <https://github.com/kevinthesun>`_, \
+`Leyuan Wang <https://github.com/Laurawly>`_
+
+This article is an introductory tutorial to deploy SSD models with TVM.
+We will use mxnet pretrained SSD model with Resnet50 as body network and
+convert it to NNVM graph;
+"""
+import os
+import zipfile
+import tvm
+import mxnet as mx
+import cv2
+import numpy as np
+
+from nnvm import compiler
+from nnvm.frontend import from_mxnet
+from tvm import relay
+from tvm.contrib.download import download
+from tvm.contrib import graph_runtime
+from mxnet.model import load_checkpoint
+
+
+######################################################################
+# Preliminary and Set parameters
+# ------------------------------
+# We should build TVM with sort support, in TVM root directory
+#
+# .. code-block:: bash
+#
+#   echo "set(USE_SORT ON)" > config.mk
+#   make -j8
+#
+
+model_name = "ssd_resnet50_512"
+model_file = "%s.zip" % model_name
+test_image = "dog.jpg"
+dshape = (1, 3, 512, 512)
+dtype = "float32"
+
+# Target settings
+# Use these commented settings to build for cuda.
+#target = 'cuda'
+#ctx = tvm.gpu(0)
+# Use these commented settings to build for opencl.
+#target = 'opencl'
+#ctx = tvm.opencl(0)
+target = "llvm"
+ctx = tvm.cpu()
+
+######################################################################
+# Download MXNet SSD pre-trained model and demo image
+# ---------------------------------------------------
+# Pre-trained model available at
+# https://github.com/apache/incubator-\mxnet/tree/master/example/ssd
+
+model_url = "https://github.com/zhreshold/mxnet-ssd/releases/download/v0.6/" \
+            "resnet50_ssd_512_voc0712_trainval.zip"
+image_url = "https://cloud.githubusercontent.com/assets/3307514/20012567/" \
+            "cbb60336-a27d-11e6-93ff-cbc3f09f5c9e.jpg"
+inference_symbol_folder = \
+    "c1904e900848df4548ce5dfb18c719c7-a28c4856c827fe766aa3da0e35bad41d44f0fb26"
+inference_symbol_url = "https://gist.github.com/kevinthesun/c1904e900848df4548ce5dfb18c719c7/" \
+                       "archive/a28c4856c827fe766aa3da0e35bad41d44f0fb26.zip"
+
+dir = "ssd_model"
+if not os.path.exists(dir):
+    os.makedirs(dir)
+model_file_path = "%s/%s" % (dir, model_file)
+test_image_path = "%s/%s" % (dir, test_image)
+inference_symbol_path = "%s/inference_model.zip" % dir
+download(model_url, model_file_path)
+download(image_url, test_image_path)
+download(inference_symbol_url, inference_symbol_path)
+
+zip_ref = zipfile.ZipFile(model_file_path, 'r')
+zip_ref.extractall(dir)
+zip_ref.close()
+zip_ref = zipfile.ZipFile(inference_symbol_path)
+zip_ref.extractall(dir)
+zip_ref.close()
+
+######################################################################
+# Convert and compile model with NNVM or Relay for CPU.
+
+sym = mx.sym.load("%s/%s/ssd_resnet50_inference.json" % (dir, inference_symbol_folder))
+_, arg_params, aux_params = load_checkpoint("%s/%s" % (dir, model_name), 0)
+
+import argparse
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "-f", "--frontend",
+    help="Frontend for compilation, nnvm or relay",
+    type=str,
+    default="nnvm")
+args = parser.parse_args()
+if args.frontend == "relay":
+    net, params = relay.frontend.from_mxnet(sym, {"data": dshape}, arg_params=arg_params, \
+                                            aux_params=aux_params)
+    with relay.build_config(opt_level=3):
+        graph, lib, params = relay.build(net, target, params=params)
+elif args.frontend == "nnvm":
+    net, params = from_mxnet(sym, arg_params, aux_params)
+    with compiler.build_config(opt_level=3):
+        graph, lib, params = compiler.build(
+            net, target, {"data": dshape}, params=params)
+else:
+    parser.print_help()
+    parser.exit()
+
+######################################################################
+# Create TVM runtime and do inference
+
+# Preprocess image
+image = cv2.imread(test_image_path)
+img_data = cv2.resize(image, (dshape[2], dshape[3]))
+img_data = img_data[:, :, (2, 1, 0)].astype(np.float32)
+img_data -= np.array([123, 117, 104])
+img_data = np.transpose(np.array(img_data), (2, 0, 1))
+img_data = np.expand_dims(img_data, axis=0)
+# Build TVM runtime
+m = graph_runtime.create(graph, lib, ctx)
+m.set_input('data', tvm.nd.array(img_data.astype(dtype)))
+m.set_input(**params)
+# execute
+m.run()
+# get outputs
+tvm_output = m.get_output(0)
+
+
+######################################################################
+# Display result
+
+class_names = ["aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", "chair",
+               "cow", "diningtable", "dog", "horse", "motorbike", "person", "pottedplant",
+               "sheep", "sofa", "train", "tvmonitor"]
+def display(img, out, thresh=0.5):
+    import random
+    import matplotlib as mpl
+    import matplotlib.pyplot as plt
+    mpl.rcParams['figure.figsize'] = (10, 10)
+    pens = dict()
+    plt.clf()
+    plt.imshow(img)
+    for det in out:
+        cid = int(det[0])
+        if cid < 0:
+            continue
+        score = det[1]
+        if score < thresh:
+            continue
+        if cid not in pens:
+            pens[cid] = (random.random(), random.random(), random.random())
+        scales = [img.shape[1], img.shape[0]] * 2
+        xmin, ymin, xmax, ymax = [int(p * s) for p, s in zip(det[2:6].tolist(), scales)]
+        rect = plt.Rectangle((xmin, ymin), xmax - xmin, ymax - ymin, fill=False,
+                             edgecolor=pens[cid], linewidth=3)
+        plt.gca().add_patch(rect)
+        text = class_names[cid]
+        plt.gca().text(xmin, ymin-2, '{:s} {:.3f}'.format(text, score),
+                       bbox=dict(facecolor=pens[cid], alpha=0.5),
+                       fontsize=12, color='white')
+    plt.show()
+
+image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+display(image, tvm_output.asnumpy()[0], thresh=0.45)

From 2f8aef8653e3afec343dbf729b579ea8fbc6f04f Mon Sep 17 00:00:00 2001
From: Wang <wayao@186590ccd7a1.ant.amazon.com>
Date: Tue, 19 Feb 2019 13:01:45 +0800
Subject: [PATCH 28/43] Refactor nms

---
 topi/python/topi/cuda/nms.py            |  9 ++++--
 topi/python/topi/vision/nms.py          | 39 ++++++++++++++++---------
 topi/python/topi/vision/ssd/multibox.py |  2 +-
 topi/tests/python/test_topi_vision.py   | 15 ++++++++--
 4 files changed, 46 insertions(+), 19 deletions(-)

diff --git a/topi/python/topi/cuda/nms.py b/topi/python/topi/cuda/nms.py
index 900a8e856ddb..26dc5704d9d9 100644
--- a/topi/python/topi/cuda/nms.py
+++ b/topi/python/topi/cuda/nms.py
@@ -182,8 +182,8 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx):
 
 
 @nms.register(["cuda", "gpu"])
-def nms_gpu(data, valid_count, iou_threshold=0.5, force_suppress=False,
-            topk=-1, id_index=0, do_rearrange=False):
+def nms_gpu(data, valid_count, return_indices, iou_threshold=0.5, force_suppress=False,
+            topk=-1, id_index=0, invalid_to_bottom=False):
     """Non-maximum suppression operator for object detection.
 
     Parameters
@@ -196,6 +196,9 @@ def nms_gpu(data, valid_count, iou_threshold=0.5, force_suppress=False,
     valid_count : tvm.Tensor
         1-D tensor for valid number of boxes.
 
+    return_indices : boolean
+        Whether to return box indices in input data.
+
     iou_threshold : optional, float
         Non-maximum suppression threshold.
 
@@ -208,7 +211,7 @@ def nms_gpu(data, valid_count, iou_threshold=0.5, force_suppress=False,
     id_index : optional, int
         index of the class categories, -1 to disable.
 
-    do_rearrange : optional, boolean
+    invalid_to_bottom : optional, boolean
         Whether to move all valid bounding boxes to the top.
 
     Returns
diff --git a/topi/python/topi/vision/nms.py b/topi/python/topi/vision/nms.py
index 66a4b0df9c30..9c8fae23fa93 100644
--- a/topi/python/topi/vision/nms.py
+++ b/topi/python/topi/vision/nms.py
@@ -143,16 +143,21 @@ def hybrid_nms(data, sorted_index, valid_count,
 
     Returns
     -------
-    valid_count : tvm.Tensor or numpy NDArray
-        1-D tensor for valid number of boxes.
+    output : tvm.Tensor
+        3-D tensor with shape [batch_size, num_anchors, 6].
+
+    box_indices: tvm.Tensor
+        2-D tensor with shape [batch_size, num_anchors].
     """
     batch_size = data.shape[0]
     num_anchors = data.shape[1]
     box_data_length = data.shape[2]
+    box_indices = output_tensor((batch_size, num_anchors), "int32")
     output = output_tensor((batch_size,
                             num_anchors,
                             box_data_length,),
                            data.dtype)
+
     for i in parallel(batch_size):
         if iou_threshold > 0:
             if valid_count[i] > 0:
@@ -163,10 +168,12 @@ def hybrid_nms(data, sorted_index, valid_count,
                 for j in range(nkeep):
                     for k in range(box_data_length):
                         output[i, j, k] = data[i, sorted_index[i, j], k]
+                    box_indices[i, j] = sorted_index[i, j]
                 if 0 < topk < valid_count[i]:
                     for j in range(valid_count[i] - nkeep):
                         for k in range(box_data_length):
                             output[i, j + nkeep, k] = -1.0
+                        box_indices[i, j + nkeep] = -1
             # Apply nms
             for j in range(valid_count[i]):
                 if output[i, j, 0] >= 0:
@@ -197,20 +204,23 @@ def hybrid_nms(data, sorted_index, valid_count,
                             iou = 0.0 if u <= 0.0 else area / u
                             if iou >= iou_threshold:
                                 output[i, k, 0] = -1.0
+                                box_indices[i, k] = -1
         else:
             for j in range(valid_count[i]):
                 for k in range(box_data_length):
                     output[i, j, k] = data[i, j, k]
+                box_indices[i, j] = j
         # Set invalid entry to be -1
         for j in range(num_anchors - valid_count[i]):
             for k in range(box_data_length):
                 output[i, j + valid_count[i], k] = -1.0
-    return output
+            box_indices[i, j + valid_count[i]] = -1
+    return output, box_indices
 
 
 @tvm.target.generic_func
-def nms(data, valid_count, iou_threshold=0.5, force_suppress=False,
-        topk=-1, id_index=0, do_rearrange=False):
+def nms(data, valid_count, return_indices, iou_threshold=0.5, force_suppress=False,
+        topk=-1, id_index=0, invalid_to_bottom=False):
     """Non-maximum suppression operator for object detection.
 
     Parameters
@@ -223,6 +233,9 @@ def nms(data, valid_count, iou_threshold=0.5, force_suppress=False,
     valid_count : tvm.Tensor
         1-D tensor for valid number of boxes.
 
+    return_indices : boolean
+        Whether to return box indices in input data.
+
     iou_threshold : optional, float
         Non-maximum suppression threshold.
 
@@ -235,7 +248,7 @@ def nms(data, valid_count, iou_threshold=0.5, force_suppress=False,
     id_index : optional, int
         index of the class categories, -1 to disable.
 
-    do_rearrange : optional, boolean
+    invalid_to_bottom : optional, boolean
         Whether to move all valid bounding boxes to the top.
 
     Returns
@@ -288,12 +301,12 @@ def nms(data, valid_count, iou_threshold=0.5, force_suppress=False,
                    in_buffers=[score_tensor_buf, valid_count_buf],
                    out_buffers=sort_tensor_buf,
                    name="nms_sort")
-    out = hybrid_nms(data, sort_tensor, valid_count,
-                     tvm.const(iou_threshold, dtype="float32"),
-                     tvm.const(force_suppress, dtype="bool"),
-                     tvm.const(topk, dtype="int32"),
-                     tvm.const(id_index, dtype="int32"))
-    if do_rearrange:
+    out, box_indices = hybrid_nms(data, sort_tensor, valid_count,
+                                  tvm.const(iou_threshold, dtype="float32"),
+                                  tvm.const(force_suppress, dtype="bool"),
+                                  tvm.const(topk, dtype="int32"),
+                                  tvm.const(id_index, dtype="int32"))
+    if not return_indices and invalid_to_bottom:
         out = hybrid_rearrange_out(out)
 
-    return out
+    return box_indices if return_indices else out
diff --git a/topi/python/topi/vision/ssd/multibox.py b/topi/python/topi/vision/ssd/multibox.py
index f20a286960cc..c663d3873587 100644
--- a/topi/python/topi/vision/ssd/multibox.py
+++ b/topi/python/topi/vision/ssd/multibox.py
@@ -292,5 +292,5 @@ def multibox_detection(cls_prob, loc_pred, anchor, clip=True, threshold=0.01, nm
     """
     inter_out = multibox_transform_loc(cls_prob, loc_pred, anchor,
                                        clip, threshold, variances)
-    out = nms(inter_out[0], inter_out[1], nms_threshold, force_suppress, nms_topk)
+    out = nms(inter_out[0], inter_out[1], False, nms_threshold, force_suppress, nms_topk)
     return out
diff --git a/topi/tests/python/test_topi_vision.py b/topi/tests/python/test_topi_vision.py
index d7dbab0bca4e..1f50199fcfba 100644
--- a/topi/tests/python/test_topi_vision.py
+++ b/topi/tests/python/test_topi_vision.py
@@ -63,6 +63,7 @@ def test_get_valid_counts():
 
 def test_nms():
     dshape = (1, 5, 6)
+    indices_dshape = (1, 5)
     data = tvm.placeholder(dshape, name="data")
     valid_count = tvm.placeholder((dshape[0],), dtype="int32", name="valid_count")
     nms_threshold = 0.7
@@ -76,6 +77,7 @@ def test_nms():
     np_result = np.array([[[2, 0.9, 35, 61, 52, 79], [0, 0.8, 1, 20, 25, 45],
                            [-1, -1, -1, -1, -1, -1], [-1, -1, -1, -1, -1, -1],
                            [-1, -1, -1, -1, -1, -1]]])
+    np_indices_result = np.array([[3, 0, -1, -1, -1]])
 
     def check_device(device):
         ctx = tvm.context(device, 0)
@@ -85,18 +87,27 @@ def check_device(device):
         print("Running on target: %s" % device)
         with tvm.target.create(device):
             if device == 'llvm':
-                out = nms(data, valid_count, nms_threshold, force_suppress, nms_topk)
+                out = nms(data, valid_count, False, nms_threshold, force_suppress, nms_topk)
+                indices_out = nms(data, valid_count, True, nms_threshold, force_suppress, nms_topk)
             else:
-                out = topi.cuda.nms(data, valid_count, nms_threshold, force_suppress, nms_topk)
+                out = topi.cuda.nms(data, valid_count, False, nms_threshold, force_suppress, nms_topk)
+                indices_out = topi.cuda.nms(data, valid_count, True, nms_threshold, force_suppress, nms_topk)
             s = topi.generic.schedule_nms(out)
+            indices_s = topi.generic.schedule_nms(indices_out)
 
         tvm_data = tvm.nd.array(np_data, ctx)
         tvm_valid_count = tvm.nd.array(np_valid_count, ctx)
+
         tvm_out = tvm.nd.array(np.zeros(dshape, dtype=data.dtype), ctx)
         f = tvm.build(s, [data, valid_count, out], device)
         f(tvm_data, tvm_valid_count, tvm_out)
         tvm.testing.assert_allclose(tvm_out.asnumpy(), np_result, rtol=1e-4)
 
+        tvm_indices_out = tvm.nd.array(np.zeros(indices_dshape, dtype="int32"), ctx)
+        f = tvm.build(indices_s, [data, valid_count, indices_out], device)
+        f(tvm_data, tvm_valid_count, tvm_indices_out)
+        tvm.testing.assert_allclose(tvm_indices_out.asnumpy(), np_indices_result, rtol=1e-4)
+
     for device in ['llvm']:
         check_device(device)
 

From d30be86f743208839ef4bcd1918bce9198a0e6bb Mon Sep 17 00:00:00 2001
From: Wang <wayao@186590ccd7a1.ant.amazon.com>
Date: Tue, 19 Feb 2019 14:11:43 +0800
Subject: [PATCH 29/43] Rollback nnvm

---
 nnvm/include/nnvm/top/nn.h                    |  33 ++---
 nnvm/include/nnvm/top/tensor.h                |  15 --
 nnvm/python/nnvm/frontend/mxnet.py            |  77 ++--------
 nnvm/python/nnvm/top/transform.py             |   4 -
 nnvm/python/nnvm/top/vision.py                |  35 ++---
 nnvm/src/top/tensor/elemwise.cc               | 102 -------------
 nnvm/src/top/tensor/transform.cc              |  85 -----------
 nnvm/src/top/vision/nms.cc                    |  66 ++-------
 nnvm/tests/python/compiler/test_top_level4.py | 136 +++++-------------
 .../python/frontend/mxnet/test_forward.py     |   1 +
 topi/python/topi/cuda/nms.py                  |   4 +-
 topi/python/topi/cuda/ssd/multibox.py         |   4 +-
 topi/python/topi/vision/nms.py                |   4 +-
 topi/python/topi/vision/ssd/multibox.py       |   5 +-
 topi/tests/python/test_topi_vision.py         |  14 +-
 15 files changed, 89 insertions(+), 496 deletions(-)

diff --git a/nnvm/include/nnvm/top/nn.h b/nnvm/include/nnvm/top/nn.h
index 82f3230b4931..69d81a98cb4c 100644
--- a/nnvm/include/nnvm/top/nn.h
+++ b/nnvm/include/nnvm/top/nn.h
@@ -402,9 +402,9 @@ struct LayoutTransformParam : public dmlc::Parameter<LayoutTransformParam> {
 
   DMLC_DECLARE_PARAMETER(LayoutTransformParam) {
     DMLC_DECLARE_FIELD(src_layout).set_default("__undef__")
-      .describe("Dimension ordering of data");
+    .describe("Dimension ordering of data");
     DMLC_DECLARE_FIELD(dst_layout).set_default("__undef__")
-      .describe("Dimension ordering of data.");
+    .describe("Dimension ordering of data.");
   }
 };
 
@@ -419,13 +419,13 @@ struct MultiBoxPriorParam : public dmlc::Parameter<MultiBoxPriorParam> {
     DMLC_DECLARE_FIELD(sizes).set_default(Tuple<float>({1.0}))
       .describe("List of sizes of generated MultiBoxPriores.");
     DMLC_DECLARE_FIELD(ratios).set_default(Tuple<float>({1.0}))
-      .describe("List of aspect ratios of generated MultiBoxPriores.");
+    .describe("List of aspect ratios of generated MultiBoxPriores.");
     DMLC_DECLARE_FIELD(steps).set_default(Tuple<float>({-1.0, -1.0}))
-      .describe("Priorbox step across y and x, -1 for auto calculation.");
+    .describe("Priorbox step across y and x, -1 for auto calculation.");
     DMLC_DECLARE_FIELD(offsets).set_default(Tuple<float>({0.5, 0.5}))
-      .describe("Priorbox center offsets, y and x respectively.");
+    .describe("Priorbox center offsets, y and x respectively.");
     DMLC_DECLARE_FIELD(clip).set_default(false)
-      .describe("Whether to clip out-of-boundary boxes.");
+    .describe("Whether to clip out-of-boundary boxes.");
   }
 };
 
@@ -443,31 +443,20 @@ struct MultiBoxTransformLocParam : public dmlc::Parameter<MultiBoxTransformLocPa
   }
 };
 
-struct GetValidCountsParam : public dmlc::Parameter<GetValidCountsParam> {
-  float score_threshold;
-  DMLC_DECLARE_PARAMETER(GetValidCountsParam) {
-    DMLC_DECLARE_FIELD(score_threshold).set_default(0.0)
-      .describe("Lower limit of score for valid bounding boxes.");
-  }
-};
-
 struct NMSParam : public dmlc::Parameter<NMSParam> {
+  bool return_indices;
   float iou_threshold;
   bool force_suppress;
   int topk;
-  int id_index;
-  bool do_rearrange;
   DMLC_DECLARE_PARAMETER(NMSParam) {
+    DMLC_DECLARE_FIELD(return_indices)
+      .describe("Whether to return box indices in input data.");
     DMLC_DECLARE_FIELD(iou_threshold).set_default(0.5)
       .describe("Non-maximum suppression threshold.");
     DMLC_DECLARE_FIELD(force_suppress).set_default(false)
-      .describe("Suppress all detections regardless of class_id.");
+    .describe("Suppress all detections regardless of class_id.");
     DMLC_DECLARE_FIELD(topk).set_default(-1)
-      .describe("Keep maximum top k detections before nms, -1 for no limit.");
-    DMLC_DECLARE_FIELD(id_index).set_default(0)
-      .describe("Axis index for id.");
-    DMLC_DECLARE_FIELD(do_rearrange).set_default(false)
-      .describe("Whether to move all valid bounding boxes to the top.");
+    .describe("Keep maximum top k detections before nms, -1 for no limit.");
   }
 };
 
diff --git a/nnvm/include/nnvm/top/tensor.h b/nnvm/include/nnvm/top/tensor.h
index dc3c23a6198b..bed1b05984da 100644
--- a/nnvm/include/nnvm/top/tensor.h
+++ b/nnvm/include/nnvm/top/tensor.h
@@ -74,21 +74,6 @@ struct StridedSliceParam : public dmlc::Parameter<StridedSliceParam> {
   }
 };
 
-struct SliceAxisParam : public dmlc::Parameter<SliceAxisParam> {
-  int axis;
-  int begin;
-  int end;
-
-  DMLC_DECLARE_PARAMETER(SliceAxisParam) {
-    DMLC_DECLARE_FIELD(axis)
-      .describe("Axis along which to be sliced.");
-    DMLC_DECLARE_FIELD(begin)
-      .describe("Index for begin of slice");
-    DMLC_DECLARE_FIELD(end).set_default(0)
-      .describe("Index for end of the slice");
-  }
-};
-
 enum TypeFlag {
   kFloat32 = 0,
   kFloat64 = 1,
diff --git a/nnvm/python/nnvm/frontend/mxnet.py b/nnvm/python/nnvm/frontend/mxnet.py
index dffc8d960c88..2df67d9967ca 100644
--- a/nnvm/python/nnvm/frontend/mxnet.py
+++ b/nnvm/python/nnvm/frontend/mxnet.py
@@ -238,18 +238,18 @@ def _clip(inputs, attrs):
 
 def _contrib_multibox_detection(inputs, attrs):
     clip = _parse_bool_str(attrs, 'clip', default='True')
-    threshold = attrs.get('threshold', 0.01)
-    iou_threshold = attrs.get('nms_threshold', 0.5)
+    threshold = attrs.get('threshold') or 0.01
+    nms_threshold = attrs.get('nms_threshold') or 0.5
     force_suppress = _parse_bool_str(attrs, 'force_suppress', default='False')
     variances = tuple([float(x.strip()) for x in attrs.get('variances').strip('()').split(',')]) \
         if attrs.get('variances') is not None else (0.1, 0.1, 0.2, 0.2)
-    topk = attrs.get('nms_topk', -1)
+    nms_topk = attrs.get('nms_topk') or -1
     new_attrs0 = {'clip': clip, 'threshold': float(threshold), 'variances': variances}
-    new_attrs1 = {'iou_threshold': float(iou_threshold), 'force_suppress': force_suppress,
-                  'topk': int(topk)}
+    new_attrs1 = {'return_indices': False, 'iou_threshold': float(nms_threshold),
+                  'force_suppress': force_suppress, 'topk': int(nms_topk)}
     data, valid_count = _get_nnvm_op('multibox_transform_loc')(inputs[0], inputs[1],
                                                                inputs[2], **new_attrs0)
-    return _get_nnvm_op('nms')(data, valid_count, **new_attrs1)
+    return _get_nnvm_op('non_max_suppression')(data, valid_count, **new_attrs1)
 
 def _elemwise_sum(inputs, _):
     new_attrs = {'num_args':len(inputs)}
@@ -314,57 +314,6 @@ def _argmin(inputs, attrs):
     new_attrs['keepdims'] = _parse_bool_str(attrs, 'keepdims', default="False")
     return _get_nnvm_op(op_name)(*inputs, **new_attrs)
 
-def _contrib_box_nms(inputs, attrs):
-    force_suppress = _parse_bool_str(attrs, 'force_suppress', default="False")
-    overlap_thresh = attrs.get('overlap_thresh', 0.5)
-    topk = attrs.get('topk', -1)
-    valid_thresh = attrs.get('valid_thresh', 0)
-    coord_start = attrs.get('coord_start', 2)
-    score_index = attrs.get('score_index', 1)
-    id_index = attrs.get('id_index', -1)
-    in_format = attrs.get('in_format', 'corner')
-    out_format = attrs.get('out_format', 'corner')
-    if int(coord_start) != 2:
-        _raise_not_supported('coord_start: %s' % coord_start, 'box_nms')
-    if int(score_index) != 1:
-        _raise_not_supported('score_index: %s' % score_index, 'box_nms')
-    if int(id_index) != -1 and int(id_index) != 0:
-        _raise_not_supported('id_index: %s' % id_index, 'box_nms')
-    if in_format != 'corner':
-        _raise_not_supported('in_format: %s' % in_format, 'box_nms')
-    if out_format != 'corner':
-        _raise_not_supported('out_format: %s' % out_format, 'box_nms')
-
-    valid_counts, inter_out = \
-        _get_nnvm_op('get_valid_counts')(inputs[0], score_threshold=valid_thresh)
-    nms_out = _get_nnvm_op('nms')(inter_out, valid_counts,
-                                  iou_threshold=overlap_thresh,
-                                  force_suppress=force_suppress,
-                                  topk=topk, id_index=id_index,
-                                  do_rearrange=True)
-    return nms_out
-
-def _slice_like(inputs, attrs):
-    op_name = 'slice_like'
-    axis = attrs.get('axes', ())
-    return _get_nnvm_op(op_name)(inputs[0], inputs[1], axis=axis)
-
-def _slice_axis(inputs, attrs):
-    op_name, new_attrs = 'slice_axis', {}
-    new_attrs['axis'] = attrs.get('axis')
-    new_attrs['begin'] = attrs.get('begin')
-    new_attrs['end'] = 0 if attrs.get('end') == "None" else attrs.get('end')
-    return _get_nnvm_op(op_name)(inputs[0], **new_attrs)
-
-def _l2_normalize(inputs, attrs):
-    op_name, new_attrs = 'l2_normalize', {}
-    mode = attrs.get('mode', 'instance')
-    if mode != 'channel':
-        _raise_not_supported('mode: %s' % mode, 'L2Normalization')
-    new_attrs['eps'] = attrs.get('eps', 1e-10)
-    new_attrs['axis'] = 1
-    return _get_nnvm_op(op_name)(inputs[0], **new_attrs)
-
 _identity_list = ['__add_scalar__', '__add_symbol__', '__div_scalar__',
                   '__div_symbol__', '__mul_scalar__', '__mul_symbol__',
                   '__pow_scalar__', '__rdiv_scalar__', '__rpow_scalar__',
@@ -373,9 +322,9 @@ def _l2_normalize(inputs, attrs):
                   'broadcast_sub', 'broadcast_to', 'cast', 'elemwise_add',
                   'elemwise_div', 'elemwise_mul', 'elemwise_sub', 'exp',
                   'flatten', 'log', 'log_softmax', 'max', 'min', 'negative',
-                  'ones_like', 'relu', 'sigmoid', 'softmax',
+                  'ones_like', 'relu', 'sigmoid', 'slice_like', 'softmax',
                   'sum', 'tanh', 'transpose', 'zeros_like', 'gather_nd',
-                  'reshape_like', 'where']
+                  'reshape_like']
 
 _convert_map = {
     '_copy'         : _rename('copy'),
@@ -385,13 +334,6 @@ def _l2_normalize(inputs, attrs):
     '_plus_scalar'  : _rename('__add_scalar__'),
     '_rdiv_scalar'  : _rename('__rdiv_scalar__'),
     '_rminus_scalar': _rename('__rsub_scalar__'),
-    '_equal_scalar' : _rename('__equal_scalar__'),
-    '_not_equal_scalar': _rename('__not_equal_scalar__'),
-    '_greater_scalar': _rename('__greater_scalar__'),
-    '_greater_equal_scalar': _rename('__greater_equal_scalar__'),
-    '_less_scalar': _rename('__less_scalar__'),
-    '_less_equal_scalar': _rename('__less_equal_scalar__'),
-    '_contrib_box_nms'       : _contrib_box_nms,
     '_contrib_MultiBoxPrior' : _rename('multibox_prior'),
     '_contrib_MultiBoxDetection' : _contrib_multibox_detection,
     '_minimum'      : _minimum,
@@ -413,14 +355,11 @@ def _l2_normalize(inputs, attrs):
     'Flatten'       : _rename('flatten'),
     'FullyConnected': _dense,
     'LeakyReLU'     : _leaky_relu,
-    'L2Normalization' : _l2_normalize,
     'Pooling'       : _pooling,
     'Pooling_v1'    : _pooling,
     'Reshape'       : _reshape,
     'slice'         : _slice,
     'SliceChannel'  : _split,
-    'slice_axis'    : _slice_axis,
-    'slice_like'    : _slice_like,
     'split'         : _split,
     'Softmax'       : _rename('softmax'),
     'SoftmaxActivation' : _softmax_activation,
diff --git a/nnvm/python/nnvm/top/transform.py b/nnvm/python/nnvm/top/transform.py
index d6c85ea283d3..8fde9632a8af 100644
--- a/nnvm/python/nnvm/top/transform.py
+++ b/nnvm/python/nnvm/top/transform.py
@@ -83,10 +83,6 @@ def schedule_concatenate(_, outs, target):
 reg.register_pattern("slice_like", OpPattern.INJECTIVE)
 reg.register_schedule("slice_like", _fschedule_injective)
 
-# slice_axis
-reg.register_pattern("slice_axis", OpPattern.INJECTIVE)
-reg.register_schedule("slice_axis", _fschedule_injective)
-
 # where
 reg.register_pattern("where", OpPattern.INJECTIVE)
 reg.register_schedule("where", _fschedule_injective)
diff --git a/nnvm/python/nnvm/top/vision.py b/nnvm/python/nnvm/top/vision.py
index cb69b897a7be..c369fee5f6e7 100644
--- a/nnvm/python/nnvm/top/vision.py
+++ b/nnvm/python/nnvm/top/vision.py
@@ -58,41 +58,24 @@ def compute_multibox_transform_loc(attrs, inputs, _):
     return topi.vision.ssd.multibox_transform_loc(inputs[0], inputs[1], inputs[2],
                                                   clip, threshold, variance)
 
-reg.register_pattern("multibox_transform_loc", OpPattern.OPAQUE)
-
-# Get valid number of anchor boxes
-@reg.register_schedule("get_valid_counts")
-def schedule_get_valid_counts(_, outs, target):
-    """Schedule definition of get_valid_counts"""
-    with tvm.target.create(target):
-        return topi.generic.schedule_get_valid_counts(outs)
-
-@reg.register_compute("get_valid_counts")
-def compute_get_valid_counts(attrs, inputs, _):
-    """Compute definition of get_valid_counts"""
-    score_threshold = attrs.get_float("score_threshold")
-    return topi.vision.get_valid_counts(inputs[0], score_threshold)
-
-reg.register_pattern("get_valid_counts", OpPattern.OPAQUE)
+reg.register_pattern("multibox_detection", OpPattern.OPAQUE)
 
 # non-maximum suppression
-@reg.register_schedule("nms")
+@reg.register_schedule("non_max_suppression")
 def schedule_nms(_, outs, target):
-    """Schedule definition of nms"""
+    """Schedule definition of non_max_suppression"""
     with tvm.target.create(target):
         return topi.generic.schedule_nms(outs)
 
-@reg.register_compute("nms")
+@reg.register_compute("non_max_suppression")
 def compute_nms(attrs, inputs, _):
-    """Compute definition of nms"""
+    """Compute definition of non_max_suppression"""
+    return_indices = attrs.get_bool('return_indices')
     iou_threshold = attrs.get_float('iou_threshold')
     force_suppress = attrs.get_bool('force_suppress')
     topk = attrs.get_int('topk')
-    id_index = attrs.get_int('id_index')
-    do_rearrange = attrs.get_bool('do_rearrange')
 
-    return topi.vision.nms(inputs[0], inputs[1], iou_threshold,
-                           force_suppress, topk, id_index,
-                           do_rearrange)
+    return topi.vision.non_max_suppression(inputs[0], inputs[1], return_indices,
+                                           iou_threshold, force_suppress, topk)
 
-reg.register_pattern("nms", OpPattern.OPAQUE)
+reg.register_pattern("non_max_suppression", OpPattern.OPAQUE)
diff --git a/nnvm/src/top/tensor/elemwise.cc b/nnvm/src/top/tensor/elemwise.cc
index 5a39f3ecc392..2d9813e22131 100644
--- a/nnvm/src/top/tensor/elemwise.cc
+++ b/nnvm/src/top/tensor/elemwise.cc
@@ -806,108 +806,6 @@ NNVM_REGISTER_ELEMWISE_BINARY_SCALAR(__rpow_scalar__)
     };
 });
 
-NNVM_REGISTER_ELEMWISE_BINARY_SCALAR(__equal_scalar__)
-.describe(R"code(Tensor equal scalar
-
-)code"  NNVM_ADD_FILELINE)
-.set_support_level(3)
-.set_attr<FTVMCompute>(
-  "FTVMCompute", [](const NodeAttrs& attrs,
-                    const Array<Tensor>& inputs,
-                    const Array<Tensor>& out_info) {
-    Tensor out = topi::cast(
-      binary_scalar_op(attrs, inputs[0],
-                       [](Expr x, Expr y) { return x == y; }),
-      out_info[0]->dtype);
-    return Array<Tensor>{ out };
-})
-.set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes);
-
-NNVM_REGISTER_ELEMWISE_BINARY_SCALAR(__not_equal_scalar__)
-.describe(R"code(Tensor not equal scalar
-
-)code"  NNVM_ADD_FILELINE)
-.set_support_level(3)
-.set_attr<FTVMCompute>(
-  "FTVMCompute", [](const NodeAttrs& attrs,
-                    const Array<Tensor>& inputs,
-                    const Array<Tensor>& out_info) {
-    Tensor out = topi::cast(
-      binary_scalar_op(attrs, inputs[0],
-                       [](Expr x, Expr y) { return x != y; }),
-      out_info[0]->dtype);
-    return Array<Tensor>{ out };
-})
-.set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes);
-
-NNVM_REGISTER_ELEMWISE_BINARY_SCALAR(__greater_scalar__)
-.describe(R"code(Tensor greater scalar
-
-)code"  NNVM_ADD_FILELINE)
-.set_support_level(3)
-.set_attr<FTVMCompute>(
-  "FTVMCompute", [](const NodeAttrs& attrs,
-                    const Array<Tensor>& inputs,
-                    const Array<Tensor>& out_info) {
-    Tensor out = topi::cast(
-      binary_scalar_op(attrs, inputs[0],
-                       [](Expr x, Expr y) { return x > y; }),
-      out_info[0]->dtype);
-    return Array<Tensor>{ out };
-})
-.set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes);
-
-NNVM_REGISTER_ELEMWISE_BINARY_SCALAR(__greater_equal_scalar__)
-.describe(R"code(Tensor greater equal scalar
-
-)code"  NNVM_ADD_FILELINE)
-.set_support_level(3)
-.set_attr<FTVMCompute>(
-  "FTVMCompute", [](const NodeAttrs& attrs,
-                    const Array<Tensor>& inputs,
-                    const Array<Tensor>& out_info) {
-    Tensor out = topi::cast(
-      binary_scalar_op(attrs, inputs[0],
-                       [](Expr x, Expr y) { return x >= y; }),
-      out_info[0]->dtype);
-    return Array<Tensor>{ out };
-})
-.set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes);
-
-NNVM_REGISTER_ELEMWISE_BINARY_SCALAR(__less_scalar__)
-.describe(R"code(Tensor less scalar
-
-)code"  NNVM_ADD_FILELINE)
-.set_support_level(3)
-.set_attr<FTVMCompute>(
-  "FTVMCompute", [](const NodeAttrs& attrs,
-                    const Array<Tensor>& inputs,
-                    const Array<Tensor>& out_info) {
-    Tensor out = topi::cast(
-      binary_scalar_op(attrs, inputs[0],
-                       [](Expr x, Expr y) { return x < y; }),
-      out_info[0]->dtype);
-    return Array<Tensor>{ out };
-})
-.set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes);
-
-NNVM_REGISTER_ELEMWISE_BINARY_SCALAR(__less_equal_scalar__)
-.describe(R"code(Tensor less equal scalar
-
-)code"  NNVM_ADD_FILELINE)
-.set_support_level(3)
-.set_attr<FTVMCompute>(
-  "FTVMCompute", [](const NodeAttrs& attrs,
-                    const Array<Tensor>& inputs,
-                    const Array<Tensor>& out_info) {
-    Tensor out = topi::cast(
-      binary_scalar_op(attrs, inputs[0],
-                       [](Expr x, Expr y) { return x <= y; }),
-      out_info[0]->dtype);
-    return Array<Tensor>{ out };
-})
-.set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes);
-
 DMLC_REGISTER_PARAMETER(ElementWiseReduceParam);
 
 NNVM_REGISTER_ELEMWISE_REDUCE_OP(elemwise_sum)
diff --git a/nnvm/src/top/tensor/transform.cc b/nnvm/src/top/tensor/transform.cc
index 4f09062ac607..9d259ae77d9b 100644
--- a/nnvm/src/top/tensor/transform.cc
+++ b/nnvm/src/top/tensor/transform.cc
@@ -1283,91 +1283,6 @@ NNVM_REGISTER_OP(slice_like)
 })
 .set_support_level(4);
 
-// SliceAxis
-DMLC_REGISTER_PARAMETER(SliceAxisParam);
-
-inline bool SliceAxisShape(const nnvm::NodeAttrs& attrs,
-                           std::vector<TShape>* in_attrs,
-                           std::vector<TShape>* out_attrs) {
-  const SliceAxisParam& param = nnvm::get<SliceAxisParam>(attrs.parsed);
-  const TShape& src_shape = in_attrs->at(0);
-  int axis = param.axis;
-  int begin = param.begin;
-  int end = param.end;
-
-  if (axis < 0) {
-    axis += src_shape.ndim();
-  }
-  if (begin < 0) {
-    begin += src_shape[axis];
-  }
-  if (end <= 0) {
-    end += src_shape[axis];
-  }
-  CHECK_LT(begin, end)
-    << "Begin index must be smaller than end index: "
-    << begin << " vs " << end;
-
-  TShape out_shape(src_shape);
-  out_shape[axis] = end - begin;
-  NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_attrs, 0, out_shape);
-  return true;
-}
-
-NNVM_REGISTER_OP(slice_axis)
-.describe(R"code(Slices along a given axis.
-Returns an array slice along a given axis starting from
-the begin index to the end index.
-)code" NNVM_ADD_FILELINE)
-.add_argument("data", "Tensor", "Input data to be sliced.")
-.set_num_outputs(1)
-.set_num_inputs(1)
-.add_arguments(SliceAxisParam::__FIELDS__())
-.set_attr_parser(ParamParser<SliceAxisParam>)
-.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<SliceAxisParam>)
-.set_attr<FInferShape>("FInferShape", SliceAxisShape)
-.set_attr<FInferType>("FInferType", ElemwiseType<1, 1>)
-.set_attr<FCorrectLayout>("FCorrectLayout", ElemwiseArbitraryLayout<1, 1>)
-.set_attr<FTVMCompute>(
-  "FTVMCompute", [](const NodeAttrs& attrs,
-                    const Array<Tensor>& inputs,
-                    const Array<Tensor>& out_info) {
-    const SliceAxisParam& param = nnvm::get<SliceAxisParam>(attrs.parsed);
-    const Array<Expr> src_shape = inputs[0]->shape;
-    Array<Expr> begin_idx, end_idx, strides;
-    int axis = param.axis;
-    int begin = param.begin;
-    int end = param.end;
-
-    if (axis < 0) {
-      axis += src_shape.size();
-    }
-    if (begin < 0) {
-      begin += topi::GetConstInt(src_shape[axis]);
-    }
-    if (end <= 0) {
-      end += topi::GetConstInt(src_shape[axis]);
-    }
-    for (size_t i = 0; i < src_shape.size(); ++i) {
-      begin_idx.push_back(make_const(tvm::Int(32), 0));
-      strides.push_back(make_const(tvm::Int(32), 1));
-    }
-    end_idx = Array<Expr>(src_shape);
-    begin_idx.Set(axis, make_const(tvm::Int(32), begin));
-    end_idx.Set(axis, make_const(tvm::Int(32), end));
-
-    return Array<Tensor>{
-      topi::strided_slice(inputs[0],
-                          GetIntArray(begin_idx),
-                          GetIntArray(end_idx),
-                          GetIntArray(strides))
-    };
-})
-.set_attr<FListInputNames>("FListInputNames", [](const NodeAttrs& attrs) {
-  return std::vector<std::string>{"data"};
-})
-.set_support_level(4);
-
 // where
 inline bool WhereShape(const nnvm::NodeAttrs& attrs,
                        std::vector<TShape>* in_attrs,
diff --git a/nnvm/src/top/vision/nms.cc b/nnvm/src/top/vision/nms.cc
index 71b4c3ff7860..315e06e22ee5 100644
--- a/nnvm/src/top/vision/nms.cc
+++ b/nnvm/src/top/vision/nms.cc
@@ -11,6 +11,7 @@
 #include <nnvm/op_attr_types.h>
 #include <nnvm/compiler/op_attr_types.h>
 #include "../op_common.h"
+#include "../elemwise_op_common.h"
 
 namespace nnvm {
 namespace top {
@@ -18,64 +19,12 @@ using compiler::FTVMCompute;
 using tvm::Tensor;
 using tvm::Array;
 
-DMLC_REGISTER_PARAMETER(GetValidCountsParam);
-
-bool GetValidCountsShape(const NodeAttrs& attrs,
-                         std::vector<TShape> *in_attrs,
-                         std::vector<TShape> *out_attrs) {
-  TShape dshape = in_attrs->at(0);
-  TShape vshape = TShape({dshape[0]});
-  CHECK_EQ(dshape.ndim(), 3U) << "Input data should be 3-D.";
-  out_attrs->clear();
-  NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_attrs, 0, vshape);
-  NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_attrs, 1, dshape);
-  return true;
-}
-
-inline bool GetValidCountsInferType(const NodeAttrs &attrs,
-                                    std::vector<int> *in_attrs,
-                                    std::vector<int> *out_attrs) {
-  DTYPE_ASSIGN(out_attrs->at(0), static_cast<int>(kInt32));
-  DTYPE_ASSIGN(out_attrs->at(1), in_attrs->at(0))
-  return true;
-}
-
-inline bool GetValidCountsInferLayout(const NodeAttrs& attrs,
-                                      std::vector<Layout> *ilayouts,
-                                      const std::vector<Layout> *last_ilayouts,
-                                      std::vector<Layout> *olayouts) {
-  static const Layout kNCHW("NCHW");
-  CHECK_EQ(ilayouts->size(), 1U);
-  CHECK_EQ(olayouts->size(), 2U);
-  NNVM_ASSIGN_LAYOUT(*ilayouts, 0, kNCHW);
-  return true;
-}
-
-NNVM_REGISTER_OP(get_valid_counts)
-.describe(R"doc("Get valid count of bounding boxes given
-a score threshold. Also moves valid boxes to the top of
-input data."
-)doc" NNVM_ADD_FILELINE)
-.set_num_inputs(1)
-.set_num_outputs(2)
-.set_attr_parser(ParamParser<GetValidCountsParam>)
-.set_attr<FGetAttrDict>("FGetAttrDict",
-                         ParamGetAttrDict<GetValidCountsParam>)
-.add_arguments(GetValidCountsParam::__FIELDS__())
-.add_argument("data", "Tensor", "Input data.")
-.set_attr<FListInputNames>("FListInputNames", [](const NodeAttrs& attrs) {
-  return std::vector<std::string>{"data"};
-})
-.set_attr<FInferShape>("FInferShape", GetValidCountsShape)
-.set_attr<FInferType>("FInferType", GetValidCountsInferType)
-.set_attr<FCorrectLayout>("FCorrectLayout", GetValidCountsInferLayout)
-.set_support_level(4);
-
 DMLC_REGISTER_PARAMETER(NMSParam);
 
 bool NMSShape(const NodeAttrs& attrs,
               std::vector<TShape> *in_attrs,
               std::vector<TShape> *out_attrs) {
+  const NMSParam& param = nnvm::get<NMSParam>(attrs.parsed);
   CHECK_EQ(in_attrs->size(), 2U) << "Inputs: [data, valid_count]";
   TShape dshape = in_attrs->at(0);
   TShape vshape = in_attrs->at(1);
@@ -85,7 +34,14 @@ bool NMSShape(const NodeAttrs& attrs,
     "(batch_size, num_anchors, 6).";
   CHECK_EQ(dshape[0], vshape[0]) << "batch_size mismatch.";
   out_attrs->clear();
-  NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_attrs, 0, dshape);
+  if (param.return_indices) {
+    TShape oshape = TShape(2);
+    oshape[0] = dshape[0];
+    oshape[1] = dshape[1];
+    NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_attrs, 0, oshape);
+  } else {
+    NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_attrs, 0, dshape);
+  }
   return true;
 }
 
@@ -108,7 +64,7 @@ inline bool NMSInferLayout(const NodeAttrs& attrs,
   return true;
 }
 
-NNVM_REGISTER_OP(nms)
+NNVM_REGISTER_OP(non_max_suppression)
   .describe(R"doc("Non-maximum suppression."
 )doc" NNVM_ADD_FILELINE)
 .set_num_inputs(2)
diff --git a/nnvm/tests/python/compiler/test_top_level4.py b/nnvm/tests/python/compiler/test_top_level4.py
index f3c297e5f6e9..991b9c2b15be 100644
--- a/nnvm/tests/python/compiler/test_top_level4.py
+++ b/nnvm/tests/python/compiler/test_top_level4.py
@@ -3,7 +3,6 @@
 import tvm
 from tvm.contrib import graph_runtime
 import topi
-import topi.testing
 import nnvm.symbol as sym
 import nnvm.compiler
 from nnvm.testing.config import ctx_list
@@ -528,15 +527,14 @@ def verify_multibox_prior(dshape, sizes=(1,), ratios=(1,), steps=(-1, -1),
     if clip:
         np_out = np.clip(np_out, 0, 1)
 
-    for target, ctx in ctx_list():
-        if target == "cuda":
-            continue
-        graph, lib, _ = nnvm.compiler.build(out, target, {"data": dshape})
-        m = graph_runtime.create(graph, lib, ctx)
-        m.set_input("data", np.random.uniform(size=dshape).astype(dtype))
-        m.run()
-        out = m.get_output(0, tvm.nd.empty(np_out.shape, dtype))
-        tvm.testing.assert_allclose(out.asnumpy(), np_out, atol=1e-5, rtol=1e-5)
+    target = "llvm"
+    ctx = tvm.cpu()
+    graph, lib, _ = nnvm.compiler.build(out, target, {"data": dshape})
+    m = graph_runtime.create(graph, lib, ctx)
+    m.set_input("data", np.random.uniform(size=dshape).astype(dtype))
+    m.run()
+    out = m.get_output(0, tvm.nd.empty(np_out.shape, dtype))
+    tvm.testing.assert_allclose(out.asnumpy(), np_out, atol=1e-5, rtol=1e-5)
 
 def test_multibox_prior():
     verify_multibox_prior((1, 3, 50, 50))
@@ -552,7 +550,7 @@ def test_multibox_transform_loc():
     anchors = sym.Variable("anchors")
     transform_loc_data, valid_count = sym.multibox_transform_loc(cls_prob=cls_prob, loc_pred=loc_preds,
                                                                  anchor=anchors)
-    out = sym.nms(data=transform_loc_data, valid_count=valid_count)
+    out = sym.non_max_suppression(data=transform_loc_data, valid_count=valid_count, return_indices=False)
 
     # Manually create test case
     np_cls_prob = np.array([[[0.2, 0.5, 0.3], [0.25, 0.3, 0.45], [0.7, 0.1, 0.2]]])
@@ -563,70 +561,27 @@ def test_multibox_transform_loc():
                                  [0, 0.44999999, 1, 1, 1, 1],
                                  [0, 0.30000001, 0, 0, 0.22903419, 0.20435292]]])
 
+    target = "llvm"
     dtype = "float32"
-    for target, ctx in ctx_list():
-        if target == "cuda":
-            continue
-        graph, lib, _ = nnvm.compiler.build(out, target, {"cls_prob": (batch_size, num_anchors, num_classes),
-                                                          "loc_preds": (batch_size, num_anchors * 4),
-                                                          "anchors": (1, num_anchors, 4)})
-        m = graph_runtime.create(graph, lib, ctx)
-        m.set_input(**{"cls_prob": np_cls_prob.astype(dtype), "loc_preds": np_loc_preds.astype(dtype), "anchors": np_anchors.astype(dtype)})
-        m.run()
-        out = m.get_output(0, tvm.nd.empty(expected_np_out.shape, dtype))
-        tvm.testing.assert_allclose(out.asnumpy(), expected_np_out, atol=1e-5, rtol=1e-5)
-
-def verify_get_valid_counts(dshape, score_threshold):
-    dtype = "float32"
-    batch_size, num_anchor, elem_length = dshape
-    np_data = np.random.uniform(size=dshape).astype(dtype)
-    np_out1 = np.zeros(shape=(batch_size,))
-    np_out2 = np.zeros(shape=dshape).astype(dtype)
-    for i in range(batch_size):
-        np_out1[i] = 0
-        inter_idx = 0
-        for j in range(num_anchor):
-            score = np_data[i, j, 1]
-            if score >= score_threshold:
-                for k in range(elem_length):
-                    np_out2[i, inter_idx, k] = np_data[i, j, k]
-                np_out1[i] += 1
-                inter_idx += 1
-            if j >= np_out1[i]:
-                for k in range(elem_length):
-                    np_out2[i, j, k] = -1
-
-    for target, ctx in ctx_list():
-        if target == "cuda":
-            continue
-        data = sym.Variable("data", dtype=dtype)
-        valid_counts, inter_data = sym.get_valid_counts(data, score_threshold=score_threshold)
-        out = sym.Group([valid_counts, inter_data])
-        graph, lib, _ = nnvm.compiler.build(out, target, {"data": dshape})
-        m = graph_runtime.create(graph, lib, ctx)
-        m.set_input("data", np_data)
-        m.run()
-        out1 = m.get_output(0, tvm.nd.empty(np_out1.shape, "int32"))
-        out2 = m.get_output(1, tvm.nd.empty(dshape, dtype))
-        tvm.testing.assert_allclose(out1.asnumpy(), np_out1, rtol=1e-3)
-        tvm.testing.assert_allclose(out2.asnumpy(), np_out2, rtol=1e-3)
-
-
-def test_get_valid_counts():
-    verify_get_valid_counts((1, 2500, 6), 0)
-    verify_get_valid_counts((1, 2500, 6), -1)
-    verify_get_valid_counts((3, 1000, 6), 0.55)
-    verify_get_valid_counts((16, 500, 6), 0.95)
+    ctx = tvm.cpu()
+    graph, lib, _ = nnvm.compiler.build(out, target, {"cls_prob": (batch_size, num_anchors, num_classes),
+                                                      "loc_preds": (batch_size, num_anchors * 4),
+                                                      "anchors": (1, num_anchors, 4)})
+    m = graph_runtime.create(graph, lib, ctx)
+    m.set_input(**{"cls_prob": np_cls_prob.astype(dtype), "loc_preds": np_loc_preds.astype(dtype), "anchors": np_anchors.astype(dtype)})
+    m.run()
+    out = m.get_output(0, tvm.nd.empty(expected_np_out.shape, dtype))
+    tvm.testing.assert_allclose(out.asnumpy(), expected_np_out, atol=1e-5, rtol=1e-5)
 
-def test_nms():
+def test_non_max_suppression():
     dshape = (1, 5, 6)
     data = sym.Variable("data")
     valid_count = sym.Variable("valid_count", dtype="int32")
     iou_threshold = 0.7
     force_suppress = True
     topk = 2
-    out = sym.nms(data=data, valid_count=valid_count, iou_threshold=iou_threshold,
-                  force_suppress=force_suppress, topk=topk)
+    out = sym.non_max_suppression(data=data, valid_count=valid_count, return_indices=False,
+                                  iou_threshold=iou_threshold, force_suppress=force_suppress, topk=topk)
 
     np_data = np.array([[[0, 0.8, 1, 20, 25, 45], [1, 0.7, 30, 60, 50, 80],
                          [0, 0.4, 4, 21, 19, 40], [2, 0.9, 35, 61, 52, 79],
@@ -636,16 +591,15 @@ def test_nms():
                            [-1, -1, -1, -1, -1, -1], [-1, -1, -1, -1, -1, -1],
                            [-1, -1, -1, -1, -1, -1]]])
 
-    for target, ctx in ctx_list():
-        if target == "cuda":
-            continue
-        graph, lib, _ = nnvm.compiler.build(out, target, {"data": dshape, "valid_count": (dshape[0],)},
-                                            dtype={"data": "float32", "valid_count": "int32"})
-        m = graph_runtime.create(graph, lib, ctx)
-        m.set_input(**{"data": np_data, "valid_count": np_valid_count})
-        m.run()
-        out = m.get_output(0, tvm.nd.empty(np_result.shape, "float32"))
-        tvm.testing.assert_allclose(out.asnumpy(), np_result, atol=1e-5, rtol=1e-5)
+    target = "llvm"
+    ctx = tvm.cpu()
+    graph, lib, _ = nnvm.compiler.build(out, target, {"data": dshape, "valid_count": (dshape[0],)},
+                                        dtype={"data": "float32", "valid_count": "int32"})
+    m = graph_runtime.create(graph, lib, ctx)
+    m.set_input(**{"data": np_data, "valid_count": np_valid_count})
+    m.run()
+    out = m.get_output(0, tvm.nd.empty(np_result.shape, "float32"))
+    tvm.testing.assert_allclose(out.asnumpy(), np_result, atol=1e-5, rtol=1e-5)
 
 def np_slice_like(np_data, np_shape_like, axis=[]):
     begin_idx = [0 for _ in np_data.shape]
@@ -662,7 +616,7 @@ def np_slice_like(np_data, np_shape_like, axis=[]):
     slice_idx = []
     for b, e in zip(begin_idx, end_idx):
         slice_idx.append(slice(b, e))
-    np_result = np_data[tuple(slice_idx)]
+    np_result = np_data[slice_idx]
     return np_result
 
 def verify_slice_like(np_data, np_shape_like, axis=[]):
@@ -702,27 +656,6 @@ def test_slice_like():
     axis = (2, 3)
     verify_slice_like(np_data, np_shape_like, axis)
 
-def verify_slice_axis(dshape, axis, begin, end):
-    data = sym.Variable("data")
-    net = sym.slice_axis(data, axis=axis, begin=begin, end=end)
-    np_data = np.random.uniform(size=dshape)
-    np_out = topi.testing.slice_axis_python(np_data, axis, begin, end)
-
-    dtype = "float32"
-    for target, ctx in ctx_list():
-        graph, lib, _ = nnvm.compiler.build(net, target, {"data": dshape}, dtype=dtype)
-        m = graph_runtime.create(graph, lib, ctx)
-        m.set_input("data", np_data)
-        m.run()
-        out = m.get_output(0, tvm.nd.empty(np_out.shape, dtype))
-        tvm.testing.assert_allclose(out.asnumpy(), np_out, atol=1e-5, rtol=1e-5)
-
-def test_slice_axis():
-    verify_slice_axis((1, 2, 3, 4), 3, 0, 2)
-    verify_slice_axis((100, 50), -1, 1, -1)
-    verify_slice_axis((20,), -1, -9, -3)
-    verify_slice_axis((20, 30, 40), 1, 5, 0)
-
 def verify_where(condition, x, y):
     dtype = "float32"
     if len(condition.shape) == 1:
@@ -777,7 +710,6 @@ def test_argmax():
     np.testing.assert_allclose(out.asnumpy(), np_argmax, atol=1e-5, rtol=1e-5)
 
 if __name__ == "__main__":
-    test_get_valid_counts()
     test_reshape()
     test_broadcast()
     test_reduce()
@@ -794,10 +726,8 @@ def test_argmax():
     test_flip()
     test_multibox_prior()
     test_multibox_transform_loc()
-    test_get_valid_counts()
-    test_nms()
+    test_non_max_suppression()
     test_slice_like()
-    test_slice_axis()
     test_where()
     test_argmax()
     print(nnvm.compiler.engine.dump())
diff --git a/nnvm/tests/python/frontend/mxnet/test_forward.py b/nnvm/tests/python/frontend/mxnet/test_forward.py
index 67f1ad5ff27d..8992799528e7 100644
--- a/nnvm/tests/python/frontend/mxnet/test_forward.py
+++ b/nnvm/tests/python/frontend/mxnet/test_forward.py
@@ -300,6 +300,7 @@ def test_forward_l2_normalize():
     mx_sym = mx.sym.L2Normalization(data, mode="channel")
     verify_mxnet_frontend_impl(mx_sym, (2, 3, 4, 5), (2, 3, 4, 5))
 
+
 if __name__ == '__main__':
     test_forward_mlp()
     test_forward_vgg()
diff --git a/topi/python/topi/cuda/nms.py b/topi/python/topi/cuda/nms.py
index 26dc5704d9d9..5f79de25e835 100644
--- a/topi/python/topi/cuda/nms.py
+++ b/topi/python/topi/cuda/nms.py
@@ -4,7 +4,7 @@
 import tvm
 
 from tvm import api
-from topi.vision import nms
+from topi.vision import non_max_suppression
 from ..util import get_const_tuple
 
 def sort_ir(data, index, output):
@@ -181,7 +181,7 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx):
     return body
 
 
-@nms.register(["cuda", "gpu"])
+@non_max_suppression.register(["cuda", "gpu"])
 def nms_gpu(data, valid_count, return_indices, iou_threshold=0.5, force_suppress=False,
             topk=-1, id_index=0, invalid_to_bottom=False):
     """Non-maximum suppression operator for object detection.
diff --git a/topi/python/topi/cuda/ssd/multibox.py b/topi/python/topi/cuda/ssd/multibox.py
index 746be092ebbe..11062824deb0 100644
--- a/topi/python/topi/cuda/ssd/multibox.py
+++ b/topi/python/topi/cuda/ssd/multibox.py
@@ -11,7 +11,7 @@
 from topi.vision.ssd import multibox_prior
 from topi.vision.ssd import multibox_detection
 from topi.vision.ssd import multibox_transform_loc
-from ..nms import nms
+from ..nms import non_max_suppression
 
 
 def multibox_prior_ir(data, out, sizes, ratios, steps, offsets):
@@ -437,6 +437,6 @@ def multibox_detection_gpu(cls_prob, loc_pred, anchor, clip=True, threshold=0.01
     """
     inter_out = multibox_transform_loc(cls_prob, loc_pred, anchor,
                                        clip, threshold, variances)
-    out = nms(
+    out = non_max_suppression(
         inter_out[0], inter_out[1], nms_threshold, force_suppress, nms_topk)
     return out
diff --git a/topi/python/topi/vision/nms.py b/topi/python/topi/vision/nms.py
index 9c8fae23fa93..60715c91f0dc 100644
--- a/topi/python/topi/vision/nms.py
+++ b/topi/python/topi/vision/nms.py
@@ -219,8 +219,8 @@ def hybrid_nms(data, sorted_index, valid_count,
 
 
 @tvm.target.generic_func
-def nms(data, valid_count, return_indices, iou_threshold=0.5, force_suppress=False,
-        topk=-1, id_index=0, invalid_to_bottom=False):
+def non_max_suppression(data, valid_count, return_indices, iou_threshold=0.5,
+                        force_suppress=False, topk=-1, id_index=0, invalid_to_bottom=False):
     """Non-maximum suppression operator for object detection.
 
     Parameters
diff --git a/topi/python/topi/vision/ssd/multibox.py b/topi/python/topi/vision/ssd/multibox.py
index c663d3873587..3c8cc6c07417 100644
--- a/topi/python/topi/vision/ssd/multibox.py
+++ b/topi/python/topi/vision/ssd/multibox.py
@@ -8,7 +8,7 @@
 
 import topi
 
-from ..nms import nms
+from ..nms import non_max_suppression
 
 @hybrid.script
 def hybrid_multibox_prior(data, sizes, ratios, steps, offsets):
@@ -292,5 +292,6 @@ def multibox_detection(cls_prob, loc_pred, anchor, clip=True, threshold=0.01, nm
     """
     inter_out = multibox_transform_loc(cls_prob, loc_pred, anchor,
                                        clip, threshold, variances)
-    out = nms(inter_out[0], inter_out[1], False, nms_threshold, force_suppress, nms_topk)
+    out = non_max_suppression(inter_out[0], inter_out[1], False, nms_threshold,
+                              force_suppress, nms_topk)
     return out
diff --git a/topi/tests/python/test_topi_vision.py b/topi/tests/python/test_topi_vision.py
index 1f50199fcfba..489f2abb92fd 100644
--- a/topi/tests/python/test_topi_vision.py
+++ b/topi/tests/python/test_topi_vision.py
@@ -8,7 +8,7 @@
 
 from tvm.contrib.pickle_memoize import memoize
 from topi.util import get_const_tuple
-from topi.vision import ssd, nms, get_valid_counts
+from topi.vision import ssd, non_max_suppression, get_valid_counts
 
 
 def verify_get_valid_counts(dshape, score_threshold):
@@ -61,7 +61,7 @@ def test_get_valid_counts():
     verify_get_valid_counts((16, 500, 6), 0.95)
 
 
-def test_nms():
+def test_non_max_suppression():
     dshape = (1, 5, 6)
     indices_dshape = (1, 5)
     data = tvm.placeholder(dshape, name="data")
@@ -87,11 +87,11 @@ def check_device(device):
         print("Running on target: %s" % device)
         with tvm.target.create(device):
             if device == 'llvm':
-                out = nms(data, valid_count, False, nms_threshold, force_suppress, nms_topk)
-                indices_out = nms(data, valid_count, True, nms_threshold, force_suppress, nms_topk)
+                out = non_max_suppression(data, valid_count, False, nms_threshold, force_suppress, nms_topk)
+                indices_out = non_max_suppression(data, valid_count, True, nms_threshold, force_suppress, nms_topk)
             else:
-                out = topi.cuda.nms(data, valid_count, False, nms_threshold, force_suppress, nms_topk)
-                indices_out = topi.cuda.nms(data, valid_count, True, nms_threshold, force_suppress, nms_topk)
+                out = topi.cuda.non_max_suppression(data, valid_count, False, nms_threshold, force_suppress, nms_topk)
+                indices_out = topi.cuda.non_max_suppression(data, valid_count, True, nms_threshold, force_suppress, nms_topk)
             s = topi.generic.schedule_nms(out)
             indices_s = topi.generic.schedule_nms(indices_out)
 
@@ -336,7 +336,7 @@ def test_proposal():
 
 if __name__ == "__main__":
     test_get_valid_counts()
-    test_nms()
+    test_non_max_suppression()
     test_multibox_prior()
     test_multibox_detection()
     test_roi_align()

From 6b1fd7a5e95bb69495ababaaaf2ff23f698fd07e Mon Sep 17 00:00:00 2001
From: Wang <wayao@186590ccd7a1.ant.amazon.com>
Date: Wed, 20 Feb 2019 17:31:00 +0800
Subject: [PATCH 30/43] Refactor relay nms

---
 include/tvm/relay/attrs/vision.h              |  9 +++--
 nnvm/include/nnvm/top/nn.h                    | 10 ++++--
 nnvm/python/nnvm/top/vision.py                |  5 ++-
 python/tvm/relay/frontend/mxnet.py            | 20 ++++++-----
 python/tvm/relay/op/_transform.py             |  2 +-
 python/tvm/relay/op/transform.py              |  2 +-
 python/tvm/relay/op/vision/_vision.py         | 14 ++++----
 python/tvm/relay/op/vision/nms.py             | 25 ++++++++------
 src/relay/op/tensor/transform.cc              |  8 ++---
 src/relay/op/vision/nms.cc                    | 22 ++++++++----
 tests/python/relay/test_op_level10.py         | 24 +++++++++++++
 tests/python/relay/test_op_level4.py          | 24 -------------
 tests/python/relay/test_op_level5.py          | 34 +++++++++++++------
 .../{nnvm => relay}/deploy_ssd_gluoncv.py     | 28 ++++++---------
 14 files changed, 132 insertions(+), 95 deletions(-)
 rename tutorials/{nnvm => relay}/deploy_ssd_gluoncv.py (76%)

diff --git a/include/tvm/relay/attrs/vision.h b/include/tvm/relay/attrs/vision.h
index 345a67655552..12523bd0c8ea 100644
--- a/include/tvm/relay/attrs/vision.h
+++ b/include/tvm/relay/attrs/vision.h
@@ -70,13 +70,16 @@ struct GetValidCountsAttrs : public tvm::AttrsNode<GetValidCountsAttrs>{
 
 /*! \brief Attributes used in non_maximum_suppression operator */
 struct NMSAttrs : public tvm::AttrsNode<NMSAttrs>{
+  bool return_indices;
   double iou_threshold;
   bool force_suppress;
   int topk;
   int id_index;
-  bool do_rearrange;
+  bool invalid_to_bottom;
 
   TVM_DECLARE_ATTRS(NMSAttrs, "relay.attrs.NMSAttrs") {
+    TVM_ATTR_FIELD(return_indices)
+      .describe("Whether to return box indices in input data.");
     TVM_ATTR_FIELD(iou_threshold).set_default(0.5)
       .describe("Non-maximum suppression threshold.");
     TVM_ATTR_FIELD(force_suppress).set_default(false)
@@ -85,8 +88,8 @@ struct NMSAttrs : public tvm::AttrsNode<NMSAttrs>{
       .describe("Keep maximum top k detections before nms, -1 for no limit.");
     TVM_ATTR_FIELD(id_index).set_default(0)
       .describe("Axis index of id.");
-    TVM_ATTR_FIELD(do_rearrange).set_default(false)
-      .describe("Whether to move all valid bounding boxes to the top.");
+    TVM_ATTR_FIELD(invalid_to_bottom).set_default(false)
+      .describe("Whether to move all invalid bounding boxes to the bottom.");
   }
 };
 
diff --git a/nnvm/include/nnvm/top/nn.h b/nnvm/include/nnvm/top/nn.h
index 69d81a98cb4c..6fd283aeb14e 100644
--- a/nnvm/include/nnvm/top/nn.h
+++ b/nnvm/include/nnvm/top/nn.h
@@ -448,15 +448,21 @@ struct NMSParam : public dmlc::Parameter<NMSParam> {
   float iou_threshold;
   bool force_suppress;
   int topk;
+  int id_index;
+  bool invalid_to_bottom;
   DMLC_DECLARE_PARAMETER(NMSParam) {
     DMLC_DECLARE_FIELD(return_indices)
       .describe("Whether to return box indices in input data.");
     DMLC_DECLARE_FIELD(iou_threshold).set_default(0.5)
       .describe("Non-maximum suppression threshold.");
     DMLC_DECLARE_FIELD(force_suppress).set_default(false)
-    .describe("Suppress all detections regardless of class_id.");
+      .describe("Suppress all detections regardless of class_id.");
     DMLC_DECLARE_FIELD(topk).set_default(-1)
-    .describe("Keep maximum top k detections before nms, -1 for no limit.");
+      .describe("Keep maximum top k detections before nms, -1 for no limit.");
+    DMLC_DECLARE_FIELD(id_index).set_default(0)
+      .describe("Axis index of id.");
+    DMLC_DECLARE_FIELD(invalid_to_bottom).set_default(false)
+      .describe("Whether to move all invalid bounding boxes to the bottom.");
   }
 };
 
diff --git a/nnvm/python/nnvm/top/vision.py b/nnvm/python/nnvm/top/vision.py
index c369fee5f6e7..1beb9be2564b 100644
--- a/nnvm/python/nnvm/top/vision.py
+++ b/nnvm/python/nnvm/top/vision.py
@@ -74,8 +74,11 @@ def compute_nms(attrs, inputs, _):
     iou_threshold = attrs.get_float('iou_threshold')
     force_suppress = attrs.get_bool('force_suppress')
     topk = attrs.get_int('topk')
+    id_index = attrs.get_int('id_index')
+    invalid_to_bottom = attrs.get_bool('invalid_to_bottom')
 
     return topi.vision.non_max_suppression(inputs[0], inputs[1], return_indices,
-                                           iou_threshold, force_suppress, topk)
+                                           iou_threshold, force_suppress, topk,
+                                           id_index, invalid_to_bottom)
 
 reg.register_pattern("non_max_suppression", OpPattern.OPAQUE)
diff --git a/python/tvm/relay/frontend/mxnet.py b/python/tvm/relay/frontend/mxnet.py
index 727fa5828aef..f00a7f551605 100644
--- a/python/tvm/relay/frontend/mxnet.py
+++ b/python/tvm/relay/frontend/mxnet.py
@@ -324,13 +324,14 @@ def _mx_multibox_detection(inputs, attrs):
                                                                   0.2, 0.2))
 
     new_attrs1 = {}
-    new_attrs1["overlap_threshold"] = attrs.get_float("nms_threshold", 0.5)
+    new_attrs1["return_indices"] = False
+    new_attrs1["iou_threshold"] = attrs.get_float("nms_threshold", 0.5)
     new_attrs1["force_suppress"] = attrs.get_bool("force_suppress", False)
     new_attrs1["topk"] = attrs.get_int("nms_topk", -1)
 
     ret = _op.vision.multibox_transform_loc(inputs[0], inputs[1],
                                             inputs[2], **new_attrs0)
-    return _op.vision.nms(ret[0], ret[1], **new_attrs1)
+    return _op.vision.non_max_suppression(ret[0], ret[1], **new_attrs1)
 
 
 def _mx_batch_dot(inputs, attrs):
@@ -382,7 +383,7 @@ def _mx_proposal(inputs, attrs):
 
 def _mx_box_nms(inputs, attrs):
     force_suppress = attrs.get_bool("force_suppress", False)
-    overlap_thresh = attrs.get_float('overlap_thresh', 0.5)
+    iou_thresh = attrs.get_float('overlap_thresh', 0.5)
     topk = attrs.get_int('topk', -1)
     valid_thresh = attrs.get_float('valid_thresh', 0)
     coord_start = attrs.get_int('coord_start', 2)
@@ -402,11 +403,14 @@ def _mx_box_nms(inputs, attrs):
         raise RuntimeError('out_format %s is not supported.' % out_format)
 
     ret = _op.vision.get_valid_counts(inputs[0], score_threshold=valid_thresh)
-    nms_out = _op.vision.nms(ret[1], ret[0],
-                             iou_threshold=overlap_thresh,
-                             force_suppress=force_suppress,
-                             topk=topk, id_index=id_index,
-                             do_rearrange=True)
+    nms_out = _op.vision.non_max_suppression(ret[1],
+                                             ret[0],
+                                             return_indices=False,
+                                             iou_threshold=iou_thresh,
+                                             force_suppress=force_suppress,
+                                             topk=topk,
+                                             id_index=id_index,
+                                             invalid_to_bottom=True)
     return nms_out
 
 
diff --git a/python/tvm/relay/op/_transform.py b/python/tvm/relay/op/_transform.py
index 83b5ce5a854f..315b448cdc6e 100644
--- a/python/tvm/relay/op/_transform.py
+++ b/python/tvm/relay/op/_transform.py
@@ -21,7 +21,7 @@
 _reg.register_schedule("arange", schedule_injective)
 _reg.register_schedule("cast", schedule_injective)
 _reg.register_schedule("strided_slice", schedule_injective)
-_reg.register_schedule("slice_axis", schedule_injective)
+_reg.register_schedule("_contrib_slice_axis", schedule_injective)
 _reg.register_schedule("slice_like", schedule_injective)
 _reg.register_schedule("split", schedule_injective)
 _reg.register_schedule("take", schedule_injective)
diff --git a/python/tvm/relay/op/transform.py b/python/tvm/relay/op/transform.py
index f19aa19772b4..9dc42861a4cf 100644
--- a/python/tvm/relay/op/transform.py
+++ b/python/tvm/relay/op/transform.py
@@ -494,7 +494,7 @@ def slice_axis(data, axis, begin, end=None):
         The computed result.
     """
     end = end or 0
-    return _make.slice_axis(data, axis, begin, end)
+    return _make._contrib_slice_axis(data, axis, begin, end)
 
 
 def slice_like(data, shape_like, axes=None):
diff --git a/python/tvm/relay/op/vision/_vision.py b/python/tvm/relay/op/vision/_vision.py
index 57ca6f2da2ce..45569a9814af 100644
--- a/python/tvm/relay/op/vision/_vision.py
+++ b/python/tvm/relay/op/vision/_vision.py
@@ -72,25 +72,27 @@ def compute_get_valid_counts(attrs, inputs, _, target):
 
 
 # non-maximum suppression
-@reg.register_schedule("vision.nms")
+@reg.register_schedule("vision.non_max_suppression")
 def schedule_nms(_, outs, target):
     """Schedule definition of nms"""
     with target:
         return topi.generic.schedule_nms(outs)
 
 
-@reg.register_compute("vision.nms")
+@reg.register_compute("vision.non_max_suppression")
 def compute_nms(attrs, inputs, _, target):
     """Compute definition of nms"""
+    return_indices = bool(get_const_int(attrs.return_indices))
     iou_threshold = get_const_float(attrs.iou_threshold)
     force_suppress = bool(get_const_int(attrs.force_suppress))
     topk = get_const_int(attrs.topk)
     id_index = get_const_int(attrs.id_index)
-    do_rearrange = bool(get_const_int(attrs.do_rearrange))
+    invalid_to_bottom = bool(get_const_int(attrs.invalid_to_bottom))
     return [
-        topi.vision.nms(inputs[0], inputs[1], iou_threshold,
-                        force_suppress, topk, id_index, do_rearrange)
+        topi.vision.non_max_suppression(inputs[0], inputs[1], return_indices,
+                                        iou_threshold, force_suppress, topk,
+                                        id_index, invalid_to_bottom)
     ]
 
 
-reg.register_pattern("vision.nms", OpPattern.OPAQUE)
+reg.register_pattern("vision.non_max_suppression", OpPattern.OPAQUE)
diff --git a/python/tvm/relay/op/vision/nms.py b/python/tvm/relay/op/vision/nms.py
index 157008ec2174..5dd2c5a74555 100644
--- a/python/tvm/relay/op/vision/nms.py
+++ b/python/tvm/relay/op/vision/nms.py
@@ -27,13 +27,14 @@ def get_valid_counts(data,
     return TupleWrapper(_make.get_valid_counts(data, score_threshold), 2)
 
 
-def nms(data,
-        valid_count,
-        iou_threshold=0.5,
-        force_suppress=False,
-        topk=-1,
-        id_index=0,
-        do_rearrange=False):
+def non_max_suppression(data,
+                        valid_count,
+                        return_indices,
+                        iou_threshold=0.5,
+                        force_suppress=False,
+                        topk=-1,
+                        id_index=0,
+                        invalid_to_bottom=False):
     """Non-maximum suppression operator for object detection.
 
     Parameters
@@ -46,6 +47,9 @@ def nms(data,
     valid_count : relay.Expr
         1-D tensor for valid number of boxes.
 
+    return_indices : bool
+        Whether to return box indices in input data.
+
     iou_threshold : float, optional
         Non-maximum suppression threshold.
 
@@ -58,7 +62,7 @@ def nms(data,
     id_index : optional, int
         index of the class categories, -1 to disable.
 
-    do_rearrange : optional, boolean
+    invalid_to_bottom : optional, boolean
         Whether to move all valid bounding boxes to the top.
 
     Returns
@@ -66,5 +70,6 @@ def nms(data,
     out : relay.Expr
         3-D tensor with shape [batch_size, num_anchors, 6].
     """
-    return _make.nms(data, valid_count, iou_threshold,
-                     force_suppress, topk, id_index, do_rearrange)
+    return _make.non_max_suppression(data, valid_count, return_indices,
+                                     iou_threshold, force_suppress, topk,
+                                     id_index, invalid_to_bottom)
diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index 73b74a60d756..c0f279a6b72c 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -1401,11 +1401,11 @@ Expr MakeSliceAxis(Expr data,
   attrs->axis = axis;
   attrs->begin = begin;
   attrs->end = end;
-  static const Op& op = Op::Get("slice_axis");
+  static const Op& op = Op::Get("_contrib_slice_axis");
   return CallNode::make(op, {data}, Attrs(attrs), {});
 }
 
-TVM_REGISTER_API("relay.op._make.slice_axis")
+TVM_REGISTER_API("relay.op._make._contrib_slice_axis")
 .set_body([](const TVMArgs& args, TVMRetValue* rv) {
   runtime::detail::unpack_call<Expr, 4>(MakeSliceAxis, args, rv);
 });
@@ -1446,14 +1446,14 @@ Array<Tensor> SliceAxisCompute(const Attrs& attrs,
   };
 }
 
-RELAY_REGISTER_OP("slice_axis")
+RELAY_REGISTER_OP("_contrib_slice_axis")
 .describe(R"doc(Slices along a given axis.
 Returns an array slice along a given axis starting from
 the begin index to the end index.
 )doc" TVM_ADD_FILELINE)
 .set_num_inputs(1)
 .add_argument("data", "Tensor", "Input data.")
-.set_support_level(4)
+.set_support_level(10)
 .add_type_rel("SliceAxis", SliceAxisRel)
 .set_attr<FTVMCompute>("FTVMCompute", SliceAxisCompute)
 .set_attr<TOpPattern>("TOpPattern", kInjective);
diff --git a/src/relay/op/vision/nms.cc b/src/relay/op/vision/nms.cc
index e8a84734b3d7..229983158262 100644
--- a/src/relay/op/vision/nms.cc
+++ b/src/relay/op/vision/nms.cc
@@ -65,42 +65,50 @@ bool NMSRel(const Array<Type>& types,
   CHECK_EQ(types.size(), 3);
   const auto* data = types[0].as<TensorTypeNode>();
   const auto* valid_count = types[1].as<TensorTypeNode>();
+  const NMSAttrs* param = attrs.as<NMSAttrs>();
   const auto& dshape = data->shape;
   const auto& vshape = valid_count->shape;
   CHECK_EQ(dshape.size(), 3) << "Input data should be 3-D.";
   CHECK_EQ(vshape.size(), 1) << "Input valid count should be 1-D.";
 
   // assign output type
-  reporter->Assign(types[2], TensorTypeNode::make(dshape, data->dtype));
+  if (param->return_indices) {
+    std::vector<IndexExpr> oshape({dshape[0], dshape[1]});
+    reporter->Assign(types[2], TensorTypeNode::make(oshape, Int(32)));
+  } else {
+    reporter->Assign(types[2], TensorTypeNode::make(dshape, data->dtype));
+  }
   return true;
 }
 
 
 Expr MakeNMS(Expr data,
              Expr valid_count,
+             bool return_indices,
              double iou_threshold,
              bool force_suppress,
              int topk,
              int id_index,
-             bool do_rearrange) {
+             bool invalid_to_bottom) {
   auto attrs = make_node<NMSAttrs>();
+  attrs->return_indices = return_indices;
   attrs->iou_threshold = iou_threshold;
   attrs->force_suppress = force_suppress;
   attrs->topk = topk;
   attrs->id_index = id_index;
-  attrs->do_rearrange = do_rearrange;
-  static const Op& op = Op::Get("vision.nms");
+  attrs->invalid_to_bottom = invalid_to_bottom;
+  static const Op& op = Op::Get("vision.non_max_suppression");
   return CallNode::make(op, {data, valid_count}, Attrs(attrs), {});
 }
 
 
-TVM_REGISTER_API("relay.op.vision._make.nms")
+TVM_REGISTER_API("relay.op.vision._make.non_max_suppression")
 .set_body([](const TVMArgs& args, TVMRetValue* rv) {
-  runtime::detail::unpack_call<Expr, 7>(MakeNMS, args, rv);
+  runtime::detail::unpack_call<Expr, 8>(MakeNMS, args, rv);
 });
 
 
-RELAY_REGISTER_OP("vision.nms")
+RELAY_REGISTER_OP("vision.non_max_suppression")
 .describe(R"doc(Non-maximum suppression.
 )doc" TVM_ADD_FILELINE)
 .set_num_inputs(2)
diff --git a/tests/python/relay/test_op_level10.py b/tests/python/relay/test_op_level10.py
index 34285d2b18dd..e3c331c6a1d0 100644
--- a/tests/python/relay/test_op_level10.py
+++ b/tests/python/relay/test_op_level10.py
@@ -2,6 +2,7 @@
 """
 import numpy as np
 import tvm
+import topi.testing
 from tvm import relay
 from tvm.relay.testing import ctx_list
 import topi
@@ -145,6 +146,7 @@ def verify_reverse_reshape(shape, newshape, oshape):
     verify_reverse_reshape((2, 3, 4), (-1, 0), (6, 4))
     verify_reverse_reshape((2, 3, 4), (0, -3), (2, 12))
 
+<<<<<<< HEAD
 def verify_batch_matmul(x_shape, y_shape, out_shape, dtype="float32"):
     x = relay.var("x", relay.TensorType(x_shape, dtype))
     y = relay.var("y", relay.TensorType(y_shape, dtype))
@@ -176,6 +178,27 @@ def test_batch_matmul():
     verify_batch_matmul((5, 16, 32), (5, 20, 32), (5, 16, 20))
     verify_batch_matmul((30, 16, 32), (30, 20, 32), (30, 16, 20))
 
+def test_contrib_slice_axis():
+    def verify(dshape, axis, begin, end):
+        x = relay.var("x", relay.TensorType(dshape, "float32"))
+        z = relay.slice_axis(x, axis=axis, begin=begin, end=end)
+        func = relay.Function([x], z)
+        func = relay.ir_pass.infer_type(func)
+        text = func.astext()
+        assert "begin" in text
+        assert "end" in text
+        x_data = np.random.uniform(size=dshape).astype("float32")
+        ref_res = topi.testing.slice_axis_python(
+            x_data, axis, begin, end)
+        for target, ctx in ctx_list():
+            intrp = relay.create_executor("graph", ctx=ctx, target=target)
+            op_res = intrp.evaluate(func)(x_data)
+            tvm.testing.assert_allclose(op_res.asnumpy(), ref_res)
+
+    verify((1, 2, 3, 4), 3, 0, 2)
+    verify((100, 50), -1, 1, -1)
+    verify((20,), -1, -9, -3)
+    verify((20, 30, 40), 1, 5, 0)
 
 if __name__ == "__main__":
     test_collapse_sum_like()
@@ -183,3 +206,4 @@ def test_batch_matmul():
     test_slice_like()
     test_reverse_reshape()
     test_batch_matmul()
+    test_contrib_slice_axis()
diff --git a/tests/python/relay/test_op_level4.py b/tests/python/relay/test_op_level4.py
index 4ba7e8cd1e72..ae7fe320940a 100644
--- a/tests/python/relay/test_op_level4.py
+++ b/tests/python/relay/test_op_level4.py
@@ -196,29 +196,6 @@ def _wrapper(data, axis=None, keepdims=False):
         verify_reduce(func, (128, 24, 128), (0, 2), True, False, (1, 24, 1))
 
 
-def test_slice_axis():
-    def verify(dshape, axis, begin, end):
-        x = relay.var("x", relay.TensorType(dshape, "float32"))
-        z = relay.slice_axis(x, axis=axis, begin=begin, end=end)
-        func = relay.Function([x], z)
-        func = relay.ir_pass.infer_type(func)
-        text = func.astext()
-        assert "begin" in text
-        assert "end" in text
-        x_data = np.random.uniform(size=dshape).astype("float32")
-        ref_res = topi.testing.slice_axis_python(
-            x_data, axis, begin, end)
-        for target, ctx in ctx_list():
-            intrp = relay.create_executor("graph", ctx=ctx, target=target)
-            op_res = intrp.evaluate(func)(x_data)
-            tvm.testing.assert_allclose(op_res.asnumpy(), ref_res)
-
-    verify((1, 2, 3, 4), 3, 0, 2)
-    verify((100, 50), -1, 1, -1)
-    verify((20,), -1, -9, -3)
-    verify((20, 30, 40), 1, 5, 0)
-
-
 def test_strided_slice():
     def verify(dshape, begin, end, strides, output, test_ref=True):
         x = relay.var("x", relay.TensorType(dshape, "float32"))
@@ -260,4 +237,3 @@ def verify(dshape, begin, end, strides, output, test_ref=True):
     test_binary_int_broadcast()
     test_where()
     test_reduce_functions()
-    test_slice_axis()
diff --git a/tests/python/relay/test_op_level5.py b/tests/python/relay/test_op_level5.py
index 1e7fe76c0a9e..09a5d37d51f8 100644
--- a/tests/python/relay/test_op_level5.py
+++ b/tests/python/relay/test_op_level5.py
@@ -174,30 +174,40 @@ def verify_get_valid_counts(dshape, score_threshold):
     verify_get_valid_counts((16, 500, 6), 0.95)
 
 
-def test_nms():
-    def verify_nms(x0_data, x1_data, dshape, ref_res,
-                   overlap_threshold=0.5, force_suppress=False, topk=-1,
+def test_non_max_suppression():
+    def verify_nms(x0_data, x1_data, dshape, ref_res, ref_indices_res,
+                   iou_threshold=0.5, force_suppress=False, topk=-1,
                    check_type_only=False):
         x0 = relay.var("x0", relay.ty.TensorType(dshape, "float32"))
         x1 = relay.var("x1", relay.ty.TensorType((dshape[0],), "int"))
-        z = relay.vision.nms(x0, x1, overlap_threshold, force_suppress, topk)
+        z = relay.vision.non_max_suppression(x0, x1, False, iou_threshold, force_suppress, topk)
+        z_indices = relay.vision.non_max_suppression(x0, x1, True, iou_threshold, force_suppress, topk)
         assert "iou_threshold" in z.astext()
+        assert "iou_threshold" in z_indices.astext()
         zz = relay.ir_pass.infer_type(z)
+        zz_indices = relay.ir_pass.infer_type(z_indices)
         assert zz.checked_type == relay.ty.TensorType(dshape, "float32")
+        assert zz_indices.checked_type == relay.ty.TensorType((dshape[0], dshape[1]), "int32")
 
         if check_type_only:
             return
 
         func = relay.Function([x0, x1], z)
         func = relay.ir_pass.infer_type(func)
+        func_indices = relay.Function([x0, x1], z_indices)
+        func_indices = relay.ir_pass.infer_type(func_indices)
         ctx_list = [("llvm", tvm.cpu(0))]
         for target, ctx in ctx_list:
             intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
             op_res1 = intrp1.evaluate(func)(x0_data, x1_data)
+            op_indices_res1 = intrp1.evaluate(func_indices)(x0_data, x1_data)
             tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5)
+            tvm.testing.assert_allclose(op_indices_res1.asnumpy(), ref_indices_res, rtol=1e-5)
             intrp2 = relay.create_executor("debug", ctx=ctx, target=target)
             op_res2 = intrp2.evaluate(func)(x0_data, x1_data)
+            op_indices_res2 = intrp2.evaluate(func_indices)(x0_data, x1_data)
             tvm.testing.assert_allclose(op_res2.asnumpy(), ref_res, rtol=1e-5)
+            tvm.testing.assert_allclose(op_indices_res2.asnumpy(), ref_indices_res, rtol=1e-5)
 
     np_data = np.array([[[0, 0.8, 1, 20, 25, 45], [1, 0.7, 30, 60, 50, 80],
                          [0, 0.4, 4, 21, 19, 40], [2, 0.9, 35, 61, 52, 79],
@@ -206,22 +216,26 @@ def verify_nms(x0_data, x1_data, dshape, ref_res,
     np_result = np.array([[[2, 0.9, 35, 61, 52, 79], [0, 0.8, 1, 20, 25, 45],
                            [-1, -1, -1, -1, -1, -1], [-1, -1, -1, -1, -1, -1],
                            [-1, -1, -1, -1, -1, -1]]])
+    np_indices_result = np.array([[3, 0, -1, -1, -1]])
     num_anchors = 5
 
     dshape = (tvm.var("n"), num_anchors, 6)
-    verify_nms(np_data, np_valid_count, dshape, np_result,
+    verify_nms(np_data, np_valid_count, dshape, np_result, np_indices_result,
                force_suppress=True, topk=2, check_type_only=True)
     dshape = (1, num_anchors, 6)
-    verify_nms(np_data, np_valid_count, dshape, np_result,
+    verify_nms(np_data, np_valid_count, dshape, np_result, np_indices_result,
                force_suppress=True, topk=2, check_type_only=False)
 
     np_result = np.array([[[2, 0.9, 35, 61, 52, 79], [0, 0.8, 1, 20, 25, 45],
                            [1, 0.7, 30, 60, 50, 80], [-1, -1, -1, -1, -1, -1],
                            [-1, -1, -1, -1, -1, -1]]])
+    np_indices_result = np.array([[3, 0, 1, -1, -1]])
     dshape = (tvm.var("n"), num_anchors, 6)
-    verify_nms(np_data, np_valid_count, dshape, np_result, check_type_only=True)
+    verify_nms(np_data, np_valid_count, dshape, np_result,
+               np_indices_result, check_type_only=True)
     dshape = (1, num_anchors, 6)
-    verify_nms(np_data, np_valid_count, dshape, np_result, topk=3)
+    verify_nms(np_data, np_valid_count, dshape, np_result,
+               np_indices_result, topk=3)
 
 
 def test_multibox_transform_loc():
@@ -263,7 +277,7 @@ def test_default_value():
 
         assert ret.checked_type == ref_type
 
-        nms = relay.vision.nms(mtl[0], mtl[1])
+        nms = relay.vision.non_max_suppression(mtl[0], mtl[1], False)
         func = relay.Function([cls_prob, loc_pred, anchors], nms)
         func = relay.ir_pass.infer_type(func)
         ctx_list = [("llvm", tvm.cpu(0))]
@@ -449,8 +463,8 @@ def verify_yolo_reorg(shape, stride):
     test_multibox_prior()
     test_multibox_transform_loc()
     test_get_valid_counts()
-    test_nms()
     test_roi_align()
     test_proposal()
     test_yolo_reorg_infer_shape()
     test_yolo_reorg()
+    test_non_max_suppression()
diff --git a/tutorials/nnvm/deploy_ssd_gluoncv.py b/tutorials/relay/deploy_ssd_gluoncv.py
similarity index 76%
rename from tutorials/nnvm/deploy_ssd_gluoncv.py
rename to tutorials/relay/deploy_ssd_gluoncv.py
index d83d1f86b75e..6a5d63b9f8cf 100644
--- a/tutorials/nnvm/deploy_ssd_gluoncv.py
+++ b/tutorials/relay/deploy_ssd_gluoncv.py
@@ -4,7 +4,7 @@
 **Author**: `Yao Wang <https://github.com/kevinthesun>`_
 
 This article is an introductory tutorial to deploy SSD models with TVM.
-We will use GluonCV pre-trained SSD model and convert it to NNVM graph.
+We will use GluonCV pre-trained SSD model and convert it to Relay IR
 """
 import tvm
 
@@ -34,8 +34,8 @@
 #
 #   To get best inference performance on CPU, change
 #   target argument according to your device and
-#   follow the :ref:`tune_nnvm_x86` to tune x86 CPU and
-#   :ref:`tune_nnvm_arm` for arm cpu.
+#   follow the :ref:`tune_relay_x86` to tune x86 CPU and
+#   :ref:`tune_relay_arm` for arm cpu.
 #
 #   SSD with VGG as body network is not supported yet since
 #   x86 conv2d schedule doesn't support dilation.
@@ -54,7 +54,6 @@
 dshape = (1, 3, 512, 512)
 dtype = "float32"
 target_list = ctx_list()
-frontend_list = ["nnvm", "relay"]
 
 ######################################################################
 # Download and pre-process demo image
@@ -65,20 +64,14 @@
 x, img = data.transforms.presets.ssd.load_test(im_fname, short=512)
 
 ######################################################################
-# Convert and compile model with NNVM or Relay for CPU.
+# Convert and compile model for CPU.
 
 block = model_zoo.get_model(model_name, pretrained=True)
 
-def compile(frontend, target):
-    if frontend == "relay":
-        net, params = relay.frontend.from_mxnet(block, {"data": dshape})
-        with relay.build_config(opt_level=3):
-            graph, lib, params = relay.build(net, target, params=params)
-    else:
-        net, params = from_mxnet(block)
-        with compiler.build_config(opt_level=3):
-            graph, lib, params = compiler.build(
-                net, target, {"data": dshape}, params=params)
+def compile(target):
+    net, params = relay.frontend.from_mxnet(block, {"data": dshape})
+    with relay.build_config(opt_level=3):
+        graph, lib, params = relay.build(net, target, params=params)
     return graph, lib, params
 
 ######################################################################
@@ -100,9 +93,8 @@ def run(graph, lib, params, ctx):
     if target == "cuda":
         print("GPU not supported yet, skip.")
         continue
-    for frontend in frontend_list:
-        graph, lib, params = compile(frontend, target)
-        class_IDs, scores, bounding_boxs = run(graph, lib, params, ctx)
+    graph, lib, params = compile(target)
+    class_IDs, scores, bounding_boxs = run(graph, lib, params, ctx)
 
 ######################################################################
 # Display result

From 42571cf4b4098b5775d7bcfcbc73338ec6a3c4ff Mon Sep 17 00:00:00 2001
From: Wang <wayao@186590ccd7a1.ant.amazon.com>
Date: Fri, 22 Feb 2019 14:03:17 +0800
Subject: [PATCH 31/43] Add max_output_size arg

---
 include/tvm/relay/attrs/vision.h        |  4 ++++
 nnvm/include/nnvm/top/nn.h              |  4 ++++
 nnvm/python/nnvm/top/vision.py          |  5 +++--
 python/tvm/relay/op/vision/_vision.py   |  5 +++--
 python/tvm/relay/op/vision/nms.py       |  9 ++++++--
 src/relay/op/vision/nms.cc              |  4 +++-
 tests/python/relay/test_op_level5.py    |  4 ++--
 topi/python/topi/vision/nms.py          | 29 +++++++++++++++++++++----
 topi/python/topi/vision/ssd/multibox.py |  4 ++--
 topi/tests/python/test_topi_vision.py   |  8 +++----
 10 files changed, 57 insertions(+), 19 deletions(-)

diff --git a/include/tvm/relay/attrs/vision.h b/include/tvm/relay/attrs/vision.h
index 12523bd0c8ea..d5aad412a9dc 100644
--- a/include/tvm/relay/attrs/vision.h
+++ b/include/tvm/relay/attrs/vision.h
@@ -71,6 +71,7 @@ struct GetValidCountsAttrs : public tvm::AttrsNode<GetValidCountsAttrs>{
 /*! \brief Attributes used in non_maximum_suppression operator */
 struct NMSAttrs : public tvm::AttrsNode<NMSAttrs>{
   bool return_indices;
+  int max_output_size;
   double iou_threshold;
   bool force_suppress;
   int topk;
@@ -80,6 +81,9 @@ struct NMSAttrs : public tvm::AttrsNode<NMSAttrs>{
   TVM_DECLARE_ATTRS(NMSAttrs, "relay.attrs.NMSAttrs") {
     TVM_ATTR_FIELD(return_indices)
       .describe("Whether to return box indices in input data.");
+    TVM_ATTR_FIELD(max_output_size).set_default(-1)
+      .describe("Max number of output valid boxes for each instance."
+                "By default all valid boxes are returned.");
     TVM_ATTR_FIELD(iou_threshold).set_default(0.5)
       .describe("Non-maximum suppression threshold.");
     TVM_ATTR_FIELD(force_suppress).set_default(false)
diff --git a/nnvm/include/nnvm/top/nn.h b/nnvm/include/nnvm/top/nn.h
index 6fd283aeb14e..b4ec5950964b 100644
--- a/nnvm/include/nnvm/top/nn.h
+++ b/nnvm/include/nnvm/top/nn.h
@@ -445,6 +445,7 @@ struct MultiBoxTransformLocParam : public dmlc::Parameter<MultiBoxTransformLocPa
 
 struct NMSParam : public dmlc::Parameter<NMSParam> {
   bool return_indices;
+  int max_output_size;
   float iou_threshold;
   bool force_suppress;
   int topk;
@@ -453,6 +454,9 @@ struct NMSParam : public dmlc::Parameter<NMSParam> {
   DMLC_DECLARE_PARAMETER(NMSParam) {
     DMLC_DECLARE_FIELD(return_indices)
       .describe("Whether to return box indices in input data.");
+    DMLC_DECLARE_FIELD(max_output_size).set_default(-1)
+      .describe("Max number of output valid boxes for each instance."
+                "By default all valid boxes are returned.");
     DMLC_DECLARE_FIELD(iou_threshold).set_default(0.5)
       .describe("Non-maximum suppression threshold.");
     DMLC_DECLARE_FIELD(force_suppress).set_default(false)
diff --git a/nnvm/python/nnvm/top/vision.py b/nnvm/python/nnvm/top/vision.py
index 1beb9be2564b..1f166e2a00d7 100644
--- a/nnvm/python/nnvm/top/vision.py
+++ b/nnvm/python/nnvm/top/vision.py
@@ -71,6 +71,7 @@ def schedule_nms(_, outs, target):
 def compute_nms(attrs, inputs, _):
     """Compute definition of non_max_suppression"""
     return_indices = attrs.get_bool('return_indices')
+    max_output_size = attrs.get_int('max_output_size')
     iou_threshold = attrs.get_float('iou_threshold')
     force_suppress = attrs.get_bool('force_suppress')
     topk = attrs.get_int('topk')
@@ -78,7 +79,7 @@ def compute_nms(attrs, inputs, _):
     invalid_to_bottom = attrs.get_bool('invalid_to_bottom')
 
     return topi.vision.non_max_suppression(inputs[0], inputs[1], return_indices,
-                                           iou_threshold, force_suppress, topk,
-                                           id_index, invalid_to_bottom)
+                                           max_output_size, iou_threshold, force_suppress,
+                                           topk, id_index, invalid_to_bottom)
 
 reg.register_pattern("non_max_suppression", OpPattern.OPAQUE)
diff --git a/python/tvm/relay/op/vision/_vision.py b/python/tvm/relay/op/vision/_vision.py
index 45569a9814af..40a47eb7b366 100644
--- a/python/tvm/relay/op/vision/_vision.py
+++ b/python/tvm/relay/op/vision/_vision.py
@@ -83,6 +83,7 @@ def schedule_nms(_, outs, target):
 def compute_nms(attrs, inputs, _, target):
     """Compute definition of nms"""
     return_indices = bool(get_const_int(attrs.return_indices))
+    max_output_size = get_const_int(attrs.max_output_size)
     iou_threshold = get_const_float(attrs.iou_threshold)
     force_suppress = bool(get_const_int(attrs.force_suppress))
     topk = get_const_int(attrs.topk)
@@ -90,8 +91,8 @@ def compute_nms(attrs, inputs, _, target):
     invalid_to_bottom = bool(get_const_int(attrs.invalid_to_bottom))
     return [
         topi.vision.non_max_suppression(inputs[0], inputs[1], return_indices,
-                                        iou_threshold, force_suppress, topk,
-                                        id_index, invalid_to_bottom)
+                                        max_output_size, iou_threshold, force_suppress,
+                                        topk, id_index, invalid_to_bottom)
     ]
 
 
diff --git a/python/tvm/relay/op/vision/nms.py b/python/tvm/relay/op/vision/nms.py
index 5dd2c5a74555..a8a9f677872f 100644
--- a/python/tvm/relay/op/vision/nms.py
+++ b/python/tvm/relay/op/vision/nms.py
@@ -30,6 +30,7 @@ def get_valid_counts(data,
 def non_max_suppression(data,
                         valid_count,
                         return_indices,
+                        max_output_size=-1,
                         iou_threshold=0.5,
                         force_suppress=False,
                         topk=-1,
@@ -47,6 +48,10 @@ def non_max_suppression(data,
     valid_count : relay.Expr
         1-D tensor for valid number of boxes.
 
+    max_output_size : int, optional
+        Max number of output valid boxes for each instance.
+        By default all valid boxes are returned.
+
     return_indices : bool
         Whether to return box indices in input data.
 
@@ -71,5 +76,5 @@ def non_max_suppression(data,
         3-D tensor with shape [batch_size, num_anchors, 6].
     """
     return _make.non_max_suppression(data, valid_count, return_indices,
-                                     iou_threshold, force_suppress, topk,
-                                     id_index, invalid_to_bottom)
+                                     max_output_size, iou_threshold, force_suppress,
+                                     topk, id_index, invalid_to_bottom)
diff --git a/src/relay/op/vision/nms.cc b/src/relay/op/vision/nms.cc
index 229983158262..a35394cfc216 100644
--- a/src/relay/op/vision/nms.cc
+++ b/src/relay/op/vision/nms.cc
@@ -85,6 +85,7 @@ bool NMSRel(const Array<Type>& types,
 Expr MakeNMS(Expr data,
              Expr valid_count,
              bool return_indices,
+             int max_output_size,
              double iou_threshold,
              bool force_suppress,
              int topk,
@@ -92,6 +93,7 @@ Expr MakeNMS(Expr data,
              bool invalid_to_bottom) {
   auto attrs = make_node<NMSAttrs>();
   attrs->return_indices = return_indices;
+  attrs->max_output_size = max_output_size;
   attrs->iou_threshold = iou_threshold;
   attrs->force_suppress = force_suppress;
   attrs->topk = topk;
@@ -104,7 +106,7 @@ Expr MakeNMS(Expr data,
 
 TVM_REGISTER_API("relay.op.vision._make.non_max_suppression")
 .set_body([](const TVMArgs& args, TVMRetValue* rv) {
-  runtime::detail::unpack_call<Expr, 8>(MakeNMS, args, rv);
+  runtime::detail::unpack_call<Expr, 9>(MakeNMS, args, rv);
 });
 
 
diff --git a/tests/python/relay/test_op_level5.py b/tests/python/relay/test_op_level5.py
index 09a5d37d51f8..ce7bcd1d9abc 100644
--- a/tests/python/relay/test_op_level5.py
+++ b/tests/python/relay/test_op_level5.py
@@ -180,8 +180,8 @@ def verify_nms(x0_data, x1_data, dshape, ref_res, ref_indices_res,
                    check_type_only=False):
         x0 = relay.var("x0", relay.ty.TensorType(dshape, "float32"))
         x1 = relay.var("x1", relay.ty.TensorType((dshape[0],), "int"))
-        z = relay.vision.non_max_suppression(x0, x1, False, iou_threshold, force_suppress, topk)
-        z_indices = relay.vision.non_max_suppression(x0, x1, True, iou_threshold, force_suppress, topk)
+        z = relay.vision.non_max_suppression(x0, x1, False, -1, iou_threshold, force_suppress, topk)
+        z_indices = relay.vision.non_max_suppression(x0, x1, True, -1, iou_threshold, force_suppress, topk)
         assert "iou_threshold" in z.astext()
         assert "iou_threshold" in z_indices.astext()
         zz = relay.ir_pass.infer_type(z)
diff --git a/topi/python/topi/vision/nms.py b/topi/python/topi/vision/nms.py
index 60715c91f0dc..daac25114663 100644
--- a/topi/python/topi/vision/nms.py
+++ b/topi/python/topi/vision/nms.py
@@ -1,4 +1,4 @@
-# pylint: disable=invalid-name, no-member, too-many-locals, too-many-arguments, undefined-variable, too-many-nested-blocks, too-many-branches
+# pylint: disable=invalid-name, no-member, too-many-locals, too-many-arguments, undefined-variable, too-many-nested-blocks, too-many-branches, too-many-statements
 """Non-maximum suppression operator"""
 import tvm
 
@@ -112,7 +112,7 @@ def get_valid_counts(data, score_threshold=0):
 
 @hybrid.script
 def hybrid_nms(data, sorted_index, valid_count,
-               iou_threshold, force_suppress,
+               max_output_size, iou_threshold, force_suppress,
                topk, id_index):
     """Hybrid routing for non-maximum suppression.
 
@@ -129,6 +129,10 @@ def hybrid_nms(data, sorted_index, valid_count,
     valid_count : tvm.Tensor or numpy NDArray
         1-D tensor for valid number of boxes.
 
+    max_output_size : tvm.const
+        Max number of output valid boxes for each instance.
+        By default all valid boxes are returned.
+
     iou_threshold : tvm.const
         Overlapping(IoU) threshold to suppress object with smaller score.
 
@@ -215,12 +219,24 @@ def hybrid_nms(data, sorted_index, valid_count,
             for k in range(box_data_length):
                 output[i, j + valid_count[i], k] = -1.0
             box_indices[i, j + valid_count[i]] = -1
+        # Only return max_output_size valid boxes
+        num_valid_boxes = 0
+        if max_output_size > 0:
+            for j in range(valid_count[i]):
+                if output[i, j, 0] >= 0:
+                    if num_valid_boxes == max_output_size:
+                        for k in range(box_data_length):
+                            output[i, j, k] = -1.0
+                        box_indices[i, j] = -1
+                    else:
+                        num_valid_boxes += 1
     return output, box_indices
 
 
 @tvm.target.generic_func
-def non_max_suppression(data, valid_count, return_indices, iou_threshold=0.5,
-                        force_suppress=False, topk=-1, id_index=0, invalid_to_bottom=False):
+def non_max_suppression(data, valid_count, return_indices, max_output_size=-1,
+                        iou_threshold=0.5, force_suppress=False, topk=-1,
+                        id_index=0, invalid_to_bottom=False):
     """Non-maximum suppression operator for object detection.
 
     Parameters
@@ -236,6 +252,10 @@ def non_max_suppression(data, valid_count, return_indices, iou_threshold=0.5,
     return_indices : boolean
         Whether to return box indices in input data.
 
+    max_output_size : optional, int
+        Max number of output valid boxes for each instance.
+        By default all valid boxes are returned.
+
     iou_threshold : optional, float
         Non-maximum suppression threshold.
 
@@ -302,6 +322,7 @@ def non_max_suppression(data, valid_count, return_indices, iou_threshold=0.5,
                    out_buffers=sort_tensor_buf,
                    name="nms_sort")
     out, box_indices = hybrid_nms(data, sort_tensor, valid_count,
+                                  tvm.const(max_output_size, dtype="int32"),
                                   tvm.const(iou_threshold, dtype="float32"),
                                   tvm.const(force_suppress, dtype="bool"),
                                   tvm.const(topk, dtype="int32"),
diff --git a/topi/python/topi/vision/ssd/multibox.py b/topi/python/topi/vision/ssd/multibox.py
index 3c8cc6c07417..64a4a94f7f06 100644
--- a/topi/python/topi/vision/ssd/multibox.py
+++ b/topi/python/topi/vision/ssd/multibox.py
@@ -292,6 +292,6 @@ def multibox_detection(cls_prob, loc_pred, anchor, clip=True, threshold=0.01, nm
     """
     inter_out = multibox_transform_loc(cls_prob, loc_pred, anchor,
                                        clip, threshold, variances)
-    out = non_max_suppression(inter_out[0], inter_out[1], False, nms_threshold,
-                              force_suppress, nms_topk)
+    out = non_max_suppression(inter_out[0], inter_out[1], False, -1,
+                              nms_threshold, force_suppress, nms_topk)
     return out
diff --git a/topi/tests/python/test_topi_vision.py b/topi/tests/python/test_topi_vision.py
index 489f2abb92fd..337e5e5e665c 100644
--- a/topi/tests/python/test_topi_vision.py
+++ b/topi/tests/python/test_topi_vision.py
@@ -87,11 +87,11 @@ def check_device(device):
         print("Running on target: %s" % device)
         with tvm.target.create(device):
             if device == 'llvm':
-                out = non_max_suppression(data, valid_count, False, nms_threshold, force_suppress, nms_topk)
-                indices_out = non_max_suppression(data, valid_count, True, nms_threshold, force_suppress, nms_topk)
+                out = non_max_suppression(data, valid_count, False, -1, nms_threshold, force_suppress, nms_topk)
+                indices_out = non_max_suppression(data, valid_count, True, -1, nms_threshold, force_suppress, nms_topk)
             else:
-                out = topi.cuda.non_max_suppression(data, valid_count, False, nms_threshold, force_suppress, nms_topk)
-                indices_out = topi.cuda.non_max_suppression(data, valid_count, True, nms_threshold, force_suppress, nms_topk)
+                out = topi.cuda.non_max_suppression(data, valid_count, False, -1, nms_threshold, force_suppress, nms_topk)
+                indices_out = topi.cuda.non_max_suppression(data, valid_count, -1, True, nms_threshold, force_suppress, nms_topk)
             s = topi.generic.schedule_nms(out)
             indices_s = topi.generic.schedule_nms(indices_out)
 

From f8fecec5539df37d4dadc6ba0f18cd3bc5d8c280 Mon Sep 17 00:00:00 2001
From: Wang <wayao@186590ccd7a1.ant.amazon.com>
Date: Sun, 24 Feb 2019 19:58:58 +0800
Subject: [PATCH 32/43] Make return_indices optional

---
 include/tvm/relay/attrs/vision.h        |  6 +++---
 nnvm/include/nnvm/top/nn.h              |  6 +++---
 nnvm/python/nnvm/top/vision.py          |  6 +++---
 python/tvm/relay/frontend/mxnet.py      |  2 +-
 python/tvm/relay/op/vision/_vision.py   |  6 +++---
 python/tvm/relay/op/vision/nms.py       | 18 +++++++++---------
 src/relay/op/vision/nms.cc              |  8 +++++---
 tests/python/relay/test_op_level5.py    |  6 +++---
 topi/python/topi/vision/nms.py          | 10 +++++-----
 topi/python/topi/vision/ssd/multibox.py |  5 +++--
 topi/tests/python/test_topi_vision.py   |  8 ++++----
 11 files changed, 42 insertions(+), 39 deletions(-)

diff --git a/include/tvm/relay/attrs/vision.h b/include/tvm/relay/attrs/vision.h
index d5aad412a9dc..2733e8554e9b 100644
--- a/include/tvm/relay/attrs/vision.h
+++ b/include/tvm/relay/attrs/vision.h
@@ -70,17 +70,15 @@ struct GetValidCountsAttrs : public tvm::AttrsNode<GetValidCountsAttrs>{
 
 /*! \brief Attributes used in non_maximum_suppression operator */
 struct NMSAttrs : public tvm::AttrsNode<NMSAttrs>{
-  bool return_indices;
   int max_output_size;
   double iou_threshold;
   bool force_suppress;
   int topk;
   int id_index;
+  bool return_indices;
   bool invalid_to_bottom;
 
   TVM_DECLARE_ATTRS(NMSAttrs, "relay.attrs.NMSAttrs") {
-    TVM_ATTR_FIELD(return_indices)
-      .describe("Whether to return box indices in input data.");
     TVM_ATTR_FIELD(max_output_size).set_default(-1)
       .describe("Max number of output valid boxes for each instance."
                 "By default all valid boxes are returned.");
@@ -92,6 +90,8 @@ struct NMSAttrs : public tvm::AttrsNode<NMSAttrs>{
       .describe("Keep maximum top k detections before nms, -1 for no limit.");
     TVM_ATTR_FIELD(id_index).set_default(0)
       .describe("Axis index of id.");
+    TVM_ATTR_FIELD(return_indices).set_default(true)
+      .describe("Whether to return box indices in input data.");
     TVM_ATTR_FIELD(invalid_to_bottom).set_default(false)
       .describe("Whether to move all invalid bounding boxes to the bottom.");
   }
diff --git a/nnvm/include/nnvm/top/nn.h b/nnvm/include/nnvm/top/nn.h
index b4ec5950964b..0f75096eb75d 100644
--- a/nnvm/include/nnvm/top/nn.h
+++ b/nnvm/include/nnvm/top/nn.h
@@ -445,15 +445,13 @@ struct MultiBoxTransformLocParam : public dmlc::Parameter<MultiBoxTransformLocPa
 
 struct NMSParam : public dmlc::Parameter<NMSParam> {
   bool return_indices;
-  int max_output_size;
   float iou_threshold;
   bool force_suppress;
   int topk;
   int id_index;
+  int max_output_size;
   bool invalid_to_bottom;
   DMLC_DECLARE_PARAMETER(NMSParam) {
-    DMLC_DECLARE_FIELD(return_indices)
-      .describe("Whether to return box indices in input data.");
     DMLC_DECLARE_FIELD(max_output_size).set_default(-1)
       .describe("Max number of output valid boxes for each instance."
                 "By default all valid boxes are returned.");
@@ -465,6 +463,8 @@ struct NMSParam : public dmlc::Parameter<NMSParam> {
       .describe("Keep maximum top k detections before nms, -1 for no limit.");
     DMLC_DECLARE_FIELD(id_index).set_default(0)
       .describe("Axis index of id.");
+    DMLC_DECLARE_FIELD(return_indices).set_default(true)
+      .describe("Whether to return box indices in input data.");
     DMLC_DECLARE_FIELD(invalid_to_bottom).set_default(false)
       .describe("Whether to move all invalid bounding boxes to the bottom.");
   }
diff --git a/nnvm/python/nnvm/top/vision.py b/nnvm/python/nnvm/top/vision.py
index 1f166e2a00d7..8bec66d7b8f4 100644
--- a/nnvm/python/nnvm/top/vision.py
+++ b/nnvm/python/nnvm/top/vision.py
@@ -78,8 +78,8 @@ def compute_nms(attrs, inputs, _):
     id_index = attrs.get_int('id_index')
     invalid_to_bottom = attrs.get_bool('invalid_to_bottom')
 
-    return topi.vision.non_max_suppression(inputs[0], inputs[1], return_indices,
-                                           max_output_size, iou_threshold, force_suppress,
-                                           topk, id_index, invalid_to_bottom)
+    return topi.vision.non_max_suppression(inputs[0], inputs[1], max_output_size,
+                                           iou_threshold, force_suppress, topk,
+                                           id_index, return_indices, invalid_to_bottom)
 
 reg.register_pattern("non_max_suppression", OpPattern.OPAQUE)
diff --git a/python/tvm/relay/frontend/mxnet.py b/python/tvm/relay/frontend/mxnet.py
index f00a7f551605..47d14cc395aa 100644
--- a/python/tvm/relay/frontend/mxnet.py
+++ b/python/tvm/relay/frontend/mxnet.py
@@ -405,11 +405,11 @@ def _mx_box_nms(inputs, attrs):
     ret = _op.vision.get_valid_counts(inputs[0], score_threshold=valid_thresh)
     nms_out = _op.vision.non_max_suppression(ret[1],
                                              ret[0],
-                                             return_indices=False,
                                              iou_threshold=iou_thresh,
                                              force_suppress=force_suppress,
                                              topk=topk,
                                              id_index=id_index,
+                                             return_indices=False,
                                              invalid_to_bottom=True)
     return nms_out
 
diff --git a/python/tvm/relay/op/vision/_vision.py b/python/tvm/relay/op/vision/_vision.py
index 40a47eb7b366..0205d6f3c2b6 100644
--- a/python/tvm/relay/op/vision/_vision.py
+++ b/python/tvm/relay/op/vision/_vision.py
@@ -90,9 +90,9 @@ def compute_nms(attrs, inputs, _, target):
     id_index = get_const_int(attrs.id_index)
     invalid_to_bottom = bool(get_const_int(attrs.invalid_to_bottom))
     return [
-        topi.vision.non_max_suppression(inputs[0], inputs[1], return_indices,
-                                        max_output_size, iou_threshold, force_suppress,
-                                        topk, id_index, invalid_to_bottom)
+        topi.vision.non_max_suppression(inputs[0], inputs[1], max_output_size,
+                                        iou_threshold, force_suppress, topk,
+                                        id_index, return_indices, invalid_to_bottom)
     ]
 
 
diff --git a/python/tvm/relay/op/vision/nms.py b/python/tvm/relay/op/vision/nms.py
index a8a9f677872f..e8586866c025 100644
--- a/python/tvm/relay/op/vision/nms.py
+++ b/python/tvm/relay/op/vision/nms.py
@@ -29,12 +29,12 @@ def get_valid_counts(data,
 
 def non_max_suppression(data,
                         valid_count,
-                        return_indices,
                         max_output_size=-1,
                         iou_threshold=0.5,
                         force_suppress=False,
                         topk=-1,
                         id_index=0,
+                        return_indices=True,
                         invalid_to_bottom=False):
     """Non-maximum suppression operator for object detection.
 
@@ -52,9 +52,6 @@ def non_max_suppression(data,
         Max number of output valid boxes for each instance.
         By default all valid boxes are returned.
 
-    return_indices : bool
-        Whether to return box indices in input data.
-
     iou_threshold : float, optional
         Non-maximum suppression threshold.
 
@@ -64,10 +61,13 @@ def non_max_suppression(data,
     topk : int, optional
         Keep maximum top k detections before nms, -1 for no limit.
 
-    id_index : optional, int
+    id_index : int, optional
         index of the class categories, -1 to disable.
 
-    invalid_to_bottom : optional, boolean
+    return_indices : bool, optional
+        Whether to return box indices in input data.
+
+    invalid_to_bottom : bool, optional
         Whether to move all valid bounding boxes to the top.
 
     Returns
@@ -75,6 +75,6 @@ def non_max_suppression(data,
     out : relay.Expr
         3-D tensor with shape [batch_size, num_anchors, 6].
     """
-    return _make.non_max_suppression(data, valid_count, return_indices,
-                                     max_output_size, iou_threshold, force_suppress,
-                                     topk, id_index, invalid_to_bottom)
+    return _make.non_max_suppression(data, valid_count, max_output_size,
+                                     iou_threshold, force_suppress, topk,
+                                     id_index, return_indices, invalid_to_bottom)
diff --git a/src/relay/op/vision/nms.cc b/src/relay/op/vision/nms.cc
index a35394cfc216..fca40f9426db 100644
--- a/src/relay/op/vision/nms.cc
+++ b/src/relay/op/vision/nms.cc
@@ -84,20 +84,20 @@ bool NMSRel(const Array<Type>& types,
 
 Expr MakeNMS(Expr data,
              Expr valid_count,
-             bool return_indices,
              int max_output_size,
              double iou_threshold,
              bool force_suppress,
              int topk,
              int id_index,
+             bool return_indices,
              bool invalid_to_bottom) {
   auto attrs = make_node<NMSAttrs>();
-  attrs->return_indices = return_indices;
   attrs->max_output_size = max_output_size;
   attrs->iou_threshold = iou_threshold;
   attrs->force_suppress = force_suppress;
   attrs->topk = topk;
   attrs->id_index = id_index;
+  attrs->return_indices = return_indices;
   attrs->invalid_to_bottom = invalid_to_bottom;
   static const Op& op = Op::Get("vision.non_max_suppression");
   return CallNode::make(op, {data, valid_count}, Attrs(attrs), {});
@@ -111,7 +111,9 @@ TVM_REGISTER_API("relay.op.vision._make.non_max_suppression")
 
 
 RELAY_REGISTER_OP("vision.non_max_suppression")
-.describe(R"doc(Non-maximum suppression.
+.describe(R"doc(Non-maximum suppression. The input boxes should
+be in the format of [class_id, score, left, top, right, bottom].
+Set id_index to be -1 to ignore class_id axis.
 )doc" TVM_ADD_FILELINE)
 .set_num_inputs(2)
 .add_argument("data", "Tensor", "Input data.")
diff --git a/tests/python/relay/test_op_level5.py b/tests/python/relay/test_op_level5.py
index ce7bcd1d9abc..6e027ff232f1 100644
--- a/tests/python/relay/test_op_level5.py
+++ b/tests/python/relay/test_op_level5.py
@@ -180,8 +180,8 @@ def verify_nms(x0_data, x1_data, dshape, ref_res, ref_indices_res,
                    check_type_only=False):
         x0 = relay.var("x0", relay.ty.TensorType(dshape, "float32"))
         x1 = relay.var("x1", relay.ty.TensorType((dshape[0],), "int"))
-        z = relay.vision.non_max_suppression(x0, x1, False, -1, iou_threshold, force_suppress, topk)
-        z_indices = relay.vision.non_max_suppression(x0, x1, True, -1, iou_threshold, force_suppress, topk)
+        z = relay.vision.non_max_suppression(x0, x1, -1, iou_threshold, force_suppress, topk, return_indices=False)
+        z_indices = relay.vision.non_max_suppression(x0, x1, -1, iou_threshold, force_suppress, topk)
         assert "iou_threshold" in z.astext()
         assert "iou_threshold" in z_indices.astext()
         zz = relay.ir_pass.infer_type(z)
@@ -277,7 +277,7 @@ def test_default_value():
 
         assert ret.checked_type == ref_type
 
-        nms = relay.vision.non_max_suppression(mtl[0], mtl[1], False)
+        nms = relay.vision.non_max_suppression(mtl[0], mtl[1], return_indices=False)
         func = relay.Function([cls_prob, loc_pred, anchors], nms)
         func = relay.ir_pass.infer_type(func)
         ctx_list = [("llvm", tvm.cpu(0))]
diff --git a/topi/python/topi/vision/nms.py b/topi/python/topi/vision/nms.py
index daac25114663..36ab0ed00510 100644
--- a/topi/python/topi/vision/nms.py
+++ b/topi/python/topi/vision/nms.py
@@ -234,9 +234,9 @@ def hybrid_nms(data, sorted_index, valid_count,
 
 
 @tvm.target.generic_func
-def non_max_suppression(data, valid_count, return_indices, max_output_size=-1,
+def non_max_suppression(data, valid_count, max_output_size=-1,
                         iou_threshold=0.5, force_suppress=False, topk=-1,
-                        id_index=0, invalid_to_bottom=False):
+                        id_index=0, return_indices=True, invalid_to_bottom=False):
     """Non-maximum suppression operator for object detection.
 
     Parameters
@@ -249,9 +249,6 @@ def non_max_suppression(data, valid_count, return_indices, max_output_size=-1,
     valid_count : tvm.Tensor
         1-D tensor for valid number of boxes.
 
-    return_indices : boolean
-        Whether to return box indices in input data.
-
     max_output_size : optional, int
         Max number of output valid boxes for each instance.
         By default all valid boxes are returned.
@@ -268,6 +265,9 @@ def non_max_suppression(data, valid_count, return_indices, max_output_size=-1,
     id_index : optional, int
         index of the class categories, -1 to disable.
 
+    return_indices : optional, boolean
+        Whether to return box indices in input data.
+
     invalid_to_bottom : optional, boolean
         Whether to move all valid bounding boxes to the top.
 
diff --git a/topi/python/topi/vision/ssd/multibox.py b/topi/python/topi/vision/ssd/multibox.py
index 64a4a94f7f06..2de1723dbd7b 100644
--- a/topi/python/topi/vision/ssd/multibox.py
+++ b/topi/python/topi/vision/ssd/multibox.py
@@ -292,6 +292,7 @@ def multibox_detection(cls_prob, loc_pred, anchor, clip=True, threshold=0.01, nm
     """
     inter_out = multibox_transform_loc(cls_prob, loc_pred, anchor,
                                        clip, threshold, variances)
-    out = non_max_suppression(inter_out[0], inter_out[1], False, -1,
-                              nms_threshold, force_suppress, nms_topk)
+    out = non_max_suppression(inter_out[0], inter_out[1], -1,
+                              nms_threshold, force_suppress, nms_topk,
+                              return_indices=False)
     return out
diff --git a/topi/tests/python/test_topi_vision.py b/topi/tests/python/test_topi_vision.py
index 337e5e5e665c..02e04212b63e 100644
--- a/topi/tests/python/test_topi_vision.py
+++ b/topi/tests/python/test_topi_vision.py
@@ -87,11 +87,11 @@ def check_device(device):
         print("Running on target: %s" % device)
         with tvm.target.create(device):
             if device == 'llvm':
-                out = non_max_suppression(data, valid_count, False, -1, nms_threshold, force_suppress, nms_topk)
-                indices_out = non_max_suppression(data, valid_count, True, -1, nms_threshold, force_suppress, nms_topk)
+                out = non_max_suppression(data, valid_count, -1, nms_threshold, force_suppress, nms_topk, return_indices=False)
+                indices_out = non_max_suppression(data, valid_count, -1, nms_threshold, force_suppress, nms_topk)
             else:
-                out = topi.cuda.non_max_suppression(data, valid_count, False, -1, nms_threshold, force_suppress, nms_topk)
-                indices_out = topi.cuda.non_max_suppression(data, valid_count, -1, True, nms_threshold, force_suppress, nms_topk)
+                out = topi.cuda.non_max_suppression(data, valid_count, -1, nms_threshold, force_suppress, nms_topk, return_indices=False)
+                indices_out = topi.cuda.non_max_suppression(data, valid_count, -1, nms_threshold, force_suppress, nms_topk)
             s = topi.generic.schedule_nms(out)
             indices_s = topi.generic.schedule_nms(indices_out)
 

From 11c8bba7e6cc2f54ff65046088fe9d3b88d7952b Mon Sep 17 00:00:00 2001
From: Wang <wayao@186590ccd7a1.ant.amazon.com>
Date: Sun, 24 Feb 2019 20:05:12 +0800
Subject: [PATCH 33/43] Minor fix

---
 nnvm/tests/python/frontend/mxnet/test_forward.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/nnvm/tests/python/frontend/mxnet/test_forward.py b/nnvm/tests/python/frontend/mxnet/test_forward.py
index 8992799528e7..67f1ad5ff27d 100644
--- a/nnvm/tests/python/frontend/mxnet/test_forward.py
+++ b/nnvm/tests/python/frontend/mxnet/test_forward.py
@@ -300,7 +300,6 @@ def test_forward_l2_normalize():
     mx_sym = mx.sym.L2Normalization(data, mode="channel")
     verify_mxnet_frontend_impl(mx_sym, (2, 3, 4, 5), (2, 3, 4, 5))
 
-
 if __name__ == '__main__':
     test_forward_mlp()
     test_forward_vgg()

From 908eedba34760394fbe9b1d52c8b8e187108aa6b Mon Sep 17 00:00:00 2001
From: Wang <wayao@186590ccd7a1.ant.amazon.com>
Date: Sun, 24 Feb 2019 21:29:12 +0800
Subject: [PATCH 34/43] Resolve conflict

---
 nnvm/python/nnvm/frontend/mxnet.py               | 2 +-
 nnvm/tests/python/frontend/mxnet/test_forward.py | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/nnvm/python/nnvm/frontend/mxnet.py b/nnvm/python/nnvm/frontend/mxnet.py
index 2df67d9967ca..56505cfabb69 100644
--- a/nnvm/python/nnvm/frontend/mxnet.py
+++ b/nnvm/python/nnvm/frontend/mxnet.py
@@ -324,7 +324,7 @@ def _argmin(inputs, attrs):
                   'flatten', 'log', 'log_softmax', 'max', 'min', 'negative',
                   'ones_like', 'relu', 'sigmoid', 'slice_like', 'softmax',
                   'sum', 'tanh', 'transpose', 'zeros_like', 'gather_nd',
-                  'reshape_like']
+                  'reshape_like', 'where']
 
 _convert_map = {
     '_copy'         : _rename('copy'),
diff --git a/nnvm/tests/python/frontend/mxnet/test_forward.py b/nnvm/tests/python/frontend/mxnet/test_forward.py
index 67f1ad5ff27d..8992799528e7 100644
--- a/nnvm/tests/python/frontend/mxnet/test_forward.py
+++ b/nnvm/tests/python/frontend/mxnet/test_forward.py
@@ -300,6 +300,7 @@ def test_forward_l2_normalize():
     mx_sym = mx.sym.L2Normalization(data, mode="channel")
     verify_mxnet_frontend_impl(mx_sym, (2, 3, 4, 5), (2, 3, 4, 5))
 
+
 if __name__ == '__main__':
     test_forward_mlp()
     test_forward_vgg()

From 9743f15c34d5c7d684902cfe8a0cddf7cf317b02 Mon Sep 17 00:00:00 2001
From: Wang <wayao@186590ccd7a1.ant.amazon.com>
Date: Mon, 25 Feb 2019 17:05:23 +0800
Subject: [PATCH 35/43] Rename topk to top_k

---
 include/tvm/relay/attrs/vision.h              |  4 ++--
 nnvm/include/nnvm/top/nn.h                    |  4 ++--
 nnvm/python/nnvm/frontend/mxnet.py            |  2 +-
 nnvm/python/nnvm/top/vision.py                |  4 ++--
 nnvm/tests/python/compiler/test_top_level4.py |  4 ++--
 python/tvm/relay/frontend/mxnet.py            |  6 +++---
 python/tvm/relay/op/vision/_vision.py         |  4 ++--
 python/tvm/relay/op/vision/nms.py             |  6 +++---
 src/relay/op/vision/nms.cc                    |  4 ++--
 tests/python/relay/test_op_level5.py          | 12 +++++------
 topi/python/topi/vision/nms.py                | 20 +++++++++----------
 11 files changed, 35 insertions(+), 35 deletions(-)

diff --git a/include/tvm/relay/attrs/vision.h b/include/tvm/relay/attrs/vision.h
index 2733e8554e9b..1b2fb6d9c997 100644
--- a/include/tvm/relay/attrs/vision.h
+++ b/include/tvm/relay/attrs/vision.h
@@ -73,7 +73,7 @@ struct NMSAttrs : public tvm::AttrsNode<NMSAttrs>{
   int max_output_size;
   double iou_threshold;
   bool force_suppress;
-  int topk;
+  int top_k;
   int id_index;
   bool return_indices;
   bool invalid_to_bottom;
@@ -86,7 +86,7 @@ struct NMSAttrs : public tvm::AttrsNode<NMSAttrs>{
       .describe("Non-maximum suppression threshold.");
     TVM_ATTR_FIELD(force_suppress).set_default(false)
       .describe("Suppress all detections regardless of class_id.");
-    TVM_ATTR_FIELD(topk).set_default(-1)
+    TVM_ATTR_FIELD(top_k).set_default(-1)
       .describe("Keep maximum top k detections before nms, -1 for no limit.");
     TVM_ATTR_FIELD(id_index).set_default(0)
       .describe("Axis index of id.");
diff --git a/nnvm/include/nnvm/top/nn.h b/nnvm/include/nnvm/top/nn.h
index 0f75096eb75d..806e497727c4 100644
--- a/nnvm/include/nnvm/top/nn.h
+++ b/nnvm/include/nnvm/top/nn.h
@@ -447,7 +447,7 @@ struct NMSParam : public dmlc::Parameter<NMSParam> {
   bool return_indices;
   float iou_threshold;
   bool force_suppress;
-  int topk;
+  int top_k;
   int id_index;
   int max_output_size;
   bool invalid_to_bottom;
@@ -459,7 +459,7 @@ struct NMSParam : public dmlc::Parameter<NMSParam> {
       .describe("Non-maximum suppression threshold.");
     DMLC_DECLARE_FIELD(force_suppress).set_default(false)
       .describe("Suppress all detections regardless of class_id.");
-    DMLC_DECLARE_FIELD(topk).set_default(-1)
+    DMLC_DECLARE_FIELD(top_k).set_default(-1)
       .describe("Keep maximum top k detections before nms, -1 for no limit.");
     DMLC_DECLARE_FIELD(id_index).set_default(0)
       .describe("Axis index of id.");
diff --git a/nnvm/python/nnvm/frontend/mxnet.py b/nnvm/python/nnvm/frontend/mxnet.py
index 56505cfabb69..47d7ede96e5f 100644
--- a/nnvm/python/nnvm/frontend/mxnet.py
+++ b/nnvm/python/nnvm/frontend/mxnet.py
@@ -246,7 +246,7 @@ def _contrib_multibox_detection(inputs, attrs):
     nms_topk = attrs.get('nms_topk') or -1
     new_attrs0 = {'clip': clip, 'threshold': float(threshold), 'variances': variances}
     new_attrs1 = {'return_indices': False, 'iou_threshold': float(nms_threshold),
-                  'force_suppress': force_suppress, 'topk': int(nms_topk)}
+                  'force_suppress': force_suppress, 'top_k': int(nms_topk)}
     data, valid_count = _get_nnvm_op('multibox_transform_loc')(inputs[0], inputs[1],
                                                                inputs[2], **new_attrs0)
     return _get_nnvm_op('non_max_suppression')(data, valid_count, **new_attrs1)
diff --git a/nnvm/python/nnvm/top/vision.py b/nnvm/python/nnvm/top/vision.py
index 8bec66d7b8f4..ab32838e10ff 100644
--- a/nnvm/python/nnvm/top/vision.py
+++ b/nnvm/python/nnvm/top/vision.py
@@ -74,12 +74,12 @@ def compute_nms(attrs, inputs, _):
     max_output_size = attrs.get_int('max_output_size')
     iou_threshold = attrs.get_float('iou_threshold')
     force_suppress = attrs.get_bool('force_suppress')
-    topk = attrs.get_int('topk')
+    top_k = attrs.get_int('top_k')
     id_index = attrs.get_int('id_index')
     invalid_to_bottom = attrs.get_bool('invalid_to_bottom')
 
     return topi.vision.non_max_suppression(inputs[0], inputs[1], max_output_size,
-                                           iou_threshold, force_suppress, topk,
+                                           iou_threshold, force_suppress, top_k,
                                            id_index, return_indices, invalid_to_bottom)
 
 reg.register_pattern("non_max_suppression", OpPattern.OPAQUE)
diff --git a/nnvm/tests/python/compiler/test_top_level4.py b/nnvm/tests/python/compiler/test_top_level4.py
index 991b9c2b15be..6a42047151e5 100644
--- a/nnvm/tests/python/compiler/test_top_level4.py
+++ b/nnvm/tests/python/compiler/test_top_level4.py
@@ -579,9 +579,9 @@ def test_non_max_suppression():
     valid_count = sym.Variable("valid_count", dtype="int32")
     iou_threshold = 0.7
     force_suppress = True
-    topk = 2
+    top_k = 2
     out = sym.non_max_suppression(data=data, valid_count=valid_count, return_indices=False,
-                                  iou_threshold=iou_threshold, force_suppress=force_suppress, topk=topk)
+                                  iou_threshold=iou_threshold, force_suppress=force_suppress, top_k=top_k)
 
     np_data = np.array([[[0, 0.8, 1, 20, 25, 45], [1, 0.7, 30, 60, 50, 80],
                          [0, 0.4, 4, 21, 19, 40], [2, 0.9, 35, 61, 52, 79],
diff --git a/python/tvm/relay/frontend/mxnet.py b/python/tvm/relay/frontend/mxnet.py
index 47d14cc395aa..0dfab69340b4 100644
--- a/python/tvm/relay/frontend/mxnet.py
+++ b/python/tvm/relay/frontend/mxnet.py
@@ -327,7 +327,7 @@ def _mx_multibox_detection(inputs, attrs):
     new_attrs1["return_indices"] = False
     new_attrs1["iou_threshold"] = attrs.get_float("nms_threshold", 0.5)
     new_attrs1["force_suppress"] = attrs.get_bool("force_suppress", False)
-    new_attrs1["topk"] = attrs.get_int("nms_topk", -1)
+    new_attrs1["top_k"] = attrs.get_int("nms_topk", -1)
 
     ret = _op.vision.multibox_transform_loc(inputs[0], inputs[1],
                                             inputs[2], **new_attrs0)
@@ -384,7 +384,7 @@ def _mx_proposal(inputs, attrs):
 def _mx_box_nms(inputs, attrs):
     force_suppress = attrs.get_bool("force_suppress", False)
     iou_thresh = attrs.get_float('overlap_thresh', 0.5)
-    topk = attrs.get_int('topk', -1)
+    top_k = attrs.get_int('topk', -1)
     valid_thresh = attrs.get_float('valid_thresh', 0)
     coord_start = attrs.get_int('coord_start', 2)
     score_index = attrs.get_int('score_index', 1)
@@ -407,7 +407,7 @@ def _mx_box_nms(inputs, attrs):
                                              ret[0],
                                              iou_threshold=iou_thresh,
                                              force_suppress=force_suppress,
-                                             topk=topk,
+                                             top_k=top_k,
                                              id_index=id_index,
                                              return_indices=False,
                                              invalid_to_bottom=True)
diff --git a/python/tvm/relay/op/vision/_vision.py b/python/tvm/relay/op/vision/_vision.py
index 0205d6f3c2b6..c887076e6af8 100644
--- a/python/tvm/relay/op/vision/_vision.py
+++ b/python/tvm/relay/op/vision/_vision.py
@@ -86,12 +86,12 @@ def compute_nms(attrs, inputs, _, target):
     max_output_size = get_const_int(attrs.max_output_size)
     iou_threshold = get_const_float(attrs.iou_threshold)
     force_suppress = bool(get_const_int(attrs.force_suppress))
-    topk = get_const_int(attrs.topk)
+    top_k = get_const_int(attrs.top_k)
     id_index = get_const_int(attrs.id_index)
     invalid_to_bottom = bool(get_const_int(attrs.invalid_to_bottom))
     return [
         topi.vision.non_max_suppression(inputs[0], inputs[1], max_output_size,
-                                        iou_threshold, force_suppress, topk,
+                                        iou_threshold, force_suppress, top_k,
                                         id_index, return_indices, invalid_to_bottom)
     ]
 
diff --git a/python/tvm/relay/op/vision/nms.py b/python/tvm/relay/op/vision/nms.py
index e8586866c025..0124ee29ab9e 100644
--- a/python/tvm/relay/op/vision/nms.py
+++ b/python/tvm/relay/op/vision/nms.py
@@ -32,7 +32,7 @@ def non_max_suppression(data,
                         max_output_size=-1,
                         iou_threshold=0.5,
                         force_suppress=False,
-                        topk=-1,
+                        top_k=-1,
                         id_index=0,
                         return_indices=True,
                         invalid_to_bottom=False):
@@ -58,7 +58,7 @@ def non_max_suppression(data,
     force_suppress : bool, optional
         Suppress all detections regardless of class_id.
 
-    topk : int, optional
+    top_k : int, optional
         Keep maximum top k detections before nms, -1 for no limit.
 
     id_index : int, optional
@@ -76,5 +76,5 @@ def non_max_suppression(data,
         3-D tensor with shape [batch_size, num_anchors, 6].
     """
     return _make.non_max_suppression(data, valid_count, max_output_size,
-                                     iou_threshold, force_suppress, topk,
+                                     iou_threshold, force_suppress, top_k,
                                      id_index, return_indices, invalid_to_bottom)
diff --git a/src/relay/op/vision/nms.cc b/src/relay/op/vision/nms.cc
index fca40f9426db..6ebc2ca49b4d 100644
--- a/src/relay/op/vision/nms.cc
+++ b/src/relay/op/vision/nms.cc
@@ -87,7 +87,7 @@ Expr MakeNMS(Expr data,
              int max_output_size,
              double iou_threshold,
              bool force_suppress,
-             int topk,
+             int top_k,
              int id_index,
              bool return_indices,
              bool invalid_to_bottom) {
@@ -95,7 +95,7 @@ Expr MakeNMS(Expr data,
   attrs->max_output_size = max_output_size;
   attrs->iou_threshold = iou_threshold;
   attrs->force_suppress = force_suppress;
-  attrs->topk = topk;
+  attrs->top_k = top_k;
   attrs->id_index = id_index;
   attrs->return_indices = return_indices;
   attrs->invalid_to_bottom = invalid_to_bottom;
diff --git a/tests/python/relay/test_op_level5.py b/tests/python/relay/test_op_level5.py
index 6e027ff232f1..eceedc760d4b 100644
--- a/tests/python/relay/test_op_level5.py
+++ b/tests/python/relay/test_op_level5.py
@@ -176,12 +176,12 @@ def verify_get_valid_counts(dshape, score_threshold):
 
 def test_non_max_suppression():
     def verify_nms(x0_data, x1_data, dshape, ref_res, ref_indices_res,
-                   iou_threshold=0.5, force_suppress=False, topk=-1,
+                   iou_threshold=0.5, force_suppress=False, top_k=-1,
                    check_type_only=False):
         x0 = relay.var("x0", relay.ty.TensorType(dshape, "float32"))
         x1 = relay.var("x1", relay.ty.TensorType((dshape[0],), "int"))
-        z = relay.vision.non_max_suppression(x0, x1, -1, iou_threshold, force_suppress, topk, return_indices=False)
-        z_indices = relay.vision.non_max_suppression(x0, x1, -1, iou_threshold, force_suppress, topk)
+        z = relay.vision.non_max_suppression(x0, x1, -1, iou_threshold, force_suppress, top_k, return_indices=False)
+        z_indices = relay.vision.non_max_suppression(x0, x1, -1, iou_threshold, force_suppress, top_k)
         assert "iou_threshold" in z.astext()
         assert "iou_threshold" in z_indices.astext()
         zz = relay.ir_pass.infer_type(z)
@@ -221,10 +221,10 @@ def verify_nms(x0_data, x1_data, dshape, ref_res, ref_indices_res,
 
     dshape = (tvm.var("n"), num_anchors, 6)
     verify_nms(np_data, np_valid_count, dshape, np_result, np_indices_result,
-               force_suppress=True, topk=2, check_type_only=True)
+               force_suppress=True, top_k=2, check_type_only=True)
     dshape = (1, num_anchors, 6)
     verify_nms(np_data, np_valid_count, dshape, np_result, np_indices_result,
-               force_suppress=True, topk=2, check_type_only=False)
+               force_suppress=True, top_k=2, check_type_only=False)
 
     np_result = np.array([[[2, 0.9, 35, 61, 52, 79], [0, 0.8, 1, 20, 25, 45],
                            [1, 0.7, 30, 60, 50, 80], [-1, -1, -1, -1, -1, -1],
@@ -235,7 +235,7 @@ def verify_nms(x0_data, x1_data, dshape, ref_res, ref_indices_res,
                np_indices_result, check_type_only=True)
     dshape = (1, num_anchors, 6)
     verify_nms(np_data, np_valid_count, dshape, np_result,
-               np_indices_result, topk=3)
+               np_indices_result, top_k=3)
 
 
 def test_multibox_transform_loc():
diff --git a/topi/python/topi/vision/nms.py b/topi/python/topi/vision/nms.py
index 36ab0ed00510..af982cba8652 100644
--- a/topi/python/topi/vision/nms.py
+++ b/topi/python/topi/vision/nms.py
@@ -113,7 +113,7 @@ def get_valid_counts(data, score_threshold=0):
 @hybrid.script
 def hybrid_nms(data, sorted_index, valid_count,
                max_output_size, iou_threshold, force_suppress,
-               topk, id_index):
+               top_k, id_index):
     """Hybrid routing for non-maximum suppression.
 
     Parameters
@@ -139,7 +139,7 @@ def hybrid_nms(data, sorted_index, valid_count,
     force_suppress : tvm.const
         Whether to suppress all detections regardless of class_id.
 
-    topk : tvm.const
+    top_k : tvm.const
         Keep maximum top k detections before nms, -1 for no limit.
 
     id_index : tvm.const
@@ -167,13 +167,13 @@ def hybrid_nms(data, sorted_index, valid_count,
             if valid_count[i] > 0:
                 # Reorder output
                 nkeep = valid_count[i]
-                if 0 < topk < nkeep:
-                    nkeep = topk
+                if 0 < top_k < nkeep:
+                    nkeep = top_k
                 for j in range(nkeep):
                     for k in range(box_data_length):
                         output[i, j, k] = data[i, sorted_index[i, j], k]
                     box_indices[i, j] = sorted_index[i, j]
-                if 0 < topk < valid_count[i]:
+                if 0 < top_k < valid_count[i]:
                     for j in range(valid_count[i] - nkeep):
                         for k in range(box_data_length):
                             output[i, j + nkeep, k] = -1.0
@@ -235,7 +235,7 @@ def hybrid_nms(data, sorted_index, valid_count,
 
 @tvm.target.generic_func
 def non_max_suppression(data, valid_count, max_output_size=-1,
-                        iou_threshold=0.5, force_suppress=False, topk=-1,
+                        iou_threshold=0.5, force_suppress=False, top_k=-1,
                         id_index=0, return_indices=True, invalid_to_bottom=False):
     """Non-maximum suppression operator for object detection.
 
@@ -259,7 +259,7 @@ def non_max_suppression(data, valid_count, max_output_size=-1,
     force_suppress : optional, boolean
         Whether to suppress all detections regardless of class_id.
 
-    topk : optional, int
+    top_k : optional, int
         Keep maximum top k detections before nms, -1 for no limit.
 
     id_index : optional, int
@@ -286,8 +286,8 @@ def non_max_suppression(data, valid_count, max_output_size=-1,
         valid_count = tvm.placeholder((dshape[0],), dtype="int32", name="valid_count")
         iou_threshold = 0.7
         force_suppress = True
-        topk = -1
-        out = nms(data, valid_count, iou_threshold, force_suppress, topk)
+        top_k = -1
+        out = nms(data, valid_count, iou_threshold, force_suppress, top_k)
         np_data = np.random.uniform(dshape)
         np_valid_count = np.array([4])
         s = topi.generic.schedule_nms(out)
@@ -325,7 +325,7 @@ def non_max_suppression(data, valid_count, max_output_size=-1,
                                   tvm.const(max_output_size, dtype="int32"),
                                   tvm.const(iou_threshold, dtype="float32"),
                                   tvm.const(force_suppress, dtype="bool"),
-                                  tvm.const(topk, dtype="int32"),
+                                  tvm.const(top_k, dtype="int32"),
                                   tvm.const(id_index, dtype="int32"))
     if not return_indices and invalid_to_bottom:
         out = hybrid_rearrange_out(out)

From 4bd6fecb4dd05379529ed3f61dfd28b42603d040 Mon Sep 17 00:00:00 2001
From: Wang <wayao@186590ccd7a1.ant.amazon.com>
Date: Wed, 27 Feb 2019 18:49:17 +0800
Subject: [PATCH 36/43] Fix example code

---
 topi/python/topi/vision/nms.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/topi/python/topi/vision/nms.py b/topi/python/topi/vision/nms.py
index af982cba8652..169daea2d4d3 100644
--- a/topi/python/topi/vision/nms.py
+++ b/topi/python/topi/vision/nms.py
@@ -280,14 +280,15 @@ def non_max_suppression(data, valid_count, max_output_size=-1,
     --------
     .. code-block:: python
 
-        # An example to use nms
+        # An example to use non_max_suppression
         dshape = (1, 5, 6)
         data = tvm.placeholder(dshape, name="data")
         valid_count = tvm.placeholder((dshape[0],), dtype="int32", name="valid_count")
         iou_threshold = 0.7
         force_suppress = True
         top_k = -1
-        out = nms(data, valid_count, iou_threshold, force_suppress, top_k)
+        out = non_max_suppression(data, valid_count, iou_threshold=iou_threshold,
+                                  force_suppress=force_suppress, top_k=top_k)
         np_data = np.random.uniform(dshape)
         np_valid_count = np.array([4])
         s = topi.generic.schedule_nms(out)

From d1e95f959d8a68612042fe86a07ddf46f0e2ff79 Mon Sep 17 00:00:00 2001
From: Wang <wayao@186590ccd7a1.ant.amazon.com>
Date: Sat, 2 Mar 2019 21:12:07 -0800
Subject: [PATCH 37/43] Fix lint

---
 nnvm/tests/python/frontend/mxnet/test_forward.py | 1 -
 tests/python/relay/test_op_level10.py            | 1 -
 topi/python/topi/testing/__init__.py             | 3 ---
 3 files changed, 5 deletions(-)

diff --git a/nnvm/tests/python/frontend/mxnet/test_forward.py b/nnvm/tests/python/frontend/mxnet/test_forward.py
index 8992799528e7..8a0b10f0eb2a 100644
--- a/nnvm/tests/python/frontend/mxnet/test_forward.py
+++ b/nnvm/tests/python/frontend/mxnet/test_forward.py
@@ -227,7 +227,6 @@ def test_forward_slice():
     mx_sym = mx.sym.slice(data, begin=(-1, 1), end=(-3, 4), step=(-1, 2))
     verify_mxnet_frontend_impl(mx_sym, (3, 4), (2, 2))
 
-<<<<<<< HEAD
 def test_forward_maximum():
     a = mx.sym.var('a')
     b = mx.sym.var('b')
diff --git a/tests/python/relay/test_op_level10.py b/tests/python/relay/test_op_level10.py
index e3c331c6a1d0..f5c9410f132a 100644
--- a/tests/python/relay/test_op_level10.py
+++ b/tests/python/relay/test_op_level10.py
@@ -146,7 +146,6 @@ def verify_reverse_reshape(shape, newshape, oshape):
     verify_reverse_reshape((2, 3, 4), (-1, 0), (6, 4))
     verify_reverse_reshape((2, 3, 4), (0, -3), (2, 12))
 
-<<<<<<< HEAD
 def verify_batch_matmul(x_shape, y_shape, out_shape, dtype="float32"):
     x = relay.var("x", relay.TensorType(x_shape, dtype))
     y = relay.var("y", relay.TensorType(y_shape, dtype))
diff --git a/topi/python/topi/testing/__init__.py b/topi/python/topi/testing/__init__.py
index 90b8e8e0e58c..1743de13fd85 100644
--- a/topi/python/topi/testing/__init__.py
+++ b/topi/python/topi/testing/__init__.py
@@ -19,8 +19,5 @@
 from .l2_normalize_python import l2_normalize_python
 from .gather_nd_python import gather_nd_python
 from .strided_slice_python import strided_slice_python
-<<<<<<< HEAD
 from .batch_matmul import batch_matmul
-=======
 from .slice_axis_python import slice_axis_python
->>>>>>> Relay support

From 292130d89308903cea611cfe36f503303805f86c Mon Sep 17 00:00:00 2001
From: Wang <wayao@186590ccd7a1.ant.amazon.com>
Date: Sat, 2 Mar 2019 23:01:01 -0800
Subject: [PATCH 38/43] Minor fix

---
 nnvm/tests/python/frontend/mxnet/test_forward.py | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/nnvm/tests/python/frontend/mxnet/test_forward.py b/nnvm/tests/python/frontend/mxnet/test_forward.py
index 8a0b10f0eb2a..581ae75a4bbc 100644
--- a/nnvm/tests/python/frontend/mxnet/test_forward.py
+++ b/nnvm/tests/python/frontend/mxnet/test_forward.py
@@ -289,16 +289,6 @@ def test_forward_minimum():
         tvm_out = m.get_output(0, tvm.nd.empty(out_shape, dtype)).asnumpy()
         tvm.testing.assert_allclose(mx_out, tvm_out, rtol=1e-5, atol=1e-5)
 
-def test_forward_slice_axis():
-    data = mx.sym.var('data')
-    mx_sym = mx.sym.slice_axis(data, axis=1, begin=-5, end=None)
-    verify_mxnet_frontend_impl(mx_sym, (1, 10, 6), (1, 5, 6))
-
-def test_forward_l2_normalize():
-    data = mx.sym.var('data')
-    mx_sym = mx.sym.L2Normalization(data, mode="channel")
-    verify_mxnet_frontend_impl(mx_sym, (2, 3, 4, 5), (2, 3, 4, 5))
-
 
 if __name__ == '__main__':
     test_forward_mlp()
@@ -325,5 +315,3 @@ def test_forward_l2_normalize():
     test_forward_slice()
     test_forward_maximum()
     test_forward_minimum()
-    test_forward_slice_axis()
-    test_forward_l2_normalize()

From fb94ffeaf5a7582c74ab93b9a0aa1b9adfa55f40 Mon Sep 17 00:00:00 2001
From: Wang <wayao@9801a7a9c287.ant.amazon.com>
Date: Mon, 4 Mar 2019 12:46:51 -0800
Subject: [PATCH 39/43] Remove contrib_slice_axis

---
 include/tvm/relay/attrs/transform.h         |  15 ---
 python/tvm/relay/op/_transform.py           |   1 -
 python/tvm/relay/op/transform.py            |  26 -----
 src/relay/op/tensor/transform.cc            | 102 --------------------
 tests/python/frontend/mxnet/test_forward.py |   2 -
 tests/python/relay/test_op_level10.py       |  22 -----
 6 files changed, 168 deletions(-)

diff --git a/include/tvm/relay/attrs/transform.h b/include/tvm/relay/attrs/transform.h
index 720d6b9d3690..fea2c960d032 100644
--- a/include/tvm/relay/attrs/transform.h
+++ b/include/tvm/relay/attrs/transform.h
@@ -171,21 +171,6 @@ struct StridedSliceAttrs : public tvm::AttrsNode<StridedSliceAttrs> {
   }
 };
 
-struct SliceAxisAttrs : public tvm::AttrsNode<SliceAxisAttrs> {
-  int axis;
-  int begin;
-  int end;
-
-  TVM_DECLARE_ATTRS(SliceAxisAttrs, "relay.attrs.SliceAxisAttrs") {
-    TVM_ATTR_FIELD(axis)
-      .describe("Axis along which to be sliced.");
-    TVM_ATTR_FIELD(begin)
-      .describe("Index for begin of slice");
-    TVM_ATTR_FIELD(end)
-      .describe("Index for end of the slice");
-  }
-};
-
 struct SliceLikeAttrs : public tvm::AttrsNode<SliceLikeAttrs> {
   Array<Integer> axes;
 
diff --git a/python/tvm/relay/op/_transform.py b/python/tvm/relay/op/_transform.py
index 315b448cdc6e..1389f96b8325 100644
--- a/python/tvm/relay/op/_transform.py
+++ b/python/tvm/relay/op/_transform.py
@@ -21,7 +21,6 @@
 _reg.register_schedule("arange", schedule_injective)
 _reg.register_schedule("cast", schedule_injective)
 _reg.register_schedule("strided_slice", schedule_injective)
-_reg.register_schedule("_contrib_slice_axis", schedule_injective)
 _reg.register_schedule("slice_like", schedule_injective)
 _reg.register_schedule("split", schedule_injective)
 _reg.register_schedule("take", schedule_injective)
diff --git a/python/tvm/relay/op/transform.py b/python/tvm/relay/op/transform.py
index 9dc42861a4cf..725f57f54bd8 100644
--- a/python/tvm/relay/op/transform.py
+++ b/python/tvm/relay/op/transform.py
@@ -471,32 +471,6 @@ def strided_slice(data, begin, end, strides=None):
     return _make.strided_slice(data, list(begin), list(end), list(strides))
 
 
-def slice_axis(data, axis, begin, end=None):
-    """Slice input array along specific axis.
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The source array to be sliced.
-
-    axis : int
-        Axis to be sliced.
-
-    begin: int
-        The index to begin with in the slicing.
-
-    end: int, optional
-        The index indicating end of the slice.
-
-    Returns
-    -------
-    ret : relay.Expr
-        The computed result.
-    """
-    end = end or 0
-    return _make._contrib_slice_axis(data, axis, begin, end)
-
-
 def slice_like(data, shape_like, axes=None):
     """Slice the first input with respect to the second input.
 
diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index c0f279a6b72c..0c26e3da742e 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -1356,108 +1356,6 @@ Array<Integer> GetIntArray(Array<IndexExpr> arr) {
   return Array<Integer>(arr.node_);
 }
 
-// slice_axis
-TVM_REGISTER_NODE_TYPE(SliceAxisAttrs);
-
-bool SliceAxisRel(const Array<Type>& types,
-                  int num_inputs,
-                  const Attrs& attrs,
-                  const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 2);
-  const auto* data = types[0].as<TensorTypeNode>();
-  const SliceAxisAttrs *param = attrs.as<SliceAxisAttrs>();
-
-  auto src_shape = data->shape;
-  int axis = param->axis;
-  int begin = param->begin;
-  int end = param->end;
-
-  if (axis < 0) {
-    axis += src_shape.size();
-  }
-  if (begin < 0) {
-    begin += *as_const_int(src_shape[axis]);
-  }
-  if (end <= 0) {
-    end += *as_const_int(src_shape[axis]);
-  }
-  CHECK_LT(begin, end)
-    << "Begin index must be smaller than end index: "
-    << begin << " vs " << end;
-
-  std::vector<IndexExpr>&& oshape = AsVector(data->shape);
-  oshape[axis] = make_const(Int(32), end - begin);
-
-  // assign output type
-  reporter->Assign(types[1], TensorTypeNode::make(oshape, data->dtype));
-  return true;
-}
-
-Expr MakeSliceAxis(Expr data,
-                   Integer axis,
-                   Integer begin,
-                   Integer end) {
-  auto attrs = make_node<SliceAxisAttrs>();
-  attrs->axis = axis;
-  attrs->begin = begin;
-  attrs->end = end;
-  static const Op& op = Op::Get("_contrib_slice_axis");
-  return CallNode::make(op, {data}, Attrs(attrs), {});
-}
-
-TVM_REGISTER_API("relay.op._make._contrib_slice_axis")
-.set_body([](const TVMArgs& args, TVMRetValue* rv) {
-  runtime::detail::unpack_call<Expr, 4>(MakeSliceAxis, args, rv);
-});
-
-Array<Tensor> SliceAxisCompute(const Attrs& attrs,
-                               const Array<Tensor>& inputs,
-                               const Type& out_type,
-                               const Target& target) {
-  const SliceAxisAttrs *param = attrs.as<SliceAxisAttrs>();
-  const Array<IndexExpr> src_shape = inputs[0]->shape;
-  Array<IndexExpr> begin_idx, end_idx, strides;
-  int axis = param->axis;
-  int begin = param->begin;
-  int end = param->end;
-
-  if (axis < 0) {
-    axis += src_shape.size();
-  }
-  if (begin < 0) {
-    begin += *as_const_int(src_shape[axis]);
-  }
-  if (end <= 0) {
-    end += *as_const_int(src_shape[axis]);
-  }
-  for (size_t i = 0; i < src_shape.size(); ++i) {
-    begin_idx.push_back(make_const(Int(32), 0));
-    strides.push_back(make_const(Int(32), 1));
-  }
-  end_idx = Array<IndexExpr>(src_shape);
-  begin_idx.Set(axis, make_const(Int(32), begin));
-  end_idx.Set(axis, make_const(Int(32), end));
-
-  return Array<Tensor>{
-    topi::strided_slice(inputs[0],
-                        GetIntArray(begin_idx),
-                        GetIntArray(end_idx),
-                        GetIntArray(strides))
-  };
-}
-
-RELAY_REGISTER_OP("_contrib_slice_axis")
-.describe(R"doc(Slices along a given axis.
-Returns an array slice along a given axis starting from
-the begin index to the end index.
-)doc" TVM_ADD_FILELINE)
-.set_num_inputs(1)
-.add_argument("data", "Tensor", "Input data.")
-.set_support_level(10)
-.add_type_rel("SliceAxis", SliceAxisRel)
-.set_attr<FTVMCompute>("FTVMCompute", SliceAxisCompute)
-.set_attr<TOpPattern>("TOpPattern", kInjective);
-
 
 // strided_slice
 TVM_REGISTER_NODE_TYPE(StridedSliceAttrs);
diff --git a/tests/python/frontend/mxnet/test_forward.py b/tests/python/frontend/mxnet/test_forward.py
index fb975c11add0..4679876c181b 100644
--- a/tests/python/frontend/mxnet/test_forward.py
+++ b/tests/python/frontend/mxnet/test_forward.py
@@ -257,8 +257,6 @@ def verify(start, stop, step):
     verify(20, 1, -1)
     verify(20, 1, -1.5)
 
-<<<<<<< HEAD
-<<<<<<< HEAD
 def _mx_symbol(F, op_name, inputs):
     op = getattr(F, op_name)
     return op(*inputs)
diff --git a/tests/python/relay/test_op_level10.py b/tests/python/relay/test_op_level10.py
index f5c9410f132a..7237cfbc3b87 100644
--- a/tests/python/relay/test_op_level10.py
+++ b/tests/python/relay/test_op_level10.py
@@ -177,27 +177,6 @@ def test_batch_matmul():
     verify_batch_matmul((5, 16, 32), (5, 20, 32), (5, 16, 20))
     verify_batch_matmul((30, 16, 32), (30, 20, 32), (30, 16, 20))
 
-def test_contrib_slice_axis():
-    def verify(dshape, axis, begin, end):
-        x = relay.var("x", relay.TensorType(dshape, "float32"))
-        z = relay.slice_axis(x, axis=axis, begin=begin, end=end)
-        func = relay.Function([x], z)
-        func = relay.ir_pass.infer_type(func)
-        text = func.astext()
-        assert "begin" in text
-        assert "end" in text
-        x_data = np.random.uniform(size=dshape).astype("float32")
-        ref_res = topi.testing.slice_axis_python(
-            x_data, axis, begin, end)
-        for target, ctx in ctx_list():
-            intrp = relay.create_executor("graph", ctx=ctx, target=target)
-            op_res = intrp.evaluate(func)(x_data)
-            tvm.testing.assert_allclose(op_res.asnumpy(), ref_res)
-
-    verify((1, 2, 3, 4), 3, 0, 2)
-    verify((100, 50), -1, 1, -1)
-    verify((20,), -1, -9, -3)
-    verify((20, 30, 40), 1, 5, 0)
 
 if __name__ == "__main__":
     test_collapse_sum_like()
@@ -205,4 +184,3 @@ def verify(dshape, axis, begin, end):
     test_slice_like()
     test_reverse_reshape()
     test_batch_matmul()
-    test_contrib_slice_axis()

From 5c0cee95acd7ad8f4d415d6687a602045344c3c8 Mon Sep 17 00:00:00 2001
From: Wang <wayao@9801a7a9c287.ant.amazon.com>
Date: Mon, 4 Mar 2019 14:15:57 -0800
Subject: [PATCH 40/43] Resolve conflict

---
 python/tvm/relay/frontend/mxnet.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/tvm/relay/frontend/mxnet.py b/python/tvm/relay/frontend/mxnet.py
index 0dfab69340b4..58de937fdbb8 100644
--- a/python/tvm/relay/frontend/mxnet.py
+++ b/python/tvm/relay/frontend/mxnet.py
@@ -434,6 +434,7 @@ def _mx_l2_normalize(inputs, attrs):
     "exp",
     "negative",
     "reshape_like",
+    "slice_like",
     "zeros_like",
     "ones_like",
     "where",
@@ -522,10 +523,10 @@ def _mx_l2_normalize(inputs, attrs):
     "BatchNorm"     : _mx_batch_norm,
     "BatchNorm_v1"  : _mx_batch_norm,
     "LRN"           : _mx_lrn,
+    "L2Normalization"  : _mx_l2_normalize,
     "slice"         : _mx_slice,
     "slice_like"    : _mx_slice_like,
     "slice_axis"    : _mx_slice_axis,
-    "L2Normalization"  : _mx_l2_normalize,
     "SliceChannel"  : _mx_split,
     "split"         : _mx_split,
     "expand_dims"   : _mx_expand_dims,

From 28d479a2db98ef7bd2d4db6b3aa7dd352a6fffd5 Mon Sep 17 00:00:00 2001
From: Wang <wayao@9801a7a9c287.ant.amazon.com>
Date: Mon, 4 Mar 2019 17:14:14 -0800
Subject: [PATCH 41/43] Address minor comments

---
 include/tvm/relay/attrs/vision.h |  6 +++---
 nnvm/include/nnvm/top/nn.h       |  4 ++--
 nnvm/src/top/vision/nms.cc       | 11 ++++++-----
 src/relay/op/vision/nms.cc       |  7 ++++---
 4 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/include/tvm/relay/attrs/vision.h b/include/tvm/relay/attrs/vision.h
index 1b2fb6d9c997..20b80f33a2a3 100644
--- a/include/tvm/relay/attrs/vision.h
+++ b/include/tvm/relay/attrs/vision.h
@@ -59,7 +59,7 @@ struct MultiBoxTransformLocAttrs
 };
 
 /*! \brief Attributes used in get_valid_counts operator */
-struct GetValidCountsAttrs : public tvm::AttrsNode<GetValidCountsAttrs>{
+struct GetValidCountsAttrs : public tvm::AttrsNode<GetValidCountsAttrs> {
   double score_threshold;
 
   TVM_DECLARE_ATTRS(GetValidCountsAttrs, "relay.attrs.GetValidCountsAttrs") {
@@ -69,7 +69,7 @@ struct GetValidCountsAttrs : public tvm::AttrsNode<GetValidCountsAttrs>{
 };
 
 /*! \brief Attributes used in non_maximum_suppression operator */
-struct NMSAttrs : public tvm::AttrsNode<NMSAttrs>{
+struct NonMaximumSuppressionAttrs : public tvm::AttrsNode<NonMaximumSuppressionAttrs> {
   int max_output_size;
   double iou_threshold;
   bool force_suppress;
@@ -78,7 +78,7 @@ struct NMSAttrs : public tvm::AttrsNode<NMSAttrs>{
   bool return_indices;
   bool invalid_to_bottom;
 
-  TVM_DECLARE_ATTRS(NMSAttrs, "relay.attrs.NMSAttrs") {
+  TVM_DECLARE_ATTRS(NonMaximumSuppressionAttrs, "relay.attrs.NonMaximumSuppressionAttrs") {
     TVM_ATTR_FIELD(max_output_size).set_default(-1)
       .describe("Max number of output valid boxes for each instance."
                 "By default all valid boxes are returned.");
diff --git a/nnvm/include/nnvm/top/nn.h b/nnvm/include/nnvm/top/nn.h
index 806e497727c4..578f928c5b9f 100644
--- a/nnvm/include/nnvm/top/nn.h
+++ b/nnvm/include/nnvm/top/nn.h
@@ -443,7 +443,7 @@ struct MultiBoxTransformLocParam : public dmlc::Parameter<MultiBoxTransformLocPa
   }
 };
 
-struct NMSParam : public dmlc::Parameter<NMSParam> {
+struct NonMaximumSuppressionParam : public dmlc::Parameter<NonMaximumSuppressionParam> {
   bool return_indices;
   float iou_threshold;
   bool force_suppress;
@@ -451,7 +451,7 @@ struct NMSParam : public dmlc::Parameter<NMSParam> {
   int id_index;
   int max_output_size;
   bool invalid_to_bottom;
-  DMLC_DECLARE_PARAMETER(NMSParam) {
+  DMLC_DECLARE_PARAMETER(NonMaximumSuppressionParam) {
     DMLC_DECLARE_FIELD(max_output_size).set_default(-1)
       .describe("Max number of output valid boxes for each instance."
                 "By default all valid boxes are returned.");
diff --git a/nnvm/src/top/vision/nms.cc b/nnvm/src/top/vision/nms.cc
index 315e06e22ee5..e69a7cb2f036 100644
--- a/nnvm/src/top/vision/nms.cc
+++ b/nnvm/src/top/vision/nms.cc
@@ -19,12 +19,13 @@ using compiler::FTVMCompute;
 using tvm::Tensor;
 using tvm::Array;
 
-DMLC_REGISTER_PARAMETER(NMSParam);
+DMLC_REGISTER_PARAMETER(NonMaximumSuppressionParam);
 
 bool NMSShape(const NodeAttrs& attrs,
               std::vector<TShape> *in_attrs,
               std::vector<TShape> *out_attrs) {
-  const NMSParam& param = nnvm::get<NMSParam>(attrs.parsed);
+  const NonMaximumSuppressionParam& param =
+    nnvm::get<NonMaximumSuppressionParam>(attrs.parsed);
   CHECK_EQ(in_attrs->size(), 2U) << "Inputs: [data, valid_count]";
   TShape dshape = in_attrs->at(0);
   TShape vshape = in_attrs->at(1);
@@ -69,10 +70,10 @@ NNVM_REGISTER_OP(non_max_suppression)
 )doc" NNVM_ADD_FILELINE)
 .set_num_inputs(2)
 .set_num_outputs(1)
-.set_attr_parser(ParamParser<NMSParam>)
+.set_attr_parser(ParamParser<NonMaximumSuppressionParam>)
 .set_attr<FGetAttrDict>("FGetAttrDict",
-                        ParamGetAttrDict<NMSParam>)
-.add_arguments(NMSParam::__FIELDS__())
+                        ParamGetAttrDict<NonMaximumSuppressionParam>)
+.add_arguments(NonMaximumSuppressionParam::__FIELDS__())
 .add_argument("data", "Tensor", "Input data.")
 .add_argument("valid_count", "Tensor", "Number of valid anchor boxes.")
 .set_attr<FListInputNames>("FListInputNames", [](const NodeAttrs& attrs) {
diff --git a/src/relay/op/vision/nms.cc b/src/relay/op/vision/nms.cc
index 6ebc2ca49b4d..6a94da032196 100644
--- a/src/relay/op/vision/nms.cc
+++ b/src/relay/op/vision/nms.cc
@@ -56,7 +56,7 @@ input data.
 .add_type_rel("GetValidCount", GetValidCountRel);
 
 
-TVM_REGISTER_NODE_TYPE(NMSAttrs);
+TVM_REGISTER_NODE_TYPE(NonMaximumSuppressionAttrs);
 
 bool NMSRel(const Array<Type>& types,
             int num_inputs,
@@ -65,7 +65,8 @@ bool NMSRel(const Array<Type>& types,
   CHECK_EQ(types.size(), 3);
   const auto* data = types[0].as<TensorTypeNode>();
   const auto* valid_count = types[1].as<TensorTypeNode>();
-  const NMSAttrs* param = attrs.as<NMSAttrs>();
+  const NonMaximumSuppressionAttrs* param =
+    attrs.as<NonMaximumSuppressionAttrs>();
   const auto& dshape = data->shape;
   const auto& vshape = valid_count->shape;
   CHECK_EQ(dshape.size(), 3) << "Input data should be 3-D.";
@@ -91,7 +92,7 @@ Expr MakeNMS(Expr data,
              int id_index,
              bool return_indices,
              bool invalid_to_bottom) {
-  auto attrs = make_node<NMSAttrs>();
+  auto attrs = make_node<NonMaximumSuppressionAttrs>();
   attrs->max_output_size = max_output_size;
   attrs->iou_threshold = iou_threshold;
   attrs->force_suppress = force_suppress;

From c2e02e415e7ce9c7cdfc57023ce3e733150284ba Mon Sep 17 00:00:00 2001
From: Wang <wayao@9801a7a9c287.ant.amazon.com>
Date: Thu, 7 Mar 2019 14:56:58 -0800
Subject: [PATCH 42/43] Move tutorial

---
 tutorials/{relay => frontend}/deploy_ssd_gluoncv.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename tutorials/{relay => frontend}/deploy_ssd_gluoncv.py (100%)

diff --git a/tutorials/relay/deploy_ssd_gluoncv.py b/tutorials/frontend/deploy_ssd_gluoncv.py
similarity index 100%
rename from tutorials/relay/deploy_ssd_gluoncv.py
rename to tutorials/frontend/deploy_ssd_gluoncv.py

From d20024c2a35802c5f0ab1717a8d32851123857ad Mon Sep 17 00:00:00 2001
From: Wang <wayao@9801a7a9c287.ant.amazon.com>
Date: Thu, 7 Mar 2019 16:54:21 -0800
Subject: [PATCH 43/43] Resolve conflict

---
 python/tvm/relay/frontend/mxnet.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/tvm/relay/frontend/mxnet.py b/python/tvm/relay/frontend/mxnet.py
index 58de937fdbb8..cdfa75e50419 100644
--- a/python/tvm/relay/frontend/mxnet.py
+++ b/python/tvm/relay/frontend/mxnet.py
@@ -434,7 +434,6 @@ def _mx_l2_normalize(inputs, attrs):
     "exp",
     "negative",
     "reshape_like",
-    "slice_like",
     "zeros_like",
     "ones_like",
     "where",