diff --git a/tests/python/relay/test_op_level5.py b/tests/python/relay/test_op_level5.py
index e10fafa7856bd..82d9399a170a5 100644
--- a/tests/python/relay/test_op_level5.py
+++ b/tests/python/relay/test_op_level5.py
@@ -179,8 +179,12 @@ def verify_nms(x0_data, x1_data, dshape, ref_res, ref_indices_res,
                    check_type_only=False):
         x0 = relay.var("x0", relay.ty.TensorType(dshape, "float32"))
         x1 = relay.var("x1", relay.ty.TensorType((dshape[0],), "int"))
-        z = relay.vision.non_max_suppression(x0, x1, -1, iou_threshold, force_suppress, top_k, return_indices=False)
-        z_indices = relay.vision.non_max_suppression(x0, x1, -1, iou_threshold, force_suppress, top_k)
+        z = relay.vision.non_max_suppression(x0, x1, max_output_size = -1, \
+            iou_threshold = iou_threshold, force_suppress = force_suppress, \
+            top_k = top_k, return_indices=False)
+        z_indices = relay.vision.non_max_suppression(x0, x1, max_output_size = -1, \
+                    iou_threshold = iou_threshold, force_suppress = force_suppress, \
+                    top_k = top_k)
         assert "iou_threshold" in z.astext()
         assert "iou_threshold" in z_indices.astext()
         zz = relay.ir_pass.infer_type(z)
diff --git a/topi/python/topi/cuda/nms.py b/topi/python/topi/cuda/nms.py
index 33d186959324d..6118ff11f9ce6 100644
--- a/topi/python/topi/cuda/nms.py
+++ b/topi/python/topi/cuda/nms.py
@@ -64,6 +64,10 @@ def get_valid_counts_pre(data, flag, idx, score_threshold):
             flag[tid] = 0
             idx[tid] = 0
 
+    ib.emit(tvm.make.Call(None, 'tvm_storage_sync',
+                          tvm.convert(['shared']),
+                          tvm.expr.Call.Intrinsic, None, 0))
+
     with ib.if_scope(tid < batch_size):
         with ib.for_range(0, num_anchors) as k:
             with ib.if_scope(k > 0):
diff --git a/topi/python/topi/cuda/ssd/multibox.py b/topi/python/topi/cuda/ssd/multibox.py
index 68b275066c622..cb92e5ff44c72 100644
--- a/topi/python/topi/cuda/ssd/multibox.py
+++ b/topi/python/topi/cuda/ssd/multibox.py
@@ -438,7 +438,7 @@ def multibox_detection_gpu(cls_prob, loc_pred, anchor, clip=True, threshold=0.01
     """
     inter_out = multibox_transform_loc(cls_prob, loc_pred, anchor,
                                        clip, threshold, variances)
-    out = non_max_suppression(
-        inter_out[0], inter_out[1], -1, nms_threshold, force_suppress, \
-        nms_topk, return_indices=False)
+    out = non_max_suppression(inter_out[0], inter_out[1], max_output_size = -1,
+                              iou_threshold = nms_threshold, force_suppress = force_suppress,
+                              top_k = nms_topk, return_indices=False)
     return out
diff --git a/topi/python/topi/cuda/vision.py b/topi/python/topi/cuda/vision.py
index 4234bedf06fd8..f32e21183f037 100644
--- a/topi/python/topi/cuda/vision.py
+++ b/topi/python/topi/cuda/vision.py
@@ -17,11 +17,7 @@ def _default_schedule(outs):
     def traverse(op):
         """inline all one-to-one-mapping operators except the last stage (output)"""
         if op.tag in ["nms", "invalid_to_bottom"]:
-            if op.name in ['nms']:
-                sort = op.input_tensors[1]
-            else:
-                out = op.input_tensors[0]
-                sort = s[out].op.input_tensors[1]
+            sort = op.input_tensors[1]
             score = s[sort].op.input_tensors[0]
             fused = s[score].fuse(*s[score].op.axis)
             num_thread = tvm.target.current_target(allow_none=False).max_num_threads
diff --git a/topi/python/topi/vision/ssd/multibox.py b/topi/python/topi/vision/ssd/multibox.py
index 2de1723dbd7b7..81ce808432cba 100644
--- a/topi/python/topi/vision/ssd/multibox.py
+++ b/topi/python/topi/vision/ssd/multibox.py
@@ -292,7 +292,7 @@ def multibox_detection(cls_prob, loc_pred, anchor, clip=True, threshold=0.01, nm
     """
     inter_out = multibox_transform_loc(cls_prob, loc_pred, anchor,
                                        clip, threshold, variances)
-    out = non_max_suppression(inter_out[0], inter_out[1], -1,
-                              nms_threshold, force_suppress, nms_topk,
-                              return_indices=False)
+    out = non_max_suppression(inter_out[0], inter_out[1], max_output_size = -1,
+                              iou_threshold = nms_threshold, force_suppress = force_suppress,
+                              top_k = nms_topk, return_indices=False)
     return out