diff --git a/topi/python/topi/cuda/nms.py b/topi/python/topi/cuda/nms.py
index d37c0136b21da..ae050f0891c44 100644
--- a/topi/python/topi/cuda/nms.py
+++ b/topi/python/topi/cuda/nms.py
@@ -113,7 +113,7 @@ def get_valid_counts_ir(data, flag, idx, valid_count, out):
     valid_count = ib.buffer_ptr(valid_count)
     out = ib.buffer_ptr(out)
 
-    max_threads = int(tvm.target.current_target(allow_none=False).max_num_threads)
+    max_threads = int(math_sqrt(tvm.target.current_target(allow_none=False).max_num_threads))
     nthread_tx = max_threads
     nthread_bx = batch_size * num_anchors * elem_length // max_threads + 1
     tx = tvm.thread_axis("threadIdx.x")
diff --git a/topi/python/topi/cuda/ssd/multibox.py b/topi/python/topi/cuda/ssd/multibox.py
index c0307947cb184..04fd30d34a4d3 100644
--- a/topi/python/topi/cuda/ssd/multibox.py
+++ b/topi/python/topi/cuda/ssd/multibox.py
@@ -76,7 +76,6 @@ def multibox_prior_ir(data, out, sizes, ratios, steps, offsets):
         with ib.if_scope((j < in_width)):
             center_h = (i + offset_h) * steps_h
             center_w = (j + offset_w) * steps_w
-
             for k in range(num_sizes + num_ratios - 1):
                 w = if_then_else(k < num_sizes,
                                  size_ratio_concat[k] * in_height / in_width / 2.0,