[TOPI] Improve get_valid_count and nms performance for CUDA (#5339)

* get_valid_count updated to have correct results * speedup nms * update nms * revert back nms * recover one test for get_valid_count
apache · Apr 15, 2020 · d81b006 · d81b006
1 parent 1265983
commit d81b006
Show file tree

Hide file tree

Showing 5 changed files with 17 additions and 295 deletions.
diff --git a/python/tvm/relay/frontend/mxnet.py b/python/tvm/relay/frontend/mxnet.py
@@ -853,7 +853,6 @@ def _mx_smooth_l1(inputs, attrs):
 
 def _mx_deformable_convolution(inputs, attrs):
     new_attrs = {}
-    assert attrs.get_bool("no_bias")
     new_attrs["kernel_size"] = attrs.get_int_tuple("kernel")
     new_attrs["strides"] = attrs.get_int_tuple("stride")
     new_attrs["padding"] = attrs.get_int_tuple("pad")

diff --git a/tests/python/relay/test_op_level5.py b/tests/python/relay/test_op_level5.py
@@ -225,6 +225,9 @@ def verify_get_valid_counts(dshape, score_threshold, id_index, score_index):
             intrp = relay.create_executor("debug", ctx=ctx, target=target)
             out = intrp.evaluate(func)(np_data)
             tvm.testing.assert_allclose(out[0].asnumpy(), np_out1, rtol=1e-3, atol=1e-04)
+            # get_valid_count for cuda doesn't do data rearrangement
+            if target == 'cuda':
+                return
             tvm.testing.assert_allclose(out[1].asnumpy(), np_out2, rtol=1e-3, atol=1e-04)
 
     verify_get_valid_counts((1, 2500, 6), 0, 0, 1)