diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index f3b03570db33..55eb5c0ac9c4 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -723,6 +723,7 @@ build_ubuntu_gpu_mkldnn() {
     CC=gcc-7 CXX=g++-7 cmake \
         -DCMAKE_BUILD_TYPE="RelWithDebInfo" \
         -DUSE_MKL_IF_AVAILABLE=OFF \
+        -DUSE_TVM_OP=ON \
         -DUSE_CUDA=ON \
         -DMXNET_CUDA_ARCH="$CI_CMAKE_CUDA_ARCH" \
         -DUSE_CPP_PACKAGE=ON \
@@ -736,6 +737,7 @@ build_ubuntu_gpu_mkldnn_nocudnn() {
     CC=gcc-7 CXX=g++-7 cmake \
         -DCMAKE_BUILD_TYPE="RelWithDebInfo" \
         -DUSE_MKL_IF_AVAILABLE=OFF \
+        -DUSE_TVM_OP=ON \
         -DUSE_CUDA=ON \
         -DMXNET_CUDA_ARCH="$CI_CMAKE_CUDA_ARCH" \
         -DUSE_CUDNN=OFF \
@@ -750,6 +752,7 @@ build_ubuntu_gpu_cuda101_cudnn7() {
     CC=gcc-7 CXX=g++-7 cmake \
         -DCMAKE_BUILD_TYPE="RelWithDebInfo" \
         -DUSE_MKL_IF_AVAILABLE=OFF \
+        -DUSE_TVM_OP=ON \
         -DUSE_CUDA=ON \
         -DMXNET_CUDA_ARCH="$CI_CMAKE_CUDA_ARCH" \
         -DUSE_CUDNN=ON \
@@ -789,6 +792,7 @@ build_ubuntu_gpu_cuda101_cudnn7_make() {
         USE_CUDA=1                                \
         USE_CUDA_PATH=/usr/local/cuda             \
         USE_CUDNN=1                               \
+        USE_TVM_OP=1                              \
         USE_CPP_PACKAGE=1                         \
         USE_DIST_KVSTORE=1                        \
         CUDA_ARCH="$CI_CUDA_COMPUTE_CAPABILITIES" \
@@ -808,6 +812,7 @@ build_ubuntu_gpu_cuda101_cudnn7_mkldnn_cpp_test() {
         USE_CUDA=1                                \
         USE_CUDA_PATH=/usr/local/cuda             \
         USE_CUDNN=1                               \
+        USE_TVM_OP=0                              \
         USE_CPP_PACKAGE=1                         \
         USE_DIST_KVSTORE=1                        \
         CUDA_ARCH="$CI_CUDA_COMPUTE_CAPABILITIES" \
@@ -817,6 +822,23 @@ build_ubuntu_gpu_cuda101_cudnn7_mkldnn_cpp_test() {
     make cython PYTHON=python3
 }
 
+build_ubuntu_gpu_cuda101_cudnn7_no_tvm_op() {
+    set -ex
+    cd /work/build
+    CC=gcc-7 CXX=g++-7 cmake \
+        -DCMAKE_BUILD_TYPE="RelWithDebInfo" \
+        -DUSE_MKL_IF_AVAILABLE=OFF \
+        -DUSE_TVM_OP=OFF \
+        -DUSE_CUDA=ON \
+        -DMXNET_CUDA_ARCH="$CI_CMAKE_CUDA_ARCH" \
+        -DUSE_CUDNN=ON \
+        -DUSE_MKLDNN=OFF \
+        -DBUILD_CYTHON_MODULES=ON \
+        -DUSE_DIST_KVSTORE=ON \
+        -G Ninja /work/mxnet
+    ninja
+}
+
 build_ubuntu_amalgamation() {
     set -ex
     # Amalgamation can not be run with -j nproc
@@ -847,6 +869,7 @@ build_ubuntu_gpu_cmake() {
         -DUSE_SIGNAL_HANDLER=ON                 \
         -DUSE_CUDA=ON                           \
         -DUSE_CUDNN=ON                          \
+        -DUSE_TVM_OP=ON                         \
         -DUSE_MKL_IF_AVAILABLE=OFF              \
         -DUSE_MKLML_MKL=OFF                     \
         -DUSE_MKLDNN=OFF                        \
@@ -867,6 +890,7 @@ build_ubuntu_gpu_cmake_no_rtc() {
         -DUSE_SIGNAL_HANDLER=ON                 \
         -DUSE_CUDA=ON                           \
         -DUSE_CUDNN=ON                          \
+        -DUSE_TVM_OP=ON                         \
         -DUSE_MKL_IF_AVAILABLE=OFF              \
         -DUSE_MKLML_MKL=OFF                     \
         -DUSE_MKLDNN=ON                         \
@@ -881,6 +905,27 @@ build_ubuntu_gpu_cmake_no_rtc() {
     ninja
 }
 
+build_ubuntu_gpu_cmake_no_tvm_op() {
+    set -ex
+    cd /work/build
+    CC=gcc-7 CXX=g++-7 cmake \
+        -DUSE_SIGNAL_HANDLER=ON                 \
+        -DUSE_CUDA=ON                           \
+        -DUSE_CUDNN=ON                          \
+        -DUSE_TVM_OP=OFF                        \
+        -DUSE_MKL_IF_AVAILABLE=OFF              \
+        -DUSE_MKLML_MKL=OFF                     \
+        -DUSE_MKLDNN=OFF                        \
+        -DUSE_DIST_KVSTORE=ON                   \
+        -DCMAKE_BUILD_TYPE=Release              \
+        -DMXNET_CUDA_ARCH="$CI_CMAKE_CUDA_ARCH" \
+        -DBUILD_CYTHON_MODULES=1                \
+        -G Ninja                                \
+        /work/mxnet
+
+    ninja
+}
+
 build_ubuntu_cpu_large_tensor() {
     set -ex
     cd /work/build
@@ -903,6 +948,7 @@ build_ubuntu_gpu_large_tensor() {
         -DUSE_SIGNAL_HANDLER=ON                 \
         -DUSE_CUDA=ON                           \
         -DUSE_CUDNN=ON                          \
+        -DUSE_TVM_OP=ON                         \
         -DUSE_MKL_IF_AVAILABLE=OFF              \
         -DUSE_MKLML_MKL=OFF                     \
         -DUSE_MKLDNN=OFF                        \
diff --git a/ci/jenkins/Jenkins_steps.groovy b/ci/jenkins/Jenkins_steps.groovy
index 8ac63ef5a9d6..61640182b3ca 100644
--- a/ci/jenkins/Jenkins_steps.groovy
+++ b/ci/jenkins/Jenkins_steps.groovy
@@ -310,6 +310,20 @@ def compile_unix_full_gpu_mkldnn_cpp_test(lib_name) {
     }]
 }
 
+def compile_unix_full_gpu_no_tvm_op() {
+    return ['GPU: CUDA10.1+cuDNN7 TVM_OP OFF': {
+      node(NODE_LINUX_CPU) {
+        ws('workspace/build-gpu-no-tvm-op') {
+          timeout(time: max_time, unit: 'MINUTES') {
+            utils.init_git()
+            utils.docker_run('ubuntu_build_cuda', 'build_ubuntu_gpu_cuda101_cudnn7_no_tvm_op', false)
+            utils.pack_lib('gpu_no_tvm_op', mx_lib_cpp_examples_no_tvm_op)
+          }
+        }
+      }
+    }]
+}
+
 def compile_unix_cmake_gpu(lib_name) {
     return ['GPU: CMake': {
       node(NODE_LINUX_CPU) {
@@ -324,6 +338,19 @@ def compile_unix_cmake_gpu(lib_name) {
     }]
 }
 
+def compile_unix_cmake_gpu_no_tvm_op() {
+    return ['GPU: CMake TVM_OP OFF': {
+      node(NODE_LINUX_CPU) {
+        ws('workspace/build-cmake-gpu-no-tvm-op') {
+          timeout(time: max_time, unit: 'MINUTES') {
+            utils.init_git()
+            utils.docker_run('ubuntu_gpu_cu101', 'build_ubuntu_gpu_cmake_no_tvm_op', false)
+          }
+        }
+      }
+    }]
+}
+
 def compile_unix_cmake_gpu_no_rtc(lib_name) {
     return ['GPU: CMake CUDA RTC OFF': {
       node(NODE_LINUX_CPU) {
@@ -338,6 +365,22 @@ def compile_unix_cmake_gpu_no_rtc(lib_name) {
     }]
 }
 
+def test_unix_python3_gpu_no_tvm_op() {
+    return ['Python3: GPU TVM_OP OFF': {
+      node(NODE_LINUX_GPU) {
+        ws('workspace/ut-python3-gpu-no-tvm-op') {
+          try {
+            utils.unpack_and_init('gpu_no_tvm_op', mx_lib_cpp_examples_no_tvm_op)
+            python3_gpu_ut_cython('ubuntu_gpu_cu101')
+            utils.publish_test_coverage()
+          } finally {
+            utils.collect_test_results_unix('tests_gpu.xml', 'tests_python3_gpu.xml')
+          }
+        }
+      }
+    }]
+}
+
 def compile_unix_tensorrt_gpu(lib_name) {
     return ['TensorRT': {
       node(NODE_LINUX_CPU) {
diff --git a/ci/jenkins/Jenkinsfile_unix_gpu b/ci/jenkins/Jenkinsfile_unix_gpu
index 84ac2bcb4623..c35af02598f2 100644
--- a/ci/jenkins/Jenkinsfile_unix_gpu
+++ b/ci/jenkins/Jenkinsfile_unix_gpu
@@ -42,6 +42,8 @@ core_logic: {
     custom_steps.compile_unix_cmake_gpu('cmake_gpu'),
     custom_steps.compile_unix_tensorrt_gpu('tensorrt'),
     custom_steps.compile_unix_int64_gpu('gpu_int64'),
+    custom_steps.compile_unix_full_gpu_no_tvm_op(),
+    custom_steps.compile_unix_cmake_gpu_no_tvm_op(),
     custom_steps.compile_unix_cmake_gpu_no_rtc('gpu_no_rtc'),
     custom_steps.compile_unix_full_gpu_mkldnn_cpp_test('gpu_mkldnn_cpp_test_make')
   ])
@@ -61,6 +63,7 @@ core_logic: {
     // TODO(szha): fix and reenable the hanging issue. tracked in #18098
     // custom_steps.test_unix_distributed_kvstore_gpu('gpu'),
     custom_steps.test_unix_byteps_gpu('gpu'),
+    custom_steps.test_unix_python3_gpu_no_tvm_op(),
     custom_steps.test_unix_capi_cpp_package('gpu_mkldnn_cpp_test_make'),
   ]) 
 }
diff --git a/contrib/tvmop/compile.py b/contrib/tvmop/compile.py
index 6341e70fe766..a104f0861f2d 100644
--- a/contrib/tvmop/compile.py
+++ b/contrib/tvmop/compile.py
@@ -152,6 +152,12 @@ def get_cuda_arch(arch):
     # we create libtvmop.o first, which gives us chance to link tvm_runtime together with the libtvmop
     # to allow mxnet find external helper functions in libtvm_runtime
     func_binary.save(arguments.target_path + "/libtvmop.o")
+    try:
+        func_binary.imported_modules
+    except NameError:
+        func_binary.imported_modules = []
+    if len(func_binary.imported_modules):
+        func_binary.imported_modules[0].save(arguments.target_path + "/libtvmop.cubin")
     ld_path = arguments.target_path if arguments.ld_path is None else arguments.ld_path
     create_shared(arguments.target_path + "/libtvmop.so",
                   arguments.target_path + "/libtvmop.o",
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index ea39d9ac6e5b..a1e5027ce5f1 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -1363,7 +1363,15 @@ int MXGetVersion(int *out) {
 #if MXNET_USE_TVM_OP
 int MXLoadTVMOp(const char *libpath) {
   API_BEGIN();
-  tvm::runtime::TVMOpModule::Get()->Load(libpath);
+  tvm::runtime::TVMOpModule *libpath_module =  tvm::runtime::TVMOpModule::Get();
+  libpath_module->Load(libpath);
+#if MXNET_USE_CUDA
+  std::string libpathstr(libpath);
+  std::string cubinpath = libpathstr.substr(0, libpathstr.size() - 11) + "libtvmop.cubin";
+  tvm::runtime::TVMOpModule cubin_module;
+  cubin_module.Load(cubinpath);
+  libpath_module->Import(cubin_module);
+#endif
   API_END();
 }
 
diff --git a/src/operator/tvmop/op_module.cc b/src/operator/tvmop/op_module.cc
index c75e5a990086..d833ae0585c2 100644
--- a/src/operator/tvmop/op_module.cc
+++ b/src/operator/tvmop/op_module.cc
@@ -46,6 +46,12 @@ void TVMOpModule::Load(const std::string &filepath) {
   *module_ptr_ = module;
 }
 
+void TVMOpModule::Import(const TVMOpModule& module) {
+  CHECK(module_ptr_ != nullptr) << "module_ptr_ is not initialized.";
+  std::lock_guard<std::mutex> lock(mutex_);
+  module_ptr_->Import(*(module.module_ptr_));
+}
+
 PackedFunc GetFunction(const std::shared_ptr<Module> &module,
                        const std::string &op_name,
                        const std::vector<mxnet::TBlob> &args) {
diff --git a/src/operator/tvmop/op_module.h b/src/operator/tvmop/op_module.h
index 269a0aa50c11..8a25b3b9951e 100644
--- a/src/operator/tvmop/op_module.h
+++ b/src/operator/tvmop/op_module.h
@@ -44,6 +44,8 @@ class TVMOpModule {
   // Load TVM operators binary
   void Load(const std::string& filepath);
 
+  void Import(const TVMOpModule& module);
+
   void Call(const std::string& func_name,
             const mxnet::OpContext& ctx,
             const std::vector<mxnet::TBlob>& args) const;