diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3f79a65464c7..16ec66b93a1f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -374,7 +374,6 @@ tvm_file_glob(GLOB RUNTIME_SRCS
   src/runtime/disco/*.cc
   src/runtime/minrpc/*.cc
   src/runtime/relax_vm/*.cc
-  src/runtime/relax_vm/*.cu
 )
 set(TVM_RUNTIME_EXT_OBJS "")
 
@@ -481,7 +480,7 @@ if (USE_CUDA AND USE_NVSHMEM)
   endif()
   set(CMAKE_CUDA_SEPARABLE_COMPILATION ON)
   set(CMAKE_POSITION_INDEPENDENT_CODE ON)
-  tvm_file_glob(GLOB RUNTIME_NVSHMEM_SRCS src/runtime/contrib/nvshmem/*.cc)
+  tvm_file_glob(GLOB RUNTIME_NVSHMEM_SRCS src/runtime/contrib/nvshmem/*.cc src/runtime/relax_vm/*.cu)
   list(APPEND RUNTIME_SRCS ${RUNTIME_NVSHMEM_SRCS})
 endif()
 
diff --git a/src/runtime/contrib/nvshmem/init.cc b/src/runtime/contrib/nvshmem/init.cc
index bd1601869569..2733c595720a 100644
--- a/src/runtime/contrib/nvshmem/init.cc
+++ b/src/runtime/contrib/nvshmem/init.cc
@@ -57,21 +57,22 @@ void InitNVSHMEM(ShapeTuple uid_64, int num_workers, int worker_id_start) {
   for (int i = 0; i < UNIQUEID_PADDING; ++i) {
     uid.internal[i] = static_cast<char>(uid_64[i + 1]);
   }
-  //FIXME: this is a hack to avoid the issue of NVSHMEM using Multi-process-per-GPU to initialize
+  // FIXME: this is a hack to avoid the issue of NVSHMEM using Multi-process-per-GPU to initialize
   cudaSetDevice(worker_id);
   nvshmemx_set_attr_uniqueid_args(worker_id, num_workers, &uid, &attr);
   nvshmemx_init_attr(NVSHMEMX_INIT_WITH_UNIQUEID, &attr);
   int mype_node = nvshmem_team_my_pe(NVSHMEMX_TEAM_NODE);
   CUDA_CALL(cudaSetDevice(mype_node));
-  if(worker!=nullptr){
-    if(worker->default_device.device_type == DLDeviceType::kDLCPU){
+  if (worker != nullptr) {
+    if (worker->default_device.device_type == DLDeviceType::kDLCPU) {
       worker->default_device = Device{DLDeviceType::kDLCUDA, mype_node};
     } else {
       ICHECK(worker->default_device.device_type == DLDeviceType::kDLCUDA &&
              worker->default_device.device_id == mype_node)
           << "The default device of the worker is inconsistent with the device used for NVSHMEM. "
-          << "The default device is " << worker->default_device << ", but the device used for NVSHMEM is "
-          << Device{DLDeviceType::kDLCUDA, mype_node} << ".";
+          << "The default device is " << worker->default_device
+          << ", but the device used for NVSHMEM is " << Device{DLDeviceType::kDLCUDA, mype_node}
+          << ".";
     }
   }
   LOG_INFO << "NVSHMEM init finished: mype=" << nvshmem_my_pe() << " "
diff --git a/tests/python/relax/test_runtime_builtin_kv_cache_transfer.py b/tests/python/relax/test_runtime_builtin_kv_cache_transfer.py
index 849360c7c552..7496ac45ef9c 100644
--- a/tests/python/relax/test_runtime_builtin_kv_cache_transfer.py
+++ b/tests/python/relax/test_runtime_builtin_kv_cache_transfer.py
@@ -149,5 +149,6 @@ def test_kv_transfer_with_disco():
         sess._sync_worker(i)
             
 if __name__ == "__main__":
+    # FIXME: only one test can be run at a time
     test_kv_transfer_without_disco()
     # test_kv_transfer_with_disco()