diff --git a/src/imperative/imperative_utils.h b/src/imperative/imperative_utils.h
index b867162abc9b..07fe04782bd0 100644
--- a/src/imperative/imperative_utils.h
+++ b/src/imperative/imperative_utils.h
@@ -419,7 +419,14 @@ inline void PushFCompute(const FCompute& fn,
       // mapping from index in input_blobs to index in pre_temp_dst
       std::unordered_map<uint32_t, uint32_t> in_temp_idx_map;
 #if MXNET_USE_MKLDNN == 1
-      InvalidateOutputs(outputs, req);
+      if (exec_type != ExecType::kCrossDeviceCopy) {
+        // kCrossDeviceCopy is used for `_copy_to` operator, which doesn't compute immediately in
+        // its FCcomputeEx, but AsyncPush the copy operation to engine.
+        // So for the case that A is holding mkldnn memory, and then copy A to B, and then copy B
+        // back to A, we shouldn't invalidate outputs for copying B back to A, because at this time,
+        // copying A to B may not happen, and will corrupt A's memory.
+        InvalidateOutputs(outputs, req);
+      }
 #endif
       std::vector<OpReqType> tmp_req = req;
       // setup blobs
@@ -461,7 +468,14 @@ inline void PushFComputeEx(const FComputeEx& fn,
   const auto& run = [=](RunContext rctx) {
       OpContext opctx{need_grad, is_train, rctx, engine::CallbackOnComplete(), requested};
 #if MXNET_USE_MKLDNN == 1
-      InvalidateOutputs(outputs, req);
+      if (exec_type != ExecType::kCrossDeviceCopy) {
+        // kCrossDeviceCopy is used for `_copy_to` operator, which doesn't compute immediately in
+        // its FCcomputeEx, but AsyncPush the copy operation to engine.
+        // So for the case that A is holding mkldnn memory, and then copy A to B, and then copy B
+        // back to A, we shouldn't invalidate outputs for copying B back to A, because at this time,
+        // copying A to B may not happen, and will corrupt A's memory.
+        InvalidateOutputs(outputs, req);
+      }
 #endif
       fn(attrs, opctx, inputs, req, outputs);
       if (ctx.dev_mask() == gpu::kDevMask && exec_type == ExecType::kSync && !rctx.is_bulk) {
@@ -508,7 +522,14 @@ inline void PushOperator(const OpStatePtr& state,
                           engine::CallbackOnComplete on_complete) {
       OpContext opctx{need_grad, is_train, rctx, on_complete, requested};
 #if MXNET_USE_MKLDNN == 1
-      InvalidateOutputs(outputs, req);
+      if (exec_type != ExecType::kCrossDeviceCopy) {
+        // kCrossDeviceCopy is used for `_copy_to` operator, which doesn't compute immediately in
+        // its FCcomputeEx, but AsyncPush the copy operation to engine.
+        // So for the case that A is holding mkldnn memory, and then copy A to B, and then copy B
+        // back to A, we shouldn't invalidate outputs for copying B back to A, because at this time,
+        // copying A to B may not happen, and will corrupt A's memory.
+        InvalidateOutputs(outputs, req);
+      }
 #endif
       fcompute_ex(state, opctx, inputs, req, outputs);
       if (ctx.dev_mask() == gpu::kDevMask && exec_type == ExecType::kSync
@@ -547,7 +568,14 @@ inline void PushOperator(const OpStatePtr& state,
         // mapping from index in input_blobs to index in pre_temp_dst
         std::unordered_map<uint32_t, uint32_t> in_temp_idx_map;
 #if MXNET_USE_MKLDNN == 1
+      if (exec_type != ExecType::kCrossDeviceCopy) {
+        // kCrossDeviceCopy is used for `_copy_to` operator, which doesn't compute immediately in
+        // its FCcomputeEx, but AsyncPush the copy operation to engine.
+        // So for the case that A is holding mkldnn memory, and then copy A to B, and then copy B
+        // back to A, we shouldn't invalidate outputs for copying B back to A, because at this time,
+        // copying A to B may not happen, and will corrupt A's memory.
         InvalidateOutputs(outputs, req);
+      }
 #endif
         std::vector<OpReqType> tmp_req = req;
         // populate input blobs and output blobs