diff --git a/src/imperative/imperative_utils.h b/src/imperative/imperative_utils.h index b867162abc9b..07fe04782bd0 100644 --- a/src/imperative/imperative_utils.h +++ b/src/imperative/imperative_utils.h @@ -419,7 +419,14 @@ inline void PushFCompute(const FCompute& fn, // mapping from index in input_blobs to index in pre_temp_dst std::unordered_map in_temp_idx_map; #if MXNET_USE_MKLDNN == 1 - InvalidateOutputs(outputs, req); + if (exec_type != ExecType::kCrossDeviceCopy) { + // kCrossDeviceCopy is used for `_copy_to` operator, which doesn't compute immediately in + // its FCcomputeEx, but AsyncPush the copy operation to engine. + // So for the case that A is holding mkldnn memory, and then copy A to B, and then copy B + // back to A, we shouldn't invalidate outputs for copying B back to A, because at this time, + // copying A to B may not happen, and will corrupt A's memory. + InvalidateOutputs(outputs, req); + } #endif std::vector tmp_req = req; // setup blobs @@ -461,7 +468,14 @@ inline void PushFComputeEx(const FComputeEx& fn, const auto& run = [=](RunContext rctx) { OpContext opctx{need_grad, is_train, rctx, engine::CallbackOnComplete(), requested}; #if MXNET_USE_MKLDNN == 1 - InvalidateOutputs(outputs, req); + if (exec_type != ExecType::kCrossDeviceCopy) { + // kCrossDeviceCopy is used for `_copy_to` operator, which doesn't compute immediately in + // its FCcomputeEx, but AsyncPush the copy operation to engine. + // So for the case that A is holding mkldnn memory, and then copy A to B, and then copy B + // back to A, we shouldn't invalidate outputs for copying B back to A, because at this time, + // copying A to B may not happen, and will corrupt A's memory. + InvalidateOutputs(outputs, req); + } #endif fn(attrs, opctx, inputs, req, outputs); if (ctx.dev_mask() == gpu::kDevMask && exec_type == ExecType::kSync && !rctx.is_bulk) { @@ -508,7 +522,14 @@ inline void PushOperator(const OpStatePtr& state, engine::CallbackOnComplete on_complete) { OpContext opctx{need_grad, is_train, rctx, on_complete, requested}; #if MXNET_USE_MKLDNN == 1 - InvalidateOutputs(outputs, req); + if (exec_type != ExecType::kCrossDeviceCopy) { + // kCrossDeviceCopy is used for `_copy_to` operator, which doesn't compute immediately in + // its FCcomputeEx, but AsyncPush the copy operation to engine. + // So for the case that A is holding mkldnn memory, and then copy A to B, and then copy B + // back to A, we shouldn't invalidate outputs for copying B back to A, because at this time, + // copying A to B may not happen, and will corrupt A's memory. + InvalidateOutputs(outputs, req); + } #endif fcompute_ex(state, opctx, inputs, req, outputs); if (ctx.dev_mask() == gpu::kDevMask && exec_type == ExecType::kSync @@ -547,7 +568,14 @@ inline void PushOperator(const OpStatePtr& state, // mapping from index in input_blobs to index in pre_temp_dst std::unordered_map in_temp_idx_map; #if MXNET_USE_MKLDNN == 1 + if (exec_type != ExecType::kCrossDeviceCopy) { + // kCrossDeviceCopy is used for `_copy_to` operator, which doesn't compute immediately in + // its FCcomputeEx, but AsyncPush the copy operation to engine. + // So for the case that A is holding mkldnn memory, and then copy A to B, and then copy B + // back to A, we shouldn't invalidate outputs for copying B back to A, because at this time, + // copying A to B may not happen, and will corrupt A's memory. InvalidateOutputs(outputs, req); + } #endif std::vector tmp_req = req; // populate input blobs and output blobs