From 5d285b09b77c58dce24f200f543e58a2ed2fb7ff Mon Sep 17 00:00:00 2001 From: Ted Themistokleous <107195283+TedThemistokleous@users.noreply.github.com> Date: Fri, 21 Jul 2023 21:48:36 -0400 Subject: [PATCH] [MIGraphX EP] Fix CopyTensorAsync and add guards for stream sync CopyTensors (#16787) Add compile guards to gate functionality based on MIGRAPHX_STREAM_SYNC for adding the following - remove excess hipStreamSyncronize to nullstream on CopyTensor calls - Add proper call for stream synchronized CopyTensorAsync for DeviceToHost case Without this change subsequent CopyTensorAsync() calls will fail for cards that don't use pinned memory thus causing hipMemcpy() calls to occur before certain kernel operations occur. ![image](https://github.com/microsoft/onnxruntime/assets/107195283/4915c18a-fb2d-40c9-a50e-a7c6613c324b) becomes ![image](https://github.com/microsoft/onnxruntime/assets/107195283/f661acf4-e2af-4c9a-b26a-30fca339cf1d) --------- Co-authored-by: Ted Themistokleous --- onnxruntime/core/providers/migraphx/gpu_data_transfer.cc | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/onnxruntime/core/providers/migraphx/gpu_data_transfer.cc b/onnxruntime/core/providers/migraphx/gpu_data_transfer.cc index 1a5c3c6d912fd..72193ef6268c1 100644 --- a/onnxruntime/core/providers/migraphx/gpu_data_transfer.cc +++ b/onnxruntime/core/providers/migraphx/gpu_data_transfer.cc @@ -24,17 +24,14 @@ common::Status GPUDataTransfer::CopyTensor(const Tensor& src, Tensor& dst) const // Copy only if the two addresses are different. if (dst_data != src_data) { HIP_CALL_THROW(hipMemcpy(dst_data, src_data, bytes, hipMemcpyDeviceToDevice)); - HIP_CALL_THROW(hipStreamSynchronize(nullptr)); } } else { // copy from other CPU memory to GPU, this is blocking HIP_CALL_THROW(hipMemcpy(dst_data, src_data, bytes, hipMemcpyHostToDevice)); - HIP_CALL_THROW(hipStreamSynchronize(nullptr)); // TODO: still need stream sync? since already blocking } } else if (src_device.Type() == OrtDevice::GPU) { // copying from GPU to CPU memory, this is blocking HIP_CALL_THROW(hipMemcpy(dst_data, src_data, bytes, hipMemcpyDeviceToHost)); - HIP_CALL_THROW(hipStreamSynchronize(nullptr)); // TODO: still need stream sync? since already blocking } else { // copying between cpu memory memcpy(dst_data, src_data, bytes); @@ -63,6 +60,7 @@ common::Status GPUDataTransfer::CopyTensorAsync(const Tensor& src, Tensor& dst, HIP_CALL_THROW(hipMemcpy(dst_data, src_data, bytes, hipMemcpyHostToDevice)); } } else if (src_device.Type() == OrtDevice::GPU) { +#ifndef MIGRAPHX_STREAM_SYNC if (dst_device.Type() == OrtDevice::CPU && dst_device.MemType() == OrtDevice::MemType::HIP_PINNED) { // copying from GPU to pinned memory, this is non-blocking HIP_CALL_THROW(hipMemcpyAsync(dst_data, src_data, bytes, hipMemcpyDeviceToHost, static_cast(stream.GetHandle()))); @@ -70,6 +68,9 @@ common::Status GPUDataTransfer::CopyTensorAsync(const Tensor& src, Tensor& dst, // copying from GPU to CPU memory, this is blocking HIP_CALL_THROW(hipMemcpy(dst_data, src_data, bytes, hipMemcpyDeviceToHost)); } +#else + HIP_CALL_THROW(hipMemcpyAsync(dst_data, src_data, bytes, hipMemcpyDeviceToHost, static_cast(stream.GetHandle()))); +#endif } else { // copying between cpu memory memcpy(dst_data, src_data, bytes);