From 85ce0c1d30935419bd380826b69b474734046930 Mon Sep 17 00:00:00 2001 From: Ted Themistokleous <107195283+TedThemistokleous@users.noreply.github.com> Date: Tue, 25 Jul 2023 16:24:17 -0400 Subject: [PATCH] [MIGraphX EP] Fix CopyTensorAsync and add guards for stream sync CopyTensors (#16787) (#13) Add compile guards to gate functionality based on MIGRAPHX_STREAM_SYNC for adding the following - remove excess hipStreamSyncronize to nullstream on CopyTensor calls - Add proper call for stream synchronized CopyTensorAsync for DeviceToHost case Without this change subsequent CopyTensorAsync() calls will fail for cards that don't use pinned memory thus causing hipMemcpy() calls to occur before certain kernel operations occur. ![image](https://github.com/microsoft/onnxruntime/assets/107195283/4915c18a-fb2d-40c9-a50e-a7c6613c324b) becomes ![image](https://github.com/microsoft/onnxruntime/assets/107195283/f661acf4-e2af-4c9a-b26a-30fca339cf1d) --------- Co-authored-by: Ted Themistokleous --- onnxruntime/core/providers/migraphx/gpu_data_transfer.cc | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/onnxruntime/core/providers/migraphx/gpu_data_transfer.cc b/onnxruntime/core/providers/migraphx/gpu_data_transfer.cc index 2a306b369b3b3..df4afd8714b4d 100644 --- a/onnxruntime/core/providers/migraphx/gpu_data_transfer.cc +++ b/onnxruntime/core/providers/migraphx/gpu_data_transfer.cc @@ -25,17 +25,14 @@ common::Status GPUDataTransfer::CopyTensor(const Tensor& src, Tensor& dst) const // Copy only if the two addresses are different. if (dst_data != src_data) { HIP_CALL_THROW(hipMemcpy(dst_data, src_data, bytes, hipMemcpyDeviceToDevice)); - HIP_CALL_THROW(hipStreamSynchronize(nullptr)); } } else { // copy from other CPU memory to GPU, this is blocking HIP_CALL_THROW(hipMemcpy(dst_data, src_data, bytes, hipMemcpyHostToDevice)); - HIP_CALL_THROW(hipStreamSynchronize(nullptr)); // TODO: still need stream sync? since already blocking } } else if (src_device.Type() == OrtDevice::GPU) { // copying from GPU to CPU memory, this is blocking HIP_CALL_THROW(hipMemcpy(dst_data, src_data, bytes, hipMemcpyDeviceToHost)); - HIP_CALL_THROW(hipStreamSynchronize(nullptr)); // TODO: still need stream sync? since already blocking } else { // copying between cpu memory memcpy(dst_data, src_data, bytes); @@ -64,6 +61,7 @@ common::Status GPUDataTransfer::CopyTensorAsync(const Tensor& src, Tensor& dst, HIP_CALL_THROW(hipMemcpy(dst_data, src_data, bytes, hipMemcpyHostToDevice)); } } else if (src_device.Type() == OrtDevice::GPU) { +#ifndef MIGRAPHX_STREAM_SYNC if (dst_device.Type() == OrtDevice::CPU && dst_device.MemType() == OrtDevice::MemType::HIP_PINNED) { // copying from GPU to pinned memory, this is non-blocking HIP_CALL_THROW(hipMemcpyAsync(dst_data, src_data, bytes, hipMemcpyDeviceToHost, static_cast(stream.GetHandle()))); @@ -71,6 +69,9 @@ common::Status GPUDataTransfer::CopyTensorAsync(const Tensor& src, Tensor& dst, // copying from GPU to CPU memory, this is blocking HIP_CALL_THROW(hipMemcpy(dst_data, src_data, bytes, hipMemcpyDeviceToHost)); } +#else + HIP_CALL_THROW(hipMemcpyAsync(dst_data, src_data, bytes, hipMemcpyDeviceToHost, static_cast(stream.GetHandle()))); +#endif } else { // copying between cpu memory memcpy(dst_data, src_data, bytes);