From e2e052386b2b1d18c430e23b15f19c88b1529e7e Mon Sep 17 00:00:00 2001 From: Hugh Delaney Date: Mon, 4 Mar 2024 19:25:21 +0000 Subject: [PATCH] waitForEvents wait for native events - Wait for native HT events if waitForEvents is called. - Only return native events if they haven't been waited on - Fix bug in get_native for event which isn't host task - Make native input types single CUevent/HIPevents, not vectors. - Update tests with using structs as well as not requiring that get_native_events returns an empty vec if SYCL RT synchronization is used. --- .../sycl/detail/backend_traits_cuda.hpp | 3 +- .../sycl/detail/backend_traits_hip.hpp | 3 +- .../backend/backend_traits_cuda.hpp | 3 +- .../ext/oneapi/experimental/backend/cuda.hpp | 3 +- sycl/source/detail/backend_impl.hpp | 5 +- sycl/source/detail/event_impl.cpp | 15 +- sycl/source/detail/event_impl.hpp | 14 +- sycl/source/detail/scheduler/commands.cpp | 9 +- .../detail/scheduler/graph_processor.cpp | 7 +- sycl/source/detail/scheduler/scheduler.cpp | 24 +- sycl/source/detail/scheduler/scheduler.hpp | 5 + sycl/source/interop_handle.cpp | 3 - .../host-task-add-native-events-cuda.cpp | 47 +--- .../host-task-get-native-events-cuda.cpp | 264 ++++++++++-------- 14 files changed, 232 insertions(+), 173 deletions(-) diff --git a/sycl/include/sycl/detail/backend_traits_cuda.hpp b/sycl/include/sycl/detail/backend_traits_cuda.hpp index d06947642b45f..92fb136126ae1 100644 --- a/sycl/include/sycl/detail/backend_traits_cuda.hpp +++ b/sycl/include/sycl/detail/backend_traits_cuda.hpp @@ -87,8 +87,7 @@ template <> struct BackendReturn { }; template <> struct BackendInput { - using type = std::vector; - using value_type = CUevent; + using type = CUevent; }; template <> struct BackendReturn { diff --git a/sycl/include/sycl/detail/backend_traits_hip.hpp b/sycl/include/sycl/detail/backend_traits_hip.hpp index c454ae8147510..b432472d1caa7 100644 --- a/sycl/include/sycl/detail/backend_traits_hip.hpp +++ b/sycl/include/sycl/detail/backend_traits_hip.hpp @@ -81,8 +81,7 @@ template <> struct BackendReturn { }; template <> struct BackendInput { - using type = std::vector; - using value_type = HIPevent; + using type = HIPevent; }; template <> struct BackendReturn { diff --git a/sycl/include/sycl/ext/oneapi/experimental/backend/backend_traits_cuda.hpp b/sycl/include/sycl/ext/oneapi/experimental/backend/backend_traits_cuda.hpp index a3ce9b5b24be2..610ef5a7b84c4 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/backend/backend_traits_cuda.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/backend/backend_traits_cuda.hpp @@ -89,8 +89,7 @@ template <> struct BackendReturn { }; template <> struct BackendInput { - using type = std::vector; - using value_type = CUevent; + using type = CUevent; }; template <> struct BackendReturn { diff --git a/sycl/include/sycl/ext/oneapi/experimental/backend/cuda.hpp b/sycl/include/sycl/ext/oneapi/experimental/backend/cuda.hpp index c5072699ad01c..5d3833080e589 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/backend/cuda.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/backend/cuda.hpp @@ -26,7 +26,8 @@ inline __SYCL_EXPORT device make_device(pi_native_handle NativeHandle) { // Implementation of cuda::has_native_event inline __SYCL_EXPORT bool has_native_event(event sycl_event) { if (sycl_event.get_backend() == backend::ext_oneapi_cuda) - return get_native(sycl_event) != nullptr; + return get_native(sycl_event).size() && + get_native(sycl_event).front() != nullptr; return false; } diff --git a/sycl/source/detail/backend_impl.hpp b/sycl/source/detail/backend_impl.hpp index d7b09beb673fd..37add4e8b43e3 100644 --- a/sycl/source/detail/backend_impl.hpp +++ b/sycl/source/detail/backend_impl.hpp @@ -18,8 +18,9 @@ namespace detail { template backend getImplBackend(const T &Impl) { // Experimental host task allows the user to get backend for event impls if constexpr (std::is_same_v>) { - assert(Impl->backendSet() && - "interop_handle::add_native_events must be used in order for a host " + assert((!Impl->is_host() || Impl->backendSet()) && + "interop_handle::add_native_events must be " + "used in order for a host " "task event to have a native event"); } else { assert(!Impl->is_host() && "Cannot get the backend for host."); diff --git a/sycl/source/detail/event_impl.cpp b/sycl/source/detail/event_impl.cpp index a3197fbabf789..fdafbe7df467c 100644 --- a/sycl/source/detail/event_impl.cpp +++ b/sycl/source/detail/event_impl.cpp @@ -69,6 +69,9 @@ void event_impl::waitInternal() { "waitInternal method cannot be used for a discarded event."); } else if (MState != HES_Complete) { // Wait for the host event + // In the case that the Host Task function stores native events with + // add_native_events, waitInternal will only wait on the lambda to complete, + // not on the asynchronous events std::unique_lock lock(MMutex); cv.wait(lock, [this] { return MState == HES_Complete; }); } @@ -79,8 +82,9 @@ void event_impl::waitInternal() { } void event_impl::waitForHostTaskNativeEvents() { - // This should only be called if we wait on a queue or a SYCL user - // event. It should not be called to managed device dependencies + std::unique_lock Lock(MHostTaskNativeEventsMutex); + if (MHostTaskNativeEventsHaveBeenWaitedOn.exchange(true)) + return; for (const EventImplPtr &Event : MHostTaskNativeEvents) Event->wait(Event); } @@ -424,11 +428,16 @@ pi_native_handle event_impl::getNative() { } std::vector event_impl::getNativeVector() { + // Return empty vec if native events have already been waited on + if (isCompleted() && (!hasHostTaskNativeEvents() || + MHostTaskNativeEventsHaveBeenWaitedOn.load())) + return {}; + // If there is a native event return that. This will also initialize context if (auto nativeEvent = getNative()) return {nativeEvent}; - // Return native events sumbitted via host task interop + // Return native events submitted via host task interop auto Plugin = getPlugin(); std::vector HandleVec; for (auto &HostTaskNativeEventImpl : MHostTaskNativeEvents) { diff --git a/sycl/source/detail/event_impl.hpp b/sycl/source/detail/event_impl.hpp index a3f3634e43930..cc8e1c0dbcdb4 100644 --- a/sycl/source/detail/event_impl.hpp +++ b/sycl/source/detail/event_impl.hpp @@ -312,7 +312,9 @@ class event_impl { return MHostTaskNativeEvents.size() > 0; } - bool backendSet() const { return hasHostTaskNativeEvents(); } + bool backendSet() const { + return !MContext->is_host() || hasHostTaskNativeEvents(); + } const std::vector &getHostTaskNativeEvents() const { return MHostTaskNativeEvents; @@ -347,16 +349,18 @@ class event_impl { std::weak_ptr MWorkerQueue; std::weak_ptr MSubmittedQueue; - // Used to hold pi_events for native events that are stored with - // interop_handle::add_native_events - std::vector MHostTaskNativeEvents; - /// Dependency events prepared for waiting by backend. std::vector MPreparedDepsEvents; std::vector MPreparedHostDepsEvents; std::vector MPostCompleteEvents; + // Used to hold pi_events for native events that are stored with + // interop_handle::add_native_events + std::vector MHostTaskNativeEvents; + std::atomic MHostTaskNativeEventsHaveBeenWaitedOn = false; + std::mutex MHostTaskNativeEventsMutex; + /// Indicates that the task associated with this event has been submitted by /// the queue to the device. std::atomic MIsFlushed = false; diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index ce66d00cf0254..09d9f92486f3d 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -323,7 +323,7 @@ class DispatchHostTask { // to the event impl of the sycl event returned at // CGSubmit - pi_result waitForEvents() const { + pi_result waitForNativeDepEvents() const { std::map> RequiredEventsPerPlugin; @@ -359,6 +359,8 @@ class DispatchHostTask { // Host events can't throw exceptions so don't try to catch it. for (const EventImplPtr &Event : MThisCmd->MPreparedHostDepsEvents) { Event->waitInternal(); + if (Event->hasHostTaskNativeEvents()) + Event->waitForHostTaskNativeEvents(); } return PI_SUCCESS; @@ -388,7 +390,7 @@ class DispatchHostTask { #endif if (!HostTask.MHostTask->isManualInteropSync()) { - pi_result WaitResult = waitForEvents(); + pi_result WaitResult = waitForNativeDepEvents(); if (WaitResult != PI_SUCCESS) { std::exception_ptr EPtr = std::make_exception_ptr(sycl::runtime_error( std::string("Couldn't wait for host-task's dependencies"), @@ -2868,8 +2870,7 @@ pi_int32 ExecCGCommand::enqueueImp() { pi_int32 ExecCGCommand::enqueueImpQueue() { if (getCG().getType() != CG::CGTYPE::CodeplayHostTask) - waitForPreparedHostEvents(); // Why is this not called if the current - // command group is a HT? + waitForPreparedHostEvents(); std::vector EventImpls = getAllPreparedDepsEvents(); auto RawEvents = getPiEvents(EventImpls); flushCrossQueueDeps(EventImpls, getWorkerQueue()); diff --git a/sycl/source/detail/scheduler/graph_processor.cpp b/sycl/source/detail/scheduler/graph_processor.cpp index 5ff184be64948..d464a65353ee0 100644 --- a/sycl/source/detail/scheduler/graph_processor.cpp +++ b/sycl/source/detail/scheduler/graph_processor.cpp @@ -57,7 +57,12 @@ bool Scheduler::GraphProcessor::handleBlockingCmd(Command *Cmd, std::lock_guard Guard(Cmd->MBlockedUsersMutex); if (Cmd->isBlocking()) { const EventImplPtr &RootCmdEvent = RootCommand->getEvent(); - Cmd->addBlockedUserUnique(RootCmdEvent); + // Host tasks don't need to be added to wait list. When host tasks are + // enqueued, a new thread is created which waits on dep events via + // condition variables, so they don't need to be enqueued by other + // additional means + if (!RootCommand->isHostTask()) + Cmd->addBlockedUserUnique(RootCmdEvent); EnqueueResult = EnqueueResultT(EnqueueResultT::SyclEnqueueBlocked, Cmd); // Blocked command will be enqueued asynchronously from submission so we diff --git a/sycl/source/detail/scheduler/scheduler.cpp b/sycl/source/detail/scheduler/scheduler.cpp index 6d06be32c6250..a83298a628539 100644 --- a/sycl/source/detail/scheduler/scheduler.cpp +++ b/sycl/source/detail/scheduler/scheduler.cpp @@ -377,6 +377,22 @@ void Scheduler::enqueueLeavesOfReqUnlocked(const Requirement *const Req, EnqueueLeaves(Record->MWriteLeaves); } +void Scheduler::enqueueUnblockedCommands( + const std::vector &ToEnqueue, ReadLockT &GraphReadLock, + std::vector &ToCleanUp) { + for (auto &Event : ToEnqueue) { + Command *Cmd = static_cast(Event->getCommand()); + if (!Cmd) + continue; + EnqueueResultT Res; + bool Enqueued = + GraphProcessor::enqueueCommand(Cmd, GraphReadLock, Res, ToCleanUp, Cmd); + if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult) + throw runtime_error("Enqueue process failed.", + PI_ERROR_INVALID_OPERATION); + } +} + Scheduler::Scheduler() { sycl::device HostDevice = createSyclObjFromImpl(device_impl::getHostDeviceImpl()); @@ -470,7 +486,13 @@ void Scheduler::NotifyHostTaskCompletion(Command *Cmd) { ToCleanUp.push_back(Cmd); Cmd->MMarkedForCleanup = true; } - Cmd->getEvent()->setComplete(); + + { + std::lock_guard Guard(Cmd->MBlockedUsersMutex); + // update self-event status + Cmd->getEvent()->setComplete(); + } + Scheduler::enqueueUnblockedCommands(Cmd->MBlockedUsers, Lock, ToCleanUp); } cleanupCommands(ToCleanUp); } diff --git a/sycl/source/detail/scheduler/scheduler.hpp b/sycl/source/detail/scheduler/scheduler.hpp index d4ce5abd75e75..53ce295626045 100644 --- a/sycl/source/detail/scheduler/scheduler.hpp +++ b/sycl/source/detail/scheduler/scheduler.hpp @@ -527,6 +527,11 @@ class Scheduler { ReadLockT &GraphReadLock, std::vector &ToCleanUp); + static void + enqueueUnblockedCommands(const std::vector &CmdsToEnqueue, + ReadLockT &GraphReadLock, + std::vector &ToCleanUp); + // May lock graph with read and write modes during execution. void cleanupDeferredMemObjects(BlockingT Blocking); diff --git a/sycl/source/interop_handle.cpp b/sycl/source/interop_handle.cpp index 448bf680b6d53..d9034abfd7819 100644 --- a/sycl/source/interop_handle.cpp +++ b/sycl/source/interop_handle.cpp @@ -77,9 +77,6 @@ void interop_handle::addNativeEvents( } std::vector interop_handle::getNativeEvents() const { - if (!MEvent->backendSet()) { - MEvent->setContextImpl(MContext); - } // What if the events here have not yet been enqueued? I will need to wait on // them. That is probably already done? // diff --git a/sycl/test-e2e/HostInteropTask/native-events/host-task-add-native-events-cuda.cpp b/sycl/test-e2e/HostInteropTask/native-events/host-task-add-native-events-cuda.cpp index d3931978a4db3..ce9777c4f1524 100644 --- a/sycl/test-e2e/HostInteropTask/native-events/host-task-add-native-events-cuda.cpp +++ b/sycl/test-e2e/HostInteropTask/native-events/host-task-add-native-events-cuda.cpp @@ -3,6 +3,12 @@ // RUN: %{build} -o %t.out -lcuda // RUN: %{run} %t.out +// These tests use the add_native_events API to ensure that the SYCL RT can +// handle the events submitted to add_native_events within its runtime DAG. +// +// If manual_interop_sync is used then the user deals with async dependencies +// manually in the HT lambda through the get_native_events interface. + #include "host-task-native-events-cuda.hpp" #include #include @@ -12,37 +18,9 @@ using T = unsigned; // We don't need to test lots of types, we just want a race constexpr size_t bufSize = 1e6; constexpr T pattern = 42; -// Check that the SYCL event that we submit with add_native_events can be -// retrieved later through get_native(syclEvent) -template void test1() { - printf("Running test 1\n"); - sycl::queue q; - - std::atomic - atomicEvent; // To share the event from the host task with the main thread - - auto syclEvent = q.submit([&](sycl::handler &cgh) { - cgh.host_task([&](sycl::interop_handle ih) { - auto [_, ev] = cudaSetCtxAndGetStreamAndEvent(ih); - cuEventRecord(ev, 0); - atomicEvent.store(ev); - ih.add_native_events({ev}); - }); - }); - - waitHelper(syclEvent, q); - - auto nativeEvents = - sycl::get_native(syclEvent); - // Check that the vec of native events contains the event we stored in the - // atomic var - assert(std::find(nativeEvents.begin(), nativeEvents.end(), - atomicEvent.load()) != nativeEvents.end()); -} - // Tries to check for a race condition if the backend events are not added to // the SYCL dag. -template void test2() { +template void test1() { printf("Running test 2\n"); sycl::queue q; std::vector out(bufSize, 0); @@ -70,7 +48,7 @@ template void test2() { } // Using host task event as a cgh.depends_on with USM -template void test3() { +template void test2() { printf("Running test 3\n"); using T = unsigned; @@ -107,7 +85,7 @@ template void test3() { } // Using host task event with implicit DAG from buffer accessor model -template void test4() { +template void test3() { printf("Running test 4\n"); using T = unsigned; @@ -125,6 +103,7 @@ template void test4() { sycl::accessor acc{buf, sycl::write_only}; cgh.host_task([&](sycl::interop_handle ih) { + // FIXME: this call fails auto accPtr = ih.get_native_mem(acc); auto [stream, ev] = cudaSetCtxAndGetStreamAndEvent(ih); @@ -152,9 +131,7 @@ int main() { test1(); test2(); test2(); - test3(); - test3(); - // test4(); Fails with `SyclObject.impl && "every constructor + // test3(); Fails with `SyclObject.impl && "every constructor // should create an impl"' failed. - // test4(); + // test3(); } diff --git a/sycl/test-e2e/HostInteropTask/native-events/host-task-get-native-events-cuda.cpp b/sycl/test-e2e/HostInteropTask/native-events/host-task-get-native-events-cuda.cpp index fa7964c0932d3..7e82e9d667ed2 100644 --- a/sycl/test-e2e/HostInteropTask/native-events/host-task-get-native-events-cuda.cpp +++ b/sycl/test-e2e/HostInteropTask/native-events/host-task-get-native-events-cuda.cpp @@ -3,152 +3,192 @@ // RUN: %{build} -o %t.out -lcuda // RUN: %{run} %t.out +// These tests use the get_native_events API together with manual_interop_sync +// property. If manual interop sync is not used then get_native_events is not +// necessary, since all events have been synchronized with already on host, +// before the HT lambda is launched. +// +// If manual_interop_sync is used then the user deals with async dependencies +// manually in the HT lambda through the get_native_events interface. +// + #include "host-task-native-events-cuda.hpp" #include #include using T = unsigned; // We don't need to test lots of types, we just want a race // condition -constexpr size_t bufSize = 1e6; +constexpr size_t bufSize = 1e7; constexpr T pattern = 42; +sycl::queue q; + +using manual_interop_sync = + sycl::ext::codeplay::experimental::property::host_task::manual_interop_sync; + +constexpr auto PropList = [](bool UseManualInteropSync) -> sycl::property_list { + if (UseManualInteropSync) + return {manual_interop_sync{}}; + return {}; +}; + // Check that the SYCL event that we submit with add_native_events can be // retrieved later through get_native_events in a dependent host task -template void test1() { - printf("Running test 1\n"); - sycl::queue q; - - std::atomic - atomicEvent; // To share the event from the host task with the main thread - - auto syclEvent1 = q.submit([&](sycl::handler &cgh) { - cgh.host_task([&](sycl::interop_handle ih) { - auto [_, ev] = cudaSetCtxAndGetStreamAndEvent(ih); - cuEventRecord(ev, 0); - atomicEvent.store(ev); - ih.add_native_events({ev}); +template struct test1 { + void operator()() { + printf("Running test 1\n"); + std::atomic atomicEvent; // To share the event from the host task + // with the main thread + + auto syclEvent1 = q.submit([&](sycl::handler &cgh) { + cgh.host_task([&](sycl::interop_handle ih) { + auto [_, ev] = cudaSetCtxAndGetStreamAndEvent(ih); + cuEventRecord(ev, 0); + atomicEvent.store(ev); + ih.add_native_events({ev}); + }); }); - }); - - // This task must wait on the other lambda to complete - auto syclEvent2 = q.submit([&](sycl::handler &cgh) { - cgh.depends_on(syclEvent1); - cgh.host_task([&](sycl::interop_handle ih) { - auto nativeEvents = - ih.get_native_events(); - assert(std::find(nativeEvents.begin(), nativeEvents.end(), - atomicEvent.load()) != nativeEvents.end()); + + // This task must wait on the other lambda to complete + auto syclEvent2 = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(syclEvent1); + cgh.host_task( + [&](sycl::interop_handle ih) { + auto nativeEvents = + ih.get_native_events(); + if constexpr (!UseManualInteropSync) { + // Events should be synchronized with by SYCL RT if + // manual_interop_sync not used + return; + } + assert(std::find(nativeEvents.begin(), nativeEvents.end(), + atomicEvent.load()) != nativeEvents.end()); + }, + PropList(UseManualInteropSync)); }); - }); - waitHelper(syclEvent2, q); -} + waitHelper(syclEvent2, q); + } +}; // Tries to check for a race condition if the backend events are not added to // the SYCL dag. -template void test2() { - printf("Running test 2\n"); - sycl::queue q; - T *ptrHost = sycl::malloc_host( - bufSize, - q); // malloc_host is necessary to make the memcpy as async as possible - - auto syclEvent1 = q.submit([&](sycl::handler &cgh) { - cgh.host_task([&](sycl::interop_handle ih) { - auto [stream, ev] = cudaSetCtxAndGetStreamAndEvent(ih); - CUdeviceptr cuPtr; - CUDA_CHECK(cuMemAlloc_v2(&cuPtr, bufSize * sizeof(T))); - CUDA_CHECK(cuMemsetD32Async(cuPtr, pattern, bufSize, stream)); - CUDA_CHECK( - cuMemcpyDtoHAsync(ptrHost, cuPtr, bufSize * sizeof(T), stream)); - - CUDA_CHECK(cuEventRecord(ev, stream)); - - ih.add_native_events({ev}); +template struct test2 { + void operator()() { + printf("Running test 2\n"); + T *ptrHost = sycl::malloc_host( + bufSize, + q); // malloc_host is necessary to make the memcpy as async as possible + + auto syclEvent1 = q.submit([&](sycl::handler &cgh) { + cgh.host_task([&](sycl::interop_handle ih) { + auto [stream, ev] = cudaSetCtxAndGetStreamAndEvent(ih); + CUdeviceptr cuPtr; + CUDA_CHECK(cuMemAlloc_v2(&cuPtr, bufSize * sizeof(T))); + CUDA_CHECK(cuMemsetD32Async(cuPtr, pattern, bufSize, stream)); + CUDA_CHECK( + cuMemcpyDtoHAsync(ptrHost, cuPtr, bufSize * sizeof(T), stream)); + + CUDA_CHECK(cuEventRecord(ev, stream)); + + ih.add_native_events({ev}); + }); }); - }); - - auto syclEvent2 = q.submit([&](sycl::handler &cgh) { - cgh.depends_on(syclEvent1); - cgh.host_task([&](sycl::interop_handle ih) { - cudaSetCtxAndGetStreamAndEvent(ih); - auto nativeEvents = - ih.get_native_events(); - assert(nativeEvents.size()); - for (auto &cudaEv : nativeEvents) { - CUDA_CHECK(cuEventSynchronize(cudaEv)); - } + + auto syclEvent2 = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(syclEvent1); + cgh.host_task( + [&](sycl::interop_handle ih) { + cudaSetCtxAndGetStreamAndEvent(ih); + auto nativeEvents = + ih.get_native_events(); + if constexpr (!UseManualInteropSync) { + // Events should be synchronized with by SYCL RT if + // manual_interop_sync not used + return; + } + assert(nativeEvents.size()); + for (auto &cudaEv : nativeEvents) { + CUDA_CHECK(cuEventSynchronize(cudaEv)); + } + }, + PropList(UseManualInteropSync)); }); - }); - - waitHelper(syclEvent2, q); - for (auto i = 0; i < bufSize; ++i) { - if (ptrHost[i] != pattern) { - printf("Wrong result at index: %d, have %d vs %d\n", i, ptrHost[i], - pattern); - throw; + + waitHelper(syclEvent2, q); + for (auto i = 0; i < bufSize; ++i) { + if (ptrHost[i] != pattern) { + fprintf(stderr, "Wrong result at index: %d, have %d vs %d\n", i, + ptrHost[i], pattern); + throw; + } } } - printf("Tests passed\n"); -} +}; // Using host task event as a cgh.depends_on with USM -template void test3() { - printf("Running test 3\n"); - using T = unsigned; - - sycl::queue q; - - T *ptrHostA = sycl::malloc_host(bufSize, q); - T *ptrHostB = sycl::malloc_host(bufSize, q); +template struct test3 { + void operator()() { + printf("Running test 3\n"); + using T = unsigned; - T *ptrDevice = sycl::malloc_device(bufSize, q); + T *ptrHostA = sycl::malloc_host(bufSize, q); + T *ptrHostB = sycl::malloc_host(bufSize, q); - for (auto i = 0; i < bufSize; ++i) - ptrHostA[i] = pattern; + T *ptrDevice = sycl::malloc_device(bufSize, q); - auto syclEvent1 = q.submit([&](sycl::handler &cgh) { - cgh.memcpy(ptrDevice, ptrHostA, bufSize * sizeof(T)); - }); + for (auto i = 0; i < bufSize; ++i) + ptrHostA[i] = pattern; - auto syclEvent2 = q.submit([&](sycl::handler &cgh) { - cgh.depends_on(syclEvent1); - cgh.host_task([&](sycl::interop_handle ih) { - auto [stream, _] = cudaSetCtxAndGetStreamAndEvent(ih); - auto nativeEvents = - ih.get_native_events(); - assert(nativeEvents.size()); - for (auto &cudaEv : nativeEvents) { - CUDA_CHECK(cuStreamWaitEvent(stream, cudaEv, 0)); - } + auto syclEvent1 = q.submit([&](sycl::handler &cgh) { + cgh.memcpy(ptrDevice, ptrHostA, bufSize * sizeof(T)); + }); - CUDA_CHECK(cuMemcpyDtoHAsync(ptrHostB, - reinterpret_cast(ptrDevice), - bufSize * sizeof(T), stream)); - CUDA_CHECK(cuStreamSynchronize(stream)); + auto syclEvent2 = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(syclEvent1); + cgh.host_task( + [&](sycl::interop_handle ih) { + auto [stream, _] = cudaSetCtxAndGetStreamAndEvent(ih); + auto nativeEvents = + ih.get_native_events(); + if constexpr (UseManualInteropSync) { + assert(nativeEvents.size()); + for (auto &cudaEv : nativeEvents) { + CUDA_CHECK(cuStreamWaitEvent(stream, cudaEv, 0)); + } + } + + CUDA_CHECK(cuMemcpyDtoHAsync( + ptrHostB, reinterpret_cast(ptrDevice), + bufSize * sizeof(T), stream)); + CUDA_CHECK(cuStreamSynchronize(stream)); + }, + PropList(UseManualInteropSync)); }); - }); - waitHelper(syclEvent2, q); + waitHelper(syclEvent2, q); - for (auto i = 0; i < bufSize; --i) { - if (ptrHostB[i] != pattern) { - printf("Wrong result at index: %d, have %d vs %d\n", i, ptrHostB[i], - pattern); - throw; + for (auto i = 0; i < bufSize; --i) { + if (ptrHostB[i] != pattern) { + cuCtxSynchronize(); + fprintf(stderr, "Wrong result at index: %d, have %d vs %d\n", i, + ptrHostB[i], pattern); + throw; + } } } +}; - printf("Tests passed\n"); +template