From e2e052386b2b1d18c430e23b15f19c88b1529e7e Mon Sep 17 00:00:00 2001
From: Hugh Delaney <hugh.delaney@codeplay.com>
Date: Mon, 4 Mar 2024 19:25:21 +0000
Subject: [PATCH] waitForEvents wait for native events

- Wait for native HT events if waitForEvents is called.
- Only return native events if they haven't been waited on
- Fix bug in get_native for event which isn't host task
- Make native input types single CUevent/HIPevents, not vectors.
- Update tests with using structs as well as not requiring that
  get_native_events returns an empty vec if SYCL RT synchronization is
  used.
---
 .../sycl/detail/backend_traits_cuda.hpp       |   3 +-
 .../sycl/detail/backend_traits_hip.hpp        |   3 +-
 .../backend/backend_traits_cuda.hpp           |   3 +-
 .../ext/oneapi/experimental/backend/cuda.hpp  |   3 +-
 sycl/source/detail/backend_impl.hpp           |   5 +-
 sycl/source/detail/event_impl.cpp             |  15 +-
 sycl/source/detail/event_impl.hpp             |  14 +-
 sycl/source/detail/scheduler/commands.cpp     |   9 +-
 .../detail/scheduler/graph_processor.cpp      |   7 +-
 sycl/source/detail/scheduler/scheduler.cpp    |  24 +-
 sycl/source/detail/scheduler/scheduler.hpp    |   5 +
 sycl/source/interop_handle.cpp                |   3 -
 .../host-task-add-native-events-cuda.cpp      |  47 +---
 .../host-task-get-native-events-cuda.cpp      | 264 ++++++++++--------
 14 files changed, 232 insertions(+), 173 deletions(-)
diff --git a/sycl/include/sycl/detail/backend_traits_cuda.hpp b/sycl/include/sycl/detail/backend_traits_cuda.hpp
index d06947642b45f..92fb136126ae1 100644
--- a/sycl/include/sycl/detail/backend_traits_cuda.hpp
+++ b/sycl/include/sycl/detail/backend_traits_cuda.hpp
@@ -87,8 +87,7 @@ template <> struct BackendReturn<backend::ext_oneapi_cuda, device> {
 };
 
 template <> struct BackendInput<backend::ext_oneapi_cuda, event> {
-  using type = std::vector<CUevent>;
-  using value_type = CUevent;
+  using type = CUevent;
 };
 
 template <> struct BackendReturn<backend::ext_oneapi_cuda, event> {
diff --git a/sycl/include/sycl/detail/backend_traits_hip.hpp b/sycl/include/sycl/detail/backend_traits_hip.hpp
index c454ae8147510..b432472d1caa7 100644
--- a/sycl/include/sycl/detail/backend_traits_hip.hpp
+++ b/sycl/include/sycl/detail/backend_traits_hip.hpp
@@ -81,8 +81,7 @@ template <> struct BackendReturn<backend::ext_oneapi_hip, device> {
 };
 
 template <> struct BackendInput<backend::ext_oneapi_hip, event> {
-  using type = std::vector<HIPevent>;
-  using value_type = HIPevent;
+  using type = HIPevent;
 };
 
 template <> struct BackendReturn<backend::ext_oneapi_hip, event> {
diff --git a/sycl/include/sycl/ext/oneapi/experimental/backend/backend_traits_cuda.hpp b/sycl/include/sycl/ext/oneapi/experimental/backend/backend_traits_cuda.hpp
index a3ce9b5b24be2..610ef5a7b84c4 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/backend/backend_traits_cuda.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/backend/backend_traits_cuda.hpp
@@ -89,8 +89,7 @@ template <> struct BackendReturn<backend::ext_oneapi_cuda, device> {
 };
 
 template <> struct BackendInput<backend::ext_oneapi_cuda, event> {
-  using type = std::vector<CUevent>;
-  using value_type = CUevent;
+  using type = CUevent;
 };
 
 template <> struct BackendReturn<backend::ext_oneapi_cuda, event> {
diff --git a/sycl/include/sycl/ext/oneapi/experimental/backend/cuda.hpp b/sycl/include/sycl/ext/oneapi/experimental/backend/cuda.hpp
index c5072699ad01c..5d3833080e589 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/backend/cuda.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/backend/cuda.hpp
@@ -26,7 +26,8 @@ inline __SYCL_EXPORT device make_device(pi_native_handle NativeHandle) {
 // Implementation of cuda::has_native_event
 inline __SYCL_EXPORT bool has_native_event(event sycl_event) {
   if (sycl_event.get_backend() == backend::ext_oneapi_cuda)
-    return get_native<backend::ext_oneapi_cuda>(sycl_event) != nullptr;
+    return get_native<backend::ext_oneapi_cuda>(sycl_event).size() &&
+           get_native<backend::ext_oneapi_cuda>(sycl_event).front() != nullptr;
 
   return false;
 }
diff --git a/sycl/source/detail/backend_impl.hpp b/sycl/source/detail/backend_impl.hpp
index d7b09beb673fd..37add4e8b43e3 100644
--- a/sycl/source/detail/backend_impl.hpp
+++ b/sycl/source/detail/backend_impl.hpp
@@ -18,8 +18,9 @@ namespace detail {
 template <class T> backend getImplBackend(const T &Impl) {
   // Experimental host task allows the user to get backend for event impls
   if constexpr (std::is_same_v<T, std::shared_ptr<event_impl>>) {
-    assert(Impl->backendSet() &&
-           "interop_handle::add_native_events must be used in order for a host "
+    assert((!Impl->is_host() || Impl->backendSet()) &&
+           "interop_handle::add_native_events must be "
+           "used in order for a host "
            "task event to have a native event");
   } else {
     assert(!Impl->is_host() && "Cannot get the backend for host.");
diff --git a/sycl/source/detail/event_impl.cpp b/sycl/source/detail/event_impl.cpp
index a3197fbabf789..fdafbe7df467c 100644
--- a/sycl/source/detail/event_impl.cpp
+++ b/sycl/source/detail/event_impl.cpp
@@ -69,6 +69,9 @@ void event_impl::waitInternal() {
         "waitInternal method cannot be used for a discarded event.");
   } else if (MState != HES_Complete) {
     // Wait for the host event
+    // In the case that the Host Task function stores native events with
+    // add_native_events, waitInternal will only wait on the lambda to complete,
+    // not on the asynchronous events
     std::unique_lock<std::mutex> lock(MMutex);
     cv.wait(lock, [this] { return MState == HES_Complete; });
   }
@@ -79,8 +82,9 @@ void event_impl::waitInternal() {
 }
 
 void event_impl::waitForHostTaskNativeEvents() {
-  // This should only be called if we wait on a queue or a SYCL user
-  // event. It should not be called to managed device dependencies
+  std::unique_lock<std::mutex> Lock(MHostTaskNativeEventsMutex);
+  if (MHostTaskNativeEventsHaveBeenWaitedOn.exchange(true))
+    return;
   for (const EventImplPtr &Event : MHostTaskNativeEvents)
     Event->wait(Event);
 }
@@ -424,11 +428,16 @@ pi_native_handle event_impl::getNative() {
 }
 
 std::vector<pi_native_handle> event_impl::getNativeVector() {
+  // Return empty vec if native events have already been waited on
+  if (isCompleted() && (!hasHostTaskNativeEvents() ||
+                        MHostTaskNativeEventsHaveBeenWaitedOn.load()))
+    return {};
+
   // If there is a native event return that. This will also initialize context
   if (auto nativeEvent = getNative())
     return {nativeEvent};
 
-  // Return native events sumbitted via host task interop
+  // Return native events submitted via host task interop
   auto Plugin = getPlugin();
   std::vector<pi_native_handle> HandleVec;
   for (auto &HostTaskNativeEventImpl : MHostTaskNativeEvents) {
diff --git a/sycl/source/detail/event_impl.hpp b/sycl/source/detail/event_impl.hpp
index a3f3634e43930..cc8e1c0dbcdb4 100644
--- a/sycl/source/detail/event_impl.hpp
+++ b/sycl/source/detail/event_impl.hpp
@@ -312,7 +312,9 @@ class event_impl {
     return MHostTaskNativeEvents.size() > 0;
   }
 
-  bool backendSet() const { return hasHostTaskNativeEvents(); }
+  bool backendSet() const {
+    return !MContext->is_host() || hasHostTaskNativeEvents();
+  }
 
   const std::vector<EventImplPtr> &getHostTaskNativeEvents() const {
     return MHostTaskNativeEvents;
@@ -347,16 +349,18 @@ class event_impl {
   std::weak_ptr<queue_impl> MWorkerQueue;
   std::weak_ptr<queue_impl> MSubmittedQueue;
 
-  // Used to hold pi_events for native events that are stored with
-  // interop_handle::add_native_events
-  std::vector<EventImplPtr> MHostTaskNativeEvents;
-
   /// Dependency events prepared for waiting by backend.
   std::vector<EventImplPtr> MPreparedDepsEvents;
   std::vector<EventImplPtr> MPreparedHostDepsEvents;
 
   std::vector<EventImplPtr> MPostCompleteEvents;
 
+  // Used to hold pi_events for native events that are stored with
+  // interop_handle::add_native_events
+  std::vector<EventImplPtr> MHostTaskNativeEvents;
+  std::atomic<bool> MHostTaskNativeEventsHaveBeenWaitedOn = false;
+  std::mutex MHostTaskNativeEventsMutex;
+
   /// Indicates that the task associated with this event has been submitted by
   /// the queue to the device.
   std::atomic<bool> MIsFlushed = false;
diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp
index ce66d00cf0254..09d9f92486f3d 100644
--- a/sycl/source/detail/scheduler/commands.cpp
+++ b/sycl/source/detail/scheduler/commands.cpp
@@ -323,7 +323,7 @@ class DispatchHostTask {
                        // to the event impl of the sycl event returned at
                        // CGSubmit
 
-  pi_result waitForEvents() const {
+  pi_result waitForNativeDepEvents() const {
     std::map<const PluginPtr, std::vector<EventImplPtr>>
         RequiredEventsPerPlugin;
 
@@ -359,6 +359,8 @@ class DispatchHostTask {
     // Host events can't throw exceptions so don't try to catch it.
     for (const EventImplPtr &Event : MThisCmd->MPreparedHostDepsEvents) {
       Event->waitInternal();
+      if (Event->hasHostTaskNativeEvents())
+        Event->waitForHostTaskNativeEvents();
     }
 
     return PI_SUCCESS;
@@ -388,7 +390,7 @@ class DispatchHostTask {
 #endif
 
     if (!HostTask.MHostTask->isManualInteropSync()) {
-      pi_result WaitResult = waitForEvents();
+      pi_result WaitResult = waitForNativeDepEvents();
       if (WaitResult != PI_SUCCESS) {
         std::exception_ptr EPtr = std::make_exception_ptr(sycl::runtime_error(
             std::string("Couldn't wait for host-task's dependencies"),
@@ -2868,8 +2870,7 @@ pi_int32 ExecCGCommand::enqueueImp() {
 
 pi_int32 ExecCGCommand::enqueueImpQueue() {
   if (getCG().getType() != CG::CGTYPE::CodeplayHostTask)
-    waitForPreparedHostEvents(); // Why is this not called if the current
-                                 // command group is a HT?
+    waitForPreparedHostEvents();
   std::vector<EventImplPtr> EventImpls = getAllPreparedDepsEvents();
   auto RawEvents = getPiEvents(EventImpls);
   flushCrossQueueDeps(EventImpls, getWorkerQueue());
diff --git a/sycl/source/detail/scheduler/graph_processor.cpp b/sycl/source/detail/scheduler/graph_processor.cpp
index 5ff184be64948..d464a65353ee0 100644
--- a/sycl/source/detail/scheduler/graph_processor.cpp
+++ b/sycl/source/detail/scheduler/graph_processor.cpp
@@ -57,7 +57,12 @@ bool Scheduler::GraphProcessor::handleBlockingCmd(Command *Cmd,
     std::lock_guard<std::mutex> Guard(Cmd->MBlockedUsersMutex);
     if (Cmd->isBlocking()) {
       const EventImplPtr &RootCmdEvent = RootCommand->getEvent();
-      Cmd->addBlockedUserUnique(RootCmdEvent);
+      // Host tasks don't need to be added to wait list. When host tasks are
+      // enqueued, a new thread is created which waits on dep events via
+      // condition variables, so they don't need to be enqueued by other
+      // additional means
+      if (!RootCommand->isHostTask())
+        Cmd->addBlockedUserUnique(RootCmdEvent);
       EnqueueResult = EnqueueResultT(EnqueueResultT::SyclEnqueueBlocked, Cmd);
 
       // Blocked command will be enqueued asynchronously from submission so we
diff --git a/sycl/source/detail/scheduler/scheduler.cpp b/sycl/source/detail/scheduler/scheduler.cpp
index 6d06be32c6250..a83298a628539 100644
--- a/sycl/source/detail/scheduler/scheduler.cpp
+++ b/sycl/source/detail/scheduler/scheduler.cpp
@@ -377,6 +377,22 @@ void Scheduler::enqueueLeavesOfReqUnlocked(const Requirement *const Req,
   EnqueueLeaves(Record->MWriteLeaves);
 }
 
+void Scheduler::enqueueUnblockedCommands(
+    const std::vector<EventImplPtr> &ToEnqueue, ReadLockT &GraphReadLock,
+    std::vector<Command *> &ToCleanUp) {
+  for (auto &Event : ToEnqueue) {
+    Command *Cmd = static_cast<Command *>(Event->getCommand());
+    if (!Cmd)
+      continue;
+    EnqueueResultT Res;
+    bool Enqueued =
+        GraphProcessor::enqueueCommand(Cmd, GraphReadLock, Res, ToCleanUp, Cmd);
+    if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult)
+      throw runtime_error("Enqueue process failed.",
+                          PI_ERROR_INVALID_OPERATION);
+  }
+}
+
 Scheduler::Scheduler() {
   sycl::device HostDevice =
       createSyclObjFromImpl<device>(device_impl::getHostDeviceImpl());
@@ -470,7 +486,13 @@ void Scheduler::NotifyHostTaskCompletion(Command *Cmd) {
       ToCleanUp.push_back(Cmd);
       Cmd->MMarkedForCleanup = true;
     }
-    Cmd->getEvent()->setComplete();
+
+    {
+      std::lock_guard<std::mutex> Guard(Cmd->MBlockedUsersMutex);
+      // update self-event status
+      Cmd->getEvent()->setComplete();
+    }
+    Scheduler::enqueueUnblockedCommands(Cmd->MBlockedUsers, Lock, ToCleanUp);
   }
   cleanupCommands(ToCleanUp);
 }
diff --git a/sycl/source/detail/scheduler/scheduler.hpp b/sycl/source/detail/scheduler/scheduler.hpp
index d4ce5abd75e75..53ce295626045 100644
--- a/sycl/source/detail/scheduler/scheduler.hpp
+++ b/sycl/source/detail/scheduler/scheduler.hpp
@@ -527,6 +527,11 @@ class Scheduler {
                                          ReadLockT &GraphReadLock,
                                          std::vector<Command *> &ToCleanUp);
 
+  static void
+  enqueueUnblockedCommands(const std::vector<EventImplPtr> &CmdsToEnqueue,
+                           ReadLockT &GraphReadLock,
+                           std::vector<Command *> &ToCleanUp);
+
   // May lock graph with read and write modes during execution.
   void cleanupDeferredMemObjects(BlockingT Blocking);
 
diff --git a/sycl/source/interop_handle.cpp b/sycl/source/interop_handle.cpp
index 448bf680b6d53..d9034abfd7819 100644
--- a/sycl/source/interop_handle.cpp
+++ b/sycl/source/interop_handle.cpp
@@ -77,9 +77,6 @@ void interop_handle::addNativeEvents(
 }
 
 std::vector<pi_native_handle> interop_handle::getNativeEvents() const {
-  if (!MEvent->backendSet()) {
-    MEvent->setContextImpl(MContext);
-  }
   // What if the events here have not yet been enqueued? I will need to wait on
   // them. That is probably already done?
   //
diff --git a/sycl/test-e2e/HostInteropTask/native-events/host-task-add-native-events-cuda.cpp b/sycl/test-e2e/HostInteropTask/native-events/host-task-add-native-events-cuda.cpp
index d3931978a4db3..ce9777c4f1524 100644
--- a/sycl/test-e2e/HostInteropTask/native-events/host-task-add-native-events-cuda.cpp
+++ b/sycl/test-e2e/HostInteropTask/native-events/host-task-add-native-events-cuda.cpp
@@ -3,6 +3,12 @@
 // RUN: %{build} -o %t.out -lcuda
 // RUN: %{run} %t.out
 
+// These tests use the add_native_events API to ensure that the SYCL RT can
+// handle the events submitted to add_native_events within its runtime DAG.
+//
+// If manual_interop_sync is used then the user deals with async dependencies
+// manually in the HT lambda through the get_native_events interface.
+
 #include "host-task-native-events-cuda.hpp"
 #include <cuda.h>
 #include <sycl/sycl.hpp>
@@ -12,37 +18,9 @@ using T = unsigned; // We don't need to test lots of types, we just want a race
 constexpr size_t bufSize = 1e6;
 constexpr T pattern = 42;
 
-// Check that the SYCL event that we submit with add_native_events can be
-// retrieved later through get_native(syclEvent)
-template <typename WaitOnType> void test1() {
-  printf("Running test 1\n");
-  sycl::queue q;
-
-  std::atomic<CUevent>
-      atomicEvent; // To share the event from the host task with the main thread
-
-  auto syclEvent = q.submit([&](sycl::handler &cgh) {
-    cgh.host_task([&](sycl::interop_handle ih) {
-      auto [_, ev] = cudaSetCtxAndGetStreamAndEvent(ih);
-      cuEventRecord(ev, 0);
-      atomicEvent.store(ev);
-      ih.add_native_events<sycl::backend::ext_oneapi_cuda>({ev});
-    });
-  });
-
-  waitHelper<WaitOnType>(syclEvent, q);
-
-  auto nativeEvents =
-      sycl::get_native<sycl::backend::ext_oneapi_cuda>(syclEvent);
-  // Check that the vec of native events contains the event we stored in the
-  // atomic var
-  assert(std::find(nativeEvents.begin(), nativeEvents.end(),
-                   atomicEvent.load()) != nativeEvents.end());
-}
-
 // Tries to check for a race condition if the backend events are not added to
 // the SYCL dag.
-template <typename WaitOnType> void test2() {
+template <typename WaitOnType> void test1() {
   printf("Running test 2\n");
   sycl::queue q;
   std::vector<T> out(bufSize, 0);
@@ -70,7 +48,7 @@ template <typename WaitOnType> void test2() {
 }
 
 // Using host task event as a cgh.depends_on with USM
-template <typename WaitOnType> void test3() {
+template <typename WaitOnType> void test2() {
   printf("Running test 3\n");
   using T = unsigned;
 
@@ -107,7 +85,7 @@ template <typename WaitOnType> void test3() {
 }
 
 // Using host task event with implicit DAG from buffer accessor model
-template <typename WaitOnType> void test4() {
+template <typename WaitOnType> void test3() {
   printf("Running test 4\n");
   using T = unsigned;
 
@@ -125,6 +103,7 @@ template <typename WaitOnType> void test4() {
     sycl::accessor acc{buf, sycl::write_only};
 
     cgh.host_task([&](sycl::interop_handle ih) {
+      // FIXME: this call fails
       auto accPtr = ih.get_native_mem<sycl::backend::ext_oneapi_cuda>(acc);
       auto [stream, ev] = cudaSetCtxAndGetStreamAndEvent(ih);
 
@@ -152,9 +131,7 @@ int main() {
   test1<sycl::event>();
   test2<sycl::queue>();
   test2<sycl::event>();
-  test3<sycl::queue>();
-  test3<sycl::event>();
-  // test4<sycl::queue>(); Fails with `SyclObject.impl && "every constructor
+  // test3<sycl::queue>(); Fails with `SyclObject.impl && "every constructor
   //                       should create an impl"' failed.
-  // test4<sycl::event>();
+  // test3<sycl::event>();
 }
diff --git a/sycl/test-e2e/HostInteropTask/native-events/host-task-get-native-events-cuda.cpp b/sycl/test-e2e/HostInteropTask/native-events/host-task-get-native-events-cuda.cpp
index fa7964c0932d3..7e82e9d667ed2 100644
--- a/sycl/test-e2e/HostInteropTask/native-events/host-task-get-native-events-cuda.cpp
+++ b/sycl/test-e2e/HostInteropTask/native-events/host-task-get-native-events-cuda.cpp
@@ -3,152 +3,192 @@
 // RUN: %{build} -o %t.out -lcuda
 // RUN: %{run} %t.out
 
+// These tests use the get_native_events API together with manual_interop_sync
+// property. If manual interop sync is not used then get_native_events is not
+// necessary, since all events have been synchronized with already on host,
+// before the HT lambda is launched.
+//
+// If manual_interop_sync is used then the user deals with async dependencies
+// manually in the HT lambda through the get_native_events interface.
+//
+
 #include "host-task-native-events-cuda.hpp"
 #include <cuda.h>
 #include <sycl/sycl.hpp>
 
 using T = unsigned; // We don't need to test lots of types, we just want a race
                     // condition
-constexpr size_t bufSize = 1e6;
+constexpr size_t bufSize = 1e7;
 constexpr T pattern = 42;
 
+sycl::queue q;
+
+using manual_interop_sync =
+    sycl::ext::codeplay::experimental::property::host_task::manual_interop_sync;
+
+constexpr auto PropList = [](bool UseManualInteropSync) -> sycl::property_list {
+  if (UseManualInteropSync)
+    return {manual_interop_sync{}};
+  return {};
+};
+
 // Check that the SYCL event that we submit with add_native_events can be
 // retrieved later through get_native_events in a dependent host task
-template <typename WaitOnType> void test1() {
-  printf("Running test 1\n");
-  sycl::queue q;
-
-  std::atomic<CUevent>
-      atomicEvent; // To share the event from the host task with the main thread
-
-  auto syclEvent1 = q.submit([&](sycl::handler &cgh) {
-    cgh.host_task([&](sycl::interop_handle ih) {
-      auto [_, ev] = cudaSetCtxAndGetStreamAndEvent(ih);
-      cuEventRecord(ev, 0);
-      atomicEvent.store(ev);
-      ih.add_native_events<sycl::backend::ext_oneapi_cuda>({ev});
+template <typename WaitOnType, bool UseManualInteropSync> struct test1 {
+  void operator()() {
+    printf("Running test 1\n");
+    std::atomic<CUevent> atomicEvent; // To share the event from the host task
+                                      // with the main thread
+
+    auto syclEvent1 = q.submit([&](sycl::handler &cgh) {
+      cgh.host_task([&](sycl::interop_handle ih) {
+        auto [_, ev] = cudaSetCtxAndGetStreamAndEvent(ih);
+        cuEventRecord(ev, 0);
+        atomicEvent.store(ev);
+        ih.add_native_events<sycl::backend::ext_oneapi_cuda>({ev});
+      });
     });
-  });
-
-  // This task must wait on the other lambda to complete
-  auto syclEvent2 = q.submit([&](sycl::handler &cgh) {
-    cgh.depends_on(syclEvent1);
-    cgh.host_task([&](sycl::interop_handle ih) {
-      auto nativeEvents =
-          ih.get_native_events<sycl::backend::ext_oneapi_cuda>();
-      assert(std::find(nativeEvents.begin(), nativeEvents.end(),
-                       atomicEvent.load()) != nativeEvents.end());
+
+    // This task must wait on the other lambda to complete
+    auto syclEvent2 = q.submit([&](sycl::handler &cgh) {
+      cgh.depends_on(syclEvent1);
+      cgh.host_task(
+          [&](sycl::interop_handle ih) {
+            auto nativeEvents =
+                ih.get_native_events<sycl::backend::ext_oneapi_cuda>();
+            if constexpr (!UseManualInteropSync) {
+              // Events should be synchronized with by SYCL RT if
+              // manual_interop_sync not used
+              return;
+            }
+            assert(std::find(nativeEvents.begin(), nativeEvents.end(),
+                             atomicEvent.load()) != nativeEvents.end());
+          },
+          PropList(UseManualInteropSync));
     });
-  });
 
-  waitHelper<WaitOnType>(syclEvent2, q);
-}
+    waitHelper<WaitOnType>(syclEvent2, q);
+  }
+};
 
 // Tries to check for a race condition if the backend events are not added to
 // the SYCL dag.
-template <typename WaitOnType> void test2() {
-  printf("Running test 2\n");
-  sycl::queue q;
-  T *ptrHost = sycl::malloc_host<T>(
-      bufSize,
-      q); // malloc_host is necessary to make the memcpy as async as possible
-
-  auto syclEvent1 = q.submit([&](sycl::handler &cgh) {
-    cgh.host_task([&](sycl::interop_handle ih) {
-      auto [stream, ev] = cudaSetCtxAndGetStreamAndEvent(ih);
-      CUdeviceptr cuPtr;
-      CUDA_CHECK(cuMemAlloc_v2(&cuPtr, bufSize * sizeof(T)));
-      CUDA_CHECK(cuMemsetD32Async(cuPtr, pattern, bufSize, stream));
-      CUDA_CHECK(
-          cuMemcpyDtoHAsync(ptrHost, cuPtr, bufSize * sizeof(T), stream));
-
-      CUDA_CHECK(cuEventRecord(ev, stream));
-
-      ih.add_native_events<sycl::backend::ext_oneapi_cuda>({ev});
+template <typename WaitOnType, bool UseManualInteropSync> struct test2 {
+  void operator()() {
+    printf("Running test 2\n");
+    T *ptrHost = sycl::malloc_host<T>(
+        bufSize,
+        q); // malloc_host is necessary to make the memcpy as async as possible
+
+    auto syclEvent1 = q.submit([&](sycl::handler &cgh) {
+      cgh.host_task([&](sycl::interop_handle ih) {
+        auto [stream, ev] = cudaSetCtxAndGetStreamAndEvent(ih);
+        CUdeviceptr cuPtr;
+        CUDA_CHECK(cuMemAlloc_v2(&cuPtr, bufSize * sizeof(T)));
+        CUDA_CHECK(cuMemsetD32Async(cuPtr, pattern, bufSize, stream));
+        CUDA_CHECK(
+            cuMemcpyDtoHAsync(ptrHost, cuPtr, bufSize * sizeof(T), stream));
+
+        CUDA_CHECK(cuEventRecord(ev, stream));
+
+        ih.add_native_events<sycl::backend::ext_oneapi_cuda>({ev});
+      });
     });
-  });
-
-  auto syclEvent2 = q.submit([&](sycl::handler &cgh) {
-    cgh.depends_on(syclEvent1);
-    cgh.host_task([&](sycl::interop_handle ih) {
-      cudaSetCtxAndGetStreamAndEvent(ih);
-      auto nativeEvents =
-          ih.get_native_events<sycl::backend::ext_oneapi_cuda>();
-      assert(nativeEvents.size());
-      for (auto &cudaEv : nativeEvents) {
-        CUDA_CHECK(cuEventSynchronize(cudaEv));
-      }
+
+    auto syclEvent2 = q.submit([&](sycl::handler &cgh) {
+      cgh.depends_on(syclEvent1);
+      cgh.host_task(
+          [&](sycl::interop_handle ih) {
+            cudaSetCtxAndGetStreamAndEvent(ih);
+            auto nativeEvents =
+                ih.get_native_events<sycl::backend::ext_oneapi_cuda>();
+            if constexpr (!UseManualInteropSync) {
+              // Events should be synchronized with by SYCL RT if
+              // manual_interop_sync not used
+              return;
+            }
+            assert(nativeEvents.size());
+            for (auto &cudaEv : nativeEvents) {
+              CUDA_CHECK(cuEventSynchronize(cudaEv));
+            }
+          },
+          PropList(UseManualInteropSync));
     });
-  });
-
-  waitHelper<WaitOnType>(syclEvent2, q);
-  for (auto i = 0; i < bufSize; ++i) {
-    if (ptrHost[i] != pattern) {
-      printf("Wrong result at index: %d, have %d vs %d\n", i, ptrHost[i],
-             pattern);
-      throw;
+
+    waitHelper<WaitOnType>(syclEvent2, q);
+    for (auto i = 0; i < bufSize; ++i) {
+      if (ptrHost[i] != pattern) {
+        fprintf(stderr, "Wrong result at index: %d, have %d vs %d\n", i,
+                ptrHost[i], pattern);
+        throw;
+      }
     }
   }
-  printf("Tests passed\n");
-}
+};
 
 // Using host task event as a cgh.depends_on with USM
-template <typename WaitOnType> void test3() {
-  printf("Running test 3\n");
-  using T = unsigned;
-
-  sycl::queue q;
-
-  T *ptrHostA = sycl::malloc_host<T>(bufSize, q);
-  T *ptrHostB = sycl::malloc_host<T>(bufSize, q);
+template <typename WaitOnType, bool UseManualInteropSync> struct test3 {
+  void operator()() {
+    printf("Running test 3\n");
+    using T = unsigned;
 
-  T *ptrDevice = sycl::malloc_device<T>(bufSize, q);
+    T *ptrHostA = sycl::malloc_host<T>(bufSize, q);
+    T *ptrHostB = sycl::malloc_host<T>(bufSize, q);
 
-  for (auto i = 0; i < bufSize; ++i)
-    ptrHostA[i] = pattern;
+    T *ptrDevice = sycl::malloc_device<T>(bufSize, q);
 
-  auto syclEvent1 = q.submit([&](sycl::handler &cgh) {
-    cgh.memcpy(ptrDevice, ptrHostA, bufSize * sizeof(T));
-  });
+    for (auto i = 0; i < bufSize; ++i)
+      ptrHostA[i] = pattern;
 
-  auto syclEvent2 = q.submit([&](sycl::handler &cgh) {
-    cgh.depends_on(syclEvent1);
-    cgh.host_task([&](sycl::interop_handle ih) {
-      auto [stream, _] = cudaSetCtxAndGetStreamAndEvent(ih);
-      auto nativeEvents =
-          ih.get_native_events<sycl::backend::ext_oneapi_cuda>();
-      assert(nativeEvents.size());
-      for (auto &cudaEv : nativeEvents) {
-        CUDA_CHECK(cuStreamWaitEvent(stream, cudaEv, 0));
-      }
+    auto syclEvent1 = q.submit([&](sycl::handler &cgh) {
+      cgh.memcpy(ptrDevice, ptrHostA, bufSize * sizeof(T));
+    });
 
-      CUDA_CHECK(cuMemcpyDtoHAsync(ptrHostB,
-                                   reinterpret_cast<CUdeviceptr>(ptrDevice),
-                                   bufSize * sizeof(T), stream));
-      CUDA_CHECK(cuStreamSynchronize(stream));
+    auto syclEvent2 = q.submit([&](sycl::handler &cgh) {
+      cgh.depends_on(syclEvent1);
+      cgh.host_task(
+          [&](sycl::interop_handle ih) {
+            auto [stream, _] = cudaSetCtxAndGetStreamAndEvent(ih);
+            auto nativeEvents =
+                ih.get_native_events<sycl::backend::ext_oneapi_cuda>();
+            if constexpr (UseManualInteropSync) {
+              assert(nativeEvents.size());
+              for (auto &cudaEv : nativeEvents) {
+                CUDA_CHECK(cuStreamWaitEvent(stream, cudaEv, 0));
+              }
+            }
+
+            CUDA_CHECK(cuMemcpyDtoHAsync(
+                ptrHostB, reinterpret_cast<CUdeviceptr>(ptrDevice),
+                bufSize * sizeof(T), stream));
+            CUDA_CHECK(cuStreamSynchronize(stream));
+          },
+          PropList(UseManualInteropSync));
     });
-  });
 
-  waitHelper<WaitOnType>(syclEvent2, q);
+    waitHelper<WaitOnType>(syclEvent2, q);
 
-  for (auto i = 0; i < bufSize; --i) {
-    if (ptrHostB[i] != pattern) {
-      printf("Wrong result at index: %d, have %d vs %d\n", i, ptrHostB[i],
-             pattern);
-      throw;
+    for (auto i = 0; i < bufSize; --i) {
+      if (ptrHostB[i] != pattern) {
+        cuCtxSynchronize();
+        fprintf(stderr, "Wrong result at index: %d, have %d vs %d\n", i,
+                ptrHostB[i], pattern);
+        throw;
+      }
     }
   }
+};
 
-  printf("Tests passed\n");
+template <template <typename, bool> typename Func> void run() {
+  Func<sycl::queue, /* UseManualInteropSync*/ true>()();
+  Func<sycl::event, /* UseManualInteropSync*/ true>()();
+  Func<sycl::queue, /* UseManualInteropSync*/ false>()();
+  Func<sycl::event, /* UseManualInteropSync*/ false>()();
 }
 
 int main() {
-  test1<sycl::queue>();
-  test1<sycl::event>();
-  test2<sycl::queue>();
-  test2<sycl::event>();
-  test3<sycl::queue>(); // FIXME: this is taking a slow path by waiting for the
-                        // parallel_for events on host before starting the host
-                        // task
-  test3<sycl::event>();
+  run<test1>();
+  run<test2>();
+  run<test3>();
+  printf("Tests passed\n");
 }