Skip to content

Commit

Permalink
waitForEvents wait for native events
Browse files Browse the repository at this point in the history
- Wait for native HT events if waitForEvents is called.
- Only return native events if they haven't been waited on
- Fix bug in get_native for event which isn't host task
- Make native input types single CUevent/HIPevents, not vectors.
- Update tests with using structs as well as not requiring that
  get_native_events returns an empty vec if SYCL RT synchronization is
  used.
  • Loading branch information
hdelan committed Mar 6, 2024
1 parent 3928199 commit e2e0523
Show file tree
Hide file tree
Showing 14 changed files with 232 additions and 173 deletions.
3 changes: 1 addition & 2 deletions sycl/include/sycl/detail/backend_traits_cuda.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -87,8 +87,7 @@ template <> struct BackendReturn<backend::ext_oneapi_cuda, device> {
};

template <> struct BackendInput<backend::ext_oneapi_cuda, event> {
using type = std::vector<CUevent>;
using value_type = CUevent;
using type = CUevent;
};

template <> struct BackendReturn<backend::ext_oneapi_cuda, event> {
Expand Down
3 changes: 1 addition & 2 deletions sycl/include/sycl/detail/backend_traits_hip.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -81,8 +81,7 @@ template <> struct BackendReturn<backend::ext_oneapi_hip, device> {
};

template <> struct BackendInput<backend::ext_oneapi_hip, event> {
using type = std::vector<HIPevent>;
using value_type = HIPevent;
using type = HIPevent;
};

template <> struct BackendReturn<backend::ext_oneapi_hip, event> {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -89,8 +89,7 @@ template <> struct BackendReturn<backend::ext_oneapi_cuda, device> {
};

template <> struct BackendInput<backend::ext_oneapi_cuda, event> {
using type = std::vector<CUevent>;
using value_type = CUevent;
using type = CUevent;
};

template <> struct BackendReturn<backend::ext_oneapi_cuda, event> {
Expand Down
3 changes: 2 additions & 1 deletion sycl/include/sycl/ext/oneapi/experimental/backend/cuda.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@ inline __SYCL_EXPORT device make_device(pi_native_handle NativeHandle) {
// Implementation of cuda::has_native_event
inline __SYCL_EXPORT bool has_native_event(event sycl_event) {
if (sycl_event.get_backend() == backend::ext_oneapi_cuda)
return get_native<backend::ext_oneapi_cuda>(sycl_event) != nullptr;
return get_native<backend::ext_oneapi_cuda>(sycl_event).size() &&
get_native<backend::ext_oneapi_cuda>(sycl_event).front() != nullptr;

return false;
}
Expand Down
5 changes: 3 additions & 2 deletions sycl/source/detail/backend_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,9 @@ namespace detail {
template <class T> backend getImplBackend(const T &Impl) {
// Experimental host task allows the user to get backend for event impls
if constexpr (std::is_same_v<T, std::shared_ptr<event_impl>>) {
assert(Impl->backendSet() &&
"interop_handle::add_native_events must be used in order for a host "
assert((!Impl->is_host() || Impl->backendSet()) &&
"interop_handle::add_native_events must be "
"used in order for a host "
"task event to have a native event");
} else {
assert(!Impl->is_host() && "Cannot get the backend for host.");
Expand Down
15 changes: 12 additions & 3 deletions sycl/source/detail/event_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,9 @@ void event_impl::waitInternal() {
"waitInternal method cannot be used for a discarded event.");
} else if (MState != HES_Complete) {
// Wait for the host event
// In the case that the Host Task function stores native events with
// add_native_events, waitInternal will only wait on the lambda to complete,
// not on the asynchronous events
std::unique_lock<std::mutex> lock(MMutex);
cv.wait(lock, [this] { return MState == HES_Complete; });
}
Expand All @@ -79,8 +82,9 @@ void event_impl::waitInternal() {
}

void event_impl::waitForHostTaskNativeEvents() {
// This should only be called if we wait on a queue or a SYCL user
// event. It should not be called to managed device dependencies
std::unique_lock<std::mutex> Lock(MHostTaskNativeEventsMutex);
if (MHostTaskNativeEventsHaveBeenWaitedOn.exchange(true))
return;
for (const EventImplPtr &Event : MHostTaskNativeEvents)
Event->wait(Event);
}
Expand Down Expand Up @@ -424,11 +428,16 @@ pi_native_handle event_impl::getNative() {
}

std::vector<pi_native_handle> event_impl::getNativeVector() {
// Return empty vec if native events have already been waited on
if (isCompleted() && (!hasHostTaskNativeEvents() ||
MHostTaskNativeEventsHaveBeenWaitedOn.load()))
return {};

// If there is a native event return that. This will also initialize context
if (auto nativeEvent = getNative())
return {nativeEvent};

// Return native events sumbitted via host task interop
// Return native events submitted via host task interop
auto Plugin = getPlugin();
std::vector<pi_native_handle> HandleVec;
for (auto &HostTaskNativeEventImpl : MHostTaskNativeEvents) {
Expand Down
14 changes: 9 additions & 5 deletions sycl/source/detail/event_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -312,7 +312,9 @@ class event_impl {
return MHostTaskNativeEvents.size() > 0;
}

bool backendSet() const { return hasHostTaskNativeEvents(); }
bool backendSet() const {
return !MContext->is_host() || hasHostTaskNativeEvents();
}

const std::vector<EventImplPtr> &getHostTaskNativeEvents() const {
return MHostTaskNativeEvents;
Expand Down Expand Up @@ -347,16 +349,18 @@ class event_impl {
std::weak_ptr<queue_impl> MWorkerQueue;
std::weak_ptr<queue_impl> MSubmittedQueue;

// Used to hold pi_events for native events that are stored with
// interop_handle::add_native_events
std::vector<EventImplPtr> MHostTaskNativeEvents;

/// Dependency events prepared for waiting by backend.
std::vector<EventImplPtr> MPreparedDepsEvents;
std::vector<EventImplPtr> MPreparedHostDepsEvents;

std::vector<EventImplPtr> MPostCompleteEvents;

// Used to hold pi_events for native events that are stored with
// interop_handle::add_native_events
std::vector<EventImplPtr> MHostTaskNativeEvents;
std::atomic<bool> MHostTaskNativeEventsHaveBeenWaitedOn = false;
std::mutex MHostTaskNativeEventsMutex;

/// Indicates that the task associated with this event has been submitted by
/// the queue to the device.
std::atomic<bool> MIsFlushed = false;
Expand Down
9 changes: 5 additions & 4 deletions sycl/source/detail/scheduler/commands.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -323,7 +323,7 @@ class DispatchHostTask {
// to the event impl of the sycl event returned at
// CGSubmit

pi_result waitForEvents() const {
pi_result waitForNativeDepEvents() const {
std::map<const PluginPtr, std::vector<EventImplPtr>>
RequiredEventsPerPlugin;

Expand Down Expand Up @@ -359,6 +359,8 @@ class DispatchHostTask {
// Host events can't throw exceptions so don't try to catch it.
for (const EventImplPtr &Event : MThisCmd->MPreparedHostDepsEvents) {
Event->waitInternal();
if (Event->hasHostTaskNativeEvents())
Event->waitForHostTaskNativeEvents();
}

return PI_SUCCESS;
Expand Down Expand Up @@ -388,7 +390,7 @@ class DispatchHostTask {
#endif

if (!HostTask.MHostTask->isManualInteropSync()) {
pi_result WaitResult = waitForEvents();
pi_result WaitResult = waitForNativeDepEvents();
if (WaitResult != PI_SUCCESS) {
std::exception_ptr EPtr = std::make_exception_ptr(sycl::runtime_error(
std::string("Couldn't wait for host-task's dependencies"),
Expand Down Expand Up @@ -2868,8 +2870,7 @@ pi_int32 ExecCGCommand::enqueueImp() {

pi_int32 ExecCGCommand::enqueueImpQueue() {
if (getCG().getType() != CG::CGTYPE::CodeplayHostTask)
waitForPreparedHostEvents(); // Why is this not called if the current
// command group is a HT?
waitForPreparedHostEvents();
std::vector<EventImplPtr> EventImpls = getAllPreparedDepsEvents();
auto RawEvents = getPiEvents(EventImpls);
flushCrossQueueDeps(EventImpls, getWorkerQueue());
Expand Down
7 changes: 6 additions & 1 deletion sycl/source/detail/scheduler/graph_processor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,12 @@ bool Scheduler::GraphProcessor::handleBlockingCmd(Command *Cmd,
std::lock_guard<std::mutex> Guard(Cmd->MBlockedUsersMutex);
if (Cmd->isBlocking()) {
const EventImplPtr &RootCmdEvent = RootCommand->getEvent();
Cmd->addBlockedUserUnique(RootCmdEvent);
// Host tasks don't need to be added to wait list. When host tasks are
// enqueued, a new thread is created which waits on dep events via
// condition variables, so they don't need to be enqueued by other
// additional means
if (!RootCommand->isHostTask())
Cmd->addBlockedUserUnique(RootCmdEvent);
EnqueueResult = EnqueueResultT(EnqueueResultT::SyclEnqueueBlocked, Cmd);

// Blocked command will be enqueued asynchronously from submission so we
Expand Down
24 changes: 23 additions & 1 deletion sycl/source/detail/scheduler/scheduler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -377,6 +377,22 @@ void Scheduler::enqueueLeavesOfReqUnlocked(const Requirement *const Req,
EnqueueLeaves(Record->MWriteLeaves);
}

void Scheduler::enqueueUnblockedCommands(
const std::vector<EventImplPtr> &ToEnqueue, ReadLockT &GraphReadLock,
std::vector<Command *> &ToCleanUp) {
for (auto &Event : ToEnqueue) {
Command *Cmd = static_cast<Command *>(Event->getCommand());
if (!Cmd)
continue;
EnqueueResultT Res;
bool Enqueued =
GraphProcessor::enqueueCommand(Cmd, GraphReadLock, Res, ToCleanUp, Cmd);
if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult)
throw runtime_error("Enqueue process failed.",
PI_ERROR_INVALID_OPERATION);
}
}

Scheduler::Scheduler() {
sycl::device HostDevice =
createSyclObjFromImpl<device>(device_impl::getHostDeviceImpl());
Expand Down Expand Up @@ -470,7 +486,13 @@ void Scheduler::NotifyHostTaskCompletion(Command *Cmd) {
ToCleanUp.push_back(Cmd);
Cmd->MMarkedForCleanup = true;
}
Cmd->getEvent()->setComplete();

{
std::lock_guard<std::mutex> Guard(Cmd->MBlockedUsersMutex);
// update self-event status
Cmd->getEvent()->setComplete();
}
Scheduler::enqueueUnblockedCommands(Cmd->MBlockedUsers, Lock, ToCleanUp);
}
cleanupCommands(ToCleanUp);
}
Expand Down
5 changes: 5 additions & 0 deletions sycl/source/detail/scheduler/scheduler.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -527,6 +527,11 @@ class Scheduler {
ReadLockT &GraphReadLock,
std::vector<Command *> &ToCleanUp);

static void
enqueueUnblockedCommands(const std::vector<EventImplPtr> &CmdsToEnqueue,
ReadLockT &GraphReadLock,
std::vector<Command *> &ToCleanUp);

// May lock graph with read and write modes during execution.
void cleanupDeferredMemObjects(BlockingT Blocking);

Expand Down
3 changes: 0 additions & 3 deletions sycl/source/interop_handle.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -77,9 +77,6 @@ void interop_handle::addNativeEvents(
}

std::vector<pi_native_handle> interop_handle::getNativeEvents() const {
if (!MEvent->backendSet()) {
MEvent->setContextImpl(MContext);
}
// What if the events here have not yet been enqueued? I will need to wait on
// them. That is probably already done?
//
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,12 @@
// RUN: %{build} -o %t.out -lcuda
// RUN: %{run} %t.out

// These tests use the add_native_events API to ensure that the SYCL RT can
// handle the events submitted to add_native_events within its runtime DAG.
//
// If manual_interop_sync is used then the user deals with async dependencies
// manually in the HT lambda through the get_native_events interface.

#include "host-task-native-events-cuda.hpp"
#include <cuda.h>
#include <sycl/sycl.hpp>
Expand All @@ -12,37 +18,9 @@ using T = unsigned; // We don't need to test lots of types, we just want a race
constexpr size_t bufSize = 1e6;
constexpr T pattern = 42;

// Check that the SYCL event that we submit with add_native_events can be
// retrieved later through get_native(syclEvent)
template <typename WaitOnType> void test1() {
printf("Running test 1\n");
sycl::queue q;

std::atomic<CUevent>
atomicEvent; // To share the event from the host task with the main thread

auto syclEvent = q.submit([&](sycl::handler &cgh) {
cgh.host_task([&](sycl::interop_handle ih) {
auto [_, ev] = cudaSetCtxAndGetStreamAndEvent(ih);
cuEventRecord(ev, 0);
atomicEvent.store(ev);
ih.add_native_events<sycl::backend::ext_oneapi_cuda>({ev});
});
});

waitHelper<WaitOnType>(syclEvent, q);

auto nativeEvents =
sycl::get_native<sycl::backend::ext_oneapi_cuda>(syclEvent);
// Check that the vec of native events contains the event we stored in the
// atomic var
assert(std::find(nativeEvents.begin(), nativeEvents.end(),
atomicEvent.load()) != nativeEvents.end());
}

// Tries to check for a race condition if the backend events are not added to
// the SYCL dag.
template <typename WaitOnType> void test2() {
template <typename WaitOnType> void test1() {
printf("Running test 2\n");
sycl::queue q;
std::vector<T> out(bufSize, 0);
Expand Down Expand Up @@ -70,7 +48,7 @@ template <typename WaitOnType> void test2() {
}

// Using host task event as a cgh.depends_on with USM
template <typename WaitOnType> void test3() {
template <typename WaitOnType> void test2() {
printf("Running test 3\n");
using T = unsigned;

Expand Down Expand Up @@ -107,7 +85,7 @@ template <typename WaitOnType> void test3() {
}

// Using host task event with implicit DAG from buffer accessor model
template <typename WaitOnType> void test4() {
template <typename WaitOnType> void test3() {
printf("Running test 4\n");
using T = unsigned;

Expand All @@ -125,6 +103,7 @@ template <typename WaitOnType> void test4() {
sycl::accessor acc{buf, sycl::write_only};

cgh.host_task([&](sycl::interop_handle ih) {
// FIXME: this call fails
auto accPtr = ih.get_native_mem<sycl::backend::ext_oneapi_cuda>(acc);
auto [stream, ev] = cudaSetCtxAndGetStreamAndEvent(ih);

Expand Down Expand Up @@ -152,9 +131,7 @@ int main() {
test1<sycl::event>();
test2<sycl::queue>();
test2<sycl::event>();
test3<sycl::queue>();
test3<sycl::event>();
// test4<sycl::queue>(); Fails with `SyclObject.impl && "every constructor
// test3<sycl::queue>(); Fails with `SyclObject.impl && "every constructor
// should create an impl"' failed.
// test4<sycl::event>();
// test3<sycl::event>();
}
Loading

0 comments on commit e2e0523

Please sign in to comment.