Skip to content

Commit

Permalink
Fixes #327, #334.
Browse files Browse the repository at this point in the history
* Switched many methods which require a context, and take a `device_t` by value, to taking it by const-reference - to make use of device_t's mutable primary-context cache.
* Host memory allocations no longer increase primary context refcounts, nor do their freeing decrease the refcount
* The following proxy class instances can now keep primary contexts alive with a reference count unit (depending on how they were constructed): `device_t`, `primary_context_t`, `kernel_t`, `stream_t`, `event_t`, `module_t`.
* Some doxygen comment corrections and additions
* Added bool aliases regarding whether or not a proxy object should hold a primary context refcount unit.
* Dropped: The `scoped_` and `unscoped_` bool-aliases which were used with `device_t::primary_context()`; we now indicate clearly whether the proxy object should or should not hold a refcount unit
* Spaces -> tabs fixes in `context.hpp`
* Added an alternative  method for setting the current device (`device::current::detail_::set_with_aux_info()`) that requires performing less API calls.
* The combined behavior of `device_t`, `primary_context_t` and `current_device::detail_::scoped_context_override` was such, that `make_unique()` with no device passed, and with the primary context inactive, would cause a primary context to be activated, then destroyed/deactivated immediately after allocation. This no longer happens.
* No longer using `current_device::detail_::scoped_context_override`; now preferring `device_t`'s primary context caching mechanism + delegating to methods which take a (not necessarily primary) context.
* The scoped context override gadget now has two flags: One for popping the context, another for reducing the primary context refcount; the logic for using them has been updated.
* `device_t` now uses an explicit flag for when it's holding a primary context refcount unit; this allows in-context copies of a `device_t` to know the primary context without holding a refcount unit.
* Spacing tweaks
* Corrected an exception message in `stream.hpp`.
* CAVEAT: Some DRY in the code handling refcount units in the various proxy class.
  • Loading branch information
eyalroz committed Jun 5, 2022
1 parent 23a78ba commit 16c287d
Show file tree
Hide file tree
Showing 22 changed files with 651 additions and 340 deletions.
44 changes: 30 additions & 14 deletions src/cuda/api/apriori_compiled_kernel.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,10 @@ inline handle_t get_handle(const void *kernel_function_ptr, const char* name = n

apriori_compiled_kernel_t wrap(
device::id_t device_id,
context::handle_t context_id,
context::handle_t primary_context_handle,
kernel::handle_t f,
const void* ptr);
const void* ptr,
bool hold_primary_context_refcount_unit = false);


} // namespace detail_
Expand Down Expand Up @@ -394,23 +395,33 @@ class apriori_compiled_kernel_t final : public kernel_t {
#endif // CAN_GET_APRIORI_KERNEL_HANDLE

protected: // ctors & dtor
apriori_compiled_kernel_t(device::id_t device_id, context::handle_t context_handle,
kernel::handle_t handle, const void *f)
: kernel_t(device_id, context_handle, handle), ptr_(f) {
apriori_compiled_kernel_t(device::id_t device_id, context::handle_t primary_context_handle,
kernel::handle_t handle, const void *f, bool hold_pc_refcount_unit)
: kernel_t(device_id, primary_context_handle, handle, hold_pc_refcount_unit), ptr_(f) {
// TODO: Consider checking whether this actually is a device function, at all and in this context
#ifndef NDEBUG
assert(f != nullptr && "Attempt to construct a kernel object for a nullptr kernel function pointer");
#endif
}
apriori_compiled_kernel_t(device::id_t device_id, context::handle_t context_handle, const void *f)
: apriori_compiled_kernel_t(device_id, context_handle, kernel::detail_::get_handle(f), f) { }
apriori_compiled_kernel_t(
device::id_t device_id,
context::handle_t primary_context_handle,
const void *f,
bool hold_primary_context_refcount_unit)
: apriori_compiled_kernel_t(
device_id,
primary_context_handle,
kernel::detail_::get_handle(f),
f,
hold_primary_context_refcount_unit)
{ }

public: // ctors & dtor
apriori_compiled_kernel_t(const apriori_compiled_kernel_t&) = default;
apriori_compiled_kernel_t(apriori_compiled_kernel_t&&) = default;

public: // friends
friend apriori_compiled_kernel_t kernel::detail_::wrap(device::id_t, context::handle_t, kernel::handle_t, const void*);
friend apriori_compiled_kernel_t kernel::detail_::wrap(device::id_t, context::handle_t, kernel::handle_t, const void*, bool);

protected: // data members
const void *const ptr_;
Expand All @@ -420,12 +431,13 @@ namespace kernel {
namespace detail_ {

inline apriori_compiled_kernel_t wrap(
device::id_t device_id,
context::handle_t context_id,
kernel::handle_t f,
const void *ptr)
device::id_t device_id,
context::handle_t primary_context_handle,
kernel::handle_t f,
const void * ptr,
bool hold_primary_context_refcount_unit)
{
return {device_id, context_id, f, ptr};
return { device_id, primary_context_handle, f, ptr, hold_primary_context_refcount_unit };
}

#if ! CAN_GET_APRIORI_KERNEL_HANDLE
Expand All @@ -438,8 +450,12 @@ inline ::std::string identify(const apriori_compiled_kernel_t& kernel)

} // namespace detail

/**
* @note The returned kernel proxy object will keep the device's primary
* context active while the kernel exists.
*/
template<typename KernelFunctionPtr>
apriori_compiled_kernel_t get(device_t device, KernelFunctionPtr function_ptr);
apriori_compiled_kernel_t get(const device_t& device, KernelFunctionPtr function_ptr);

template<typename KernelFunctionPtr>
apriori_compiled_kernel_t get(context_t context, KernelFunctionPtr function_ptr);
Expand Down
5 changes: 4 additions & 1 deletion src/cuda/api/array.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,9 @@ handle_t get_descriptor(context::handle_t context_handle, handle_t handle)
* value, which is interpolated between the nearest corresponding array elements.
*
* @note CUDA only supports arrays of 2 or 3 dimensions.
*
* @note Instances of this class do _not_ keep devices' primary contexts
* alive/active - just like memory allocations (but unlike events and streams).
*/
template <typename T, dimensionality_t NumDimensions>
class array_t {
Expand Down Expand Up @@ -209,7 +212,7 @@ array_t<T,NumDimensions> create(

template <typename T, dimensionality_t NumDimensions>
array_t<T,NumDimensions> create(
device_t device,
const device_t& device,
dimensions_t<NumDimensions> dimensions);


Expand Down
5 changes: 5 additions & 0 deletions src/cuda/api/constants.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,11 @@ enum : bool {
do_not_take_ownership = false,
};

enum : bool {
do_hold_primary_context_refcount_unit = true,
do_not_hold_primary_context_refcount_unit = false,
};

namespace context {

namespace detail_ {
Expand Down
12 changes: 6 additions & 6 deletions src/cuda/api/context.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -244,8 +244,8 @@ class context_t {
: device_id_(device_id), context_handle_(context_handle) { }
///@endcond

device_t associated_device() const;
context_t associated_context() const;
device_t associated_device() const;
context_t associated_context() const;

/**
* Allocate a region of memory on the device
Expand Down Expand Up @@ -385,7 +385,7 @@ class context_t {
/**
* @return the maximum grid depth at which a thread can issue the device
* runtime call `cudaDeviceSynchronize()` / `cuda::device::synchronize()`
* to wait on child grid launches to complete.
* to wait on child grid launches to complete.
*
* @todo Is this really a feature of the context? Not of the device?
*/
Expand Down Expand Up @@ -661,7 +661,7 @@ class context_t {

// Deleted since the handle_t and handle_t are constant
context_t& operator=(context_t&& other) noexcept
{
{
::std::swap(device_id_, other.device_id_);
::std::swap(handle_, other.handle_);
::std::swap(owning_, other.owning_);
Expand Down Expand Up @@ -741,12 +741,12 @@ inline handle_t create_and_push(
context_t create(
device_t device,
host_thread_synch_scheduling_policy_t synch_scheduling_policy = heuristic,
bool keep_larger_local_mem_after_resize = false);
bool keep_larger_local_mem_after_resize = false);

context_t create_and_push(
device_t device,
host_thread_synch_scheduling_policy_t synch_scheduling_policy = heuristic,
bool keep_larger_local_mem_after_resize = false);
bool keep_larger_local_mem_after_resize = false);

namespace current {

Expand Down
168 changes: 66 additions & 102 deletions src/cuda/api/current_device.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,8 @@ inline id_t get_id()
// Should we activate and push the default device's context? probably not.
return default_device_id;
}
throw_if_error(status, "Failed obtaining the current context for determining which "
"device is active");
throw_if_error(status,
"Failed obtaining the current context for determining which device is active");

if (current_context_handle == context::detail_::none) {
// Should we activate and push the default device's context? probably not.
Expand All @@ -75,41 +75,69 @@ inline id_t get_id()
* @note This replaces the current CUDA context (rather than pushing a context
* onto the stack), so use with care.
*
* @note This causes a primary context for the device to be created, if it
* doesn't already exist. I'm not entirely sure regarding the conditions under
* which it will be destroyed, however.
*
* @param[in] device Numeric ID of the device to make current
*/
inline void set(id_t device_id)
///@{

/**
* @note: The primary context reference count will be increased by calling this
* function, except if the following conditions are met:
*
* 1. The primary context handle was specified as a parameter (i.e.
* we got something other than @ref detail_::none was for nit).
* 2. The current context is the desired device's primary context.
*
* USE WITH EXTRA CARE!
*/
inline context::handle_t set_with_aux_info(
id_t device_id,
bool driver_is_initialized,
context::handle_t current_context_handle = context::detail_::none,
context::handle_t device_pc_handle = context::detail_::none)
{
context::handle_t current_context_handle;
bool have_current_context;
auto status = cuCtxGetCurrent(&current_context_handle);
if (status == CUDA_ERROR_NOT_INITIALIZED) {
if (not driver_is_initialized) {
initialize_driver();
// Should we activate and PUSH the default device's context? probably not.
have_current_context = false;
}
else {
have_current_context = (current_context_handle != context::detail_::none);
device_pc_handle = device::primary_context::detail_::obtain_and_increase_refcount(device_id);
context::current::detail_::set(device_pc_handle);
return device_pc_handle;
}
if (have_current_context) {
auto current_context_device_id = context::detail_::get_device_id(current_context_handle);
if (current_context_device_id == device_id) {
return;
if (current_context_handle != context::detail_::none) {
if (current_context_handle == device_pc_handle) {
return device_pc_handle;
}
}
auto device_pc_is_active = device::primary_context::detail_::is_active(device_id);
bool need_refcount_increase = not device_pc_is_active;
auto dev_pc_handle = device::primary_context::detail_::get_handle(device_id, need_refcount_increase);
context::current::detail_::set(dev_pc_handle);

device_pc_handle = device::primary_context::detail_::obtain_and_increase_refcount(device_id);
if (current_context_handle == device_pc_handle) {
return device_pc_handle;
}
context::current::detail_::set(device_pc_handle); // Remember: This _replaces_ the current context
return device_pc_handle;
}

// ... which is the equivalent of doing:
// auto status = cudaSetDevice(device_id);
// throw_if_error(status, "Failure setting current device to " + ::std::to_string(device_id));
/**
* @brief Ensures activation of a device's primary context and makes that
* context current, placing it at the top of the context stack - and
* replacing the previous top stack element if one existed.
*
* @note This causes a primary context for the device to be created
* ("activated"), if it doesn't already exist - in which case it also "leaks"
* a reference count unit, setting the refcount at 1. On the other hand,
* if the primary context was already active, the reference count is _not_
* increased - regardless of whether the primary context was the current
* context or not.
*
* @note This should be equivalent to `cudaSetDevice(device_id)` + error
* checking.
*/
inline void set(id_t device_id)
{
context::handle_t current_context_handle;
auto status = cuCtxGetCurrent(&current_context_handle);
bool driver_initialized = (status == CUDA_ERROR_NOT_INITIALIZED);
set_with_aux_info(device_id, driver_initialized, current_context_handle);
// Note: We can safely assume the refcount was increased.
}
///@}

/**
* Set the first possible of several devices to be the current one for the CUDA Runtime API.
Expand All @@ -120,96 +148,32 @@ inline void set(id_t device_id)
* @note this replaces the current CUDA context (rather than pushing a context
* onto the stack), so use with care.
*/
inline void set(const id_t* device_ids, size_t num_devices)
inline void set(const id_t *device_ids, size_t num_devices)
{
if (num_devices > static_cast<size_t>(cuda::device::count())) {
throw cuda::runtime_error(status::invalid_device, "More devices listed than exist on the system");
}
auto result = cudaSetValidDevices(const_cast<int*>(device_ids), (int) num_devices);
throw_if_error(result, "Failure setting the current device to any of the list of "
auto result = cudaSetValidDevices(const_cast<int *>(device_ids), (int) num_devices);
throw_if_error(result,
"Failure setting the current device to any of the list of "
+ ::std::to_string(num_devices) + " devices specified");
}

} // namespace detail

/**
* @note See the out-of-`detail_::` version of this class.
* Tells the CUDA runtime API to consider the specified device as the current one.
*
* @note Perhaps it would be better to keep a copy of the current context ID in a
* member of this class, instead of on the stack?
*
* @note we have no guarantee that the context stack is not altered during
* the lifetime of this object; but - we assume it wasn't, and it's up to the users
* of this class to assure that's the case or face the consequences.
*
* @note We don't want to use the cuda::context::detail_scoped_override_t
* as the implementation, since we're not simply pushing and popping
* @note this will replace the top of the context stack, if the stack isn't empty;
* and will create/activate the device's primary context if it isn't already active.
*/

class scoped_context_override_t {
public:
explicit scoped_context_override_t(id_t device_id) :
device_id_(device_id),
refcount_was_nonzero(device::primary_context::detail_::is_active(device_id))
{
auto top_of_context_stack = context::current::detail_::get_handle();
if (top_of_context_stack != context::detail_::none) {
context::current::detail_::push(top_of_context_stack); // Yes, we're pushing a copy of the same context
}
device::current::detail_::set(device_id); // ... which now gets overwritten at the top of the stack
primary_context_handle = device::primary_context::detail_::obtain_and_increase_refcount(device_id);

// auto top_of_context_stack = context::current::detail_::get_handle();
// device::current::detail_::set(device_id); // ... which now gets overwritten at the top of the stack
// primary_context = device::primary_context::detail_::get_handle(device_id);
// context::current::detail_::push(primary_context);
}
~scoped_context_override_t() {
context::current::detail_::pop();
//#else
// auto popped_context_handle = context::current::detail_::pop();
// if (popped_context_handle != primary_context_handle) {
// throw ::std::logic_error("Expected the top of the context stack to hold the primary context of "
// + device::detail_::identify(device_id_));
// }
//#endif
if (refcount_was_nonzero) {
device::primary_context::detail_::decrease_refcount(device_id_);
// We intentionally "leak" a refcount, as otherwise, the primary context
// gets destroyed after we have created it - and we don't want that happening.
}

}
device::id_t device_id_;
primary_context::handle_t primary_context_handle;
bool refcount_was_nonzero;
};


} // namespace detail_
void set(const device_t& device);

/**
* Reset the CUDA Runtime API's current device to its default value - the default device
*/
inline void set_to_default() { return detail_::set(device::default_device_id); }

void set(const device_t& device);

/**
* A RAII-like mechanism for setting the CUDA Runtime API's current device for
* what remains of the current scope, and changing it back to its previous value
* when exiting the scope.
*
* @note The description says "RAII-like" because the reality is more complex. The
* runtime API sets a device by overwriting the current
*/
class scoped_override_t : private detail_::scoped_context_override_t {
protected:
using parent = detail_::scoped_context_override_t;
public:
scoped_override_t(const device_t& device);
scoped_override_t(device_t&& device);
~scoped_override_t() = default;
};

/**
* This macro will set the current device for the remainder of the scope in which it is
* invoked, and will change it back to the previous value when exiting the scope. Use
Expand Down
Loading

0 comments on commit 16c287d

Please sign in to comment.