Fixes #327, #334.

* Switched many methods which require a context, and take a `device_t` by value, to taking it by const-reference - to make use of device_t's mutable primary-context cache. * Host memory allocations no longer increase primary context refcounts, nor do their freeing decrease the refcount * The following proxy class instances can now keep primary contexts alive with a reference count unit (depending on how they were constructed): `device_t`, `primary_context_t`, `kernel_t`, `stream_t`, `event_t`, `module_t`. * Some doxygen comment corrections and additions * Added bool aliases regarding whether or not a proxy object should hold a primary context refcount unit. * Dropped: The `scoped_` and `unscoped_` bool-aliases which were used with `device_t::primary_context()`; we now indicate clearly whether the proxy object should or should not hold a refcount unit * Spaces -> tabs fixes in `context.hpp` * Added an alternative method for setting the current device (`device::current::detail_::set_with_aux_info()`) that requires performing less API calls. * The combined behavior of `device_t`, `primary_context_t` and `current_device::detail_::scoped_context_override` was such, that `make_unique()` with no device passed, and with the primary context inactive, would cause a primary context to be activated, then destroyed/deactivated immediately after allocation. This no longer happens. * No longer using `current_device::detail_::scoped_context_override`; now preferring `device_t`'s primary context caching mechanism + delegating to methods which take a (not necessarily primary) context. * The scoped context override gadget now has two flags: One for popping the context, another for reducing the primary context refcount; the logic for using them has been updated. * `device_t` now uses an explicit flag for when it's holding a primary context refcount unit; this allows in-context copies of a `device_t` to know the primary context without holding a refcount unit. * Spacing tweaks * Corrected an exception message in `stream.hpp`. * CAVEAT: Some DRY in the code handling refcount units in the various proxy class.
eyalroz · Jun 5, 2022 · 16c287d · 16c287d
1 parent 23a78ba
commit 16c287d
Show file tree

Hide file tree

Showing 22 changed files with 651 additions and 340 deletions.
diff --git a/src/cuda/api/apriori_compiled_kernel.hpp b/src/cuda/api/apriori_compiled_kernel.hpp
@@ -45,9 +45,10 @@ inline handle_t get_handle(const void *kernel_function_ptr, const char* name = n
 
 apriori_compiled_kernel_t wrap(
 	device::id_t device_id,
-	context::handle_t context_id,
+	context::handle_t primary_context_handle,
 	kernel::handle_t f,
-	const void* ptr);
+	const void* ptr,
+	bool hold_primary_context_refcount_unit = false);
 
 
 } // namespace detail_
@@ -394,23 +395,33 @@ class apriori_compiled_kernel_t final : public kernel_t {
 #endif // CAN_GET_APRIORI_KERNEL_HANDLE
 
 protected: // ctors & dtor
-	apriori_compiled_kernel_t(device::id_t device_id, context::handle_t context_handle,
-		kernel::handle_t handle, const void *f)
-		: kernel_t(device_id, context_handle, handle), ptr_(f) {
+	apriori_compiled_kernel_t(device::id_t device_id, context::handle_t primary_context_handle,
+		kernel::handle_t handle, const void *f, bool hold_pc_refcount_unit)
+	: kernel_t(device_id, primary_context_handle, handle, hold_pc_refcount_unit), ptr_(f) {
 		// TODO: Consider checking whether this actually is a device function, at all and in this context
 #ifndef NDEBUG
 		assert(f != nullptr && "Attempt to construct a kernel object for a nullptr kernel function pointer");
 #endif
 	}
-	apriori_compiled_kernel_t(device::id_t device_id, context::handle_t context_handle, const void *f)
-		: apriori_compiled_kernel_t(device_id, context_handle, kernel::detail_::get_handle(f), f) { }
+	apriori_compiled_kernel_t(
+		device::id_t device_id,
+		context::handle_t primary_context_handle,
+		const void *f,
+		bool hold_primary_context_refcount_unit)
+	: apriori_compiled_kernel_t(
+		device_id,
+		primary_context_handle,
+		kernel::detail_::get_handle(f),
+		f,
+		hold_primary_context_refcount_unit)
+	{ }
 
 public: // ctors & dtor
 	apriori_compiled_kernel_t(const apriori_compiled_kernel_t&) = default;
 	apriori_compiled_kernel_t(apriori_compiled_kernel_t&&) = default;
 
 public: // friends
-	friend apriori_compiled_kernel_t kernel::detail_::wrap(device::id_t, context::handle_t, kernel::handle_t, const void*);
+	friend apriori_compiled_kernel_t kernel::detail_::wrap(device::id_t, context::handle_t, kernel::handle_t, const void*, bool);
 
 protected: // data members
 	const void *const ptr_;
@@ -420,12 +431,13 @@ namespace kernel {
 namespace detail_ {
 
 inline apriori_compiled_kernel_t wrap(
-	device::id_t device_id,
-	context::handle_t context_id,
-	kernel::handle_t f,
-	const void *ptr)
+	device::id_t       device_id,
+	context::handle_t  primary_context_handle,
+	kernel::handle_t   f,
+	const void *       ptr,
+	bool               hold_primary_context_refcount_unit)
 {
-	return {device_id, context_id, f, ptr};
+	return { device_id, primary_context_handle, f, ptr, hold_primary_context_refcount_unit };
 }
 
 #if ! CAN_GET_APRIORI_KERNEL_HANDLE
@@ -438,8 +450,12 @@ inline ::std::string identify(const apriori_compiled_kernel_t& kernel)
 
 } // namespace detail
 
+/**
+ * @note The returned kernel proxy object will keep the device's primary
+ * context active while the kernel exists.
+ */
 template<typename KernelFunctionPtr>
-apriori_compiled_kernel_t get(device_t device, KernelFunctionPtr function_ptr);
+apriori_compiled_kernel_t get(const device_t& device, KernelFunctionPtr function_ptr);
 
 template<typename KernelFunctionPtr>
 apriori_compiled_kernel_t get(context_t context, KernelFunctionPtr function_ptr);

diff --git a/src/cuda/api/array.hpp b/src/cuda/api/array.hpp
@@ -134,6 +134,9 @@ handle_t get_descriptor(context::handle_t context_handle, handle_t handle)
  * value, which is interpolated between the nearest corresponding array elements.
  *
  * @note CUDA only supports arrays of 2 or 3 dimensions.
+ *
+ * @note Instances of this class do _not_ keep devices' primary contexts
+ * alive/active - just like memory allocations (but unlike events and streams).
  */
 template <typename T, dimensionality_t NumDimensions>
 class array_t {
@@ -209,7 +212,7 @@ array_t<T,NumDimensions> create(
 
 template <typename T, dimensionality_t NumDimensions>
 array_t<T,NumDimensions> create(
-	device_t                     device,
+	const device_t&              device,
 	dimensions_t<NumDimensions>  dimensions);
 
 

diff --git a/src/cuda/api/constants.hpp b/src/cuda/api/constants.hpp
@@ -116,6 +116,11 @@ enum : bool {
 	do_not_take_ownership = false,
 };
 
+enum : bool {
+	do_hold_primary_context_refcount_unit = true,
+	do_not_hold_primary_context_refcount_unit = false,
+};
+
 namespace context {
 
 namespace detail_ {

diff --git a/src/cuda/api/context.hpp b/src/cuda/api/context.hpp
@@ -244,8 +244,8 @@ class context_t {
 			: device_id_(device_id), context_handle_(context_handle) { }
 		///@endcond
 
-		device_t associated_device() const;
-		context_t associated_context() const;
+        device_t associated_device() const;
+        context_t associated_context() const;
 
 		/**
 		 * Allocate a region of memory on the device
@@ -385,7 +385,7 @@ class context_t {
 	/**
 	 * @return the maximum grid depth at which a thread can issue the device
 	 * runtime call `cudaDeviceSynchronize()` / `cuda::device::synchronize()`
-	 * to wait on child grid launches to complete.
+     * to wait on child grid launches to complete.
 	 *
 	 * @todo Is this really a feature of the context? Not of the device?
 	 */
@@ -661,7 +661,7 @@ class context_t {
 
 	// Deleted since the handle_t and handle_t are constant
 	context_t& operator=(context_t&& other) noexcept
-  	{
+	{
 		::std::swap(device_id_, other.device_id_);
 		::std::swap(handle_, other.handle_);
 		::std::swap(owning_, other.owning_);
@@ -741,12 +741,12 @@ inline handle_t create_and_push(
 context_t create(
 	device_t                               device,
 	host_thread_synch_scheduling_policy_t  synch_scheduling_policy = heuristic,
-	bool                                   keep_larger_local_mem_after_resize = false);
+    bool                                   keep_larger_local_mem_after_resize = false);
 
 context_t create_and_push(
 	device_t                               device,
 	host_thread_synch_scheduling_policy_t  synch_scheduling_policy = heuristic,
-	bool                                   keep_larger_local_mem_after_resize = false);
+    bool                                   keep_larger_local_mem_after_resize = false);
 
 namespace current {
 

diff --git a/src/cuda/api/current_device.hpp b/src/cuda/api/current_device.hpp
@@ -52,8 +52,8 @@ inline id_t get_id()
 		// Should we activate and push the default device's context? probably not.
 		return default_device_id;
 	}
-	throw_if_error(status, "Failed obtaining the current context for determining which "
-		"device is active");
+	throw_if_error(status,
+		"Failed obtaining the current context for determining which device is active");
 
 	if (current_context_handle == context::detail_::none) {
 		// Should we activate and push the default device's context? probably not.
@@ -75,41 +75,69 @@ inline id_t get_id()
  * @note This replaces the current CUDA context (rather than pushing a context
  * onto the stack), so use with care.
  *
- * @note This causes a primary context for the device to be created, if it
- * doesn't already exist. I'm not entirely sure regarding the conditions under
- * which it will be destroyed, however.
- *
  * @param[in] device Numeric ID of the device to make current
  */
-inline void set(id_t device_id)
+///@{
+
+/**
+ * @note: The primary context reference count will be increased by calling this
+ * function, except if the following conditions are met:
+ *
+ * 1. The primary context handle was specified as a parameter (i.e.
+ *    we got something other than @ref detail_::none was for nit).
+ * 2. The current context is the desired device's primary context.
+ *
+ * USE WITH EXTRA CARE!
+ */
+inline context::handle_t set_with_aux_info(
+	id_t device_id,
+	bool driver_is_initialized,
+	context::handle_t current_context_handle = context::detail_::none,
+	context::handle_t device_pc_handle = context::detail_::none)
 {
-	context::handle_t current_context_handle;
-	bool have_current_context;
-	auto status = cuCtxGetCurrent(&current_context_handle);
-	if (status == CUDA_ERROR_NOT_INITIALIZED) {
+	if (not driver_is_initialized) {
 		initialize_driver();
-		// Should we activate and PUSH the default device's context? probably not.
-		have_current_context = false;
-	}
-	else {
-		have_current_context = (current_context_handle != context::detail_::none);
+		device_pc_handle = device::primary_context::detail_::obtain_and_increase_refcount(device_id);
+		context::current::detail_::set(device_pc_handle);
+		return device_pc_handle;
 	}
-	if (have_current_context) {
-		auto current_context_device_id = context::detail_::get_device_id(current_context_handle);
-		if (current_context_device_id == device_id) {
-			return;
+	if (current_context_handle != context::detail_::none) {
+		if (current_context_handle == device_pc_handle) {
+			return device_pc_handle;
 		}
 	}
-	auto device_pc_is_active = device::primary_context::detail_::is_active(device_id);
-	bool need_refcount_increase = not device_pc_is_active;
-	auto dev_pc_handle = device::primary_context::detail_::get_handle(device_id, need_refcount_increase);
-	context::current::detail_::set(dev_pc_handle);
-
+	device_pc_handle = device::primary_context::detail_::obtain_and_increase_refcount(device_id);
+	if (current_context_handle == device_pc_handle) {
+		return device_pc_handle;
+	}
+	context::current::detail_::set(device_pc_handle); // Remember: This _replaces_ the current context
+	return device_pc_handle;
+}
 
-	// ... which is the equivalent of doing:
-	// auto status = cudaSetDevice(device_id);
-	// throw_if_error(status, "Failure setting current device to " + ::std::to_string(device_id));
+/**
+ * @brief Ensures activation of a device's primary context and makes that
+ * context current, placing it at the top of the context stack - and
+ * replacing the previous top stack element if one existed.
+ *
+ * @note This causes a primary context for the device to be created
+ * ("activated"), if it doesn't already exist - in which case it also "leaks"
+ * a reference count unit, setting the refcount at 1. On the other hand,
+ * if the primary context was already active, the reference count is _not_
+ * increased - regardless of whether the primary context was the current
+ * context or not.
+ *
+ * @note This should be equivalent to `cudaSetDevice(device_id)` + error
+ * checking.
+ */
+inline void set(id_t device_id)
+{
+	context::handle_t current_context_handle;
+	auto status = cuCtxGetCurrent(&current_context_handle);
+	bool driver_initialized = (status == CUDA_ERROR_NOT_INITIALIZED);
+	set_with_aux_info(device_id, driver_initialized, current_context_handle);
+	// Note: We can safely assume the refcount was increased.
 }
+///@}
 
 /**
  * Set the first possible of several devices to be the current one for the CUDA Runtime API.
@@ -120,96 +148,32 @@ inline void set(id_t device_id)
  * @note this replaces the current CUDA context (rather than pushing a context
  * onto the stack), so use with care.
  */
-inline void set(const id_t* device_ids, size_t num_devices)
+inline void set(const id_t *device_ids, size_t num_devices)
 {
 	if (num_devices > static_cast<size_t>(cuda::device::count())) {
 		throw cuda::runtime_error(status::invalid_device, "More devices listed than exist on the system");
 	}
-	auto result = cudaSetValidDevices(const_cast<int*>(device_ids), (int) num_devices);
-	throw_if_error(result, "Failure setting the current device to any of the list of "
+	auto result = cudaSetValidDevices(const_cast<int *>(device_ids), (int) num_devices);
+	throw_if_error(result,
+		"Failure setting the current device to any of the list of "
 		+ ::std::to_string(num_devices) + " devices specified");
 }
 
+} // namespace detail
+
 /**
- * @note See the out-of-`detail_::` version of this class.
+ * Tells the CUDA runtime API to consider the specified device as the current one.
  *
- * @note Perhaps it would be better to keep a copy of the current context ID in a
- * member of this class, instead of on the stack?
- *
- * @note we have no guarantee that the context stack is not altered during
- * the lifetime of this object; but - we assume it wasn't, and it's up to the users
- * of this class to assure that's the case or face the consequences.
- *
- * @note We don't want to use the cuda::context::detail_scoped_override_t
- * as the implementation, since we're not simply pushing and popping
+ * @note this will replace the top of the context stack, if the stack isn't empty;
+ * and will create/activate the device's primary context if it isn't already active.
  */
-
-class scoped_context_override_t {
-public:
-	explicit scoped_context_override_t(id_t device_id) :
-		device_id_(device_id),
-		refcount_was_nonzero(device::primary_context::detail_::is_active(device_id))
-	{
-		auto top_of_context_stack = context::current::detail_::get_handle();
-		if (top_of_context_stack != context::detail_::none) {
-			context::current::detail_::push(top_of_context_stack); // Yes, we're pushing a copy of the same context
-		}
-		device::current::detail_::set(device_id); // ... which now gets overwritten at the top of the stack
-		primary_context_handle = device::primary_context::detail_::obtain_and_increase_refcount(device_id);
-
-//		auto top_of_context_stack = context::current::detail_::get_handle();
-//		device::current::detail_::set(device_id); // ... which now gets overwritten at the top of the stack
-//		primary_context = device::primary_context::detail_::get_handle(device_id);
-//		context::current::detail_::push(primary_context);
-	}
-	~scoped_context_override_t() {
-		context::current::detail_::pop();
-//#else
-//		auto popped_context_handle = context::current::detail_::pop();
-//		if (popped_context_handle != primary_context_handle) {
-//			throw ::std::logic_error("Expected the top of the context stack to hold the primary context of "
-//				+ device::detail_::identify(device_id_));
-//		}
-//#endif
-		if (refcount_was_nonzero) {
-			device::primary_context::detail_::decrease_refcount(device_id_);
-			// We intentionally "leak" a refcount, as otherwise, the primary context
-			// gets destroyed after we have created it - and we don't want that happening.
-		}
-
-	}
-	device::id_t device_id_;
-	primary_context::handle_t primary_context_handle;
-	bool refcount_was_nonzero;
-};
-
-
-} // namespace detail_
+void set(const device_t& device);
 
 /**
  * Reset the CUDA Runtime API's current device to its default value - the default device
  */
 inline void set_to_default() { return detail_::set(device::default_device_id); }
 
-void set(const device_t& device);
-
-/**
- * A RAII-like mechanism for setting the CUDA Runtime API's current device for
- * what remains of the current scope, and changing it back to its previous value
- * when exiting the scope.
- *
- * @note The description says "RAII-like" because the reality is more complex. The
- * runtime API sets a device by overwriting the current
- */
-class scoped_override_t : private detail_::scoped_context_override_t {
-protected:
-	using parent = detail_::scoped_context_override_t;
-public:
-	scoped_override_t(const device_t& device);
-	scoped_override_t(device_t&& device);
-	~scoped_override_t() = default;
-};
-
 /**
  * This macro will set the current device for the remainder of the scope in which it is
  * invoked, and will change it back to the previous value when exiting the scope. Use