Fixes #473, regards #304: Host-function launching-related changes:

* No longer allocating heap memory on enqueue and releaing it during launch - only passing pointers the user has provided. Part of the motivation for this is enabling stream capture and re-execution of the launch. * Separated a method for enqueuing no-argument callables and enqueuing functions which take a single (pointer) argument. * Enqueued callables no longer receive a stream (as CUDA has moved away from this convention and we can't make it happen without the heap allocation scheme we had before * `#ifdef`'ed out parts of `launch_config_builder.hpp` which require CUDA 10.0 to run (essentially obtaining minimum dimensions for maximum occupancy). * Dropped some redundant comments in `stream.hpp` about the choice of API functions
eyalroz · Mar 9, 2023 · 86f6d45 · 86f6d45
1 parent 30fa9af
commit 86f6d45
Show file tree

Hide file tree

Showing 9 changed files with 110 additions and 117 deletions.
diff --git a/README.md b/README.md
@@ -117,11 +117,8 @@ The [Milestones](https://github.com/eyalroz/cuda-api-wrappers/milestones) indica
 
 We've all dreamed of being able to type in:
 
-	my_stream.enqueue.callback(
-		[&foo](cuda::stream_t stream, cuda::status_t status) {
-			::std::cout << "Hello " << foo << " world!\n";
-		}
-	);
+	auto callback =	[&foo] { ::std::cout << "Hello " << foo << " world!\n"; }
+	my_stream.enqueue.host_invokable(callback);
 
 ... and have that just work, right? Well, now it does!
 

diff --git a/examples/by_runtime_api_module/event_management.cu b/examples/by_runtime_api_module/event_management.cu
@@ -106,17 +106,16 @@ int main(int argc, char **argv)
 
 	stream.enqueue.kernel_launch(print_message<N,1>, { 1, 1 }, message<N>("I am launched before the first event"));
 	stream.enqueue.event(event_1);
-	stream.enqueue.host_function_call(
-		[&event_1, &event_2](const cuda::stream_t&) {
-			report_occurrence("In first callback (enqueued after first event but before first kernel)", event_1, event_2);
-		}
-	);
+	auto first_callback = [&] {
+		report_occurrence("In first callback (enqueued after first event but before first kernel)", event_1, event_2);
+	};
+	stream.enqueue.host_invokable(first_callback);
 	stream.enqueue.kernel_launch(increment, launch_config, buffer.get(), buffer_size);
-	stream.enqueue.host_function_call(
-		[&event_1, &event_2](const cuda::stream_t& ) {
-			report_occurrence("In second callback (enqueued after the first kernel but before the second event)", event_1, event_2);
-		}
-	);
+	auto second_callback = [&] {
+		report_occurrence("In second callback (enqueued after the first kernel but before the second event)",
+			event_1, event_2);
+	};
+	stream.enqueue.host_invokable(second_callback);
 	stream.enqueue.event(event_2);
 	stream.enqueue.kernel_launch(print_message<N,3>, { 1, 1 }, message<N>("I am launched after the second event"));
 	stream.enqueue.event(event_3);

diff --git a/examples/by_runtime_api_module/execution_control.cu b/examples/by_runtime_api_module/execution_control.cu
@@ -115,9 +115,9 @@ int main(int argc, char **argv)
 		launch_config_4 = launch_config_2;
 		launch_config_4 = ::std::move(launch_config_3);
 		[[maybe_unused]] cuda::launch_configuration_t launch_config_5{::std::move(launch_config_2)};
-    // In case the `[[maybe_unused]]` attribute is ignored, let's try to trick the compiler
-    // into thinking we're actually using launch_config_4.
-    launch_config_4.dimensions == launch_config.dimensions;
+    	// In case the `[[maybe_unused]]` attribute is ignored, let's try to trick the compiler
+    	// into thinking we're actually using launch_config_4.
+    	launch_config_4.dimensions == launch_config.dimensions;
 	}
 
 	cuda::launch(kernel_function, launch_config, bar);

diff --git a/examples/by_runtime_api_module/stream_management.cu b/examples/by_runtime_api_module/stream_management.cu
@@ -166,12 +166,11 @@ int main(int argc, char **argv)
 	auto event_1 = cuda::event::create(device, cuda::event::sync_by_blocking);
 	stream_1.enqueue.kernel_launch(print_message<N,3>, single_thread_config, message<N>("I'm on stream 1"));
 	stream_1.enqueue.memset(buffer.get(), 'b', buffer_size);
-	stream_1.enqueue.host_function_call(
-		[&buffer](cuda::stream_t) {
-			::std::cout << "Callback from stream 1!... \n";
-			print_first_char(buffer.get());
-		}
-	);
+	auto callback = [&]() {
+		::std::cout << "Callback from stream 1!... \n";
+		print_first_char(buffer.get());
+	};
+	stream_1.enqueue.host_invokable(callback);
 	auto threads_per_block = cuda::kernel::get(device, increment).get_attribute(CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK);
 	auto num_blocks = div_rounding_up(buffer_size, threads_per_block);
 	auto launch_config = cuda::make_launch_config(num_blocks, threads_per_block);

diff --git a/examples/other/io_compute_overlap_with_streams.cu b/examples/other/io_compute_overlap_with_streams.cu
@@ -135,12 +135,10 @@ int main(int, char **)
             buffer_set.device_result.get(),
             num_elements);
         stream.enqueue.copy(buffer_set.host_result.get(), buffer_set.device_result.get(), buffer_size);
-        stream.enqueue.host_function_call(
-            [=](cuda::stream_t) {
-                ::std::cout
-                    << "Stream " << k+1 << " of " << num_kernels << " has concluded all work. " << ::std::endl;
-            }
-        );
+	auto callback = [=] {
+		::std::cout << "Stream " << k+1 << " of " << num_kernels << " has concluded all work. " << ::std::endl;
+	};
+        stream.enqueue.host_invokable(callback);
     }
     ::std::this_thread::sleep_for(::std::chrono::microseconds(50000));
     for(auto& stream : streams) { stream.synchronize(); }

diff --git a/src/cuda/api/launch_config_builder.hpp b/src/cuda/api/launch_config_builder.hpp
@@ -73,11 +73,13 @@ class launch_config_builder_t {
 	{
 		grid::composite_dimensions_t result;
 		if (saturate_with_active_blocks_) {
+#if CUDA_VERSION >= 10000
 			if (use_min_params_for_max_occupancy_) {
 				throw ::std::logic_error(
 					"Cannot both use the minimum grid parameters for achieving maximum occupancy, _and_ saturate "
 					"the grid with fixed-size cubs.");
 			}
+#endif
 			if (not (kernel_)) {
 				throw ::std::logic_error("A kernel must be set to determine how many blocks are required to saturate the device");
 			}
@@ -94,6 +96,7 @@ class launch_config_builder_t {
 			result.grid = kernel_->max_active_blocks_per_multiprocessor(num_block_threads, dshmem_size);
 			return result;
 		}
+#if CUDA_VERSION >= 10000
 		if (use_min_params_for_max_occupancy_) {
 			if (not (kernel_)) {
 				throw ::std::logic_error("A kernel must be set to determine the minimum grid parameter sfor m");
@@ -108,6 +111,7 @@ class launch_config_builder_t {
 			result.grid = composite_dims.grid;
 			return result;
 		}
+#endif
 		if (dimensions_.block and dimensions_.overall) {
 			result.grid = grid::detail_::div_rounding_up(dimensions_.overall.value(), dimensions_.block.value());
 			result.block = dimensions_.block.value();
@@ -175,7 +179,9 @@ class launch_config_builder_t {
 	const kernel_t* kernel_ { nullptr };
 	optional<device::id_t> device_;
 	bool saturate_with_active_blocks_ { false };
+#if CUDA_VERSION >= 10000
 	bool use_min_params_for_max_occupancy_ { false };
+#endif
 
 	static cuda::device_t device(optional<device::id_t> maybe_id)
 	{
@@ -528,7 +534,9 @@ class launch_config_builder_t {
 		}
 		dimensions_.grid = nullopt;
 		dimensions_.overall = nullopt;
+#if CUDA_VERSION >= 10000
 		use_min_params_for_max_occupancy_ = false;
+#endif
 		saturate_with_active_blocks_ = true;
 		return *this;
 	}
@@ -541,7 +549,9 @@ class launch_config_builder_t {
 		dimensions_.block = nullopt;
 		dimensions_.grid = nullopt;
 		dimensions_.overall = nullopt;
+#if CUDA_VERSION >= 10000
 		use_min_params_for_max_occupancy_ = true;
+#endif
 		saturate_with_active_blocks_ = false;
 		return *this;
 	}

diff --git a/src/cuda/api/multi_wrapper_impls/pointer.hpp b/src/cuda/api/multi_wrapper_impls/pointer.hpp
@@ -59,8 +59,8 @@ inline context_t context_of(const void* ptr)
 	void* value_ptrs[] = {&device_id, &context_handle};
 	pointer::detail_::get_attributes(2, attributes, value_ptrs, ptr);
 #else
-	auto context_handle = pointer::detail_::context_handle_of(ptr_);
-	auto device_id = context::detail_::get_device_id(ptr_);
+	auto context_handle = pointer::detail_::context_handle_of(ptr);
+	auto device_id = context::detail_::get_device_id(context_handle);
 #endif
 	return context::wrap(device_id, context_handle);
 }

diff --git a/src/cuda/api/stream.hpp b/src/cuda/api/stream.hpp
@@ -116,8 +116,6 @@ inline handle_t create_raw_in_current_context(
 		CU_STREAM_DEFAULT : CU_STREAM_NON_BLOCKING;
 	handle_t new_stream_handle;
 	auto status = cuStreamCreateWithPriority(&new_stream_handle, flags, priority);
-		// We could instead have used an equivalent Driver API call:
-		// cuStreamCreateWithPriority(cuStreamCreateWithPriority(&new_stream_handle, flags, priority);
 	throw_if_error_lazy(status, "Failed creating a new stream in " + detail_::identify(new_stream_handle));
 	return new_stream_handle;
 }
@@ -150,6 +148,9 @@ inline void record_event_in_current_context(
 	stream::handle_t   stream_handle,
 	event::handle_t    event_handle);
 
+template <typename Function>
+void enqueue_function_call(const stream_t& stream, Function function, void * argument);
+
 } // namespace detail_
 
 /**
@@ -304,59 +305,21 @@ class stream_t {
 
 protected: // static methods
 
-	/**
-	 * A function used internally by this class as the host function to call directly; see
-	 * @ref enqueue_t::host_function_call - but only with CUDA version 10.0 and later.
-	 *
-	 * @param stream_handle the ID of the stream for which a host function call was triggered - this
-	 * will be passed by the CUDA runtime
-	 * @param stream_wrapper_members_and_callable a tuple, containing the information necessary to
-	 * recreate the wrapper with which the callback is associated, without any additional CUDA API calls -
-	 * plus the callable which was passed to @ref enqueue_t::host_function_call, and which the programmer
-	 * actually wants to be called.
-	 *
-	 * @note instances of this template are of type {@ref callback_t}.
-	 */
-	template <typename Callable>
-	static void CUDA_CB stream_launched_host_function_adapter(void * stream_wrapper_members_and_callable)
-	{
-		using tuple_type = ::std::tuple<device::id_t, context::handle_t , stream::handle_t, Callable>;
-		auto* tuple_ptr = reinterpret_cast<tuple_type *>(stream_wrapper_members_and_callable);
-		auto unique_ptr_to_tuple = ::std::unique_ptr<tuple_type>{tuple_ptr}; // Ensures deletion when we leave this function.
-		auto device_id        = ::std::get<0>(*unique_ptr_to_tuple.get());
-		auto context_handle   = ::std::get<1>(*unique_ptr_to_tuple.get());
-		auto stream_handle    = ::std::get<2>(*unique_ptr_to_tuple.get());
-		const auto& callable  = ::std::get<3>(*unique_ptr_to_tuple.get());
-		callable( stream_t{device_id, context_handle, stream_handle, do_not_take_ownership} );
-	}
-
 	/**
 	 * @brief A function to @ref `host_function_launch_adapter`, for use with the old-style CUDA Runtime API call,
-	 * which passes more arguments to the callable - and calls the host function even on device failures.
+	 * which passes more arguments to the invokable - and calls the host function even on device failures.
 	 *
 	 * @param stream_handle the ID of the stream for which a host function call was triggered - this
 	 * will be passed by the CUDA runtime
 	 * @note status indicates the status the CUDA status when the host function call is triggered; anything
 	 * other than @ref `cuda::status::success` means there's been a device error previously - but
-	 * in that case, we won't invoke the callable, as such execution is deprecated; see:
+	 * in that case, we won't invoke the invokable, as such execution is deprecated; see:
 	 * https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__STREAM.html#group__CUDART__STREAM
-	 * @param device_id_and_callable a pair-value, containing the ID of the device to which the stream launching
-	 * the host function call is associated, as well as the callable callback which was passed to
+	 * @param device_id_and_invokable a pair-value, containing the ID of the device to which the stream launching
+	 * the host function call is associated, as well as the invokable callback which was passed to
 	 * @ref enqueue_t::host_function_call, and which the programmer actually wants to be called.
 	 */
-	template <typename Callable>
-	static void callback_launch_adapter(
-		stream::handle_t,
-		status_t      status,
-		void *        stream_wrapper_members_and_callable)
-	{
-		if (status != cuda::status::success) {
-			using tuple_type = ::std::tuple<device::id_t, context::handle_t , stream::handle_t, Callable>;
-			delete reinterpret_cast<tuple_type*>(stream_wrapper_members_and_callable);
-			return;
-		}
-		stream_launched_host_function_adapter<Callable>(stream_wrapper_members_and_callable);
-	}
+
 
 public: // mutators
 
@@ -518,53 +481,37 @@ class stream_t {
 			bool          records_timing     = event::do_record_timings,
 			bool          interprocess       = event::not_interprocess) const;
 
+# if CUDA_VERSION >= 10000
 		/**
-		 * Execute the specified function on the calling host thread once all
+		 * Execute the specified function on the calling host thread, after all
 		 * hereto-scheduled work on this stream has been completed.
 		 *
-		 * @param callable_ a function to execute on the host. It must be callable
-		 * with two parameters: `cuda::stream::handle_t stream_handle, cuda::event::handle_t event_handle`
+		 * @param invokable_ an object to call. It must be invokable/invokable with
+		 * a
 		 */
-		template <typename Callable>
-		void host_function_call(Callable callable_) const
+		template <typename Argument>
+		void host_function_call(void (*function)(Argument*), Argument* argument) const
 		{
-			context::current::detail_::scoped_override_t set_context_for_this_scope(associated_stream.context_handle_);
-
-			// Since callable_ will be going out of scope after the enqueueing,
-			// and we don't know anything about the scope of the original argument with
-			// which we were called, we must make a copy of `callable_` on the heap
-			// and pass that as the user-defined data. We also add information about
-			// the enqueueing stream.
-			auto raw_callable_extra_argument = new
-				::std::tuple<device::id_t, context::handle_t, stream::handle_t, Callable>(
-				associated_stream.device_id_,
-				associated_stream.context_handle_,
-				associated_stream.handle(),
-					Callable(::std::move(callable_))
-				);
-
-			// While we always register the same static function, `callback_adapter` as the
-			// callback - what it will actually _do_ is invoke the callback we were passed.
-
-#if CUDA_VERSION >= 10000
-			auto status = cuLaunchHostFunc(
-				associated_stream.handle_, &stream_launched_host_function_adapter<Callable>, raw_callable_extra_argument);
-				// Could have used the equivalent Driver API call: cuLaunchHostFunc()
-#else
-			// The nVIDIA runtime API (at least up to v10.2) requires passing 0 as the flags
-			// variable, see:
-			// http://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__STREAM.html
-			static constexpr const unsigned fixed_flags { 0u };
-			auto status = cuStreamAddCallback(
-				associated_stream.handle_, &callback_launch_adapter<Callable>, raw_callable_extra_argument, fixed_flags);
-				// Could have used the equivalent Driver API call: cuAddStreamCallback()
+			// I hope you like function declaration punning :-)
+			stream::detail_::enqueue_function_call(
+				associated_stream, reinterpret_cast<stream::detail_::callback_t>(function), argument);
+		}
 #endif
 
-			throw_if_error_lazy(status, "Failed scheduling a callback to be launched on "
-				+ stream::detail_::identify(associated_stream.handle_,
-					associated_stream.context_handle_, associated_stream.device_id_));
+	private:
+		template <typename Invokable>
+		static void CUDA_CB stream_launched_invoker(void* type_erased_invokable) {
+			auto invokable = reinterpret_cast<Invokable*>(type_erased_invokable);
+			(*invokable)();
 		}
 
+	public:
+		template <typename Invokable>
+		void host_invokable(Invokable& invokable) const
+		{
+			auto type_erased_invoker = reinterpret_cast<stream::detail_::callback_t>(stream_launched_invoker<Invokable>);
+			stream::detail_::enqueue_function_call(associated_stream, type_erased_invoker, &invokable);
+		}
 
 #if CUDA_VERSION >= 11020
 		/**
@@ -999,6 +946,40 @@ inline CUresult write_value<uint64_t>(CUstream stream_handle, CUdeviceptr addres
 	return cuStreamWriteValue64(stream_handle, address, value, flags);
 }
 
+/**
+ * A function used internally by this class as the host function to call directly; see
+ * @ref enqueue_t::host_function_call - but only with CUDA version 10.0 and later.
+ *
+ * @param stream_handle the ID of the stream for which a host function call was triggered - this
+ * will be passed by the CUDA runtime
+ * @param stream_wrapper_members_and_invokable a tuple, containing the information necessary to
+ * recreate the wrapper with which the callback is associated, without any additional CUDA API calls -
+ * plus the invokable which was passed to @ref enqueue_t::host_function_call, and which the programmer
+ * actually wants to be called.
+ *
+ * @note instances of this template are of type {@ref callback_t}.
+ */
+template <typename Function>
+void enqueue_function_call(const stream_t& stream, Function function, void* argument)
+{
+	context::current::detail_::scoped_override_t set_context_for_this_scope(stream.context_handle());
+
+	// While we always register the same static function, `callback_adapter` as the
+	// callback - what it will actually _do_ is invoke the callback we were passed.
+
+#if CUDA_VERSION >= 10000
+	auto status = cuLaunchHostFunc(stream.handle(), function, argument);
+	// Could have used the equivalent Driver API call: cuLaunchHostFunc()
+#else
+	// The nVIDIA runtime API (at least up to v10.2) requires passing 0 as the flags
+	// variable, see:
+	// http://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__STREAM.html
+	static constexpr const unsigned fixed_flags { 0u };
+	auto status = cuStreamAddCallback(stream.handle(), function, argument, fixed_flags);
+#endif
+	throw_if_error_lazy(status,	"Failed enqueuing a host function/invokable to be launched on " + stream::detail_::identify(stream));
+}
+
 } // namespace detail_
 
 /**

diff --git a/src/cuda/api/types.hpp b/src/cuda/api/types.hpp
@@ -348,6 +348,17 @@ enum : priority_t {
 	default_priority   = 0
 };
 
+namespace detail_ {
+
+#if CUDA_VERSION >= 10000
+using callback_t = CUhostFn;
+#else
+using callback_t = CUstreamCallback;
+#endif
+
+} // namespace detail_
+
+
 } // namespace stream
 
 namespace grid {
@@ -913,8 +924,6 @@ using handle_t = CUfunction;
 
 } // namespace kernel
 
-using callback_t = CUhostFn;
-
 // The C++ standard library doesn't offer ::std::dynarray (although it almost did),
 // and we won't introduce our own here. So...
 template <typename T>