Fixes #284, #285; helper_cuda.h changes, other minor changes:

* For #284: Introduced a grid-and-block-dimensions structure, `grid::complete_dimensions_t`. Using it when returning both grid and block dimensions instead of an `std::pair`; it has equals * For #285: Changed the construction pattern for `kernel_t`: * Dropped the templated, wrapping, direct constructor. * Added `kernel::detail_::wrap()` taking a device ID and an arbitrary (function) pointer, and a `kernel::wrap()` taking a device ID and type-erased `const void*` pointer. * Made the lower-level `wrap()` a friend of the `kernel_t` class. * Now using the default destructor for `kernel_t`'s (has nothing to do with the construction changes). * Spacing tweaks. * Comment typo fixes. * Added not-equal operators for launch configurations * Added some comments to some `#endif`'s, reminding the reader of the condition used in the `#if` of `#ifdef`. * Made some narrowing casts explicit, to clarify their intentionality to static analysis tool. * Added two aliases to the sync/async boolean enum in `cuda::stream` * A bit of comment rephrasing Example program changes: * Adapted examples for the use of `grid::complete_dimensions_t`. * Now creating wrapped kernels using `cuda::kernel::wrap()` rather than by direct construction. * Spacing tweaks. * Changes to the `cudaChooseDevice()` function in `helper_cuda.h`; mainly: * Now returning a `cuda::device_t` * No longer making the returned device current. In particular, that means that `simpleStreams.cu` may now be using a device that's not the current one.
eyalroz · Jan 14, 2022 · 1f38bf9 · 1f38bf9
1 parent bc5371d
commit 1f38bf9
Show file tree

Hide file tree

Showing 12 changed files with 197 additions and 88 deletions.
diff --git a/examples/by_runtime_api_module/event_management.cu b/examples/by_runtime_api_module/event_management.cu
@@ -99,7 +99,7 @@ int main(int argc, char **argv)
 	constexpr size_t buffer_size = 12345678;
 	auto buffer = cuda::memory::managed::make_unique<char[]>(
 		buffer_size, cuda::memory::managed::initial_visibility_t::to_all_devices);
-	cuda::grid::block_dimension_t threads_per_block = cuda::kernel_t(device, increment).attributes().maxThreadsPerBlock;
+	cuda::grid::block_dimension_t threads_per_block = cuda::kernel::wrap(device, increment).attributes().maxThreadsPerBlock;
 	cuda::grid::dimension_t num_blocks = (buffer_size + threads_per_block - 1) / threads_per_block;
 	auto launch_config = cuda::make_launch_config(num_blocks, threads_per_block);
 
@@ -113,7 +113,7 @@ int main(int argc, char **argv)
 	stream.enqueue.kernel_launch(increment, launch_config, buffer.get(), buffer_size);
 	stream.enqueue.host_function_call(
 		[&event_1, &event_2](cuda::stream_t) {
-		report_occurrence("In second callback (enqueued after the first kernel but before the second event)", event_1, event_2);
+			report_occurrence("In second callback (enqueued after the first kernel but before the second event)", event_1, event_2);
 		}
 	);
 	stream.enqueue.event(event_2);

diff --git a/examples/by_runtime_api_module/execution_control.cu b/examples/by_runtime_api_module/execution_control.cu
@@ -63,7 +63,7 @@ int main(int argc, char **argv)
 
 	auto device = cuda::device::get(device_id).make_current();
 	std::cout << "Using CUDA device " << device.name() << " (having device ID " << device.id() << ")\n";
-	cuda::kernel_t kernel(device, kernel_function);
+	auto kernel = cuda::kernel::wrap(device, kernel_function);
 
 	// ------------------------------------------
 	//  Attributes without a specific API call
@@ -179,12 +179,12 @@ int main(int argc, char **argv)
 		cuda::outstanding_error::clear();
 	}
 #endif
-	cuda::kernel_t non_cooperative_kernel(device, kernel_function);
+	auto non_cooperative_kernel = cuda::kernel::wrap(device, kernel_function);
 	auto non_cooperative_config = launch_config;
 	non_cooperative_config.block_cooperation = true;
 	std::cout
-		<< "Launching kernel " << kernel_name
-		<< " with " << num_blocks << " blocks, un-cooperatively, using stream.launch()\n" << std::flush;
+		<< "Launching kernel " << kernel_name << " with "
+		<< num_blocks << " blocks, un-cooperatively, using stream.launch()\n" << std::flush;
 	stream.enqueue.kernel_launch(non_cooperative_kernel, non_cooperative_config, bar);
 	stream.synchronize();
 

diff --git a/examples/by_runtime_api_module/stream_management.cu b/examples/by_runtime_api_module/stream_management.cu
@@ -172,7 +172,7 @@ int main(int argc, char **argv)
 			print_first_char(buffer.get());
 		}
 	);
-	auto threads_per_block = cuda::kernel_t(device, increment).attributes().maxThreadsPerBlock;
+	auto threads_per_block = cuda::kernel::wrap(device, increment).attributes().maxThreadsPerBlock;
 	auto num_blocks = (buffer_size + threads_per_block - 1) / threads_per_block;
 	auto launch_config = cuda::make_launch_config(num_blocks, threads_per_block);
 	// TODO: The following doesn't have much of a meaningful effect; we should modify this example

diff --git a/examples/modified_cuda_samples/helper_cuda.h b/examples/modified_cuda_samples/helper_cuda.h
@@ -61,7 +61,7 @@ inline std::ostream& operator<< (std::ostream& os, const cuda::device::compute_c
 
 #ifdef __CUDA_RUNTIME_H__
 // General GPU Device CUDA Initialization
-inline void gpuDeviceInit(int device_id)
+/*inline void gpuDeviceInit(int device_id)
 {
 	auto device_count = cuda::device::count();
 
@@ -91,8 +91,22 @@ inline void gpuDeviceInit(int device_id)
 	}
 
 	device.make_current();
+}*/
+
+static void ensure_device_is_usable(const cuda::device_t device)
+{
+	auto properties = device.properties();
+
+	if (not properties.usable_for_compute()) {
+		die_("Error: device " + std::to_string(device.id()) + "is running with <Compute Mode Prohibited>.");
+	}
+
+	if (not properties.compute_capability().major() < 1) {
+		die_("CUDA device " + std::to_string(device.id()) + " does not support CUDA.\n");
+	}
 }
 
+
 // This function returns the best GPU (with maximum GFLOPS)
 inline int gpuGetMaxGflopsDeviceId()
 {
@@ -145,19 +159,20 @@ inline int gpuGetMaxGflopsDeviceId()
 }
 
 // Initialization code to find the best CUDA Device
-inline void chooseCudaDevice(int argc, const char **argv)
+// Unlike in NVIDIA's original helper_cuda.h, this does _not_
+// make the chosen device current.
+inline cuda::device_t chooseCudaDevice(int argc, const char **argv)
 {
-	cuda::device::id_t device_id;
 	// If the command-line has a device number specified, use it
 	if (checkCmdLineFlag(argc, argv, "device"))
 	{
-		device_id = getCmdLineArgumentInt(argc, argv, "device=");
-
-		if (device_id < 0) { die_("Invalid command line parameter"); }
-		else
-		{
-			gpuDeviceInit(device_id);
+		auto device_id = getCmdLineArgumentInt(argc, argv, "device=");
+		if (device_id < 0) {
+			die_("Invalid command line parameter");
 		}
+		auto device = cuda::device::get(device_id);
+		ensure_device_is_usable(device);
+		return device;
 	}
 	else
 	{
@@ -166,7 +181,7 @@ inline void chooseCudaDevice(int argc, const char **argv)
 		std::cout << "GPU Device " << best_device.id() << ": ";
 		std::cout << "\"" << best_device.name() << "\" ";
 		std::cout << "with compute capability " << best_device.properties().compute_capability() << "\n";
-		best_device.make_current();
+		return best_device;
 	}
 }
 

diff --git a/examples/modified_cuda_samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest.cu b/examples/modified_cuda_samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest.cu
@@ -137,9 +137,9 @@ void enqueue_p2p_copy(
     P2PEngine p2p_mechanism,
     cuda::stream_t& stream)
 {
-    auto copy_kernel = cuda::kernel_t(stream.device(), copyp2p);
-    auto params = copy_kernel.min_grid_params_for_max_occupancy();
-    auto launch_config = cuda::make_launch_config(params.first, params.second);
+    auto copy_kernel = cuda::kernel::wrap(stream.device(), copyp2p);
+    auto grid_and_block_dims = copy_kernel.min_grid_params_for_max_occupancy();
+    auto launch_config = cuda::make_launch_config(grid_and_block_dims);
 
 
     if (p2p_mechanism == SM && p2paccess)
@@ -423,7 +423,7 @@ void outputLatencyMatrix(P2PEngine p2p_mechanism, bool test_p2p, P2PDataTransfer
             // relatively low.  Higher repeatitions will cause the delay kernel
             // to timeout and lead to unstable results.
             *flag = 0;
-            auto single_thread = cuda::make_launch_config(cuda::grid::dimensions_t::point(), cuda::grid::dimensions_t::point());
+            auto single_thread = cuda::make_launch_config(cuda::grid::dimensions_t::point(), cuda::grid::block_dimensions_t::point());
             streams[i].enqueue.kernel_launch(delay, single_thread, flag, default_timeout_clocks);
             streams[i].enqueue.event(start[i]);
 

diff --git a/examples/modified_cuda_samples/simpleStreams/simpleStreams.cu b/examples/modified_cuda_samples/simpleStreams/simpleStreams.cu
@@ -166,11 +166,10 @@ int main(int argc, char **argv)
 	}
 
 	std::cout << "\n> ";
-	chooseCudaDevice(argc, (const char **)argv);
-	auto current_device = cuda::device::current::get();
+	auto device = chooseCudaDevice(argc, (const char **)argv);
 
 	// Checking for compute capabilities
-	auto properties = current_device.properties();
+	auto properties = device.properties();
 	auto compute_capability = properties.compute_capability();
 
 	if (compute_capability < cuda::device::compute_capability_t({1, 1}) ) {
@@ -224,7 +223,7 @@ int main(int argc, char **argv)
 	default: // should not be able to get here
 		exit(EXIT_FAILURE);
 	}
-	current_device.set_synch_scheduling_policy(policy);
+	device.set_synch_scheduling_policy(policy);
 	// Not necessary: Since CUDA 3.2 (which is below the minimum supported
 	// version for the API wrappers, all contexts allow such mapping.
 	// current_device.enable_mapping_host_memory();
@@ -237,8 +236,8 @@ int main(int argc, char **argv)
 
 	// allocate device memory
 	// pointers to data and init value in the device memory
-	auto d_a = cuda::memory::device::make_unique<int[]>(current_device, n);
-	auto d_c = cuda::memory::device::make_unique<int>(current_device);
+	auto d_a = cuda::memory::device::make_unique<int[]>(device, n);
+	auto d_c = cuda::memory::device::make_unique<int>(device);
 	cuda::memory::copy_single(d_c.get(), &c);
 
 	std::cout << "\nStarting Test\n";
@@ -247,11 +246,11 @@ int main(int argc, char **argv)
 	std::vector<cuda::stream_t> streams;
 	std::generate_n(
 		std::back_inserter(streams), nstreams,
-		[&current_device]() {
+		[&device]() {
 			// Note: we could omit the specific requirement of synchronization
 			// with the default stream, since that's the CUDA default - but I
 			// think it's important to state that's the case
-			return current_device.create_stream(
+			return device.create_stream(
 				cuda::stream::implicitly_synchronizes_with_default_stream);
 		}
 	);
@@ -260,8 +259,8 @@ int main(int argc, char **argv)
 	// use blocking sync
 	auto use_blocking_sync = (device_sync_method == cudaDeviceBlockingSync);
 
-	auto start_event = cuda::event::create(current_device, use_blocking_sync);
-	auto stop_event = cuda::event::create(current_device, use_blocking_sync);
+	auto start_event = cuda::event::create(device, use_blocking_sync);
+	auto stop_event = cuda::event::create(device, use_blocking_sync);
 
 	// time memcopy from device
 	start_event.record(); // record on the default stream, to ensure that all previous CUDA calls have completed

diff --git a/src/cuda/api/ipc.hpp b/src/cuda/api/ipc.hpp
@@ -16,7 +16,7 @@
  * here. In addition to the free-standing functions, the class
  * @ref cuda::memory::ipc::imported_t is defined, usable by receiving
  * processes as an 'adapter' to incoming handles which may be passed
- * as-is to code requiring a propoer pointer.
+ * as-is to code requiring a proper pointer.
  *
  */
 #pragma once

diff --git a/src/cuda/api/kernel.hpp b/src/cuda/api/kernel.hpp
@@ -22,10 +22,17 @@ namespace cuda {
 ///@cond
 class device_t;
 class stream_t;
+class kernel_t;
 ///@endcond
 
 namespace kernel {
 
+namespace detail_ {
+
+inline kernel_t wrap(device::id_t device_id, const void* ptr);
+
+} // namespace detail
+
 /**
  * @brief a wrapper around `cudaFuncAttributes`, offering
  * a few convenience member functions.
@@ -129,15 +136,13 @@ class kernel_t {
 	 * (1-dimensional), and the first element being the minimum number of such blocks necessary
 	 * for keeping the GPU "busy" (again, in a 1-dimensional grid).
 	 */
-	::std::pair<grid::dimension_t, grid::block_dimension_t>
-	min_grid_params_for_max_occupancy(
+	grid::complete_dimensions_t min_grid_params_for_max_occupancy(
 		memory::shared::size_t   dynamic_shared_memory_size = no_dynamic_shared_memory,
 		grid::block_dimension_t  block_size_limit = 0,
 		bool                     disable_caching_override = false);
 
 	template <typename UnaryFunction>
-	::std::pair<grid::dimension_t, grid::block_dimension_t>
-	min_grid_params_for_max_occupancy(
+	grid::complete_dimensions_t min_grid_params_for_max_occupancy(
 		UnaryFunction            block_size_to_dynamic_shared_mem_size,
 		grid::block_dimension_t  block_size_limit = 0,
 		bool                     disable_caching_override = false);
@@ -198,9 +203,9 @@ class kernel_t {
 	}
 
 public: // ctors & dtor
-	template <typename DeviceFunction>
-	kernel_t(const device_t& device, DeviceFunction f);
-	~kernel_t() { };
+	~kernel_t() = default;
+
+	friend kernel_t kernel::detail_::wrap(device::id_t, const void* ptr);
 
 protected: // data members
 	const device::id_t device_id_;
@@ -262,8 +267,21 @@ auto unwrap(Kernel f) -> typename ::std::conditional<
 	return detail_::unwrap_inner<Kernel, KernelParameters...>(got_a_kernel_t{}, f);
 }
 
+namespace detail_ {
+
+inline kernel_t wrap(device::id_t device_id, const void* function_ptr)
+{
+	return { device_id, reinterpret_cast<const void*>(function_ptr) };
+}
+
+} // namespace detail_
+
+template<typename KernelFunctionPtr>
+kernel_t wrap(const device_t &device, KernelFunctionPtr function_ptr);
+
 } // namespace kernel
 
+
 } // namespace cuda
 
 #endif // CUDA_API_WRAPPERS_KERNEL_HPP_
diff --git a/src/cuda/api/kernel_launch.hpp b/src/cuda/api/kernel_launch.hpp
@@ -89,7 +89,7 @@ inline void collect_argument_addresses(void** collected_addresses, Arg&& arg, Ar
 template<typename RawKernel, typename... KernelParameters>
 inline void enqueue_launch(
 	RawKernel                   kernel_function,
-	stream::handle_t                stream_handle,
+	stream::handle_t            stream_handle,
 	launch_configuration_t      launch_configuration,
 	KernelParameters&&...       parameters)
 #ifndef __CUDACC__
@@ -106,8 +106,8 @@ inline void enqueue_launch(
 	if (launch_configuration.block_cooperation == thread_blocks_may_not_cooperate) {
 		// regular plain vanilla launch
 		kernel_function <<<
-			launch_configuration.grid_dimensions,
-			launch_configuration.block_dimensions,
+			launch_configuration.dimensions.grid,
+			launch_configuration.dimensions.block,
 			launch_configuration.dynamic_shared_memory_size,
 			stream_handle
 			>>>(::std::forward<KernelParameters>(parameters)...);
@@ -133,8 +133,8 @@ inline void enqueue_launch(
 		detail_::collect_argument_addresses(argument_ptrs, ::std::forward<KernelParameters>(parameters)...);
 		auto status = cudaLaunchCooperativeKernel(
 			(const void*) kernel_function,
-			launch_configuration.grid_dimensions,
-			launch_configuration.block_dimensions,
+			launch_configuration.dimensions.grid,
+			launch_configuration.dimensions.block,
 			argument_ptrs,
 			launch_configuration.dynamic_shared_memory_size,
 			stream_handle);