Fixes #582, fixes #564:

* launch config <-> device validation now checks for block cooperation support when that's requested * Refactored and re-located some of the launch config validation code * Added: `device_t` method for checking block cooperation support * Now properly validating grid dimensions to ensure we don't exceed the maxima * Made sure the code paths inwards from the non-detail_ launching functions to the actual CUDA API calls all have appropriate validation calls * Comment and spacing tweaks
eyalroz · Feb 9, 2024 · db7a03e · db7a03e
1 parent d0ef615
commit db7a03e
Show file tree

Hide file tree

Showing 7 changed files with 222 additions and 85 deletions.
diff --git a/src/cuda/api/device.hpp b/src/cuda/api/device.hpp
@@ -357,6 +357,17 @@ class device_t {
 		return get_attribute(CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH);
 	}
 
+#if CUDA_VERSION >= 12000
+	/**
+	 * True if this device supports "clusters" of grid blocks,
+	 * which can pool their shared memory together
+	 */
+	bool supports_block_clustering() const
+	{
+		return get_attribute(CU_DEVICE_ATTRIBUTE_CLUSTER_LAUNCH);
+	}
+#endif
+
 #if CUDA_VERSION >= 11020
 	/**
 	 * True if this device supports executing kernels in which blocks can

diff --git a/src/cuda/api/kernel_launch.hpp b/src/cuda/api/kernel_launch.hpp
@@ -178,9 +178,6 @@ void enqueue_raw_kernel_launch_in_current_context(
 	using decayed_kf_type = typename ::std::decay<KernelFunction>::type;
 	static_assert(::std::is_function<decayed_kf_type>::value or is_function_ptr<decayed_kf_type>::value,
 		"Only a bona fide function can be launched as a CUDA kernel");
-#ifndef NDEBUG
-	validate(launch_configuration);
-#endif
 	if (not launch_configuration.has_nondefault_attributes()) {
 		// regular plain vanilla launch
 		kernel_function <<<
@@ -323,26 +320,7 @@ void enqueue_launch(
 	Kernel&&                kernel,
 	const stream_t&         stream,
 	launch_configuration_t  launch_configuration,
-	KernelParameters&&...   parameters)
-{
-	static_assert(
-		detail_::all_true<::std::is_trivially_copy_constructible<detail_::kernel_parameter_decay_t<KernelParameters>>::value...>::value,
-		"All kernel parameter types must be of a trivially copy-constructible (decayed) type." );
-	static constexpr const bool wrapped_contextual_kernel = ::std::is_base_of<kernel_t, typename ::std::decay<Kernel>::type>::value;
-#if CUDA_VERSION >= 12000
-	static constexpr const bool library_kernel = cuda::detail_::is_library_kernel<Kernel>::value;
-#else
-	static constexpr const bool library_kernel = false;
-#endif // CUDA_VERSION >= 12000
-	// We would have liked an "if constexpr" here, but that is unsupported by C++11, so we have to
-	// use tagged dispatch for the separate behavior for raw and wrapped kernels - although the enqueue_launch
-	// function for each of them will basically be just a one-liner :-(
-	detail_::enqueue_launch<Kernel, KernelParameters...>(
-		detail_::bool_constant<wrapped_contextual_kernel>{},
-		detail_::bool_constant<library_kernel>{},
-		::std::forward<Kernel>(kernel), stream, launch_configuration,
-		::std::forward<KernelParameters>(parameters)...);
-}
+	KernelParameters&&...   parameters);
 
 /**
  * Variant of @ref enqueue_launch for use with the default stream in the current context.

diff --git a/src/cuda/api/launch_config_builder.hpp b/src/cuda/api/launch_config_builder.hpp
@@ -1,9 +1,9 @@
 /**
  * @file
  *
- * @brief Contains the @ref launch
+ * @brief Contains the @ref `cuda::launch_config_builder_t` class definition
  *
- * @note Launch configurations are  used mostly in @ref kernel_launch.hpp . 
+ * @note Launch configurations are used mostly in @ref `kernel_launch.hpp`.
  */
 
 #pragma once
@@ -178,6 +178,7 @@ class launch_config_builder_t {
 
 	struct {
 		optional<grid::block_dimensions_t  > block;
+		optional<grid::dimensions_t        > block_cluster;
 		optional<grid::dimensions_t        > grid;
 		optional<grid::overall_dimensions_t> overall;
 	} dimensions_;
@@ -188,7 +189,7 @@ class launch_config_builder_t {
 	// but the semantic is that if the determiner is not null, we use it;
 	// and if you want to force a concrete apriori value, then you nullify
 	// the determiner
-	kernel::shared_memory_size_determiner_t dynamic_shared_memory_size_determiner_ {nullptr };
+	kernel::shared_memory_size_determiner_t dynamic_shared_memory_size_determiner_ { nullptr };
 	memory::shared::size_t dynamic_shared_memory_size_ { 0 };
 
 	const kernel_t* kernel_ { nullptr };
@@ -224,15 +225,15 @@ class launch_config_builder_t {
 		memory::shared::size_t  shared_mem_size)
 	{
 		if (kernel_ptr == nullptr) { return; }
-		detail_::validate_compatibility(*kernel_ptr, shared_mem_size);
+		detail_::validate_shared_mem_size_compatibility(*kernel_ptr, shared_mem_size);
 	}
 
 	static void validate_compatibility(
 		optional<device::id_t> maybe_device_id,
 		memory::shared::size_t shared_mem_size)
 	{
 		if (not maybe_device_id) { return; }
-		detail_::validate_compatibility(device(maybe_device_id), shared_mem_size);
+		detail_::validate_shared_mem_compatibility(device(maybe_device_id), shared_mem_size);
 	}
 
 	void validate_dynamic_shared_memory_size(memory::shared::size_t size)
@@ -269,6 +270,15 @@ class launch_config_builder_t {
 		validate_block_dimension_compatibility(device_, block_dims);
 	}
 
+
+	static void validate_grid_dimension_compatibility(
+		optional<device::id_t>    maybe_device_id,
+		grid::block_dimensions_t  block_dims)
+	{
+		if (not maybe_device_id) { return; }
+		detail_::validate_grid_dimension_compatibility(device(maybe_device_id), block_dims);
+	}
+
 	void validate_grid_dimensions(grid::dimensions_t grid_dims) const
 	{
 		detail_::validate_grid_dimensions(grid_dims);
@@ -279,6 +289,16 @@ class launch_config_builder_t {
 		// TODO: Check divisibility
 	}
 
+#if CUDA_VERSION >= 12000
+	void validate_cluster_dimensions(grid::dimensions_t cluster_dims) const
+	{
+		if (dimensions_.grid and grid::dimensions_t::divides(cluster_dims, dimensions_.grid.value())) {
+			throw ::std::runtime_error("The requested block cluster dimensions do not "
+				"divide the grid dimensions (in blocks)");
+		}
+	}
+#endif // CUDA_VERSION >= 12000
+
 	void validate_overall_dimensions(grid::overall_dimensions_t overall_dims) const
 	{
 		if (dimensions_.block and dimensions_.grid) {
@@ -309,7 +329,8 @@ class launch_config_builder_t {
 				get_composite_dimensions().block;
 			validate_block_dimension_compatibility(device_id, block_dims);
 		}
-		validate_compatibility(device_id, dynamic_shared_memory_size_);
+		detail_::validate_compatibility(
+			device_id, dynamic_shared_memory_size_, thread_block_cooperation, dimensions_.block_cluster);
 	}
 
 	void validate_composite_dimensions(grid::composite_dimensions_t composite_dims) const
@@ -318,7 +339,7 @@ class launch_config_builder_t {
 		validate_block_dimension_compatibility(device_, composite_dims.block);
 
 		// Is there anything to validate regarding the grid dims?
-		validate_block_dimension_compatibility(device_, composite_dims.grid);
+		validate_grid_dimension_compatibility(device_, composite_dims.grid);
 	}
 #endif // ifndef NDEBUG
 
@@ -378,6 +399,15 @@ class launch_config_builder_t {
 		return *this;
 	}
 
+	launch_config_builder_t& cluster_blocks(grid::block_dimensions_t cluster_dims)
+	{
+#ifndef NDEBUG
+		validate_cluster_dimensions(cluster_dims);
+#endif
+		dimensions_.block_cluster = cluster_dims;
+		return *this;
+	}
+
 	launch_config_builder_t& grid_dimensions(grid::dimensions_t dims)
 	{
 #ifndef NDEBUG

diff --git a/src/cuda/api/launch_configuration.hpp b/src/cuda/api/launch_configuration.hpp
@@ -207,7 +207,7 @@ namespace detail_ {
 
 // Note: This will not check anything related to the device or the kernel
 // with which the launch configuration is to be used
-inline void validate(launch_configuration_t launch_config) noexcept(false)
+inline void validate(const launch_configuration_t& launch_config) noexcept(false)
 {
 	validate_block_dimensions(launch_config.dimensions.block);
 	validate_grid_dimensions(launch_config.dimensions.grid);
@@ -223,15 +223,9 @@ inline void validate_compatibility(
 	//	validate_grid_dimension_compatibility(device, launch_config.dimensions.grid);
 }
 
-inline void validate_compatibility(
+void validate_compatibility(
 	const kernel_t& kernel,
-	launch_configuration_t launch_config) noexcept(false)
-{
-	validate(launch_config);
-	validate_block_dimension_compatibility(kernel, launch_config.dimensions.block);
-	//  Uncomment if we actually get such checks
-	//	validate_grid_dimension_compatibility(kernel, launch_config.dimensions.grid);
-}
+	launch_configuration_t launch_config) noexcept(false);
 
 using launch_attribute_index_t = unsigned int;