Skip to content

Commit

Permalink
Fixes #582, fixes #564:
Browse files Browse the repository at this point in the history
* launch config <-> device validation now checks for block cooperation support when that's requested
* Refactored and re-located some of the launch config validation code
* Added: `device_t` method for checking block cooperation support
* Now properly validating grid dimensions to ensure we don't exceed the maxima
* Made sure the code paths inwards from the non-detail_ launching functions to the actual CUDA API calls all have appropriate validation calls
* Comment and spacing tweaks
  • Loading branch information
eyalroz committed Feb 9, 2024
1 parent d0ef615 commit db7a03e
Show file tree
Hide file tree
Showing 7 changed files with 222 additions and 85 deletions.
11 changes: 11 additions & 0 deletions src/cuda/api/device.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -357,6 +357,17 @@ class device_t {
return get_attribute(CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH);
}

#if CUDA_VERSION >= 12000
/**
* True if this device supports "clusters" of grid blocks,
* which can pool their shared memory together
*/
bool supports_block_clustering() const
{
return get_attribute(CU_DEVICE_ATTRIBUTE_CLUSTER_LAUNCH);
}
#endif

#if CUDA_VERSION >= 11020
/**
* True if this device supports executing kernels in which blocks can
Expand Down
24 changes: 1 addition & 23 deletions src/cuda/api/kernel_launch.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -178,9 +178,6 @@ void enqueue_raw_kernel_launch_in_current_context(
using decayed_kf_type = typename ::std::decay<KernelFunction>::type;
static_assert(::std::is_function<decayed_kf_type>::value or is_function_ptr<decayed_kf_type>::value,
"Only a bona fide function can be launched as a CUDA kernel");
#ifndef NDEBUG
validate(launch_configuration);
#endif
if (not launch_configuration.has_nondefault_attributes()) {
// regular plain vanilla launch
kernel_function <<<
Expand Down Expand Up @@ -323,26 +320,7 @@ void enqueue_launch(
Kernel&& kernel,
const stream_t& stream,
launch_configuration_t launch_configuration,
KernelParameters&&... parameters)
{
static_assert(
detail_::all_true<::std::is_trivially_copy_constructible<detail_::kernel_parameter_decay_t<KernelParameters>>::value...>::value,
"All kernel parameter types must be of a trivially copy-constructible (decayed) type." );
static constexpr const bool wrapped_contextual_kernel = ::std::is_base_of<kernel_t, typename ::std::decay<Kernel>::type>::value;
#if CUDA_VERSION >= 12000
static constexpr const bool library_kernel = cuda::detail_::is_library_kernel<Kernel>::value;
#else
static constexpr const bool library_kernel = false;
#endif // CUDA_VERSION >= 12000
// We would have liked an "if constexpr" here, but that is unsupported by C++11, so we have to
// use tagged dispatch for the separate behavior for raw and wrapped kernels - although the enqueue_launch
// function for each of them will basically be just a one-liner :-(
detail_::enqueue_launch<Kernel, KernelParameters...>(
detail_::bool_constant<wrapped_contextual_kernel>{},
detail_::bool_constant<library_kernel>{},
::std::forward<Kernel>(kernel), stream, launch_configuration,
::std::forward<KernelParameters>(parameters)...);
}
KernelParameters&&... parameters);

/**
* Variant of @ref enqueue_launch for use with the default stream in the current context.
Expand Down
44 changes: 37 additions & 7 deletions src/cuda/api/launch_config_builder.hpp
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
/**
* @file
*
* @brief Contains the @ref launch
* @brief Contains the @ref `cuda::launch_config_builder_t` class definition
*
* @note Launch configurations are used mostly in @ref kernel_launch.hpp .
* @note Launch configurations are used mostly in @ref `kernel_launch.hpp`.
*/

#pragma once
Expand Down Expand Up @@ -178,6 +178,7 @@ class launch_config_builder_t {

struct {
optional<grid::block_dimensions_t > block;
optional<grid::dimensions_t > block_cluster;
optional<grid::dimensions_t > grid;
optional<grid::overall_dimensions_t> overall;
} dimensions_;
Expand All @@ -188,7 +189,7 @@ class launch_config_builder_t {
// but the semantic is that if the determiner is not null, we use it;
// and if you want to force a concrete apriori value, then you nullify
// the determiner
kernel::shared_memory_size_determiner_t dynamic_shared_memory_size_determiner_ {nullptr };
kernel::shared_memory_size_determiner_t dynamic_shared_memory_size_determiner_ { nullptr };
memory::shared::size_t dynamic_shared_memory_size_ { 0 };

const kernel_t* kernel_ { nullptr };
Expand Down Expand Up @@ -224,15 +225,15 @@ class launch_config_builder_t {
memory::shared::size_t shared_mem_size)
{
if (kernel_ptr == nullptr) { return; }
detail_::validate_compatibility(*kernel_ptr, shared_mem_size);
detail_::validate_shared_mem_size_compatibility(*kernel_ptr, shared_mem_size);
}

static void validate_compatibility(
optional<device::id_t> maybe_device_id,
memory::shared::size_t shared_mem_size)
{
if (not maybe_device_id) { return; }
detail_::validate_compatibility(device(maybe_device_id), shared_mem_size);
detail_::validate_shared_mem_compatibility(device(maybe_device_id), shared_mem_size);
}

void validate_dynamic_shared_memory_size(memory::shared::size_t size)
Expand Down Expand Up @@ -269,6 +270,15 @@ class launch_config_builder_t {
validate_block_dimension_compatibility(device_, block_dims);
}


static void validate_grid_dimension_compatibility(
optional<device::id_t> maybe_device_id,
grid::block_dimensions_t block_dims)
{
if (not maybe_device_id) { return; }
detail_::validate_grid_dimension_compatibility(device(maybe_device_id), block_dims);
}

void validate_grid_dimensions(grid::dimensions_t grid_dims) const
{
detail_::validate_grid_dimensions(grid_dims);
Expand All @@ -279,6 +289,16 @@ class launch_config_builder_t {
// TODO: Check divisibility
}

#if CUDA_VERSION >= 12000
void validate_cluster_dimensions(grid::dimensions_t cluster_dims) const
{
if (dimensions_.grid and grid::dimensions_t::divides(cluster_dims, dimensions_.grid.value())) {
throw ::std::runtime_error("The requested block cluster dimensions do not "
"divide the grid dimensions (in blocks)");
}
}
#endif // CUDA_VERSION >= 12000

void validate_overall_dimensions(grid::overall_dimensions_t overall_dims) const
{
if (dimensions_.block and dimensions_.grid) {
Expand Down Expand Up @@ -309,7 +329,8 @@ class launch_config_builder_t {
get_composite_dimensions().block;
validate_block_dimension_compatibility(device_id, block_dims);
}
validate_compatibility(device_id, dynamic_shared_memory_size_);
detail_::validate_compatibility(
device_id, dynamic_shared_memory_size_, thread_block_cooperation, dimensions_.block_cluster);
}

void validate_composite_dimensions(grid::composite_dimensions_t composite_dims) const
Expand All @@ -318,7 +339,7 @@ class launch_config_builder_t {
validate_block_dimension_compatibility(device_, composite_dims.block);

// Is there anything to validate regarding the grid dims?
validate_block_dimension_compatibility(device_, composite_dims.grid);
validate_grid_dimension_compatibility(device_, composite_dims.grid);
}
#endif // ifndef NDEBUG

Expand Down Expand Up @@ -378,6 +399,15 @@ class launch_config_builder_t {
return *this;
}

launch_config_builder_t& cluster_blocks(grid::block_dimensions_t cluster_dims)
{
#ifndef NDEBUG
validate_cluster_dimensions(cluster_dims);
#endif
dimensions_.block_cluster = cluster_dims;
return *this;
}

launch_config_builder_t& grid_dimensions(grid::dimensions_t dims)
{
#ifndef NDEBUG
Expand Down
12 changes: 3 additions & 9 deletions src/cuda/api/launch_configuration.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,7 @@ namespace detail_ {

// Note: This will not check anything related to the device or the kernel
// with which the launch configuration is to be used
inline void validate(launch_configuration_t launch_config) noexcept(false)
inline void validate(const launch_configuration_t& launch_config) noexcept(false)
{
validate_block_dimensions(launch_config.dimensions.block);
validate_grid_dimensions(launch_config.dimensions.grid);
Expand All @@ -223,15 +223,9 @@ inline void validate_compatibility(
// validate_grid_dimension_compatibility(device, launch_config.dimensions.grid);
}

inline void validate_compatibility(
void validate_compatibility(
const kernel_t& kernel,
launch_configuration_t launch_config) noexcept(false)
{
validate(launch_config);
validate_block_dimension_compatibility(kernel, launch_config.dimensions.block);
// Uncomment if we actually get such checks
// validate_grid_dimension_compatibility(kernel, launch_config.dimensions.grid);
}
launch_configuration_t launch_config) noexcept(false);

using launch_attribute_index_t = unsigned int;

Expand Down
Loading

0 comments on commit db7a03e

Please sign in to comment.