Skip to content

Commit

Permalink
Fixes #284, #285; helper_cuda.h changes, other minor changes:
Browse files Browse the repository at this point in the history
* For #284: Introduced a grid-and-block-dimensions structure, `grid::complete_dimensions_t`. Using it when returning both grid and block dimensions instead of an `std::pair`; it has equals
* For #285: Changed the construction pattern for `kernel_t`:
  * Dropped the templated, wrapping, direct constructor.
  * Added `kernel::detail_::wrap()` taking a device ID and an arbitrary (function) pointer, and a `kernel::wrap()` taking a device ID and type-erased `const void*` pointer.
  * Made the lower-level `wrap()` a friend of the `kernel_t` class.
* Now using the default destructor for `kernel_t`'s (has nothing to do with the construction changes).
* Spacing tweaks.
* Comment typo fixes.
* Added not-equal operators for launch configurations
* Added some comments to some `#endif`'s, reminding the reader of the condition used in the `#if` of `#ifdef`.
* Made some narrowing casts explicit, to clarify their intentionality to static analysis tool.
* Added two aliases to the sync/async boolean enum in `cuda::stream`
* A bit of comment rephrasing

Example program changes:

* Adapted examples for the use of `grid::complete_dimensions_t`.
* Now creating wrapped kernels using `cuda::kernel::wrap()` rather than by direct construction.
* Spacing tweaks.
* Changes to the `cudaChooseDevice()` function in `helper_cuda.h`; mainly:
  * Now returning a `cuda::device_t`
  * No longer making the returned device current. In particular, that means that `simpleStreams.cu` may now be using a device that's not the current one.
  • Loading branch information
eyalroz committed Jan 14, 2022
1 parent bc5371d commit 1f38bf9
Show file tree
Hide file tree
Showing 12 changed files with 197 additions and 88 deletions.
4 changes: 2 additions & 2 deletions examples/by_runtime_api_module/event_management.cu
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ int main(int argc, char **argv)
constexpr size_t buffer_size = 12345678;
auto buffer = cuda::memory::managed::make_unique<char[]>(
buffer_size, cuda::memory::managed::initial_visibility_t::to_all_devices);
cuda::grid::block_dimension_t threads_per_block = cuda::kernel_t(device, increment).attributes().maxThreadsPerBlock;
cuda::grid::block_dimension_t threads_per_block = cuda::kernel::wrap(device, increment).attributes().maxThreadsPerBlock;
cuda::grid::dimension_t num_blocks = (buffer_size + threads_per_block - 1) / threads_per_block;
auto launch_config = cuda::make_launch_config(num_blocks, threads_per_block);

Expand All @@ -113,7 +113,7 @@ int main(int argc, char **argv)
stream.enqueue.kernel_launch(increment, launch_config, buffer.get(), buffer_size);
stream.enqueue.host_function_call(
[&event_1, &event_2](cuda::stream_t) {
report_occurrence("In second callback (enqueued after the first kernel but before the second event)", event_1, event_2);
report_occurrence("In second callback (enqueued after the first kernel but before the second event)", event_1, event_2);
}
);
stream.enqueue.event(event_2);
Expand Down
8 changes: 4 additions & 4 deletions examples/by_runtime_api_module/execution_control.cu
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ int main(int argc, char **argv)

auto device = cuda::device::get(device_id).make_current();
std::cout << "Using CUDA device " << device.name() << " (having device ID " << device.id() << ")\n";
cuda::kernel_t kernel(device, kernel_function);
auto kernel = cuda::kernel::wrap(device, kernel_function);

// ------------------------------------------
// Attributes without a specific API call
Expand Down Expand Up @@ -179,12 +179,12 @@ int main(int argc, char **argv)
cuda::outstanding_error::clear();
}
#endif
cuda::kernel_t non_cooperative_kernel(device, kernel_function);
auto non_cooperative_kernel = cuda::kernel::wrap(device, kernel_function);
auto non_cooperative_config = launch_config;
non_cooperative_config.block_cooperation = true;
std::cout
<< "Launching kernel " << kernel_name
<< " with " << num_blocks << " blocks, un-cooperatively, using stream.launch()\n" << std::flush;
<< "Launching kernel " << kernel_name << " with "
<< num_blocks << " blocks, un-cooperatively, using stream.launch()\n" << std::flush;
stream.enqueue.kernel_launch(non_cooperative_kernel, non_cooperative_config, bar);
stream.synchronize();

Expand Down
2 changes: 1 addition & 1 deletion examples/by_runtime_api_module/stream_management.cu
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,7 @@ int main(int argc, char **argv)
print_first_char(buffer.get());
}
);
auto threads_per_block = cuda::kernel_t(device, increment).attributes().maxThreadsPerBlock;
auto threads_per_block = cuda::kernel::wrap(device, increment).attributes().maxThreadsPerBlock;
auto num_blocks = (buffer_size + threads_per_block - 1) / threads_per_block;
auto launch_config = cuda::make_launch_config(num_blocks, threads_per_block);
// TODO: The following doesn't have much of a meaningful effect; we should modify this example
Expand Down
35 changes: 25 additions & 10 deletions examples/modified_cuda_samples/helper_cuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ inline std::ostream& operator<< (std::ostream& os, const cuda::device::compute_c

#ifdef __CUDA_RUNTIME_H__
// General GPU Device CUDA Initialization
inline void gpuDeviceInit(int device_id)
/*inline void gpuDeviceInit(int device_id)
{
auto device_count = cuda::device::count();
Expand Down Expand Up @@ -91,8 +91,22 @@ inline void gpuDeviceInit(int device_id)
}
device.make_current();
}*/

static void ensure_device_is_usable(const cuda::device_t device)
{
auto properties = device.properties();

if (not properties.usable_for_compute()) {
die_("Error: device " + std::to_string(device.id()) + "is running with <Compute Mode Prohibited>.");
}

if (not properties.compute_capability().major() < 1) {
die_("CUDA device " + std::to_string(device.id()) + " does not support CUDA.\n");
}
}


// This function returns the best GPU (with maximum GFLOPS)
inline int gpuGetMaxGflopsDeviceId()
{
Expand Down Expand Up @@ -145,19 +159,20 @@ inline int gpuGetMaxGflopsDeviceId()
}

// Initialization code to find the best CUDA Device
inline void chooseCudaDevice(int argc, const char **argv)
// Unlike in NVIDIA's original helper_cuda.h, this does _not_
// make the chosen device current.
inline cuda::device_t chooseCudaDevice(int argc, const char **argv)
{
cuda::device::id_t device_id;
// If the command-line has a device number specified, use it
if (checkCmdLineFlag(argc, argv, "device"))
{
device_id = getCmdLineArgumentInt(argc, argv, "device=");

if (device_id < 0) { die_("Invalid command line parameter"); }
else
{
gpuDeviceInit(device_id);
auto device_id = getCmdLineArgumentInt(argc, argv, "device=");
if (device_id < 0) {
die_("Invalid command line parameter");
}
auto device = cuda::device::get(device_id);
ensure_device_is_usable(device);
return device;
}
else
{
Expand All @@ -166,7 +181,7 @@ inline void chooseCudaDevice(int argc, const char **argv)
std::cout << "GPU Device " << best_device.id() << ": ";
std::cout << "\"" << best_device.name() << "\" ";
std::cout << "with compute capability " << best_device.properties().compute_capability() << "\n";
best_device.make_current();
return best_device;
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -137,9 +137,9 @@ void enqueue_p2p_copy(
P2PEngine p2p_mechanism,
cuda::stream_t& stream)
{
auto copy_kernel = cuda::kernel_t(stream.device(), copyp2p);
auto params = copy_kernel.min_grid_params_for_max_occupancy();
auto launch_config = cuda::make_launch_config(params.first, params.second);
auto copy_kernel = cuda::kernel::wrap(stream.device(), copyp2p);
auto grid_and_block_dims = copy_kernel.min_grid_params_for_max_occupancy();
auto launch_config = cuda::make_launch_config(grid_and_block_dims);


if (p2p_mechanism == SM && p2paccess)
Expand Down Expand Up @@ -423,7 +423,7 @@ void outputLatencyMatrix(P2PEngine p2p_mechanism, bool test_p2p, P2PDataTransfer
// relatively low. Higher repeatitions will cause the delay kernel
// to timeout and lead to unstable results.
*flag = 0;
auto single_thread = cuda::make_launch_config(cuda::grid::dimensions_t::point(), cuda::grid::dimensions_t::point());
auto single_thread = cuda::make_launch_config(cuda::grid::dimensions_t::point(), cuda::grid::block_dimensions_t::point());
streams[i].enqueue.kernel_launch(delay, single_thread, flag, default_timeout_clocks);
streams[i].enqueue.event(start[i]);

Expand Down
19 changes: 9 additions & 10 deletions examples/modified_cuda_samples/simpleStreams/simpleStreams.cu
Original file line number Diff line number Diff line change
Expand Up @@ -166,11 +166,10 @@ int main(int argc, char **argv)
}

std::cout << "\n> ";
chooseCudaDevice(argc, (const char **)argv);
auto current_device = cuda::device::current::get();
auto device = chooseCudaDevice(argc, (const char **)argv);

// Checking for compute capabilities
auto properties = current_device.properties();
auto properties = device.properties();
auto compute_capability = properties.compute_capability();

if (compute_capability < cuda::device::compute_capability_t({1, 1}) ) {
Expand Down Expand Up @@ -224,7 +223,7 @@ int main(int argc, char **argv)
default: // should not be able to get here
exit(EXIT_FAILURE);
}
current_device.set_synch_scheduling_policy(policy);
device.set_synch_scheduling_policy(policy);
// Not necessary: Since CUDA 3.2 (which is below the minimum supported
// version for the API wrappers, all contexts allow such mapping.
// current_device.enable_mapping_host_memory();
Expand All @@ -237,8 +236,8 @@ int main(int argc, char **argv)

// allocate device memory
// pointers to data and init value in the device memory
auto d_a = cuda::memory::device::make_unique<int[]>(current_device, n);
auto d_c = cuda::memory::device::make_unique<int>(current_device);
auto d_a = cuda::memory::device::make_unique<int[]>(device, n);
auto d_c = cuda::memory::device::make_unique<int>(device);
cuda::memory::copy_single(d_c.get(), &c);

std::cout << "\nStarting Test\n";
Expand All @@ -247,11 +246,11 @@ int main(int argc, char **argv)
std::vector<cuda::stream_t> streams;
std::generate_n(
std::back_inserter(streams), nstreams,
[&current_device]() {
[&device]() {
// Note: we could omit the specific requirement of synchronization
// with the default stream, since that's the CUDA default - but I
// think it's important to state that's the case
return current_device.create_stream(
return device.create_stream(
cuda::stream::implicitly_synchronizes_with_default_stream);
}
);
Expand All @@ -260,8 +259,8 @@ int main(int argc, char **argv)
// use blocking sync
auto use_blocking_sync = (device_sync_method == cudaDeviceBlockingSync);

auto start_event = cuda::event::create(current_device, use_blocking_sync);
auto stop_event = cuda::event::create(current_device, use_blocking_sync);
auto start_event = cuda::event::create(device, use_blocking_sync);
auto stop_event = cuda::event::create(device, use_blocking_sync);

// time memcopy from device
start_event.record(); // record on the default stream, to ensure that all previous CUDA calls have completed
Expand Down
2 changes: 1 addition & 1 deletion src/cuda/api/ipc.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
* here. In addition to the free-standing functions, the class
* @ref cuda::memory::ipc::imported_t is defined, usable by receiving
* processes as an 'adapter' to incoming handles which may be passed
* as-is to code requiring a propoer pointer.
* as-is to code requiring a proper pointer.
*
*/
#pragma once
Expand Down
32 changes: 25 additions & 7 deletions src/cuda/api/kernel.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,17 @@ namespace cuda {
///@cond
class device_t;
class stream_t;
class kernel_t;
///@endcond

namespace kernel {

namespace detail_ {

inline kernel_t wrap(device::id_t device_id, const void* ptr);

} // namespace detail

/**
* @brief a wrapper around `cudaFuncAttributes`, offering
* a few convenience member functions.
Expand Down Expand Up @@ -129,15 +136,13 @@ class kernel_t {
* (1-dimensional), and the first element being the minimum number of such blocks necessary
* for keeping the GPU "busy" (again, in a 1-dimensional grid).
*/
::std::pair<grid::dimension_t, grid::block_dimension_t>
min_grid_params_for_max_occupancy(
grid::complete_dimensions_t min_grid_params_for_max_occupancy(
memory::shared::size_t dynamic_shared_memory_size = no_dynamic_shared_memory,
grid::block_dimension_t block_size_limit = 0,
bool disable_caching_override = false);

template <typename UnaryFunction>
::std::pair<grid::dimension_t, grid::block_dimension_t>
min_grid_params_for_max_occupancy(
grid::complete_dimensions_t min_grid_params_for_max_occupancy(
UnaryFunction block_size_to_dynamic_shared_mem_size,
grid::block_dimension_t block_size_limit = 0,
bool disable_caching_override = false);
Expand Down Expand Up @@ -198,9 +203,9 @@ class kernel_t {
}

public: // ctors & dtor
template <typename DeviceFunction>
kernel_t(const device_t& device, DeviceFunction f);
~kernel_t() { };
~kernel_t() = default;

friend kernel_t kernel::detail_::wrap(device::id_t, const void* ptr);

protected: // data members
const device::id_t device_id_;
Expand Down Expand Up @@ -262,8 +267,21 @@ auto unwrap(Kernel f) -> typename ::std::conditional<
return detail_::unwrap_inner<Kernel, KernelParameters...>(got_a_kernel_t{}, f);
}

namespace detail_ {

inline kernel_t wrap(device::id_t device_id, const void* function_ptr)
{
return { device_id, reinterpret_cast<const void*>(function_ptr) };
}

} // namespace detail_

template<typename KernelFunctionPtr>
kernel_t wrap(const device_t &device, KernelFunctionPtr function_ptr);

} // namespace kernel


} // namespace cuda

#endif // CUDA_API_WRAPPERS_KERNEL_HPP_
10 changes: 5 additions & 5 deletions src/cuda/api/kernel_launch.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ inline void collect_argument_addresses(void** collected_addresses, Arg&& arg, Ar
template<typename RawKernel, typename... KernelParameters>
inline void enqueue_launch(
RawKernel kernel_function,
stream::handle_t stream_handle,
stream::handle_t stream_handle,
launch_configuration_t launch_configuration,
KernelParameters&&... parameters)
#ifndef __CUDACC__
Expand All @@ -106,8 +106,8 @@ inline void enqueue_launch(
if (launch_configuration.block_cooperation == thread_blocks_may_not_cooperate) {
// regular plain vanilla launch
kernel_function <<<
launch_configuration.grid_dimensions,
launch_configuration.block_dimensions,
launch_configuration.dimensions.grid,
launch_configuration.dimensions.block,
launch_configuration.dynamic_shared_memory_size,
stream_handle
>>>(::std::forward<KernelParameters>(parameters)...);
Expand All @@ -133,8 +133,8 @@ inline void enqueue_launch(
detail_::collect_argument_addresses(argument_ptrs, ::std::forward<KernelParameters>(parameters)...);
auto status = cudaLaunchCooperativeKernel(
(const void*) kernel_function,
launch_configuration.grid_dimensions,
launch_configuration.block_dimensions,
launch_configuration.dimensions.grid,
launch_configuration.dimensions.block,
argument_ptrs,
launch_configuration.dynamic_shared_memory_size,
stream_handle);
Expand Down
Loading

0 comments on commit 1f38bf9

Please sign in to comment.