From 090057f79c3efff54a966785bf244c4c86dfd870 Mon Sep 17 00:00:00 2001 From: Eyal Rozenberg Date: Fri, 22 Mar 2024 22:14:42 +0200 Subject: [PATCH] Regards #3: Additional doxygen comments, corrections and completions of existing comments, doxygen markup tweaks, and spacing tweaks --- src/cuda/api/array.hpp | 6 +- src/cuda/api/context.hpp | 22 ++- src/cuda/api/detail/unique_span.hpp | 1 - src/cuda/api/device.hpp | 11 +- src/cuda/api/launch_config_builder.hpp | 105 +++++++++- src/cuda/api/launch_configuration.hpp | 26 +++ src/cuda/api/link.hpp | 62 ++++-- src/cuda/api/link_options.hpp | 61 +++++- src/cuda/api/memory.hpp | 186 +++++++++++------- src/cuda/api/miscellany.hpp | 7 + src/cuda/api/module.hpp | 84 ++++++-- .../api/multi_wrapper_impls/unique_region.hpp | 16 ++ .../api/multi_wrapper_impls/unique_span.hpp | 2 +- src/cuda/api/pci_id.hpp | 2 + src/cuda/api/peer_to_peer.hpp | 32 ++- src/cuda/api/pointer.hpp | 2 +- src/cuda/api/primary_context.hpp | 10 + src/cuda/api/stream.hpp | 143 ++++++++++---- src/cuda/api/texture_view.hpp | 9 +- src/cuda/api/types.hpp | 18 +- src/cuda/api/unique_region.hpp | 57 ++++-- src/cuda/define_specifiers.hpp | 3 + src/cuda/rtc/compilation_output.hpp | 150 +++++++++----- src/cuda/rtc/error.hpp | 11 +- src/cuda/rtc/program.hpp | 15 +- src/cuda/rtc/types.hpp | 9 +- 26 files changed, 819 insertions(+), 231 deletions(-) diff --git a/src/cuda/api/array.hpp b/src/cuda/api/array.hpp index 58a9ea49..4519cfd6 100644 --- a/src/cuda/api/array.hpp +++ b/src/cuda/api/array.hpp @@ -270,16 +270,20 @@ array_t wrap( return { device_id, context_handle, handle, dimensions }; } +/// Create a new (typed) CUDA array of the specified dimensions +///@{ +/// @param context ... in which the array is to be created template array_t create( const context_t& context, dimensions_t dimensions); +/// @param device ... in whose primary context the array is to be created template array_t create( const device_t& device, dimensions_t dimensions); - +///@} } // namespace array diff --git a/src/cuda/api/context.hpp b/src/cuda/api/context.hpp index 1933d66e..ca97c5ea 100644 --- a/src/cuda/api/context.hpp +++ b/src/cuda/api/context.hpp @@ -210,6 +210,15 @@ inline context::flags_t get_flags(handle_t handle) } // namespace context +/** + * Waits for all previously-scheduled tasks on all streams (= queues) + * in a CUDA context to conclude, before returning. + * + * Depending on the `host_thread_sync_scheduling_policy_t` set for the + * specified context, the thread calling this method will either yield, + * spin or block until all tasks scheduled previously scheduled on streams + * within this context have concluded. + */ inline void synchronize(const context_t& context); /** @@ -745,10 +754,18 @@ inline handle_t create_and_push( /** * @brief creates a new context on a given device * - * @param device The device on which to create the new stream + * @param device + * The device which the new context will regard * @param sync_scheduling_policy + * Choice of how host threads are to perform synchronization with pending + * actions in streams within this context. See + * @ref host_thread_sync_scheduling_policy_t for a description of these + * choices. * @param keep_larger_local_mem_after_resize - * @return + * If true, larger allocations of global device memory, used by kernels + * requiring a larger amount of local memory, will be kept (so that future + * kernels with such requirements will not trigger a re-allocation). + * * @note Until CUDA 11, there used to also be a flag for enabling/disabling * the ability of mapping pinned host memory to device addresses. However, it was * being ignored since CUDA 3.2 already, with the minimum CUDA version supported @@ -861,6 +878,7 @@ inline context_t get_with_fallback_push() } // namespace current +/// @return true if the context is the primary context of its device bool is_primary(const context_t& context); namespace detail_ { diff --git a/src/cuda/api/detail/unique_span.hpp b/src/cuda/api/detail/unique_span.hpp index 5f18f4b2..7db1e752 100644 --- a/src/cuda/api/detail/unique_span.hpp +++ b/src/cuda/api/detail/unique_span.hpp @@ -37,7 +37,6 @@ namespace cuda { * * @tparam T the type of individual elements in the unique_span */ - template> class unique_span : public ::cuda::span { public: // span types diff --git a/src/cuda/api/device.hpp b/src/cuda/api/device.hpp index 4dfaf4ad..2fd91dd2 100644 --- a/src/cuda/api/device.hpp +++ b/src/cuda/api/device.hpp @@ -38,7 +38,7 @@ class pool_t; * @brief Waits for all previously-scheduled tasks on all streams (= queues) * on a specified device to conclude. * - * Depending on the host_thread_sync_scheduling_policy_t set for this + * Depending on the host_thread_sync_scheduling_policy_t set for the specified * device, the thread calling this method will either yield, spin or block * until all tasks scheduled previously scheduled on this device have been * concluded. @@ -604,11 +604,17 @@ class device_t { set_flags(other_flags | static_cast(new_policy)); } + /// @returns true if the device will keep larger amounts of global device memory allocated + /// for use as local memory, after a kernel was executed which required a larger-than-usual + /// allocation bool keeping_larger_local_mem_after_resize() const { return flags() & CU_CTX_LMEM_RESIZE_TO_MAX; } + /// @brief Instructs the (primary context of) the device to keep larger amounts of global + /// device memory allocated for use as local memory, after a kernel was executed which + /// required a larger-than-usual allocation void keep_larger_local_mem_after_resize(bool keep = true) { auto other_flags = flags() & ~CU_CTX_LMEM_RESIZE_TO_MAX; @@ -616,6 +622,9 @@ class device_t { set_flags(new_flags); } + /// @brief Instructs the (primary context of) the device to discard allocations of larger + /// amounts of global device memory which were used by a kernel requiring a larger amount + /// of local memory, and has concluded execution. void dont_keep_larger_local_mem_after_resize() { keep_larger_local_mem_after_resize(false); diff --git a/src/cuda/api/launch_config_builder.hpp b/src/cuda/api/launch_config_builder.hpp index d3faaf07..9899afe4 100644 --- a/src/cuda/api/launch_config_builder.hpp +++ b/src/cuda/api/launch_config_builder.hpp @@ -179,6 +179,8 @@ class launch_config_builder_t { } } + /// Use the information specified for the builder to figure out the grid and block + /// dimensions with which the kernel is to be launched grid::composite_dimensions_t get_composite_dimensions() const noexcept(false) { auto result = get_unvalidated_composite_dimensions(); @@ -189,6 +191,10 @@ class launch_config_builder_t { } public: + /// Use the information specified to the builder (and defaults for the unspecified + /// information) to finalize the construction of a kernel launch configuration, + /// which can then be passed along with the kernel to a kernel-launching function, + /// e.g. the standalone @ref kernel::launch or the stream command @ref stream_t::enqueue_t::kernel_launch launch_configuration_t build() const { auto result = launch_configuration_t{ get_composite_dimensions() }; @@ -392,6 +398,7 @@ class launch_config_builder_t { } + /// Set the dimensions for each block in the intended kernel launch grid launch_config_builder_t& block_dimensions( grid::block_dimension_t x, grid::block_dimension_t y = 1, @@ -400,8 +407,17 @@ class launch_config_builder_t { return block_dimensions(grid::block_dimensions_t{x, y, z}); } + /// Set the block in the intended kernel launch grid to be uni-dimensional + /// with a specified size launch_config_builder_t& block_size(grid::block_dimension_t size) { return block_dimensions(size, 1, 1); } + /** + * Set the intended kernel launch grid to have 1D blocks, of the maximum + * length possible given the information specified to the builder. + * + * @note This will fail if neither a kernel nor a device have been chosen + * for the launch. + */ launch_config_builder_t& use_maximum_linear_block() { grid::block_dimension_t max_size; @@ -424,6 +440,16 @@ class launch_config_builder_t { } #if CUDA_VERSION >= 12000 + /** + * Set the dimensions of multi-block clusters within the grid. + * + * @note There is only a small number of possible dimension combinations of clusters; + * and this function does _not_ guarantee to fail immediately if you specify an + * invalid such combination. + * + * @note This setting does not affect the overall dimensions of the grid in terms of + * blocks. + */ launch_config_builder_t& cluster_blocks(grid::block_dimensions_t cluster_dims) { #ifndef NDEBUG @@ -434,6 +460,9 @@ class launch_config_builder_t { } #endif + /// Set the dimension of the grid for the intended kernel launch, in terms + /// of blocks + ///@{ launch_config_builder_t& grid_dimensions(grid::dimensions_t dims) { #ifndef NDEBUG @@ -447,6 +476,7 @@ class launch_config_builder_t { return *this; } + ///@} launch_config_builder_t& grid_dimensions( grid::dimension_t x, grid::dimension_t y = 1, @@ -455,9 +485,17 @@ class launch_config_builder_t { return grid_dimensions(grid::dimensions_t{x, y, z}); } + /// Set the grid for the intended launch to be one-dimensional, with a specified number + /// of blocks + ///@{ launch_config_builder_t& grid_size(grid::dimension_t size) {return grid_dimensions(size, 1, 1); } launch_config_builder_t& num_blocks(grid::dimension_t size) {return grid_size(size); } + ///@} + + /// Set the overall number of _threads_, in each dimension, of all blocks + /// in the grid of the intended kernel launch + ///@{ launch_config_builder_t& overall_dimensions(grid::overall_dimensions_t dims) { #ifndef NDEBUG @@ -474,16 +512,30 @@ class launch_config_builder_t { { return overall_dimensions(grid::overall_dimensions_t{x, y, z}); } + ///@} + /// Set the intended launch grid to be linear, with a specified overall number of _threads_ + /// over all (1D) blocks in the grid launch_config_builder_t& overall_size(grid::overall_dimension_t size) { return overall_dimensions(size, 1, 1); } + /** + * Set whether or blocks may synchronize with each other or not + * + * @note recall that even "non-cooperative" blocks can still access the same global memory + * locations, and can use atomic operations on such locations for (slow) synchronization. + */ launch_config_builder_t& block_cooperation(bool cooperation) { thread_block_cooperation = cooperation; return *this; } + /// Let kernel thread blocks synchronize with each other, or are guaranteed to act independently + /// (atomic global memory operations notwithstanding) launch_config_builder_t& blocks_may_cooperate() { return block_cooperation(true); } + + /// Prevent kernel thread blocks synchronize with each other, guaranteeing each block will + /// work entirely independently (atomic global memory operations notwithstanding) launch_config_builder_t& blocks_dont_cooperate() { return block_cooperation(false); } launch_config_builder_t& dynamic_shared_memory_size( @@ -493,11 +545,18 @@ class launch_config_builder_t { return *this; } + /// Indicate that the intended launch should not allocate any shared + /// memory for the kernel to use beyond the static amount necessitated + /// by its (compiled) code. launch_config_builder_t& no_dynamic_shared_memory() { return dynamic_shared_memory_size(memory::shared::size_t(0)); } + /// Indicate that the intended launch should allocate a certain amount of shared + /// memory for the kernel to use beyond the static amount necessitated + /// by its (compiled) code. + ///@{ launch_config_builder_t& dynamic_shared_memory_size(memory::shared::size_t size) { #ifndef NDEBUG @@ -512,13 +571,32 @@ class launch_config_builder_t { { return dynamic_shared_memory_size(size); } + ///@} + /** + * Indicate that the intended launch should allocate additional shared + * memory for the kernel to use beyond the static amount necessitated + * by its (compiled) code - with the amount to be determined based on + * the block size + * + * @param shared_mem_size_determiner a function determining the dynamic + * shared memory size given the kernel launch block size + */ launch_config_builder_t& dynamic_shared_memory( kernel::shared_memory_size_determiner_t shared_mem_size_determiner) { return dynamic_shared_memory_size(shared_mem_size_determiner); } + /** + * Indicate that the specified wrapped kernel will be the one launched + * with the configuration to be produced by this object. Such an indication + * provides this object with information about the device and context in + * which the kernel is to be launched, and ranges of possible values for + * certain parameters (e.g. shared memory size, dimensions). + * + * @note Calling this method obviates a call to the @ref device() method. + */ launch_config_builder_t& kernel(const kernel_t* wrapped_kernel_ptr) { if (device_ and kernel_->device_id() != device_.value()) { @@ -533,6 +611,15 @@ class launch_config_builder_t { return *this; } + /** + * Indicate that the intended kernel launch would occur on (some stream in + * some context on) the specified device. Such an indication provides this + * object with some information regarding ranges of possible values for + * certain parameters (e.g. shared memory size, dimensions). + * + * @note Do not call both this and the @ref kernel() method; prefer just that one. + */ + ///@{ launch_config_builder_t& device(const device::id_t device_id) { if (kernel_ and kernel_->device_id() != device_id) { @@ -548,7 +635,11 @@ class launch_config_builder_t { { return this->device(device.id()); } + ///@} + /// Clear the association with a specific kernel (which may have been + /// set using the @ref kernel method) + ///@{ launch_config_builder_t& kernel_independent() { kernel_ = nullptr; @@ -559,13 +650,14 @@ class launch_config_builder_t { kernel_ = nullptr; return *this; } + ///@} /** - * @brief THis will use information about the kernel, the already-set block size, + * @brief This will use information about the kernel, the already-set block size, * and the device to create a unidimensional grid of blocks to exactly saturate * the CUDA device's capacity for simultaneous active blocks. * - * @note This will _not_ set the block size - unlike + * @note This will _not_ set the block size - unlike {@ref min_params_for_max_occupancy()}. */ launch_config_builder_t& saturate_with_active_blocks() { @@ -584,6 +676,14 @@ class launch_config_builder_t { return *this; } + /** + * @brief This will use information about the kernel and the device to define + * a minimum launch grid which should guarantee maximum occupancy of the GPU's + * multiprocessors. + * + * @note A builder after this call _will_ set the block dimensions - unlike + * {@ref saturate_with_active_blocks()} . + */ launch_config_builder_t& min_params_for_max_occupancy() { if (not (kernel_)) { @@ -600,6 +700,7 @@ class launch_config_builder_t { } }; // launch_config_builder_t +/// A slightly shorter-named construction idiom for @ref launch_config_builder_t inline launch_config_builder_t launch_config_builder() { return {}; } } // namespace cuda diff --git a/src/cuda/api/launch_configuration.hpp b/src/cuda/api/launch_configuration.hpp index f891f8e0..e213e7ed 100644 --- a/src/cuda/api/launch_configuration.hpp +++ b/src/cuda/api/launch_configuration.hpp @@ -58,7 +58,16 @@ enum class cluster_scheduling_policy_t { }; #endif +/** + * The full set of possible configuration parameters for launching a kernel on a GPU. + * + * @note Consider using a @ref launch_configuration_builder_t to incrementally construct + * these structs. + * + * @note this structure must be constructed with at least the grid and block dimensions. + */ struct launch_configuration_t { + /// Dimensions of the launch grid in blocks, and of the individual blocks in the grid. grid::composite_dimensions_t dimensions { grid::dimensions_t{ 0u, 0u, 0u }, grid::block_dimensions_t{ 0u, 0u, 0u } }; /** @@ -160,6 +169,20 @@ struct launch_configuration_t { constexpr launch_configuration_t(const launch_configuration_t&) = default; constexpr launch_configuration_t(launch_configuration_t&&) = default; + /** + * Constructors corresponding to the CUDA runtime API's triple-chevron launch + * syntax: + * + * my_kernel <<< grid_Dims, block_dims, dynamic_shmem_size, my_stream >>> ( + * arg1, arg2, arg3, etc); + * + * ... where the specified aspects of the launch configuration are the dimensions + * and the dynamic shared memory size. + * + * @note The choices of stream and kernel function are _not_ part of the launch + * configuration. + */ + ///@{ constexpr launch_configuration_t( grid::composite_dimensions_t grid_and_block_dimensions, memory::shared::size_t dynamic_shared_mem = 0u @@ -184,12 +207,14 @@ struct launch_configuration_t { grid::block_dimensions_t(block_dims), dynamic_shared_mem) { } + ///@} CPP14_CONSTEXPR launch_configuration_t& operator=(const launch_configuration_t& other) = default; CPP14_CONSTEXPR launch_configuration_t& operator=(launch_configuration_t&&) = default; }; #if __cplusplus < 202002L +///@cond constexpr bool operator==(const launch_configuration_t lhs, const launch_configuration_t& rhs) noexcept { return @@ -210,6 +235,7 @@ constexpr bool operator!=(const launch_configuration_t lhs, const launch_configu { return not (lhs == rhs); } +///@endcond #endif namespace detail_ { diff --git a/src/cuda/api/link.hpp b/src/cuda/api/link.hpp index ef55571e..bd028477 100644 --- a/src/cuda/api/link.hpp +++ b/src/cuda/api/link.hpp @@ -20,14 +20,14 @@ namespace cuda { ///@cond class device_t; - class module_t; - class link_t; ///@endcond +/// Definitions related to CUDA linking-processes, captured by the @ref link_t wrapper class namespace link { +/// Kinds of images which can be used by the linker (some may require driver compilation work) enum class input_kind_t { cubin, /// Compiled device-class-specific device code ptx, /// PTX (microarchitecture-inspecific intermediate representation) @@ -36,9 +36,14 @@ enum class input_kind_t { library, /// An archive of objects files with embedded device code; a `.a` file }; +/// A raw CUDA driver handle for a linking-process using handle_t = CUlinkState; -// TODO: Check if the linking has been completed! +/** + * @brief Wrap an existing CUDA link-process in a @ref link_t wrapper class instance. + * + * @todo : Consider checking if the linking has already been completed! + */ inline link_t wrap( device::id_t device_id, context::handle_t context_handle, @@ -48,21 +53,19 @@ inline link_t wrap( inline link_t create(const void *image, const link::options_t &options); -// TODO: Use a clase-class with C++17 of later, made up of the two classes here +/// Definitions relating to inputs to CUDA linking-processes namespace input { -/** - * A typed, named, image in memory which can be used as an input to a runtime - * CUDA linking process. - */ +/// A typed, named, image in memory which can be used as an input to a runtime CUDA linking-process struct image_t : memory::region_t { - const char *name; - link::input_kind_t type; + const char *name; /// Link images are attached a name when registered in a linking-process + link::input_kind_t type; /// type of contents found in the memory region }; +/// A typed, named, image in a file which can be used as an input to a runtime CUDA linking-process struct file_t { - const char *path; // TODO: Use a proper path in C++14 and later - link::input_kind_t type; + const char *path; + link::input_kind_t type; /// type of contents found in the file }; } // namespace input @@ -115,7 +118,16 @@ class link_t { return memory::region_t{cubin_output_start, cubin_output_size}; } - // TODO: Replace this with methods which take wrapper classes. + /** + * Add another linkable image, from memory, to this linking-process + * + * @param[in] image Memory region containing the image + * @param[in] ptx_compilation_options Options for compiling PTX code to cubin, if necessary, + * before linking. + * + * @note some types of linkable images are not, in fact, even compiled - but can be compiled + * by the driver with the specified @p options. + */ void add(link::input::image_t image, const link::options_t &ptx_compilation_options = {}) const { auto marshalled_options = link::detail_::marshal(ptx_compilation_options); @@ -134,6 +146,17 @@ class link_t { + ::std::to_string(static_cast(image.type)) + " to a link."); } + /** + * Add another linkable image, from a file, to this linking-process + * + * @param[in] file_input Path of the image file to be added + * @param[in] ptx_compilation_options Options for compiling PTX code to cubin, if necessary, + * before linking. + * + * @note some types of linkable images are not, in fact, even compiled - but can be compiled + * by the driver with the specified @p options. + */ + ///@{ void add_file(link::input::file_t file_input, const link::options_t &options) const { auto marshalled_options = link::detail_::marshal(options); @@ -156,6 +179,7 @@ class link_t { return add_file(path.c_str(), file_contents_type); } #endif + ///@} protected: // constructors @@ -219,6 +243,7 @@ class link_t { namespace link { +/// Create a new link-process (before adding any compiled images or or image-files) inline link_t create(const link::options_t &options = link::options_t{}) { handle_t new_link_handle; @@ -241,13 +266,12 @@ inline link_t create(const link::options_t &options = link::options_t{}) do_take_ownership); } -// TODO: Check if the linking has been completed! inline link_t wrap( - device::id_t device_id, - context::handle_t context_handle, - link::handle_t handle, - const link::options_t &options, - bool take_ownership) noexcept + device::id_t device_id, + context::handle_t context_handle, + link::handle_t handle, + const link::options_t & options, + bool take_ownership) noexcept { return link_t{device_id, context_handle, handle, options, take_ownership}; } diff --git a/src/cuda/api/link_options.hpp b/src/cuda/api/link_options.hpp index a2e17d59..52d98206 100644 --- a/src/cuda/api/link_options.hpp +++ b/src/cuda/api/link_options.hpp @@ -1,7 +1,8 @@ /** * @file * - * @brief Definitions and utility functions relating to just-in-time compilation and linking of CUDA code. + * @brief Definitions and utility functions relating to just-in-time compilation, assembly + * and linking of CUDA code. */ #pragma once #ifndef CUDA_API_WRAPPERS_ASSEMBLY_AND_LINK_OPTIONS_HPP_ @@ -21,17 +22,37 @@ class module_t; namespace link { +/// Possible strategies for obtaining fully-compiled binary code for a target device +/// when it is not immediately available. enum fallback_strategy_for_binary_code_t { + /// Prefer compiling available PTX code to produce fully-compiled binary code prefer_compiling_ptx = 0, + /// Prefer using existing fully-compiled (binary) code, for a compatible but + /// not identical target device prefer_using_compatible_binary = 1, }; namespace detail_ { +/// The CUDA driver's raw generic JIT-related option type using option_t = CUjit_option; +/** + * Mechanism for finalizing options into a format readily usable by the + * link_t wrapper (and by the `cuLink`- functions - but highly inconvenient + * for inspection and modification. + * + * @note Don't create these yourself unless you have to; use @ref options_t + * instead, and @ref options_t::marshal() when done, for completing the + * linking-process. If you must create them - use `push_back()` method + * repeatedly until done with all options. + */ struct marshalled_options_t { + /// The CUDA driver's expected type for number of link-related options using size_type = unsigned; + + /// The CUDA driver's enum for option identification has this many values - + /// and thus, there is need for no more than this many marshalled options constexpr static const size_type max_num_options { CU_JIT_NUM_OPTIONS }; protected: @@ -39,8 +60,6 @@ struct marshalled_options_t { ::std::array value_buffer; size_type count_ { 0 }; public: - size_type count() const { return count_; } - void push_back(option_t option) { if (count_ >= max_num_options) { @@ -76,7 +95,12 @@ struct marshalled_options_t { } public: - + /** + * This method (alone) is used to populate this structure. + * + * @note The class is not a standard container, and this method cannot be + * reversed or undone, i.e. there is no `pop_back()` or `pop()`. + */ template void push_back(option_t option, T value) { @@ -85,25 +109,46 @@ struct marshalled_options_t { // Now set value_buffer[count-1]... value_buffer[count_-1] = process_value(value); } + + /// These three methods yield what the CUDA driver actually expects: + /// Two matching raw buffers and their count of elements + ///@{ const option_t* options() const { return option_buffer.data(); } const void * const * values() const { return value_buffer.data(); } + size_type count() const { return count_; } + ///@} }; } // namespace detail_ +/** + * A convenience class for holding, setting and inspecting options for a CUDA binary code + * linking process - which may also involve PTX compilation. + * + * @note This structure does not let you set those options which the CUDA driver documentation + * describes as having internal purposes only. + */ struct options_t final : public rtc::common_ptx_compilation_options_t { + /// options related to logging the link-process struct { + /// Non-error information regarding the logging process (i.e. its "standard output" stream) optional> info; + + /// Information regarding errors in the logging process (i.e. its "standard error" stream) optional> error; + + /// Control whether the info and error logging will be verbose bool verbose; } logs; - // Note: When this is true, the specific_target of the base class - // is overridden + /// Instead of using explicitly-specified binary target, from + /// @ref common_ptx_compilation_options_t::specific_target - use the device of the current CUDA + /// context as the target for binary generation bool obtain_target_from_cuda_context { true }; - /// fallback behavior if a (matching cubin???) is not found + /// Possible strategy for obtaining fully-compiled binary code when it is not + /// simply available in the input to the link-process optional fallback_strategy_for_binary_code; // Ignoring the "internal purposes only" options; @@ -118,6 +163,8 @@ struct options_t final : public rtc::common_ptx_compilation_options_t { namespace detail_ { +/// Construct a easily-driver-usable link-process options structure from +/// a more user-friendly `options_t` structure. inline marshalled_options_t marshal(const options_t& link_options) { marshalled_options_t marshalled{}; diff --git a/src/cuda/api/memory.hpp b/src/cuda/api/memory.hpp index ce72ecae..24f286a5 100644 --- a/src/cuda/api/memory.hpp +++ b/src/cuda/api/memory.hpp @@ -85,7 +85,7 @@ enum cpu_write_combining : bool { }; /** - * @brief options accepted by CUDA's allocator of memory with a host-side aspect + * options accepted by CUDA's allocator of memory with a host-side aspect * (host-only or managed memory). */ struct allocation_options { @@ -106,6 +106,7 @@ inline unsigned make_cuda_host_alloc_flags(allocation_options options) /** * @namespace mapped + * * Memory regions appearing in both on the host-side and device-side address * spaces with the regions in both spaces mapped to each other (i.e. guaranteed * to have the same contents on access up to synchronization details). Consult the @@ -124,7 +125,7 @@ struct span_pair_t { }; /** - * @brief A pair of memory regions, one in system (=host) memory and one on a + * A pair of memory regions, one in system (=host) memory and one on a * CUDA device's memory - mapped to each other * * @note this is the mapped-pair equivalent of a `void *`; it is not a @@ -143,9 +144,7 @@ struct region_pair_t { } // namespace mapped -/** - * @brief CUDA-Device-global memory on a single device (not accessible from the host) - */ +///CUDA-Device-global memory on a single device (not accessible from the host) namespace device { namespace detail_ { @@ -325,7 +324,7 @@ struct deleter { /** - * @brief Sets consecutive elements of a region of memory to a fixed + * Sets consecutive elements of a region of memory to a fixed * value of some width * * @note A generalization of `set()`, for different-size units. @@ -339,7 +338,7 @@ template void typed_set(T* start, const T& value, size_t num_elements); /** - * @brief Sets all bytes in a region of memory to a fixed value + * Sets all bytes in a region of memory to a fixed value * * @note The equivalent of @ref ::std::memset for CUDA device-side memory * @@ -366,7 +365,7 @@ inline void set(region_t region, int byte_value) ///@} /** - * @brief Sets all bytes in a region of memory to 0 (zero) + * Sets all bytes in a region of memory to 0 (zero) */ ///@{ /** @@ -389,7 +388,7 @@ inline void zero(region_t region) ///@} /** - * @brief Sets all bytes of a single pointed-to value to 0 + * Sets all bytes of a single pointed-to value to 0 * * @param ptr pointer to a value of a certain type, in a CUDA device's * global memory @@ -558,7 +557,7 @@ inline void copy(region_t destination, void* source) ///@} /** - * @brief Sets a number of bytes in memory to a fixed value + * Sets a number of bytes in memory to a fixed value * * @note The equivalent of @ref ::std::memset - for any and all CUDA-related * memory spaces @@ -571,7 +570,7 @@ inline void copy(region_t destination, void* source) void set(void* ptr, int byte_value, size_t num_bytes); /** - * @brief Sets all bytes in a region of memory to a fixed value + * Sets all bytes in a region of memory to a fixed value * * @note The equivalent of @ref ::std::memset - for any and all CUDA-related * memory spaces @@ -586,7 +585,7 @@ inline void set(region_t region, int byte_value) } /** - * @brief Sets all bytes in a region of memory to 0 (zero) + * Sets all bytes in a region of memory to 0 (zero) */ ///@{ /** @@ -610,7 +609,7 @@ inline void zero(void* ptr, size_t num_bytes) ///@} /** - * @brief Sets all bytes of a single pointed-to value to 0 + * Sets all bytes of a single pointed-to value to 0 * * @param ptr pointer to a single element of a certain type, which may * be in host-side memory, global CUDA-device-side memory or CUDA-managed @@ -662,7 +661,7 @@ status_t multidim_copy(copy_parameters_t params) } // namespace detail_ /** - * @brief An almost-generalized-case memory copy, taking a rather complex structure of + * An almost-generalized-case memory copy, taking a rather complex structure of * copy parameters - wrapping the CUDA driver's own most-generalized-case copy * * @tparam NumDimensions The number of dimensions of the parameter structure. @@ -1216,8 +1215,7 @@ inline void typed_set(T* start, const T& value, size_t num_elements, stream::han /** - * @brief Sets consecutive elements of a region of memory to a fixed - * value of some width + * Sets consecutive elements of a region of memory to a fixed value of some width * * @note A generalization of `async::set()`, for different-size units. * @@ -1263,7 +1261,7 @@ inline void set(void* start, int byte_value, size_t num_bytes, const stream_t& s void zero(void* start, size_t num_bytes, const stream_t& stream); /** - * @brief Asynchronously sets all bytes of a single pointed-to value + * Asynchronously sets all bytes of a single pointed-to value * to 0 (zero). * * @note asynchronous version of @ref memory::zero(T*) @@ -1440,6 +1438,7 @@ inline void copy( /** * @namespace host + * * Host-side (= system) memory which is "pinned", i.e. resides in * a fixed physical location - and allocated by the CUDA driver. */ @@ -1459,7 +1458,7 @@ inline region_t allocate( /** - * allocate pinned host memory + * Allocates pinned host memory * * @note "pinned" memory is allocated in contiguous physical ram * addresses, making it possible to copy to and from it to the the @@ -1469,25 +1468,17 @@ inline region_t allocate( * * @throws cuda::runtime_error if allocation fails for any reason * - * @param context * @param size_in_bytes the amount of memory to allocate, in bytes * @param options * options to pass to the cuda host-side memory allocator; see * {@ref memory::allocation_options}. * @return a pointer to the allocated stretch of memory */ -///@{ - -inline region_t allocate( - const context_t& context, - size_t size_in_bytes, - allocation_options options); - -region_t allocate( - size_t size_in_bytes, - allocation_options options); +region_t allocate(size_t size_in_bytes, allocation_options options); /** + * @copydoc allocate(size_t, allocation_options) + * * @param portability * whether or not the allocated region can be used in different * CUDA contexts. @@ -1504,16 +1495,18 @@ inline region_t allocate( return allocate(size_in_bytes, allocation_options{ portability, cpu_wc } ); } +/// @copydoc allocate(size_t, portability_across_contexts, cpu_write_combining) inline region_t allocate(size_t size_in_bytes, cpu_write_combining cpu_wc) { return allocate(size_in_bytes, allocation_options{ portability_across_contexts(false), cpu_write_combining(cpu_wc)} ); } -///@} - /** - * Free a region of pinned host memory which was allocated with one of the pinned host + * Frees a region of pinned host memory which was allocated with one of the pinned host * memory allocation functions. + * + * @note The address provided must be the _beginning_ of the region of allocated memory; + * and the entire region is freed (i.e. the region size is known to/determined by the driver) */ inline void free(void* host_ptr) { @@ -1526,6 +1519,11 @@ inline void free(void* host_ptr) throw runtime_error(result, "Freeing pinned host memory at " + cuda::detail_::ptr_as_hex(host_ptr)); } +/** + * @copybrief free(void*) + * + * @param region The region of memory to free + */ inline void free(region_t region) { return free(region.data()); } namespace detail_ { @@ -1537,9 +1535,8 @@ struct deleter { void operator()(void* ptr) const { cuda::memory::host::free(ptr); } }; - /** - * @brief Makes a preallocated memory region behave as though it were allocated with @ref host::allocate. + * Makes a pre-allocated memory region behave as though it were allocated with @ref host::allocate. * * Page-locks the memory range specified by ptr and size and maps it for the device(s) as specified by * flags. This memory range also is added to the same tracking mechanism as cuMemAllocHost() to @@ -1669,7 +1666,7 @@ inline void deregister(const_region_t region) } /** - * @brief Sets all bytes in a stretch of host-side memory to a single value + * Sets all bytes in a stretch of host-side memory to a single value * * @note a wrapper for @ref ::std::memset * @@ -1703,22 +1700,6 @@ inline void zero(T* ptr) } // namespace host -/** - * This type of memory, also known as _unified_ memory, appears within - * a unified, all-system address space - and is used with the same - * address range on the host and on all relevant CUDA devices on a - * system. It is paged, so that it may exceed the physical size of - * a CUDA device's global memory. The CUDA driver takes care of - * "swapping" pages "out" from a device to host memory or "swapping" - * them back "in", as well as of propagation of changes between - * devices and host-memory. - * - * @note For more details, see - * - * Unified Memory for CUDA Beginners on the - * Parallel4All blog. - * - */ namespace managed { namespace detail_ { @@ -1899,7 +1880,7 @@ inline region_t allocate( } // namespace detail_ /** - * @brief Allocate a a region of managed memory, accessible with the same + * Allocate a a region of managed memory, accessible with the same * address on the host and on CUDA devices. * * @param context the initial context which is likely to access the managed @@ -1917,7 +1898,7 @@ inline region_t allocate( initial_visibility_t initial_visibility = initial_visibility_t::to_all_devices); /** - * @brief Allocate a a region of managed memory, accessible with the same + * Allocate a a region of managed memory, accessible with the same * address on the host and on CUDA devices * * @param device the initial device which is likely to access the managed @@ -1935,7 +1916,7 @@ inline region_t allocate( initial_visibility_t initial_visibility = initial_visibility_t::to_all_devices); /** - * @brief Allocate a a region of managed memory, accessible with the same + * Allocate a a region of managed memory, accessible with the same * address on the host and on all CUDA devices. * * @note While the allocated memory should be available universally, the @@ -2007,7 +1988,7 @@ inline void prefetch( } // namespace detail_ /** - * @brief Prefetches a region of managed memory to a specific device, so + * Prefetches a region of managed memory to a specific device, so * it can later be used there without waiting for I/O from the host or other * devices. */ @@ -2017,7 +1998,7 @@ void prefetch( const stream_t& stream); /** - * @brief Prefetches a region of managed memory into host memory. It can + * Prefetches a region of managed memory into host memory. It can * later be used there without waiting for I/O from any of the CUDA devices. */ void prefetch_to_host( @@ -2194,6 +2175,7 @@ inline bool is_part_of_a_region_pair(const void* ptr) namespace device { +/// A unique span of device-global memory template using unique_span = cuda::unique_span; @@ -2212,25 +2194,39 @@ unique_span make_unique_span(const context::handle_t context_handle, size_t s } // namespace detail_ /** - * @brief Create a variant of ::std::unique_pointer for an array in - * device-global memory. + * Allocate memory for a consecutive sequence of typed elements in device-global memory. * - * @note CUDA's runtime API always has a current device; but - - * there is not necessary a current context; so a primary context - * for a device may be created through this call. + * @tparam T type of the individual elements in the allocated sequence * - * @tparam T an array type; _not_ the type of individual elements + * @param context The CUDA device context in which to make the allocation. + * @param size the number of elements to allocate + * @return A @ref unique_span which owns the allocated memory (and will release said * - * @param context The CUDA device context in which to make the - * allocation. - * @param num_elements the number of elements to allocate + * @note This function is somewhat similar to ::std:: make_unique_for_overwrite(), except + * that the returned value is not "just" a unique pointer, but also has a size. It is also + * similar to {@ref cuda::device::make_unique_region}, except that the allocation is + * conceived as typed elements. * - * @return an ::std::unique_ptr pointing to the constructed T array -*/ + * @note Typically, this is used for trivially-constructible elements, for which reason the + * non-construction of individual elements should not pose a problem. But - let the user beware. + */ template unique_span make_unique_span(const context_t& context, size_t size); + +/** + * @copydoc make_unique_span(const context_t&, size_t) + * + * @param device The CUDA device in whose primary context to make the allocation. + */ template unique_span make_unique_span(const device_t& device, size_t size); + +/** + * @copydoc make_unique_span(const context_t&, size_t) + * + * @note The current device's primary context will be used (_not_ the + * current context). + */ template unique_span make_unique_span(size_t size); @@ -2243,7 +2239,7 @@ inline device::unique_span make_unique_span(const context_t& context, size_t return device::make_unique_span(context, num_elements); } -/// See @ref `device::make_unique_span(const device_t& device, size_t num_elements)` +/// See @ref `device::make_unique_span(const context_t& context, size_t num_elements)` template inline device::unique_span make_unique_span(const device_t& device, size_t num_elements) { @@ -2252,9 +2248,29 @@ inline device::unique_span make_unique_span(const device_t& device, size_t nu namespace host { +/// A unique span of CUDA-driver-allocated, pinned host (=system) memory template using unique_span = cuda::unique_span; +/** + * Allocate memory for a consecutive sequence of typed elements in system + * (host-side) memory. + * + * @tparam T type of the individual elements in the allocated sequence + * + * @param size the number of elements to allocate + * @return A @ref unique_span which owns the allocated memory (and will release said + * memory upon destruction) + * + * @note This function is somewhat similar to ::std:: make_unique_for_overwrite(), except + * that the returned value is not "just" a unique pointer, but also has a size. It is also + * similar to {@ref cuda::device::make_unique_region}, except that the allocation is + * conceived as typed elements. + * + * @note Typically, this is used for trivially-constructible elements, for which reason the + * non-construction of individual elements should not pose a problem. But - let the user + * beware, especially since this is host-side memory. + */ template unique_span make_unique_span(size_t size) { @@ -2265,6 +2281,7 @@ unique_span make_unique_span(size_t size) namespace managed { +/// A unique span of CUDA-driver-allocated managed memory template using unique_span = cuda::unique_span; @@ -2282,16 +2299,48 @@ unique_span make_unique_span( } // namespace detail_ +/** + * Allocate memory for a consecutive sequence of typed elements in system + * (host-side) memory. + * + * @tparam T type of the individual elements in the allocated sequence + * + * @param size the number of elements to allocate + * @return A @ref unique_span which owns the allocated memory (and will release said + * memory upon destruction) + * + * @note This function is somewhat similar to ::std:: make_unique_for_overwrite(), except + * that the returned value is not "just" a unique pointer, but also has a size. It is also + * similar to {@ref cuda::device::make_unique_region}, except that the allocation is + * conceived as typed elements. + * + * @note Typically, this is used for trivially-constructible elements, for which reason the + * non-construction of individual elements should not pose a problem. But - let the user + * beware, especially since this is host-side memory. + */ template unique_span make_unique_span( const context_t& context, size_t size, initial_visibility_t initial_visibility = initial_visibility_t::to_all_devices); + +/** + * @copydoc make_unique_span(const context_t&, size_t) + * + * @param device The CUDA device in whose primary context to make the allocation. + */ template unique_span make_unique_span( const device_t& device, size_t size, initial_visibility_t initial_visibility = initial_visibility_t::to_all_devices); + +/** + * @copydoc make_unique_span(const context_t&, size_t) + * + * @note The current device's primary context will be used (_not_ the + * current context). + */ template unique_span make_unique_span( size_t size, @@ -2302,6 +2351,7 @@ unique_span make_unique_span( } // namespace memory namespace symbol { + /** * Locates a CUDA symbol in global or constant device memory * diff --git a/src/cuda/api/miscellany.hpp b/src/cuda/api/miscellany.hpp index 6d04f547..1913ac0d 100644 --- a/src/cuda/api/miscellany.hpp +++ b/src/cuda/api/miscellany.hpp @@ -30,6 +30,13 @@ inline void initialize_driver() throw_if_error_lazy(status, "Failed initializing the CUDA driver"); } +/** + * A mechanism for ensuring a @ref cuInit() call has been made, to use before making + * any other driver API calls. + * + * @note differs from simply calling `initialize_driver()` in that repeated calls + * from the same thread will avoid additional @ref cuInit() call. + */ inline void ensure_driver_is_initialized() { thread_local bool driver_known_to_be_initialized{false}; diff --git a/src/cuda/api/module.hpp b/src/cuda/api/module.hpp index f69315e4..b06d694c 100644 --- a/src/cuda/api/module.hpp +++ b/src/cuda/api/module.hpp @@ -30,10 +30,13 @@ class kernel_t; namespace module { +// The CUDA driver's raw handle for modules using handle_t = CUmodule; namespace detail_ { +/// Construct a module proxy object - for an existing module - from the class' +/// constituent fields inline module_t wrap( device::id_t device_id, context::handle_t context_handle, @@ -70,6 +73,8 @@ inline void destroy(handle_t handle, context::handle_t context_handle, device::i * be loaded (and in which the module contents may be used) * @param[in] module_data the opaque, raw binary data for the module - in a contiguous container * such as a span, a cuda::unique_span etc.. + * @param link_options Potential options for the PTX compilation and linking of the compiled + * device-side code. */ ///@{ template = 201703L +/// @copydoc load_from_file(device_t, const char*) inline module_t load_from_file( const device_t& device, const ::std::filesystem::path& path) @@ -298,6 +337,7 @@ inline module_t load_from_file( return load_from_file(device, path.c_str()); } +/// @copydoc load_from_file(const char*) inline module_t load_from_file( const ::std::filesystem::path& path) { @@ -320,23 +360,19 @@ inline module_t wrap( return module_t{device_id, context_handle, module_handle, take_ownership, hold_pc_refcount_unit}; } -/* -template -module_t create(const context_t& context, const void* module_data, Creator creator_function); -*/ - /** * Creates a new module in a context using raw compiled code * * @param context The module will exist within this GPU context, i.e. the globals (functions, * variable) of the module would be usable within that constant. * @param module_data The raw compiled code for the module. - * @param link_options Potential options for the PTX compilation and device linking of the code. + * @param link_options Potential options for the PTX compilation and linking of the compiled + * device-side code. */ -///@{ module_t create(const context_t& context, const void* module_data, const link::options_t& link_options); + +/// @copydoc create(const context_t&, const void*, const link::options_t&) module_t create(const context_t& context, const void* module_data); -///@} inline void destroy(handle_t handle, context::handle_t context_handle, device::id_t device_id) { @@ -362,7 +398,17 @@ inline device::primary_context_t get_context_for(device_t& locus); } // namespace detail_ -// Note: The following may create the primary context of a device! +/** + * Create a new module - in a specified context or in a device's primary context, + * using raw module data in memory. + * + * @tparam Locus Either a @ref cuda::device_t or a {@ref cuda::context_t}. + * @tparam ContiguousContainer A span, a vector, a unique_span, or similar type + * @param locus Where the new module should be created + * @param module_data The raw data for the module in locus-accessible memory. + * + * @note This function may create/allocate resources for the primary context of a device! + */ template ::value, bool>> module_t create( @@ -373,6 +419,14 @@ module_t create( return detail_::create(context, module_data.data()); } +/** + * @copydoc create(Locus&&, ContiguousContainer) + * + * @param link_options Options for PTX compilation and for linking the module data, + * eventually. + * + * @return + */ // Note: The following may create the primary context of a device! template ::value, bool>> diff --git a/src/cuda/api/multi_wrapper_impls/unique_region.hpp b/src/cuda/api/multi_wrapper_impls/unique_region.hpp index 34fb9c5f..3416def8 100644 --- a/src/cuda/api/multi_wrapper_impls/unique_region.hpp +++ b/src/cuda/api/multi_wrapper_impls/unique_region.hpp @@ -72,6 +72,14 @@ inline unique_region make_unique_region(size_t num_bytes) namespace managed { +/** + * @brief Allocate a region of managed memory, accessible both from CUDA devices + * and from the CPU. + * + * @param context A context of possible single-device-visibility + * + * @returns An owning RAII/CADRe object for the allocated managed memory region + */ inline unique_region make_unique_region( const context_t& context, size_t num_bytes, @@ -81,6 +89,14 @@ inline unique_region make_unique_region( return unique_region { detail_::allocate_in_current_context(num_bytes, initial_visibility) }; } +/** + * @brief Allocate a region of managed memory, accessible both from CUDA devices + * and from the CPU. + * + * @param context A context of possible single-device-visibility + * + * @returns An owning RAII/CADRe object for the allocated managed memory region + */ inline unique_region make_unique_region( const device_t& device, size_t num_bytes, diff --git a/src/cuda/api/multi_wrapper_impls/unique_span.hpp b/src/cuda/api/multi_wrapper_impls/unique_span.hpp index c2907c63..0c0ee18b 100644 --- a/src/cuda/api/multi_wrapper_impls/unique_span.hpp +++ b/src/cuda/api/multi_wrapper_impls/unique_span.hpp @@ -28,7 +28,7 @@ unique_span make_unique_span(const context_t& context, size_t num_elements) } /** - * @brief Create a variant of ::std::unique_pointer for an array in + * @brief Allocate (but do) * device-global memory * * @tparam T an array type; _not_ the type of individual elements diff --git a/src/cuda/api/pci_id.hpp b/src/cuda/api/pci_id.hpp index 8c8e56ae..9518e941 100644 --- a/src/cuda/api/pci_id.hpp +++ b/src/cuda/api/pci_id.hpp @@ -53,6 +53,8 @@ struct pci_location_t { * and any of them can be used. */ static pci_location_t parse(const ::std::string& id_str); + + /// @copydoc parse(const ::std::string& id_str) static pci_location_t parse(const char* id_str); }; diff --git a/src/cuda/api/peer_to_peer.hpp b/src/cuda/api/peer_to_peer.hpp index 68c718b1..cc7c3f93 100644 --- a/src/cuda/api/peer_to_peer.hpp +++ b/src/cuda/api/peer_to_peer.hpp @@ -14,17 +14,31 @@ namespace cuda { namespace device { +/** + * @namespace peer_to_peer + * + * API functions and definitions relating to communications among peer CUDA GPU devices + * on the same system. + */ namespace peer_to_peer { -// Aliases for all CUDA device attributes +/// Aliases for CUDA driver GPU attribute codes +///@{ + +/// A relative value indicating the performance of the link between two devices +constexpr const attribute_t link_performance_rank = CU_DEVICE_P2P_ATTRIBUTE_PERFORMANCE_RANK; + +/// 1 if access is supported, 0 otherwise +constexpr const attribute_t access_support = CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED; + +/// 1 if the first device can perform native atomic operations on the second device, 0 otherwise +constexpr const attribute_t native_atomics_support = CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED; -constexpr const attribute_t link_performance_rank = CU_DEVICE_P2P_ATTRIBUTE_PERFORMANCE_RANK; /// A relative value indicating the performance of the link between two devices -constexpr const attribute_t access_support = CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED; /// 1 if access is supported, 0 otherwise -constexpr const attribute_t native_atomics_support = CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED; /// 1 if the first device can perform native atomic operations on the second device, 0 otherwise #if CUDA_VERSION >= 10000 -constexpr const attribute_t array_access_support = CU_DEVICE_P2P_ATTRIBUTE_CUDA_ARRAY_ACCESS_SUPPORTED; /// 1 if special array iterpolatory access operations are supported across the link, 0 otherwise +/// 1 if special array interpolatory access operations are supported across the link, 0 otherwise +constexpr const attribute_t array_access_support = CU_DEVICE_P2P_ATTRIBUTE_CUDA_ARRAY_ACCESS_SUPPORTED; #endif - +///@} namespace detail_ { /** @@ -94,6 +108,12 @@ void disable_access_to(const context_t &peer_context); } // namespace current +/** + * @namespace peer_to_peer + * + * API functions and definitions relating to communications among "peer" contexts on + * the same system, which may possibly regard different CUDA devices. + */ namespace peer_to_peer { namespace detail_ { diff --git a/src/cuda/api/pointer.hpp b/src/cuda/api/pointer.hpp index 2e573b72..1688b8d3 100644 --- a/src/cuda/api/pointer.hpp +++ b/src/cuda/api/pointer.hpp @@ -120,7 +120,7 @@ inline memory::type_t type_of(const void* ptr) /// Obtain (a non-owning wrapper for) the CUDA context with which a memory address is associated /// (e.g. being the result of an allocation or mapping in that context) -inline context_t context_of(const void* ptr); +context_t context_of(void const* ptr); /** * A convenience wrapper around a raw pointer "known" to the CUDA runtime diff --git a/src/cuda/api/primary_context.hpp b/src/cuda/api/primary_context.hpp index db59acc5..4bbaff92 100644 --- a/src/cuda/api/primary_context.hpp +++ b/src/cuda/api/primary_context.hpp @@ -75,6 +75,12 @@ inline void increase_refcount(device::id_t device_id) } // namespace detail_ +/** + * @returns true if the device's primary context is active (i.e. has resources allocated for it), + * which implies we are holding a refcount unit for it somewhere. + * + * @note recall a primary context being active does not mean that it is the _current_ context + */ inline bool is_active(const device_t& device); /** @@ -140,6 +146,9 @@ class primary_context_t : public context_t { public: + /// @return a stream object for the default-ID stream of the device, which + /// is pre-created and on which actions are scheduled when the runtime API + /// is used and no stream is specified. stream_t default_stream() const noexcept; public: // friendship @@ -280,6 +289,7 @@ inline bool is_current(device::id_t device_id) } // namespace detail +/// @return true if the current context is its device's primary context inline bool is_current() { auto device_id = context::current::detail_::get_device_id(); diff --git a/src/cuda/api/stream.hpp b/src/cuda/api/stream.hpp index e9896a9a..ab57b314 100644 --- a/src/cuda/api/stream.hpp +++ b/src/cuda/api/stream.hpp @@ -50,6 +50,11 @@ enum : bool { nonblocking = async, }; +/** + * Kinds of conditions to apply to a value in GPU global memory + * when waiting on that value, i.e. on what condition to stop + * waiting. + */ enum wait_condition_t : unsigned { greater_or_equal_to = CU_STREAM_WAIT_VALUE_GEQ, geq = CU_STREAM_WAIT_VALUE_GEQ, @@ -173,6 +178,9 @@ void enqueue_function_call(const stream_t& stream, Function function, void * arg * the current context and outlasting it. When set to `true`, * the proxy class will act as it does usually, destroying the stream * when being destructed itself. + * @param hold_pc_refcount_unit when the stream's context is a device's primary + * context, this controls whether that context must be kept active while the + * stream continues to exist * @return an instance of the stream proxy class, with the specified * device-stream combination. */ @@ -255,6 +263,8 @@ class stream_t { return flags & CU_STREAM_NON_BLOCKING; } + /// @returns the execution priority of a tasks on this stream (relative to other + /// tasks in other streams on the same device stream::priority_t priority() const { int the_priority; @@ -321,8 +331,21 @@ class stream_t { const stream_t& associated_stream; public: + ///@cond enqueue_t(const stream_t& stream) : associated_stream(stream) {} + ///@nocond + /** + * Schedule a kernel launch on the associated stream + * + * @param kernel A wrapper around the kernel to launch + * @param launch_configuration A description of how to launch the kernel (e.g. + * block and grid dimensions). + * @param parameters to arguments to be passed to the kernel for this launch + * + * @note This function is cognizant of the types of all arguments passed to it; + * for a type-erased version, see @ref type_erased_kernel_launch() + */ template void kernel_launch( const KernelFunction& kernel_function, @@ -336,6 +359,19 @@ class stream_t { ::std::forward(parameters)...); } + /** + * Schedule a kernel launch on the associated stream + * + * @param kernel A wrapper around the kernel to launch + * @param launch_configuration A description of how to launch the kernel (e.g. + * block and grid dimensions). + * @param marshalled_arguments Pointers to arguments to be passed to the kernel + * for this launch + * + * @note This signature does not require any type information regarding the kernel + * function type; see @ref kernel_launch() for a type-observing version of the + * same schedulign operation. + */ void type_erased_kernel_launch( const kernel_t& kernel, launch_configuration_t launch_configuration, @@ -345,21 +381,14 @@ class stream_t { } /** - * Have the CUDA device perform an I/O operation between two specified - * memory regions (on or off the actual device) + * Copy operations * + * The source and destination memory regions may be anywhere the CUDA driver can + * map (e.g. the device's global memory, host/system memory, the global memory of + * another device, constant memory etc.) */ - ///@{ - /** - * @param destination destination region into which to copy. May be - * anywhere in which memory can be mapped to the device's memory space (e.g. - * the device's global memory, host memory or the global memory of another device) - * @param source destination region from which to copy. May be - * anywhere in which memory can be mapped to the device's memory space (e.g. - * the device's global memory, host memory or the global memory of another device) - * @param num_bytes size of the region to copy - **/ + /// Schedule a copy of one region of memory to another void copy(void *destination, const void *source, size_t num_bytes) const { // CUDA doesn't seem to need us to be in the stream's context to enqueue the copy; @@ -368,6 +397,7 @@ class stream_t { memory::async::detail_::copy(destination, source, num_bytes, associated_stream.handle_); } + /// @copybrief copy(void *, const void *, size_t) const void copy(void* destination, memory::const_region_t source, size_t num_bytes) const { #ifndef NDEBUG @@ -378,16 +408,23 @@ class stream_t { copy(destination, source.start(), num_bytes); } + /** + * @copybrief copy(void *, const void *, size_t) const + * + * @note @p num_bytes may be smaller than the sizes of any of the regions + */ void copy(memory::region_t destination, memory::const_region_t source, size_t num_bytes) const { copy(destination.start(), source, num_bytes); } + /// @copybrief copy(void *, const void *, size_t) const void copy(memory::region_t destination, memory::const_region_t source) const { copy(destination, source, source.size()); } + /// @copybrief copy(void *, const void *, size_t) const void copy(void* destination, memory::const_region_t source) const { copy(destination, source, source.size()); @@ -398,9 +435,9 @@ class stream_t { * Set all bytes of a certain region in device memory (or unified memory, * but using the CUDA device to do it) to a single fixed value. * - * @param destination Beginning of the region to fill + * @param start Beginning of the region to fill * @param byte_value the value with which to fill the memory region bytes - * @param num_bytes size of the region to fill + * @param num_bytes size in bytes of the region to fill */ void memset(void *start, int byte_value, size_t num_bytes) const { @@ -409,6 +446,7 @@ class stream_t { memory::device::async::detail_::set(start, byte_value, num_bytes, associated_stream.handle_); } + /// @copydoc memset(void *, int, size_t) const void memset(memory::region_t region, int byte_value) const { memset(region.data(), byte_value, region.size()); @@ -422,7 +460,7 @@ class stream_t { * API call for setting to zero; does that mean there are special facilities * for zero'ing memory faster? Who knows. * - * @param destination Beginning of the region to fill + * @param start Beginning of the region to fill * @param num_bytes size of the region to fill */ void memzero(void *start, size_t num_bytes) const @@ -431,6 +469,9 @@ class stream_t { memory::device::async::detail_::zero(start, num_bytes, associated_stream.handle_); } + /** + * @copydoc memzero(void *, size_t) const + */ void memzero(memory::region_t region) const { memzero(region.data(), region.size()); @@ -492,6 +533,7 @@ class stream_t { } public: + /// Enqueues a host-invokable object, typically a function or closure object call. template void host_invokable(Invokable& invokable) const { @@ -529,13 +571,16 @@ class stream_t { memory::device::async::free(associated_stream, region); } #endif - ///@{ /** * Sets the attachment of a region of managed memory (i.e. in the address space visible * on all CUDA devices and the host) in one of several supported attachment modes. * - * The attachmentis actually a commitment vis-a-vis the CUDA driver and the GPU itself + * @param managed_region_start a pointer to the beginning of the managed memory region. + * This cannot be a pointer to anywhere in the middle of an allocated region - you must + * pass whatever @ref cuda::memory::managed::allocate() returned. + * + * The attachment is actually a commitment vis-a-vis the CUDA driver and the GPU itself * that it doesn't need to worry about accesses to this memory from devices other than * its object of attachment, so that the driver can optimize scheduling accordingly. * @@ -547,12 +592,6 @@ class stream_t { * the attachment goes into effect (some time after) previous scheduled actions have * concluded. */ - ///@{ - /** - * @param managed_region_start a pointer to the beginning of the managed memory region. - * This cannot be a pointer to anywhere in the middle of an allocated region - you must - * pass whatever @ref cuda::memory::managed::allocate() returned. - */ void attach_managed_region( const void* managed_region_start, memory::managed::attachment_t attachment = memory::managed::attachment_t::single_stream) const @@ -572,8 +611,23 @@ class stream_t { } /** - * @param region the managed memory region to attach; it cannot be a sub-region - - * you must pass whatever @ref cuda::memory::managed::allocate() returned. + * @copybrief attach_managed_region(const void*, memory::managed::attachment_t) const + * + * @param region the entire managed memory region; note this must not be a sub-region; + * you must pass whatever the CUDA memory allocation or construction code provided + * you with, in full. + * + * The attachment is actually a commitment vis-a-vis the CUDA driver and the GPU itself + * that it doesn't need to worry about accesses to this memory from devices other than + * its object of attachment, so that the driver can optimize scheduling accordingly. + * + * @note by default, the memory region is attached to this specific stream on its + * specific device. In this case, the host will be allowed to read from this memory + * region whenever no kernels are pending on this stream. + * + * @note Attachment happens asynchronously, as an operation on this stream, i.e. + * the attachment goes into effect (some time after) previous scheduled actions have + * concluded. */ void attach_managed_region( memory::region_t region, @@ -581,8 +635,6 @@ class stream_t { { attach_managed_region(region.start(), attachment); } - ///@} - /** * Will pause all further activity on the stream until the specified event has @@ -862,16 +914,21 @@ class stream_t { // it must release its refcount unit on destruction public: // data members - which only exist in lieu of namespaces + + /// This data member is a gadget for use instead of a "class-local" namespace; + /// we do not need it as a distinct object const enqueue_t enqueue { *this }; // The use of *this here is safe, since enqueue_t doesn't do anything with it // on its own. Any use of enqueue only happens through, well, *this - and // after construction. }; +///@cond inline bool operator!=(const stream_t& lhs, const stream_t& rhs) noexcept { return not (lhs == rhs); } +///@nocond namespace stream { @@ -972,16 +1029,7 @@ void enqueue_function_call(const stream_t& stream, Function function, void* argu * for execution scheduling; lower numbers represent higher properties; * each device has a range of priorities, which can be obtained using * @ref device_t::stream_priority_range() - * @param hold_pc_refcount_unit when the event's context is a device's primary - * context, this controls whether that context must be kept active while the - * event continues to exist - * @return The newly-created stream - */ -///@{ - -/** - * @brief Create a new stream (= queue) in the primary execution context - * of a CUDA device. + * @return The newly-created stream */ stream_t create( const device_t& device, @@ -992,6 +1040,16 @@ stream_t create( * @brief Create a new stream (= queue) in a CUDA execution context. * * @param context the execution context in which to create the stream + * @param synchronizes_with_default_stream if true, no work on this stream + * will execute concurrently with work from the default stream (stream 0) + * @param priority priority of tasks on the stream, relative to other streams, + * for execution scheduling; lower numbers represent higher properties; + * each device has a range of priorities, which can be obtained using + * @ref device_t::stream_priority_range() + * @param hold_pc_refcount_unit when the stream's context is a device's primary + * context, this controls whether that context must be kept active while the + * steam continues to exist + * @return The newly-created stream */ stream_t create( const context_t& context, @@ -1002,7 +1060,16 @@ stream_t create( } // namespace stream -inline void synchronize(const stream_t& stream) +/** + * Waits for all previously-scheduled tasks on a given stream to conclude, + * before returning. + * + * Depending on the `host_thread_sync_scheduling_policy_t` set for the + * specified stream, the thread calling this method will either yield, + * spin or block until all tasks scheduled previously scheduled on the + * stream have concluded. + */ + inline void synchronize(const stream_t& stream) { // Note: Unfortunately, even though CUDA should be aware of which context a stream belongs to, // and not have trouble acting on a stream in another context - it balks at doing so under diff --git a/src/cuda/api/texture_view.hpp b/src/cuda/api/texture_view.hpp index c4456948..34a31a47 100644 --- a/src/cuda/api/texture_view.hpp +++ b/src/cuda/api/texture_view.hpp @@ -21,6 +21,7 @@ class texture_view; namespace texture { +/// The CUDA driver's raw, opaque handle for texture objects using raw_handle_t = CUtexObject; /** @@ -111,7 +112,6 @@ class texture_view { other.owning_ = false; }; - template texture_view( const cuda::array_t& arr, @@ -160,7 +160,10 @@ class texture_view { public: // non-mutating getters + /// @returns A non-owning proxy object for the CUDA context in which this texture is defined context_t context() const; + + /// @returns A non-owning proxy object for the CUDA device on which this texture is defined device_t device() const; public: // friendship @@ -174,7 +177,7 @@ class texture_view { bool owning_; }; - +///@cond inline bool operator==(const texture_view& lhs, const texture_view& rhs) noexcept { return lhs.raw_handle() == rhs.raw_handle(); @@ -184,7 +187,7 @@ inline bool operator!=(const texture_view& lhs, const texture_view& rhs) noexcep { return lhs.raw_handle() != rhs.raw_handle(); } - +///@nocond namespace texture { inline texture_view wrap( diff --git a/src/cuda/api/types.hpp b/src/cuda/api/types.hpp index c709db6d..21c3ae33 100644 --- a/src/cuda/api/types.hpp +++ b/src/cuda/api/types.hpp @@ -573,6 +573,22 @@ using size_t = unsigned; } // namespace shared +/** + * This type of memory, also known as _unified_ memory, appears within + * a unified, all-system address space - and is used with the same + * address range on the host and on all relevant CUDA devices on a + * system. It is paged, so that it may exceed the physical size of + * a CUDA device's global memory. The CUDA driver takes care of + * "swapping" pages "out" from a device to host memory or "swapping" + * them back "in", as well as of propagation of changes between + * devices and host-memory. + * + * @note For more details, see + * + * Unified Memory for CUDA Beginners on the + * Parallel4All blog. + * + */ namespace managed { enum class initial_visibility_t { @@ -706,7 +722,7 @@ using flags_t = unsigned; /** * Scheduling policies the Runtime API may use when the host-side * thread it is running in needs to wait for results from a certain - * device + * device or context. */ enum host_thread_sync_scheduling_policy_t : unsigned int { diff --git a/src/cuda/api/unique_region.hpp b/src/cuda/api/unique_region.hpp index d054217f..a9b65a99 100644 --- a/src/cuda/api/unique_region.hpp +++ b/src/cuda/api/unique_region.hpp @@ -151,6 +151,7 @@ class unique_region : public region_t { namespace device { +/// A unique region of device-global memory using unique_region = memory::unique_region; namespace detail_ { @@ -164,20 +165,30 @@ inline unique_region make_unique_region(const context::handle_t context_handle, } // namespace detail_ /** - * @brief Allocate an array in device-global memory and return an owning class for it + * @brief Allocate a region in device-global memory * - * @param num_bytes the size in bytes of the allocated region - */ -///@{ -/** - * @param device The CUDA device in whose global memory to make the allocation. + * @param context The context within which (and in the device global memory + * of which) to make the allocation + * @param num_bytes Size of the region to be allocated, in bytes + * @returns An owning RAII/CADRe object for the allocated memory region */ unique_region make_unique_region(const context_t& context, size_t num_bytes); + /** - * @param context The CUDA context in which to make the allocation. + * @brief Allocate a region in device-global memory + * + * @param device The device in the global memory of which to make the allocation + * @returns An owning RAII/CADRe object for the allocated memory region */ unique_region make_unique_region(const device_t& device, size_t num_bytes); +/** + * @brief Allocate a region in device-global memory within the primary context + * of the current CUDA device + * + * @param device The device in the global memory of which to make the allocation + * @returns An owning RAII/CADRe object for the allocated memory region + */ unique_region make_unique_region(size_t num_bytes); ///}@ @@ -198,19 +209,21 @@ inline device::unique_region make_unique_region(const device_t& device, size_t n namespace host { +/// A unique region of pinned host memory using unique_region = memory::unique_region; -inline unique_region make_unique_region( - const context_t& context, - size_t num_bytes, - allocation_options options = allocation_options{}); -inline unique_region make_unique_region(const device_t& device, size_t num_bytes); +/** + * @brief Allocate a physical-address-pinned region of system memory + * + * @returns An owning RAII/CADRe object for the allocated memory region + */ inline unique_region make_unique_region(size_t num_bytes); } // namespace host namespace managed { +/// A unique region of managed memory, see @ref cuda::memory::managed using unique_region = memory::unique_region; namespace detail_ { @@ -226,14 +239,34 @@ inline unique_region make_unique_region( } // namespace detail_ +/** + * @copydoc make_unique_region(size_t num_bytes) + * + * @param context A context, to set when allocating the memory region, for whatever + * association effect that may have. + */ inline unique_region make_unique_region( const context_t& context, size_t num_bytes, initial_visibility_t initial_visibility = initial_visibility_t::to_all_devices); + +/** + * @copydoc make_unique_region(size_t num_bytes) + * + * @param device A context, whose primary context will be current when allocating + * the memory region, for whatever association effect that may have. + */ inline unique_region make_unique_region( const device_t& device, size_t num_bytes, initial_visibility_t initial_visibility = initial_visibility_t::to_all_devices); + +/** + * @brief Allocate a region of managed memory, accessible both from CUDA devices + * and from the CPU. + * + * @returns An owning RAII/CADRe object for the allocated managed memory region + */ inline unique_region make_unique_region( size_t num_bytes); diff --git a/src/cuda/define_specifiers.hpp b/src/cuda/define_specifiers.hpp index a907ea11..4234041b 100644 --- a/src/cuda/define_specifiers.hpp +++ b/src/cuda/define_specifiers.hpp @@ -8,6 +8,8 @@ #ifdef __CUDACC__ +/// Shorthands for CUDA-specific function declaration decorations +///@{ #ifndef CUDA_FD #define CUDA_FD __forceinline__ __device__ #endif @@ -57,5 +59,6 @@ #ifndef CUDA_H #define CUDA_H #endif +///@} #endif // __CUDACC__ diff --git a/src/cuda/rtc/compilation_output.hpp b/src/cuda/rtc/compilation_output.hpp index 45e457c5..c95359bf 100644 --- a/src/cuda/rtc/compilation_output.hpp +++ b/src/cuda/rtc/compilation_output.hpp @@ -23,8 +23,18 @@ class context_t; ///@endcond namespace rtc { + +/** + * The output produced by a compilation process by one of the CUDA libraries, + * including any byproducts. + * + * @tparam Kind Which language was compiled to produce the result + * + * @note A failed compilation is also a (useful) kind of compilation output. + */ template class compilation_output_t; + } // namespace rtc ///@cond @@ -41,6 +51,7 @@ class module_t; namespace module { +/// Build a contextualized module from the results of a successful compilation template inline module_t create( const context_t& context, @@ -49,9 +60,6 @@ inline module_t create( } // namespace module -/** - * @brief Real-time compilation of CUDA programs using the NVIDIA NVRTC library. - */ namespace rtc { namespace program { @@ -237,6 +245,11 @@ template <> inline status_t destroy_and_return_status(handle } // namespace program +/** + * @namespace compilation_output + * + * Definitions relating to and supporting the @ref compilation_output_t class + */ namespace compilation_output { namespace detail_ { @@ -256,12 +269,14 @@ inline compilation_output_t wrap( } // namespace compilation_output /** - * Wrapper class for the result of an NVRTC compilation (including the program handle) - - * whether it succeeded or failed due to errors in the program itself. + * The result of the compilation of an {@ref rtc::program_t}, whether successful or + * failed, with any related byproducts. + * + * @note This class _may_ own a low-level program handle. * - * @note This class _may_ own an NVRTC low-level program handle. * @note If compilation failed due to apriori-invalid arguments - an exception will - * have been thrown. The only failure this class may represent + * have been thrown. A failure indication in this class indicates a program whose + * compilation actually _took place_ and ended with a failure. */ template class compilation_output_base_t { @@ -271,27 +286,30 @@ class compilation_output_base_t { using status_type = status_t; public: // getters + + /// @returns `true` if the compilation resulting in this output had succeeded bool succeeded() const { return succeeded_; } + + /// @returns `true` if the compilation resulting in this output had failed bool failed() const { return not succeeded_; } + + /// @returns `true` if the compilation resulting in this output had succeeded, `false` otherwise operator bool() const { return succeeded_; } const ::std::string& program_name() const { return program_name_; } handle_type program_handle() const { return program_handle_; } public: // non-mutators - // Unfortunately, C++'s standard string class is very inflexible, - // and it is not possible for us to get it to have an appropriately- - // sized _uninitialized_ buffer. We will therefore have to use - // a clunkier return type. - // - // ::std::string log() const - /** - * Obtain a copy of the log of the last compilation + * Write a copy of the program compilation log into a user-provided buffer * - * @note This will fail if the program has never been compiled. + * @param[inout] buffer A writable buffer large enough to contain the compilation log + * + * @return the buffer passed in (which has now been overwritten with the log) + * + * @note This will fail if the program has never been compiled, or if the + * buffer is not large enough to hold the complete log (plus nul character). */ - ///@{ span log(span buffer) const { size_t size = program::detail_::get_log_size(program_handle_, program_name_.c_str()); @@ -305,6 +323,13 @@ class compilation_output_base_t { return { buffer.data(), size }; } + /** + * Obtain a copy of the compilation log + * + * @returns an owning container with a nul-terminated copy of the log + * + * @note This will fail if the program has never been compiled. + */ unique_span log() const { size_t size = program::detail_::get_log_size(program_handle_, program_name_.c_str()); @@ -317,11 +342,34 @@ class compilation_output_base_t { result[size] = '\0'; return result; } - ///@} #if CUDA_VERSION >= 11010 - virtual unique_span cubin() const = 0; + /** + * Write the CUBIN result of the last compilation into a buffer. + * + * @param[inout] buffer A writable buffer large enough to contain the compiled + * program's CUBIN code. + * @return The sub-buffer, starting at the beginning of @p buffer, containing + * exactly the compiled program's CUBIN (i.e. sized down to fit the contents) + * + * @note This will fail if the program has never been compiled; due to + * compilation failure and also due to LTO/linking failure. + */ virtual span cubin(span buffer) const = 0; + + /** + * Obtain a copy of the CUBIN code resulting from the program compilation + * + * @returns an owning container with a copy of the CUBIN code + * + * @note This will fail if the program has never been compiled; if the compilation + * target was a virtual architecture (in which case only PTX is available); due to + * compilation failure and also due to LTO/linking failure. + */ + virtual unique_span cubin() const = 0; + + /// @returns true if the program has been successfully compiled, with the result + /// containing CUBIN code. virtual bool has_cubin() const = 0; #endif @@ -382,9 +430,17 @@ class compilation_output_t : public compilation_output_base_t ptx(span buffer) const { @@ -398,6 +454,13 @@ class compilation_output_t : public compilation_output_base_t ptx() const { size_t size = program::detail_::get_ptx_size(program_handle_, program_name_.c_str()); @@ -410,8 +473,9 @@ class compilation_output_t : public compilation_output_base_t : public compilation_output_base_t= 11010 - /** - * Obtain a copy of the CUBIN result of the last compilation. - * - * @note CUBIN output is not available when compiling for a virtual architecture only. - * Also, it may be missing in cases such as compilation failure or link-time - * optimization compilation. - * @note This will fail if the program has never been compiled. - */ - ///@{ span cubin(span buffer) const override { size_t size = program::detail_::get_cubin_size(program_handle_, program_name_.c_str()); @@ -456,7 +511,6 @@ class compilation_output_t : public compilation_output_base_t(result.data(), program_handle_, program_name_.c_str()); return result; } - ///@} bool has_cubin() const override { @@ -467,13 +521,12 @@ class compilation_output_t : public compilation_output_base_t 0); } - #endif #if CUDA_VERSION >= 11040 /** - * Obtain a copy of the LTO IR result of the last compilation - the intermediate - * representation used for link-time optimization + * Write the LTO IR result of the last compilation - the intermediate + * representation used for link-time optimization - into a buffer * * @throws ::std::invalid_argument if the supplied buffer is too small to hold * the program's LTO IR. @@ -485,7 +538,6 @@ class compilation_output_t : public compilation_output_base_t lto_ir(span buffer) const { size_t size = program::detail_::get_lto_ir_size(program_handle_, program_name_.c_str()); @@ -498,6 +550,15 @@ class compilation_output_t : public compilation_output_base_t lto_ir() const { size_t size = program::detail_::get_lto_ir_size(program_handle_, program_name_.c_str()); @@ -510,12 +571,9 @@ class compilation_output_t : public compilation_output_base_t : public compilation_output_base_t : public compilation_output_base_t : public compilation_output_base_t { bool own_handle); public: // non-mutators - /** - * Obtain a copy of the CUBIN result of the last compilation. - * - * @note This will fail if the program has never been compiled. - */ - ///@{ span cubin(span buffer) const override { size_t size = program::detail_::get_cubin_size(program_handle_, program_name_.c_str()); @@ -593,7 +645,6 @@ class compilation_output_t : public compilation_output_base_t { return { buffer.data(), size }; } -public: // non-mutators unique_span cubin() const override { size_t size = program::detail_::get_cubin_size(program_handle_, program_name_.c_str()); @@ -606,7 +657,6 @@ class compilation_output_t : public compilation_output_base_t { result[size] = '\0'; return result; } - ///@} bool has_cubin() const override { @@ -695,6 +745,8 @@ template<> inline module_t create( #endif // CUDA_VERSION >= 11010 +/// Build a module from the results of a successful compilation, in the primary context +/// of the specified device template inline module_t create( device_t& device, diff --git a/src/cuda/rtc/error.hpp b/src/cuda/rtc/error.hpp index d880a2b5..8ac427b9 100644 --- a/src/cuda/rtc/error.hpp +++ b/src/cuda/rtc/error.hpp @@ -121,7 +121,6 @@ namespace rtc { template class runtime_error : public ::std::runtime_error { public: - ///@cond // TODO: Constructor chaining; and perhaps allow for more construction mechanisms? runtime_error(status_t error_code) : ::std::runtime_error(describe(error_code)), @@ -132,7 +131,6 @@ class runtime_error : public ::std::runtime_error { ::std::runtime_error(::std::move(what_arg) + ": " + describe(error_code)), code_(error_code) { } - ///@endcond runtime_error(status::named_t error_code) : runtime_error(static_cast>(error_code)) { } runtime_error(status::named_t error_code, const ::std::string& what_arg) : @@ -190,6 +188,15 @@ inline void throw_if_error(rtc::status_t status) noexcept(false) if (is_failure(status)) { throw rtc::runtime_error(status); } } +/** + * Throws a @ref ::cuda::rtc::runtime_error exception if the status is not success + * + * @note The rationale for this macro is that neither the exception, nor its constructor + * arguments, are evaluated on the "happy path"; and that cannot be achieved with a + * function - which genertally/typically evaluates its arguments. To guarantee this + * lazy evaluation with a function, we would need exception-construction-argument-producing + * lambdas, which we would obviously rather avoid. + */ #define throw_if_rtc_error_lazy(Kind, status__, ... ) \ do { \ ::cuda::rtc::status_t tie_status__ = static_cast<::cuda::rtc::status_t>(status__); \ diff --git a/src/cuda/rtc/program.hpp b/src/cuda/rtc/program.hpp index bb51099f..9aed4531 100644 --- a/src/cuda/rtc/program.hpp +++ b/src/cuda/rtc/program.hpp @@ -23,6 +23,19 @@ namespace program { namespace detail_ { +/** + * Create a new program object from source code + * + * @tparam Kind We can create a program with any one of the (two...) kinds of supported + * source code + * @param program_name arbitrary identifier to recognize the program by; it's suggested + * not to get too crazy + * @param program_source The source code of the program, possibly with include directives + * in the case of C++ + * @param num_headers The number of pairs of header "file" names and header content strings + * @param header_sources Pointers to nul-terminated per-header source code + * @param header_names Pointers to nul-terminated names of the different headers + */ template inline program::handle_t create( const char *program_name, @@ -285,7 +298,7 @@ class program_t : public program::detail_::base_t { protected: template - static inline void check_string_type() + static void check_string_type() { using no_cref_string_type = typename ::std::remove_const::type>::type; static_assert( diff --git a/src/cuda/rtc/types.hpp b/src/cuda/rtc/types.hpp index 1526a4c6..80f0de90 100644 --- a/src/cuda/rtc/types.hpp +++ b/src/cuda/rtc/types.hpp @@ -51,13 +51,20 @@ enum source_kind_t { // provide a container which may then be resized. /** - * @brief Real-time compilation of CUDA programs using the NVIDIA NVRTC library. + * @namespace rtc + * + * @brief Real-time compilation of programs using the NVIDIA libraries. */ namespace rtc { /// A span of C-style strings the contents of which must not be modified using const_cstrings_span = span; +/** + * @namespace program + * + * Definitions relating to source-code programs to be compiled + */ namespace program { namespace detail_ {