Regards #3: Multiple doxygen, regular-comment, and spacing fixes, add…

…itions, improvements and removal of redundancies.
eyalroz · Mar 25, 2024 · 08e8877 · 08e8877
1 parent 9d6521f
commit 08e8877
Show file tree

Hide file tree

Showing 44 changed files with 327 additions and 245 deletions.
diff --git a/src/cuda/api/common_ptx_compilation_options.hpp b/src/cuda/api/common_ptx_compilation_options.hpp
@@ -134,7 +134,6 @@ struct common_ptx_compilation_options_t {
 
 	/**
 	 * The minimum number of threads per block which the compiler should target
-	 * @note can't be combined with a value for the @ref target property.
 	 */
 	optional<grid::block_dimension_t> min_num_threads_per_block{};
 

diff --git a/src/cuda/api/context.hpp b/src/cuda/api/context.hpp
@@ -211,9 +211,9 @@ inline void synchronize(const context_t& context);
  * @note By default this class has RAII semantics, i.e. it creates a
  * context on construction and destroys it on destruction, and isn't merely
  * an ephemeral wrapper one could apply and discard; but this second kind of
- * semantics is also supported, through the @ref context_t::holds_refcount_unit_ field.
+ * semantics is also supported, through the @ref context_t::owning_ field.
  *
- * @note A context is a specific to a device; see, therefore, also @ref device_t .
+ * @note A context is a specific to a device; see, therefore, also {@ref cuda::device_t}.
  * @note This class is a "reference type", not a "value type". Therefore, making changes
  * to properties of the context is a const-respecting operation on this class.
  */
@@ -228,9 +228,9 @@ class context_t {
 public: // inner classes
 
 	/**
-	 * @brief A class to create a faux member in a @ref device_t, in lieu of an in-class
+	 * @brief A class to create a faux member in a @ref context_t, in lieu of an in-class
 	 * namespace (which C++ does not support); whenever you see a function
-	 * `my_dev.memory::foo()`, think of it as a `my_dev::memory::foo()`.
+	 * `my_context.memory::foo()`, think of it as a `my_dev::memory::foo()`.
 	 */
 	class global_memory_type {
 	protected: // data members
@@ -492,7 +492,7 @@ class context_t {
 	 * Gets the synchronization policy to be used for threads synchronizing
 	 * with this CUDA context.
 	 *
-	 * @note see @ref host_thread_sync_scheduling_policy_t
+	 * @note see @ref context::host_thread_sync_scheduling_policy_t
 	 * for a description of the various policies.
 	 */
 	context::host_thread_sync_scheduling_policy_t sync_scheduling_policy() const
@@ -664,6 +664,7 @@ class context_t {
 protected: // data members
 	device::id_t       device_id_;
 	context::handle_t  handle_;
+	/// When true, the object is a valud type, and the context must be destroyed on destruction
 	bool               owning_;
 		// this field is mutable only for enabling move construction; other
 		// than in that case it must not be altered

diff --git a/src/cuda/api/copy_parameters.hpp b/src/cuda/api/copy_parameters.hpp
@@ -1,7 +1,7 @@
 /**
  * @file
  *
- * @brief The @ref copy_parameters class template and related definitions.
+ * @brief The @ref cuda::memory::copy_parameters_t class template and related definitions.
  */
 #ifndef CUDA_API_WRAPPERS_COPY_PARAMETERS_HPP
 #define CUDA_API_WRAPPERS_COPY_PARAMETERS_HPP

diff --git a/src/cuda/api/detail/region.hpp b/src/cuda/api/detail/region.hpp
@@ -1,8 +1,8 @@
 /**
  * @file
  *
- * @brief A memory region class (@ref `cuda::memory::region`) and related
- * functionality.
+ * @brief A memory region class (@ref cuda::memory::region_t and @ref
+ * cuda::memory::const_region_t) and related functionality.
  *
  * @note There is no CUDA-specific functionality here, and this class could be
  * used irrespective of the CUDA APIs and GPUs in general.
@@ -121,6 +121,9 @@ bool operator!=(const base_region_t<T>& lhs, const base_region_t<T>& rhs)
 
 }  // namespace detail_
 
+/**
+ * An untyped, but sized, region in some memory space
+ */
 struct region_t : public detail_::base_region_t<void> {
 	using base_region_t<void>::base_region_t;
 	region_t subregion(size_t offset_in_bytes, size_t size_in_bytes) const
@@ -130,6 +133,9 @@ struct region_t : public detail_::base_region_t<void> {
 	}
 };
 
+/**
+ * An untyped, but sized, region with const-constrained data in some memory space
+ */
 struct const_region_t : public detail_::base_region_t<void const> {
 	using base_region_t<void const>::base_region_t;
 	const_region_t(region_t r) : base_region_t(r.start(), r.size()) {}

diff --git a/src/cuda/api/detail/span.hpp b/src/cuda/api/detail/span.hpp
@@ -1,7 +1,7 @@
 /**
  * @file
  *
- * @brief Contains an implementation of an std::span-like class, @ref `cuda::span`
+ * @brief Contains an implementation of an std::span-like class, @ref cuda::span
  *
  * @note When compiling with C++20 or later, the actual std::span is used instead
  */

diff --git a/src/cuda/api/detail/unique_span.hpp b/src/cuda/api/detail/unique_span.hpp
@@ -1,7 +1,7 @@
 /**
  * @file
  *
- * @brief Contains an implementation of an std::dynarray-like class, @ref `cuda::unique_span`
+ * @brief Contains the class @ref cuda::unique_span
  *
  * @note There is no CUDA-specific code in this file; the class is usable entirely independently
  * of the CUDA APIs and GPUs in general

diff --git a/src/cuda/api/device.hpp b/src/cuda/api/device.hpp
@@ -514,39 +514,60 @@ class device_t {
 		return id_;
 	}
 
-	stream_t default_stream(bool hold_primary_context_refcount_unit = false) const;
-
 	/**
-	 * See @ref cuda::stream::create()
+	 * Obtain a wrapper for the (always-existing) default stream within
+	 * the device' primary context.
+	 *
+	 * @param hold_primary_context_refcount_unit when true, the returned stream
+	 *     wrapper will keep the device' primary context in existence during
+	 *     its lifetime.
 	 */
+	stream_t default_stream(bool hold_primary_context_refcount_unit = false) const;
+
+	/// See @ref cuda::stream::create()
 	stream_t create_stream(
 		bool                will_synchronize_with_default_stream,
 		stream::priority_t  priority = cuda::stream::default_priority) const;
 
-	/**
-	 * See @ref cuda::event::create()
-	 */
+	/// See @ref cuda::event::create()
 	event_t create_event(
 		bool uses_blocking_sync = event::sync_by_busy_waiting, // Yes, that's the runtime default
 		bool records_timing     = event::do_record_timings,
 		bool interprocess       = event::not_interprocess);
 
+	/// See @ref cuda::context::create()
 	context_t create_context(
 		context::host_thread_sync_scheduling_policy_t   sync_scheduling_policy = context::heuristic,
 		bool                                            keep_larger_local_mem_after_resize = false) const;
 
 #if CUDA_VERSION >= 11020
 
+	/// See @ref cuda::memory::pool::create()
 	template <memory::pool::shared_handle_kind_t Kind = memory::pool::shared_handle_kind_t::no_export>
 	memory::pool_t create_memory_pool() const;
 
 #endif
 
-	template<typename KernelFunction, typename ... KernelParameters>
+	/**
+	 * Launch a kernel on the default stream of the device' primary context
+	 *
+	 * @tparam Kernel May be either a plain function type (for a `__global__` function
+	 *     accessible to the translation unit, or (a reference to) any subclass of
+	 * `   `cuda::kernel_t`.
+	 * @param kernel_function
+	 *     the kernel to launch; may be either a (`__global__`) function pointer,
+	 *     or a kernel proxy class.
+	 * @param launch_configuration
+	 *     the configuration with which to launch the kernel;
+	 * @param arguments
+	 *     the arguments with which to launch @p kernel (but note that references
+	 *     are not maintained).
+	 */
+	template<typename Kernel, typename ... KernelParameters>
 	void launch(
-		KernelFunction          kernel_function,
+		Kernel                  kernel,
 		launch_configuration_t  launch_configuration,
-		KernelParameters...     parameters) const;
+		KernelParameters...     arguments) const;
 
 	/**
 	 * Determines the range of possible priorities for streams on this device.

diff --git a/src/cuda/api/device_properties.hpp b/src/cuda/api/device_properties.hpp
@@ -108,14 +108,14 @@ constexpr compute_capability_t make_compute_capability(unsigned major, unsigned
 /**
  * @brief A structure holding a collection various properties of a device
  *
- * @note Somewhat annoyingly, CUDA devices have attributes, properties and flags.
+ * @note Somewhat annoyingly, CUDA devices have all of attributes, properties and flags.
  * Attributes have integral number values; properties have all sorts of values,
- * including arrays and limited-length strings (see
- * @ref cuda::device::properties_t), and flags are either binary or
- * small-finite-domain type fitting into an overall flagss value (see
- * @ref cuda::device_t::flags_t). Flags and properties are obtained all at once,
+ * including arrays and limited-length strings, and flags are actually associated with
+ * a device's primary context, as it is actually _contexts_ which have flags (which are
+ * either binary or small-finite-domain type fitting into an overall flags value:
+ * {@ref context::flags_t}). Flags and properties are obtained all at once (the latter,
+ * using the runtime API),
  * attributes are more one-at-a-time.
- *
  */
 struct properties_t : public cudaDeviceProp {
 

diff --git a/src/cuda/api/error.hpp b/src/cuda/api/error.hpp
@@ -34,7 +34,7 @@ namespace status {
  * @note unfortunately, this enum can't inherit from @ref cuda::status_t
  */
 enum named_t : ::std::underlying_type<status_t>::type {
-	success                          = CUDA_SUCCESS,
+	success                          = CUDA_SUCCESS, /// Operation was successful; no errors
 	memory_allocation_failure        = CUDA_ERROR_OUT_OF_MEMORY, // corresponds to cudaErrorMemoryAllocation
 	not_yet_initialized              = CUDA_ERROR_NOT_INITIALIZED, // corresponds to cudaErrorInitializationError
 	already_deinitialized            = CUDA_ERROR_DEINITIALIZED, // corresponds to cudaErrorCudartUnloading
@@ -336,7 +336,7 @@ do { \
  * Do nothing... unless the status indicates an error, in which case
  * a @ref cuda::runtime_error exception is thrown
  *
- * @param status should be @ref cuda::status::success - otherwise an exception is thrown
+ * @param status should be @ref status::success  - otherwise an exception is thrown
  * @param message An extra description message to add to the exception
  */
 inline void throw_if_error(status_t status, const ::std::string& message) noexcept(false)
@@ -453,16 +453,13 @@ inline void ensure_none(const char *message) noexcept(false)
 }
 
 /**
- * @brief Does nothing (unless throwing an exception)
+ * @brief Does nothing (except possibly throwing an exception)
  *
  * @note similar to @ref throw_if_error, but uses the CUDA Runtime API's internal
  * state
  *
- * @throws cuda::runtime_error if the CUDA runtime API has
- * encountered previously encountered an (uncleared) error
- *
- * @param clear_any_error When true, clears the CUDA Runtime API's state from
- * recalling errors arising from before this oment
+ * @throws cuda::runtime_error if the CUDA runtime API has encountered previously
+ * encountered an (uncleared) error
  */
 inline void ensure_none() noexcept(false)
 {

diff --git a/src/cuda/api/event.hpp b/src/cuda/api/event.hpp
@@ -79,14 +79,17 @@ namespace event {
  * @note This is a named constructor idiom, existing of direct access to the ctor
  * of the same signature, to emphasize that a new event is _not_ created.
  *
+ * @param device_id Index of the device to which the event relates
  * @param context_handle Handle of the context in which this event was created
  * @param event_handle handle of the pre-existing event
- * @param take_ownership When set to `false`, the CUDA event
- * will not be destroyed along with proxy; use this setting
- * when temporarily working with a stream existing irrespective of
- * the current context and outlasting it. When set to `true`,
- * the proxy class will act as it does usually, destroying the event
- * when being destructed itself.
+ * @param take_ownership When set to `false`, the CUDA event will not be destroyed
+ *     along with proxy; use this setting when temporarily working with a stream
+ *     existing irrespective of  the current context and outlasting it. When set to
+ *     `true`, the proxy class will act as it does usually, destroying the event
+ *      when being destructed itself.
+ * @param hold_pc_refcount_unit when the event's context is a device's primary
+ *     context, this controls whether that context must be kept active while
+ *     the event continues to exist.
  * @return an event wrapper associated with the specified event
  */
 event_t wrap(
@@ -125,7 +128,7 @@ inline void wait(const event_t& event);
  * @note By default this class has RAII semantics, i.e. it has the runtime create
  * an event on construction and destroy it on destruction, and isn't merely
  * an ephemeral wrapper one could apply and discard; but this second kind of
- * semantics is also (sort of) supported, through the @ref event_t::owning field.
+ * semantics is also (sort of) supported, through the @ref event_t::owning_ field.
  */
 class event_t {
 

diff --git a/src/cuda/api/ipc.hpp b/src/cuda/api/ipc.hpp
@@ -94,7 +94,7 @@ inline void unmap(void* ipc_mapped_ptr)
  *
  * @param device_ptr beginning of the region of memory
  * to be shared with other processes
- * @return a handle which another process can call @ref import()
+ * @return a handle which another process can call @ref detail_::import()
  * on to obtain a device pointer it can use
  */
 inline ptr_handle_t export_(void* device_ptr)
@@ -302,15 +302,18 @@ inline handle_t export_(const event_t& event);
  * @param event_ipc_handle the handle obtained via inter-process communications
  */
 ///@{
+
  /**
   * @param device the device with which the imported event is associated
   */
 inline event_t import(const device_t& device, const handle_t& event_ipc_handle);
 
 /**
  * @param context the device-context with which the imported event is associated
+ * @param event_ipc_handle The handle created by another process, to be imported
+ * @return An event usable in the current process
  */
-inline event_t import(const context_t& device, const handle_t& event_ipc_handle);
+inline event_t import(const context_t& context, const handle_t& event_ipc_handle);
 ///@}
 
 } // namespace ipc

diff --git a/src/cuda/api/kernel.hpp b/src/cuda/api/kernel.hpp
@@ -12,6 +12,7 @@
 
 #include "primary_context.hpp"
 #include "current_context.hpp"
+#include "device_properties.hpp"
 #include "error.hpp"
 #include "types.hpp"
 
@@ -39,15 +40,18 @@ using shared_memory_size_determiner_t = size_t (CUDA_CB *)(int block_size);
  * @note This is a named constructor idiom, existing of direct access to the ctor
  * of the same signature, to emphasize that a new kernel is _not_ somehow created.
  *
- * @param id Device on which the texture is located
- * @param context_handle Handle of the context in which the kernel was created or added
- * @param handle raw CUDA driver handle for the kernel
+ * @param device_id Device of the context in which the kernel was created
+ * @param context_handle Handle of the context in which the kernel was created
+ * @param handle Raw CUDA driver handle for the kernel
+ * @param hold_pc_refcount_unit when the event's context is a device's primary
+ *     context, this controls whether that context must be kept active while the
+ *     event continues to exist.
  * @return a wrapper object associated with the specified kernel
  */
 kernel_t wrap(
 	device::id_t       device_id,
-	context::handle_t  context_id,
-	kernel::handle_t   f,
+	context::handle_t  context_handle,
+	kernel::handle_t   handle,
 	bool               hold_primary_context_refcount_unit = false);
 
 namespace detail_ {
@@ -115,12 +119,18 @@ inline attribute_value_t get_attribute(const kernel_t& kernel, attribute_t attri
 class kernel_t {
 
 public: // getters
+
+	/// Get (a proxy for) the context in which this kernel is defined
 	context_t context() const noexcept;
+	/// Get (a proxy for) the device for (a context of) which this kernel is defined
 	device_t device() const noexcept;
 
+	/// Get the id of the device for (a context of) which this kernel is defined
 	device::id_t device_id() const noexcept { return device_id_; }
+	/// Get the raw handle of the context in which this kernel is defined
 	context::handle_t context_handle() const noexcept { return context_handle_; }
 #if CAN_GET_APRIORI_KERNEL_HANDLE
+	/// Get the raw (intra-context) CUDA handle for this kernel
 	kernel::handle_t handle() const noexcept { return handle_; }
 #else
 	kernel::handle_t handle() const
@@ -375,11 +385,11 @@ namespace kernel {
 
 inline kernel_t wrap(
 	device::id_t       device_id,
-	context::handle_t  context_id,
-	kernel::handle_t   f,
+	context::handle_t  context_handle,
+	kernel::handle_t   handle,
 	bool hold_primary_context_refcount_unit)
 {
-	return kernel_t{ device_id, context_id, f, hold_primary_context_refcount_unit };
+	return kernel_t{device_id, context_handle, handle, hold_primary_context_refcount_unit };
 }
 
 inline attribute_value_t get_attribute(const kernel_t& kernel, attribute_t attribute)

diff --git a/src/cuda/api/kernel_launch.hpp b/src/cuda/api/kernel_launch.hpp
@@ -24,11 +24,6 @@
  * @ref cuda::launch_configuration_t .
  * </ul>
  *
- * @note You'd probably better avoid launching kernels using these
- * function directly, and go through the @ref cuda::stream_t or @ref cuda::device_t
- * proxy classes' launch mechanism (e.g.
- * `my_stream.enqueue.kernel_launch(...)`).
- *
  * @note Even though when you use this wrapper, your code will not have the silly
  * chevron, you can't use it from regular `.cpp` files compiled with your host
  * compiler. Hence the `.cuh` extension. You _can_, however, safely include this
@@ -335,8 +330,8 @@ void launch(
 
 /**
  * Launch a kernel with the arguments pre-marshalled into the (main) form
- * which @ref cuLaunchKernel accepts variables in: A null-terminated sequence
- * of (possibly const) `void *`'s to the argument values.
+ * which the CUDA driver's launch primitive accepts variables in: A null-
+ * terminated sequence of (possibly const) `void *`'s to the argument values.
  *
  * @tparam SpanOfConstVoidPtrLike
  *     Type of the container for the marshalled arguments; typically, this