From 090057f79c3efff54a966785bf244c4c86dfd870 Mon Sep 17 00:00:00 2001
From: Eyal Rozenberg <eyalroz1@gmx.com>
Date: Fri, 22 Mar 2024 22:14:42 +0200
Subject: [PATCH] Regards #3: Additional doxygen comments, corrections and
 completions of existing comments, doxygen markup tweaks, and spacing tweaks

---
 src/cuda/api/array.hpp                        |   6 +-
 src/cuda/api/context.hpp                      |  22 ++-
 src/cuda/api/detail/unique_span.hpp           |   1 -
 src/cuda/api/device.hpp                       |  11 +-
 src/cuda/api/launch_config_builder.hpp        | 105 +++++++++-
 src/cuda/api/launch_configuration.hpp         |  26 +++
 src/cuda/api/link.hpp                         |  62 ++++--
 src/cuda/api/link_options.hpp                 |  61 +++++-
 src/cuda/api/memory.hpp                       | 186 +++++++++++-------
 src/cuda/api/miscellany.hpp                   |   7 +
 src/cuda/api/module.hpp                       |  84 ++++++--
 .../api/multi_wrapper_impls/unique_region.hpp |  16 ++
 .../api/multi_wrapper_impls/unique_span.hpp   |   2 +-
 src/cuda/api/pci_id.hpp                       |   2 +
 src/cuda/api/peer_to_peer.hpp                 |  32 ++-
 src/cuda/api/pointer.hpp                      |   2 +-
 src/cuda/api/primary_context.hpp              |  10 +
 src/cuda/api/stream.hpp                       | 143 ++++++++++----
 src/cuda/api/texture_view.hpp                 |   9 +-
 src/cuda/api/types.hpp                        |  18 +-
 src/cuda/api/unique_region.hpp                |  57 ++++--
 src/cuda/define_specifiers.hpp                |   3 +
 src/cuda/rtc/compilation_output.hpp           | 150 +++++++++-----
 src/cuda/rtc/error.hpp                        |  11 +-
 src/cuda/rtc/program.hpp                      |  15 +-
 src/cuda/rtc/types.hpp                        |   9 +-
 26 files changed, 819 insertions(+), 231 deletions(-)
diff --git a/src/cuda/api/array.hpp b/src/cuda/api/array.hpp
index 58a9ea49..4519cfd6 100644
--- a/src/cuda/api/array.hpp
+++ b/src/cuda/api/array.hpp
@@ -270,16 +270,20 @@ array_t<T, NumDimensions> wrap(
 	return { device_id, context_handle, handle, dimensions };
 }
 
+/// Create a new (typed) CUDA array of the specified dimensions
+///@{
+/// @param context ... in which the array is to be created
 template <typename T, dimensionality_t NumDimensions>
 array_t<T,NumDimensions> create(
 	const context_t&             context,
 	dimensions_t<NumDimensions>  dimensions);
 
+/// @param device ... in whose primary context the array is to be created
 template <typename T, dimensionality_t NumDimensions>
 array_t<T,NumDimensions> create(
 	const device_t&              device,
 	dimensions_t<NumDimensions>  dimensions);
-
+///@}
 
 } // namespace array
 
diff --git a/src/cuda/api/context.hpp b/src/cuda/api/context.hpp
index 1933d66e..ca97c5ea 100644
--- a/src/cuda/api/context.hpp
+++ b/src/cuda/api/context.hpp
@@ -210,6 +210,15 @@ inline context::flags_t get_flags(handle_t handle)
 
 } // namespace context
 
+/**
+ * Waits for all previously-scheduled tasks on all streams (= queues)
+ * in a CUDA context to conclude, before returning.
+ *
+ * Depending on the `host_thread_sync_scheduling_policy_t` set for the
+ * specified context, the thread calling this method will either yield,
+ * spin or block until all tasks scheduled previously scheduled on streams
+ * within this context have concluded.
+ */
 inline void synchronize(const context_t& context);
 
 /**
@@ -745,10 +754,18 @@ inline handle_t create_and_push(
 /**
  * @brief creates a new context on a given device
  *
- * @param device              The device on which to create the new stream
+ * @param device
+ *     The device which the new context will regard
  * @param sync_scheduling_policy
+ *     Choice of how host threads are to perform synchronization with pending
+ *     actions in streams within this context. See
+ *     @ref host_thread_sync_scheduling_policy_t for a description of these
+ *     choices.
  * @param keep_larger_local_mem_after_resize
- * @return
+ *     If true, larger allocations of global device memory, used by kernels
+ *     requiring a larger amount of local memory, will be kept (so that future
+ *     kernels with such requirements will not trigger a re-allocation).
+ *
  * @note Until CUDA 11, there used to also be a flag for enabling/disabling
  * the ability of mapping pinned host memory to device addresses. However, it was
  * being ignored since CUDA 3.2 already, with the minimum CUDA version supported
@@ -861,6 +878,7 @@ inline context_t get_with_fallback_push()
 
 } // namespace current
 
+/// @return true if the context is the primary context of its device
 bool is_primary(const context_t& context);
 
 namespace detail_ {
diff --git a/src/cuda/api/detail/unique_span.hpp b/src/cuda/api/detail/unique_span.hpp
index 5f18f4b2..7db1e752 100644
--- a/src/cuda/api/detail/unique_span.hpp
+++ b/src/cuda/api/detail/unique_span.hpp
@@ -37,7 +37,6 @@ namespace cuda {
  *
  * @tparam T the type of individual elements in the unique_span
  */
-
 template<typename T, typename Deleter = ::std::default_delete<T[]>>
 class unique_span : public ::cuda::span<T> {
 public: // span types
diff --git a/src/cuda/api/device.hpp b/src/cuda/api/device.hpp
index 4dfaf4ad..2fd91dd2 100644
--- a/src/cuda/api/device.hpp
+++ b/src/cuda/api/device.hpp
@@ -38,7 +38,7 @@ class pool_t;
  * @brief Waits for all previously-scheduled tasks on all streams (= queues)
  * on a specified device to conclude.
  *
- * Depending on the host_thread_sync_scheduling_policy_t set for this
+ * Depending on the host_thread_sync_scheduling_policy_t set for the specified
  * device, the thread calling this method will either yield, spin or block
  * until all tasks scheduled previously scheduled on this device have been
  * concluded.
@@ -604,11 +604,17 @@ class device_t {
 		set_flags(other_flags | static_cast<flags_type>(new_policy));
 	}
 
+	/// @returns true if the device will keep larger amounts of global device memory allocated
+	/// for use as local memory, after a kernel was executed which required a larger-than-usual
+	/// allocation
 	bool keeping_larger_local_mem_after_resize() const
 	{
 		return flags() & CU_CTX_LMEM_RESIZE_TO_MAX;
 	}
 
+	/// @brief Instructs the (primary context of) the device to keep larger amounts of global
+	/// device memory allocated for use as local memory, after a kernel was executed which
+	/// required a larger-than-usual allocation
 	void keep_larger_local_mem_after_resize(bool keep = true)
 	{
 		auto other_flags = flags() & ~CU_CTX_LMEM_RESIZE_TO_MAX;
@@ -616,6 +622,9 @@ class device_t {
 		set_flags(new_flags);
 	}
 
+	/// @brief Instructs the (primary context of) the device to discard allocations of larger
+	/// amounts of global device memory which were used by a kernel requiring a larger amount
+	/// of local memory, and has concluded execution.
 	void dont_keep_larger_local_mem_after_resize()
 	{
 		keep_larger_local_mem_after_resize(false);
diff --git a/src/cuda/api/launch_config_builder.hpp b/src/cuda/api/launch_config_builder.hpp
index d3faaf07..9899afe4 100644
--- a/src/cuda/api/launch_config_builder.hpp
+++ b/src/cuda/api/launch_config_builder.hpp
@@ -179,6 +179,8 @@ class launch_config_builder_t {
 		}
 	}
 
+	/// Use the information specified for the builder to figure out the grid and block
+	/// dimensions with which the kernel is to be launched
 	grid::composite_dimensions_t get_composite_dimensions() const noexcept(false)
 	{
 		auto result = get_unvalidated_composite_dimensions();
@@ -189,6 +191,10 @@ class launch_config_builder_t {
 	}
 
 public:
+	/// Use the information specified to the builder (and defaults for the unspecified
+	/// information) to finalize the construction of a kernel launch configuration,
+	/// which can then be passed along with the kernel to a kernel-launching function,
+	/// e.g. the standalone @ref kernel::launch or the stream command @ref stream_t::enqueue_t::kernel_launch
 	launch_configuration_t build() const
 	{
 		auto result = launch_configuration_t{ get_composite_dimensions() };
@@ -392,6 +398,7 @@ class launch_config_builder_t {
 
 	}
 
+	/// Set the dimensions for each block in the intended kernel launch grid
 	launch_config_builder_t& block_dimensions(
 		grid::block_dimension_t x,
 		grid::block_dimension_t y = 1,
@@ -400,8 +407,17 @@ class launch_config_builder_t {
 		return block_dimensions(grid::block_dimensions_t{x, y, z});
 	}
 
+	/// Set the block in the intended kernel launch grid to be uni-dimensional
+	/// with a specified size
 	launch_config_builder_t& block_size(grid::block_dimension_t size) { return block_dimensions(size, 1, 1); }
 
+	/**
+	 * Set the intended kernel launch grid to have 1D blocks, of the maximum
+	 * length possible given the information specified to the builder.
+	 *
+	 * @note This will fail if neither a kernel nor a device have been chosen
+	 * for the launch.
+	 */
 	launch_config_builder_t& use_maximum_linear_block()
 	{
 		grid::block_dimension_t max_size;
@@ -424,6 +440,16 @@ class launch_config_builder_t {
 	}
 
 #if CUDA_VERSION >= 12000
+	/**
+	 * Set the dimensions of multi-block clusters within the grid.
+	 *
+	 * @note There is only a small number of possible dimension combinations of clusters;
+	 * and this function does _not_ guarantee to fail immediately if you specify an
+	 * invalid such combination.
+	 *
+	 * @note This setting does not affect the overall dimensions of the grid in terms of
+	 * blocks.
+	 */
 	launch_config_builder_t& cluster_blocks(grid::block_dimensions_t cluster_dims)
 	{
 #ifndef NDEBUG
@@ -434,6 +460,9 @@ class launch_config_builder_t {
 	}
 #endif
 
+	/// Set the dimension of the grid for the intended kernel launch, in terms
+	/// of blocks
+	///@{
 	launch_config_builder_t& grid_dimensions(grid::dimensions_t dims)
 	{
 #ifndef NDEBUG
@@ -447,6 +476,7 @@ class launch_config_builder_t {
 		return *this;
 	}
 
+	///@}
 	launch_config_builder_t& grid_dimensions(
 		grid::dimension_t x,
 		grid::dimension_t y = 1,
@@ -455,9 +485,17 @@ class launch_config_builder_t {
 		return grid_dimensions(grid::dimensions_t{x, y, z});
 	}
 
+	/// Set the grid for the intended launch to be one-dimensional, with a specified number
+	/// of blocks
+	///@{
 	launch_config_builder_t& grid_size(grid::dimension_t size) {return grid_dimensions(size, 1, 1); }
 	launch_config_builder_t& num_blocks(grid::dimension_t size) {return grid_size(size); }
+	///@}
+
 
+	/// Set the overall number of _threads_, in each dimension, of all blocks
+	/// in the grid of the intended kernel launch
+	///@{
 	launch_config_builder_t& overall_dimensions(grid::overall_dimensions_t dims)
 	{
 #ifndef NDEBUG
@@ -474,16 +512,30 @@ class launch_config_builder_t {
 	{
 		return overall_dimensions(grid::overall_dimensions_t{x, y, z});
 	}
+	///@}
 
+	/// Set the intended launch grid to be linear, with a specified overall number of _threads_
+	/// over all (1D) blocks in the grid
 	launch_config_builder_t& overall_size(grid::overall_dimension_t size) { return overall_dimensions(size, 1, 1); }
 
+	/**
+	 * Set whether or blocks may synchronize with each other or not
+	 *
+	 * @note recall that even "non-cooperative" blocks can still access the same global memory
+	 * locations, and can use atomic operations on such locations for (slow) synchronization.
+	 */
 	launch_config_builder_t& block_cooperation(bool cooperation)
 	{
 		thread_block_cooperation = cooperation;
 		return *this;
 	}
 
+	/// Let kernel thread blocks synchronize with each other, or are guaranteed to act independently
+	/// (atomic global memory operations notwithstanding)
 	launch_config_builder_t& blocks_may_cooperate() { return block_cooperation(true); }
+
+	/// Prevent kernel thread blocks synchronize with each other, guaranteeing each block will
+	/// work entirely independently (atomic global memory operations notwithstanding)
 	launch_config_builder_t& blocks_dont_cooperate() { return block_cooperation(false); }
 
 	launch_config_builder_t& dynamic_shared_memory_size(
@@ -493,11 +545,18 @@ class launch_config_builder_t {
 		return *this;
 	}
 
+	/// Indicate that the intended launch should not allocate any shared
+	/// memory for the kernel to use beyond the static amount necessitated
+	/// by its (compiled) code.
 	launch_config_builder_t& no_dynamic_shared_memory()
 	{
 		return dynamic_shared_memory_size(memory::shared::size_t(0));
 	}
 
+	/// Indicate that the intended launch should allocate a certain amount of shared
+	/// memory for the kernel to use beyond the static amount necessitated
+	/// by its (compiled) code.
+	///@{
 	launch_config_builder_t& dynamic_shared_memory_size(memory::shared::size_t size)
 	{
 #ifndef NDEBUG
@@ -512,13 +571,32 @@ class launch_config_builder_t {
 	{
 		return dynamic_shared_memory_size(size);
 	}
+	///@}
 
+	/**
+	 * Indicate that the intended launch should allocate additional shared
+	 * memory for the kernel to use beyond the static amount necessitated
+	 * by its (compiled) code - with the amount to be determined based on
+	 * the block size
+	 *
+	 * @param shared_mem_size_determiner a function determining the dynamic
+	 * shared memory size given the kernel launch block size
+	 */
 	launch_config_builder_t& dynamic_shared_memory(
 		kernel::shared_memory_size_determiner_t shared_mem_size_determiner)
 	{
 		return dynamic_shared_memory_size(shared_mem_size_determiner);
 	}
 
+	/**
+	 * Indicate that the specified wrapped kernel will be the one launched
+	 * with the configuration to be produced by this object.  Such an indication
+	 * provides this object with information about the device and context in
+	 * which the kernel is to be launched, and ranges of possible values for
+	 * certain parameters (e.g. shared memory size, dimensions).
+	 *
+	 * @note Calling this method obviates a call to the @ref device() method.
+	 */
 	launch_config_builder_t& kernel(const kernel_t* wrapped_kernel_ptr)
 	{
 		if (device_ and kernel_->device_id() != device_.value()) {
@@ -533,6 +611,15 @@ class launch_config_builder_t {
 		return *this;
 	}
 
+	/**
+	 * Indicate that the intended kernel launch would occur on (some stream in
+	 * some context on) the specified device. Such an indication provides this
+	 * object with some information regarding ranges of possible values for
+	 * certain parameters (e.g. shared memory size, dimensions).
+	 *
+	 * @note Do not call both this and the @ref kernel() method; prefer just that one.
+	 */
+	///@{
 	launch_config_builder_t& device(const device::id_t device_id)
 	{
 		if (kernel_ and kernel_->device_id() != device_id) {
@@ -548,7 +635,11 @@ class launch_config_builder_t {
 	{
 		return this->device(device.id());
 	}
+	///@}
 
+	/// Clear the association with a specific kernel (which may have been
+	/// set using the @ref kernel method)
+	///@{
 	launch_config_builder_t& kernel_independent()
 	{
 		kernel_ = nullptr;
@@ -559,13 +650,14 @@ class launch_config_builder_t {
 		kernel_ = nullptr;
 		return *this;
 	}
+	///@}
 
 	/**
-	 * @brief THis will use information about the kernel, the already-set block size,
+	 * @brief This will use information about the kernel, the already-set block size,
 	 * and the device to create a unidimensional grid of blocks to exactly saturate
 	 * the CUDA device's capacity for simultaneous active blocks.
 	 *
-	 * @note This will _not_ set the block size - unlike
+	 * @note This will _not_ set the block size - unlike {@ref min_params_for_max_occupancy()}.
 	 */
 	launch_config_builder_t& saturate_with_active_blocks()
 	{
@@ -584,6 +676,14 @@ class launch_config_builder_t {
 		return *this;
 	}
 
+	/**
+	 * @brief This will use information about the kernel and the device to define
+	 * a minimum launch grid which should guarantee maximum occupancy of the GPU's
+	 * multiprocessors.
+	 *
+	 * @note A builder after this call _will_ set the block dimensions - unlike
+	 * {@ref saturate_with_active_blocks()} .
+	 */
 	launch_config_builder_t& min_params_for_max_occupancy()
 	{
 		if (not (kernel_)) {
@@ -600,6 +700,7 @@ class launch_config_builder_t {
 	}
 }; // launch_config_builder_t
 
+/// A slightly shorter-named construction idiom for @ref launch_config_builder_t
 inline launch_config_builder_t launch_config_builder() { return {}; }
 
 } // namespace cuda
diff --git a/src/cuda/api/launch_configuration.hpp b/src/cuda/api/launch_configuration.hpp
index f891f8e0..e213e7ed 100644
--- a/src/cuda/api/launch_configuration.hpp
+++ b/src/cuda/api/launch_configuration.hpp
@@ -58,7 +58,16 @@ enum class cluster_scheduling_policy_t {
 };
 #endif
 
+/**
+ * The full set of possible configuration parameters for launching a kernel on a GPU.
+ *
+ * @note Consider using a @ref launch_configuration_builder_t to incrementally construct
+ * these structs.
+ *
+ * @note this structure must be constructed with at least the grid and block dimensions.
+ */
 struct launch_configuration_t {
+	/// Dimensions of the launch grid in blocks, and of the individual blocks in the grid.
 	grid::composite_dimensions_t dimensions { grid::dimensions_t{ 0u, 0u, 0u }, grid::block_dimensions_t{ 0u, 0u, 0u } };
 
 	/**
@@ -160,6 +169,20 @@ struct launch_configuration_t {
 	constexpr launch_configuration_t(const launch_configuration_t&) = default;
 	constexpr launch_configuration_t(launch_configuration_t&&) = default;
 
+	/**
+	 * Constructors corresponding to the CUDA runtime API's triple-chevron launch
+	 * syntax:
+	 *
+	 *     my_kernel <<< grid_Dims, block_dims, dynamic_shmem_size, my_stream >>> (
+	 *         arg1, arg2, arg3, etc);
+	 *
+	 * ... where the specified aspects of the launch configuration are the dimensions
+	 * and the dynamic shared memory size.
+	 *
+	 * @note The choices of stream and kernel function are _not_ part of the launch
+	 * configuration.
+	 */
+	///@{
 	constexpr launch_configuration_t(
 		grid::composite_dimensions_t grid_and_block_dimensions,
 		memory::shared::size_t dynamic_shared_mem = 0u
@@ -184,12 +207,14 @@ struct launch_configuration_t {
 		grid::block_dimensions_t(block_dims),
 		dynamic_shared_mem)
 	{ }
+	///@}
 
 	CPP14_CONSTEXPR launch_configuration_t& operator=(const launch_configuration_t& other) = default;
 	CPP14_CONSTEXPR launch_configuration_t& operator=(launch_configuration_t&&) = default;
 };
 
 #if __cplusplus < 202002L
+///@cond
 constexpr bool operator==(const launch_configuration_t lhs, const launch_configuration_t& rhs) noexcept
 {
 	return
@@ -210,6 +235,7 @@ constexpr bool operator!=(const launch_configuration_t lhs, const launch_configu
 {
 	return not (lhs == rhs);
 }
+///@endcond
 #endif
 
 namespace detail_ {
diff --git a/src/cuda/api/link.hpp b/src/cuda/api/link.hpp
index ef55571e..bd028477 100644
--- a/src/cuda/api/link.hpp
+++ b/src/cuda/api/link.hpp
@@ -20,14 +20,14 @@ namespace cuda {
 
 ///@cond
 class device_t;
-
 class module_t;
-
 class link_t;
 ///@endcond
 
+/// Definitions related to CUDA linking-processes, captured by the  @ref link_t wrapper class
 namespace link {
 
+/// Kinds of images which can be used by the linker (some may require driver compilation work)
 enum class input_kind_t {
 	cubin,    /// Compiled device-class-specific device code
 	ptx,      /// PTX (microarchitecture-inspecific intermediate representation)
@@ -36,9 +36,14 @@ enum class input_kind_t {
 	library,  /// An archive of objects files with embedded device code; a `.a` file
 };
 
+/// A raw CUDA driver handle for a linking-process
 using handle_t = CUlinkState;
 
-// TODO: Check if the linking has been completed!
+/**
+ * @brief Wrap an existing CUDA link-process in a @ref link_t wrapper class instance.
+ *
+ * @todo : Consider checking if the linking has already been completed!
+ */
 inline link_t wrap(
 	device::id_t device_id,
 	context::handle_t context_handle,
@@ -48,21 +53,19 @@ inline link_t wrap(
 
 inline link_t create(const void *image, const link::options_t &options);
 
-// TODO: Use a clase-class with C++17 of later, made up of the two classes here
+/// Definitions relating to inputs to CUDA linking-processes
 namespace input {
 
-/**
- * A typed, named, image in memory which can be used as an input to a runtime
- * CUDA linking process.
- */
+/// A typed, named, image in memory which can be used as an input to a runtime CUDA linking-process
 struct image_t : memory::region_t {
-	const char *name;
-	link::input_kind_t type;
+	const char *name; /// Link images are attached a name when registered in a linking-process
+	link::input_kind_t type; /// type of contents found in the memory region
 };
 
+/// A typed, named, image in a file which can be used as an input to a runtime CUDA linking-process
 struct file_t {
-	const char *path; // TODO: Use a proper path in C++14 and later
-	link::input_kind_t type;
+	const char *path;
+	link::input_kind_t type; /// type of contents found in the file
 };
 
 } // namespace input
@@ -115,7 +118,16 @@ class link_t {
 		return memory::region_t{cubin_output_start, cubin_output_size};
 	}
 
-	// TODO: Replace this with methods which take wrapper classes.
+	/**
+	 * Add another linkable image, from memory, to this linking-process
+	 *
+	 * @param[in] image Memory region containing the image
+	 * @param[in] ptx_compilation_options Options for compiling PTX code to cubin, if necessary,
+	 *     before linking.
+	 *
+	 * @note some types of linkable images are not, in fact, even compiled - but can be compiled
+	 * by the driver with the specified @p options.
+	 */
 	void add(link::input::image_t image, const link::options_t &ptx_compilation_options = {}) const
 	{
 		auto marshalled_options = link::detail_::marshal(ptx_compilation_options);
@@ -134,6 +146,17 @@ class link_t {
 			+ ::std::to_string(static_cast<int>(image.type)) + " to a link.");
 	}
 
+	/**
+	 * Add another linkable image, from a file, to this linking-process
+	 *
+	 * @param[in] file_input Path of the image file to be added
+	 * @param[in] ptx_compilation_options Options for compiling PTX code to cubin, if necessary,
+	 *     before linking.
+	 *
+	 * @note some types of linkable images are not, in fact, even compiled - but can be compiled
+	 * by the driver with the specified @p options.
+	 */
+	///@{
 	void add_file(link::input::file_t file_input, const link::options_t &options) const
 	{
 		auto marshalled_options = link::detail_::marshal(options);
@@ -156,6 +179,7 @@ class link_t {
 		return add_file(path.c_str(), file_contents_type);
 	}
 #endif
+	///@}
 
 protected: // constructors
 
@@ -219,6 +243,7 @@ class link_t {
 
 namespace link {
 
+/// Create a new link-process (before adding any compiled images or or image-files)
 inline link_t create(const link::options_t &options = link::options_t{})
 {
 	handle_t new_link_handle;
@@ -241,13 +266,12 @@ inline link_t create(const link::options_t &options = link::options_t{})
 		do_take_ownership);
 }
 
-// TODO: Check if the linking has been completed!
 inline link_t wrap(
-	device::id_t device_id,
-	context::handle_t context_handle,
-	link::handle_t handle,
-	const link::options_t &options,
-	bool take_ownership) noexcept
+	device::id_t             device_id,
+	context::handle_t        context_handle,
+	link::handle_t           handle,
+	const link::options_t &  options,
+	bool                     take_ownership) noexcept
 {
 	return link_t{device_id, context_handle, handle, options, take_ownership};
 }
diff --git a/src/cuda/api/link_options.hpp b/src/cuda/api/link_options.hpp
index a2e17d59..52d98206 100644
--- a/src/cuda/api/link_options.hpp
+++ b/src/cuda/api/link_options.hpp
@@ -1,7 +1,8 @@
 /**
  * @file
  *
- * @brief Definitions and utility functions relating to just-in-time compilation and linking of CUDA code.
+ * @brief Definitions and utility functions relating to just-in-time compilation, assembly
+ * and linking of CUDA code.
  */
 #pragma once
 #ifndef CUDA_API_WRAPPERS_ASSEMBLY_AND_LINK_OPTIONS_HPP_
@@ -21,17 +22,37 @@ class module_t;
 
 namespace link {
 
+/// Possible strategies for obtaining fully-compiled binary code for a target device
+/// when it is not immediately available.
 enum fallback_strategy_for_binary_code_t {
+	/// Prefer compiling available PTX code to produce fully-compiled binary code
 	prefer_compiling_ptx            = 0,
+	/// Prefer using existing fully-compiled (binary) code, for a compatible but
+	/// not identical target device
 	prefer_using_compatible_binary  = 1,
 };
 
 namespace detail_ {
 
+/// The CUDA driver's raw generic JIT-related option type
 using option_t = CUjit_option;
 
+/**
+ * Mechanism for finalizing options into a format readily usable by the
+ * link_t wrapper (and by the `cuLink`- functions - but highly inconvenient
+ * for inspection and modification.
+ *
+ * @note Don't create these yourself unless you have to; use @ref options_t
+ * instead, and @ref options_t::marshal() when done, for completing the
+ * linking-process. If you must create them - use `push_back()` method
+ * repeatedly until done with all options.
+ */
 struct marshalled_options_t {
+	/// The CUDA driver's expected type for number of link-related options
 	using size_type = unsigned;
+
+	/// The CUDA driver's enum for option identification has this many values -
+	/// and thus, there is need for no more than this many marshalled options
 	constexpr static const size_type max_num_options { CU_JIT_NUM_OPTIONS };
 
 protected:
@@ -39,8 +60,6 @@ struct marshalled_options_t {
 	::std::array<void*, max_num_options> value_buffer;
 	size_type count_ { 0 };
 public:
-	size_type count() const { return count_; }
-
 	void push_back(option_t option)
 	{
 		if (count_ >= max_num_options) {
@@ -76,7 +95,12 @@ struct marshalled_options_t {
 	}
 
 public:
-
+	/**
+	 * This method (alone) is used to populate this structure.
+	 *
+	 * @note The class is not a standard container, and this method cannot be
+	 * reversed or undone, i.e. there is no `pop_back()` or `pop()`.
+	 */
 	template <typename T>
 	void push_back(option_t option, T value)
 	{
@@ -85,25 +109,46 @@ struct marshalled_options_t {
 		// Now set value_buffer[count-1]...
 		value_buffer[count_-1] = process_value(value);
 	}
+
+	/// These three methods yield what the CUDA driver actually expects:
+	/// Two matching raw buffers and their count of elements
+	///@{
 	const option_t* options() const { return option_buffer.data(); }
 	const void * const * values() const { return value_buffer.data(); }
+	size_type count() const { return count_; }
+	///@}
 };
 
 } // namespace detail_
 
+/**
+ * A convenience class for holding, setting and inspecting options for a CUDA binary code
+ * linking process - which may also involve PTX compilation.
+ *
+ * @note This structure does not let you set those options which the CUDA driver documentation
+ * describes as having internal purposes only.
+ */
 struct options_t final : public rtc::common_ptx_compilation_options_t {
 
+	/// options related to logging the link-process
 	struct {
+		/// Non-error information regarding the logging process (i.e. its "standard output" stream)
 		optional<span<char>> info;
+
+		/// Information regarding errors in the logging process (i.e. its "standard error" stream)
 		optional<span<char>> error;
+
+		/// Control whether the info and error logging will be verbose
 		bool verbose;
 	} logs;
 
-	// Note: When this is true, the specific_target of the base class
-	// is overridden
+	/// Instead of using explicitly-specified binary target, from
+	/// @ref common_ptx_compilation_options_t::specific_target - use the device of the current CUDA
+	/// context as the target for binary generation
 	bool obtain_target_from_cuda_context { true };
 
-	/// fallback behavior if a (matching cubin???) is not found
+	/// Possible strategy for obtaining fully-compiled binary code when it is not
+	/// simply available in the input to the link-process
 	optional<fallback_strategy_for_binary_code_t> fallback_strategy_for_binary_code;
 
 	// Ignoring the "internal purposes only" options;
@@ -118,6 +163,8 @@ struct options_t final : public rtc::common_ptx_compilation_options_t {
 
 namespace detail_ {
 
+/// Construct a easily-driver-usable link-process options structure from
+/// a more user-friendly `options_t` structure.
 inline marshalled_options_t marshal(const options_t& link_options)
 {
 	marshalled_options_t marshalled{};
diff --git a/src/cuda/api/memory.hpp b/src/cuda/api/memory.hpp
index ce72ecae..24f286a5 100644
--- a/src/cuda/api/memory.hpp
+++ b/src/cuda/api/memory.hpp
@@ -85,7 +85,7 @@ enum cpu_write_combining : bool {
 };
 
 /**
- * @brief options accepted by CUDA's allocator of memory with a host-side aspect
+ * options accepted by CUDA's allocator of memory with a host-side aspect
  * (host-only or managed memory).
  */
 struct allocation_options {
@@ -106,6 +106,7 @@ inline unsigned make_cuda_host_alloc_flags(allocation_options options)
 
 /**
  * @namespace mapped
+ *
  * Memory regions appearing in both on the host-side and device-side address
  * spaces with the regions in both spaces mapped to each other (i.e. guaranteed
  * to have the same contents on access up to synchronization details). Consult the
@@ -124,7 +125,7 @@ struct span_pair_t {
 };
 
 /**
- * @brief A pair of memory regions, one in system (=host) memory and one on a
+ * A pair of memory regions, one in system (=host) memory and one on a
  * CUDA device's memory - mapped to each other
  *
  * @note this is the mapped-pair equivalent of a `void *`; it is not a
@@ -143,9 +144,7 @@ struct region_pair_t {
 
 } // namespace mapped
 
-/**
- * @brief CUDA-Device-global memory on a single device (not accessible from the host)
- */
+///CUDA-Device-global memory on a single device (not accessible from the host)
 namespace device {
 
 namespace detail_ {
@@ -325,7 +324,7 @@ struct deleter {
 
 
 /**
- * @brief Sets consecutive elements of a region of memory to a fixed
+ * Sets consecutive elements of a region of memory to a fixed
  * value of some width
  *
  * @note A generalization of `set()`, for different-size units.
@@ -339,7 +338,7 @@ template <typename T>
 void typed_set(T* start, const T& value, size_t num_elements);
 
 /**
- * @brief Sets all bytes in a region of memory to a fixed value
+ * Sets all bytes in a region of memory to a fixed value
  *
  * @note The equivalent of @ref ::std::memset for CUDA device-side memory
  *
@@ -366,7 +365,7 @@ inline void set(region_t region, int byte_value)
 ///@}
 
 /**
- * @brief Sets all bytes in a region of memory to 0 (zero)
+ * Sets all bytes in a region of memory to 0 (zero)
  */
 ///@{
 /**
@@ -389,7 +388,7 @@ inline void zero(region_t region)
 ///@}
 
 /**
- * @brief Sets all bytes of a single pointed-to value to 0
+ * Sets all bytes of a single pointed-to value to 0
  *
  * @param ptr pointer to a value of a certain type, in a CUDA device's
  * global memory
@@ -558,7 +557,7 @@ inline void copy(region_t destination, void* source)
 ///@}
 
 /**
- * @brief Sets a number of bytes in memory to a fixed value
+ * Sets a number of bytes in memory to a fixed value
  *
  * @note The equivalent of @ref ::std::memset - for any and all CUDA-related
  * memory spaces
@@ -571,7 +570,7 @@ inline void copy(region_t destination, void* source)
 void set(void* ptr, int byte_value, size_t num_bytes);
 
 /**
- * @brief Sets all bytes in a region of memory to a fixed value
+ * Sets all bytes in a region of memory to a fixed value
  *
  * @note The equivalent of @ref ::std::memset - for any and all CUDA-related
  * memory spaces
@@ -586,7 +585,7 @@ inline void set(region_t region, int byte_value)
 }
 
 /**
- * @brief Sets all bytes in a region of memory to 0 (zero)
+ * Sets all bytes in a region of memory to 0 (zero)
  */
 ///@{
 /**
@@ -610,7 +609,7 @@ inline void zero(void* ptr, size_t num_bytes)
 ///@}
 
 /**
- * @brief Sets all bytes of a single pointed-to value to 0
+ * Sets all bytes of a single pointed-to value to 0
  *
  * @param ptr pointer to a single element of a certain type, which may
  * be in host-side memory, global CUDA-device-side memory or CUDA-managed
@@ -662,7 +661,7 @@ status_t multidim_copy(copy_parameters_t<NumDimensions> params)
 } // namespace detail_
 
 /**
- * @brief An almost-generalized-case memory copy, taking a rather complex structure of
+ * An almost-generalized-case memory copy, taking a rather complex structure of
  * copy parameters - wrapping the CUDA driver's own most-generalized-case copy
  *
  * @tparam NumDimensions The number of dimensions of the parameter structure.
@@ -1216,8 +1215,7 @@ inline void typed_set(T* start, const T& value, size_t num_elements, stream::han
 
 
 /**
- * @brief Sets consecutive elements of a region of memory to a fixed
- * value of some width
+ * Sets consecutive elements of a region of memory to a fixed value of some width
  *
  * @note A generalization of `async::set()`, for different-size units.
  *
@@ -1263,7 +1261,7 @@ inline void set(void* start, int byte_value, size_t num_bytes, const stream_t& s
 void zero(void* start, size_t num_bytes, const stream_t& stream);
 
 /**
- * @brief Asynchronously sets all bytes of a single pointed-to value
+ * Asynchronously sets all bytes of a single pointed-to value
  * to 0 (zero).
  *
  * @note asynchronous version of @ref memory::zero(T*)
@@ -1440,6 +1438,7 @@ inline void copy(
 
 /**
  * @namespace host
+ *
  * Host-side (= system) memory which is "pinned", i.e. resides in
  * a fixed physical location - and allocated by the CUDA driver.
  */
@@ -1459,7 +1458,7 @@ inline region_t allocate(
 
 
 /**
- * allocate pinned host memory
+ * Allocates pinned host memory
  *
  * @note "pinned" memory is allocated in contiguous physical ram
  * addresses, making it possible to copy to and from it to the the
@@ -1469,25 +1468,17 @@ inline region_t allocate(
  *
  * @throws cuda::runtime_error if allocation fails for any reason
  *
- * @param context
  * @param size_in_bytes the amount of memory to allocate, in bytes
  * @param options
  *     options to pass to the cuda host-side memory allocator; see
  *     {@ref memory::allocation_options}.
  * @return a pointer to the allocated stretch of memory
  */
-///@{
-
-inline region_t allocate(
-	const context_t&    context,
-	size_t              size_in_bytes,
-	allocation_options  options);
-
-region_t allocate(
-	size_t              size_in_bytes,
-	allocation_options  options);
+region_t allocate(size_t size_in_bytes, allocation_options options);
 
 /**
+ * @copydoc allocate(size_t, allocation_options)
+ *
  * @param portability
  *     whether or not the allocated region can be used in different
  *     CUDA contexts.
@@ -1504,16 +1495,18 @@ inline region_t allocate(
 	return allocate(size_in_bytes, allocation_options{ portability, cpu_wc } );
 }
 
+/// @copydoc allocate(size_t, portability_across_contexts, cpu_write_combining)
 inline region_t allocate(size_t size_in_bytes, cpu_write_combining cpu_wc)
 {
 	return allocate(size_in_bytes, allocation_options{ portability_across_contexts(false), cpu_write_combining(cpu_wc)} );
 }
 
-///@}
-
 /**
- * Free a region of pinned host memory which was allocated with one of the pinned host
+ * Frees a region of pinned host memory which was allocated with one of the pinned host
  * memory allocation functions.
+ *
+ * @note The address provided must be the _beginning_ of the region of allocated memory;
+ * and the entire region is freed (i.e. the region size is known to/determined by the driver)
  */
 inline void free(void* host_ptr)
 {
@@ -1526,6 +1519,11 @@ inline void free(void* host_ptr)
 	throw runtime_error(result, "Freeing pinned host memory at " + cuda::detail_::ptr_as_hex(host_ptr));
 }
 
+/**
+ * @copybrief free(void*)
+ *
+ * @param region The region of memory to free
+ */
 inline void free(region_t region) {	return free(region.data()); }
 
 namespace detail_ {
@@ -1537,9 +1535,8 @@ struct deleter {
 	void operator()(void* ptr) const { cuda::memory::host::free(ptr); }
 };
 
-
 /**
- * @brief Makes a preallocated memory region behave as though it were allocated with @ref host::allocate.
+ * Makes a pre-allocated memory region behave as though it were allocated with @ref host::allocate.
  *
  * Page-locks the memory range specified by ptr and size and maps it for the device(s) as specified by
  * flags. This memory range also is added to the same tracking mechanism as cuMemAllocHost() to
@@ -1669,7 +1666,7 @@ inline void deregister(const_region_t region)
 }
 
 /**
- * @brief Sets all bytes in a stretch of host-side memory to a single value
+ * Sets all bytes in a stretch of host-side memory to a single value
  *
  * @note a wrapper for @ref ::std::memset
  *
@@ -1703,22 +1700,6 @@ inline void zero(T* ptr)
 
 } // namespace host
 
-/**
- * This type of memory, also known as _unified_ memory, appears within
- * a unified, all-system address space - and is used with the same
- * address range on the host and on all relevant CUDA devices on a
- * system. It is paged, so that it may exceed the physical size of
- * a CUDA device's global memory. The CUDA driver takes care of
- * "swapping" pages "out" from a device to host memory or "swapping"
- * them back "in", as well as of propagation of changes between
- * devices and host-memory.
- *
- * @note For more details, see
- * <a href="https://devblogs.nvidia.com/parallelforall/unified-memory-cuda-beginners/">
- * Unified Memory for CUDA Beginners</a> on the
- * <a href="https://devblogs.nvidia.com/parallelforall">Parallel4All blog</a>.
- *
- */
 namespace managed {
 
 namespace detail_ {
@@ -1899,7 +1880,7 @@ inline region_t allocate(
 } // namespace detail_
 
 /**
- * @brief Allocate a a region of managed memory, accessible with the same
+ * Allocate a a region of managed memory, accessible with the same
  * address on the host and on CUDA devices.
  *
  * @param context the initial context which is likely to access the managed
@@ -1917,7 +1898,7 @@ inline region_t allocate(
 	initial_visibility_t  initial_visibility = initial_visibility_t::to_all_devices);
 
 /**
- * @brief Allocate a a region of managed memory, accessible with the same
+ * Allocate a a region of managed memory, accessible with the same
  * address on the host and on CUDA devices
  *
  * @param device the initial device which is likely to access the managed
@@ -1935,7 +1916,7 @@ inline region_t allocate(
 	initial_visibility_t  initial_visibility = initial_visibility_t::to_all_devices);
 
 /**
- * @brief Allocate a a region of managed memory, accessible with the same
+ * Allocate a a region of managed memory, accessible with the same
  * address on the host and on all CUDA devices.
  *
  * @note While the allocated memory should be available universally, the
@@ -2007,7 +1988,7 @@ inline void prefetch(
 } // namespace detail_
 
 /**
- * @brief Prefetches a region of managed memory to a specific device, so
+ * Prefetches a region of managed memory to a specific device, so
  * it can later be used there without waiting for I/O from the host or other
  * devices.
  */
@@ -2017,7 +1998,7 @@ void prefetch(
 	const stream_t&        stream);
 
 /**
- * @brief Prefetches a region of managed memory into host memory. It can
+ * Prefetches a region of managed memory into host memory. It can
  * later be used there without waiting for I/O from any of the CUDA devices.
  */
 void prefetch_to_host(
@@ -2194,6 +2175,7 @@ inline bool is_part_of_a_region_pair(const void* ptr)
 
 namespace device {
 
+/// A unique span of device-global memory
 template <typename T>
 using unique_span = cuda::unique_span<T, detail_::deleter>;
 
@@ -2212,25 +2194,39 @@ unique_span<T> make_unique_span(const context::handle_t context_handle, size_t s
 } // namespace detail_
 
 /**
- * @brief Create a variant of ::std::unique_pointer for an array in
- * device-global memory.
+ * Allocate memory for a consecutive sequence of typed elements in device-global memory.
  *
- * @note CUDA's runtime API always has a current device; but -
- * there is not necessary a current context; so a primary context
- * for a device may be created through this call.
+ * @tparam T  type of the individual elements in the allocated sequence
  *
- * @tparam T  an array type; _not_ the type of individual elements
+ * @param context The CUDA device context in which to make the allocation.
+ * @param size the number of elements to allocate
+ * @return A @ref unique_span which owns the allocated memory (and will release said
  *
- * @param context       The CUDA device context in which to make the
- *                      allocation.
- * @param num_elements  the number of elements to allocate
+ * @note This function is somewhat similar to ::std:: make_unique_for_overwrite(), except
+ * that the returned value is not "just" a unique pointer, but also has a size. It is also
+ * similar to {@ref cuda::device::make_unique_region}, except that the allocation is
+ * conceived as typed elements.
  *
- * @return an ::std::unique_ptr pointing to the constructed T array
-*/
+ * @note Typically, this is used for trivially-constructible elements, for which reason the
+ * non-construction of individual elements should not pose a problem. But - let the user beware.
+ */
 template <typename T>
 unique_span<T> make_unique_span(const context_t& context, size_t size);
+
+/**
+ * @copydoc make_unique_span(const context_t&, size_t)
+ *
+ * @param device The CUDA device in whose primary context to make the allocation.
+ */
 template <typename T>
 unique_span<T> make_unique_span(const device_t& device, size_t size);
+
+/**
+ * @copydoc make_unique_span(const context_t&, size_t)
+ *
+ * @note The current device's primary context will be used (_not_ the
+ * current context).
+ */
 template <typename T>
 unique_span<T> make_unique_span(size_t size);
 
@@ -2243,7 +2239,7 @@ inline device::unique_span<T> make_unique_span(const context_t& context, size_t
 	return device::make_unique_span<T>(context, num_elements);
 }
 
-/// See @ref `device::make_unique_span(const device_t& device, size_t num_elements)`
+/// See @ref `device::make_unique_span(const context_t& context, size_t num_elements)`
 template <typename T>
 inline device::unique_span<T> make_unique_span(const device_t& device, size_t num_elements)
 {
@@ -2252,9 +2248,29 @@ inline device::unique_span<T> make_unique_span(const device_t& device, size_t nu
 
 namespace host {
 
+/// A unique span of CUDA-driver-allocated, pinned host (=system) memory
 template <typename T>
 using unique_span = cuda::unique_span<T, detail_::deleter>;
 
+/**
+ * Allocate memory for a consecutive sequence of typed elements in system
+ * (host-side) memory.
+ *
+ * @tparam T  type of the individual elements in the allocated sequence
+ *
+ * @param size the number of elements to allocate
+ * @return A @ref unique_span which owns the allocated memory (and will release said
+ * memory upon destruction)
+ *
+ * @note This function is somewhat similar to ::std:: make_unique_for_overwrite(), except
+ * that the returned value is not "just" a unique pointer, but also has a size. It is also
+ * similar to {@ref cuda::device::make_unique_region}, except that the allocation is
+ * conceived as typed elements.
+ *
+ * @note Typically, this is used for trivially-constructible elements, for which reason the
+ * non-construction of individual elements should not pose a problem. But - let the user
+ * beware, especially since this is host-side memory.
+ */
 template <typename T>
 unique_span<T> make_unique_span(size_t size)
 {
@@ -2265,6 +2281,7 @@ unique_span<T> make_unique_span(size_t size)
 
 namespace managed {
 
+/// A unique span of CUDA-driver-allocated managed memory
 template <typename T>
 using unique_span = cuda::unique_span<T, detail_::deleter>;
 
@@ -2282,16 +2299,48 @@ unique_span<T> make_unique_span(
 
 } // namespace detail_
 
+/**
+ * Allocate memory for a consecutive sequence of typed elements in system
+ * (host-side) memory.
+ *
+ * @tparam T  type of the individual elements in the allocated sequence
+ *
+ * @param size the number of elements to allocate
+ * @return A @ref unique_span which owns the allocated memory (and will release said
+ * memory upon destruction)
+ *
+ * @note This function is somewhat similar to ::std:: make_unique_for_overwrite(), except
+ * that the returned value is not "just" a unique pointer, but also has a size. It is also
+ * similar to {@ref cuda::device::make_unique_region}, except that the allocation is
+ * conceived as typed elements.
+ *
+ * @note Typically, this is used for trivially-constructible elements, for which reason the
+ * non-construction of individual elements should not pose a problem. But - let the user
+ * beware, especially since this is host-side memory.
+ */
 template <typename T>
 unique_span<T> make_unique_span(
 	const context_t&      context,
 	size_t                size,
 	initial_visibility_t  initial_visibility = initial_visibility_t::to_all_devices);
+
+/**
+ * @copydoc make_unique_span(const context_t&, size_t)
+ *
+ * @param device The CUDA device in whose primary context to make the allocation.
+ */
 template <typename T>
 unique_span<T> make_unique_span(
 	const device_t&       device,
 	size_t                size,
 	initial_visibility_t  initial_visibility = initial_visibility_t::to_all_devices);
+
+/**
+ * @copydoc make_unique_span(const context_t&, size_t)
+ *
+ * @note The current device's primary context will be used (_not_ the
+ * current context).
+ */
 template <typename T>
 unique_span<T> make_unique_span(
 	size_t                size,
@@ -2302,6 +2351,7 @@ unique_span<T> make_unique_span(
 } // namespace memory
 
 namespace symbol {
+
 /**
  * Locates a CUDA symbol in global or constant device memory
  *
diff --git a/src/cuda/api/miscellany.hpp b/src/cuda/api/miscellany.hpp
index 6d04f547..1913ac0d 100644
--- a/src/cuda/api/miscellany.hpp
+++ b/src/cuda/api/miscellany.hpp
@@ -30,6 +30,13 @@ inline void initialize_driver()
 	throw_if_error_lazy(status, "Failed initializing the CUDA driver");
 }
 
+/**
+ * A mechanism for ensuring a @ref cuInit() call has been made, to use before making
+ * any other driver API calls.
+ *
+ * @note differs from simply calling `initialize_driver()` in that repeated calls
+ * from the same thread will avoid additional @ref cuInit() call.
+ */
 inline void ensure_driver_is_initialized()
 {
 	thread_local bool driver_known_to_be_initialized{false};
diff --git a/src/cuda/api/module.hpp b/src/cuda/api/module.hpp
index f69315e4..b06d694c 100644
--- a/src/cuda/api/module.hpp
+++ b/src/cuda/api/module.hpp
@@ -30,10 +30,13 @@ class kernel_t;
 
 namespace module {
 
+// The CUDA driver's raw handle for modules
 using handle_t = CUmodule;
 
 namespace detail_ {
 
+/// Construct a module proxy object - for an existing module - from the class'
+/// constituent fields
 inline module_t wrap(
 	device::id_t            device_id,
 	context::handle_t       context_handle,
@@ -70,6 +73,8 @@ inline void destroy(handle_t handle, context::handle_t context_handle, device::i
  *     be loaded (and in which the module contents may be used)
  * @param[in] module_data the opaque, raw binary data for the module - in a contiguous container
  *     such as a span, a cuda::unique_span etc..
+ * @param link_options Potential options for the PTX compilation and linking of the compiled
+ *     device-side code.
  */
 ///@{
 template <typename Locus, typename ContiguousContainer,
@@ -97,17 +102,21 @@ module_t create(
 class module_t {
 
 public: // getters
-
+	/// Getters for the module object's raw constituent fields
+	///@{
 	module::handle_t handle() const { return handle_; }
 	context::handle_t context_handle() const { return context_handle_; }
 	device::id_t device_id() const { return device_id_; }
+	///@}
 
+	/// @returns the context in which this module exists
 	context_t context() const;
+
+	/// @returns the device with which this module is associated
 	device_t device() const;
 
 	/**
-	 * Obtains an already-compiled kernel previously associated with
-	 * this module.
+	 * Obtains a kernel constituting part of this module
 	 *
 	 * @param name The function name, in case of a C-style function,
 	 * or the mangled function signature, in case of a C++-style
@@ -117,11 +126,13 @@ class module_t {
 	 */
 	cuda::kernel_t get_kernel(const char* name) const;
 
+	/// @copydoc get_kernel(const char*) const
 	cuda::kernel_t get_kernel(const ::std::string& name) const
 	{
 		return get_kernel(name.c_str());
 	}
 
+	/// Get the mapping of a named memory region in this module to actual memory
 	memory::region_t get_global_region(const char* name) const
 	{
 		CUdeviceptr dptr;
@@ -134,7 +145,14 @@ class module_t {
 	// TODO: Implement a surface reference and texture reference class rather than these raw pointers.
 
 #if CUDA_VERSION < 12000
+	/// A "plug" of a method regarding surfaces, which modules support but our wrappers don't
+	/// really cater to the use of
 	CUsurfref get_surface(const char* name) const;
+
+	/// A "plug" of a method regarding texture references
+	///
+	/// TODO: Beef this up into a proper method and consider relations between texture references,
+	/// texture objects and texture views
 	CUtexref get_texture_reference(const char* name) const;
 #endif
 
@@ -247,15 +265,15 @@ inline module_t load_from_file_in_current_context(
  * relevant resources for it.
  *
  * @param path of a cubin, PTX, or fatbin file constituting the module to be loaded.
+ * @param context The context in which to create a module containing the loaded data
  * @return the loaded module
  *
  * @note this covers cuModuleLoadFatBinary() even though that's not directly used
  *
- * @todo: consider adding load_module methods to context_t
- * @todo: When switching to the C++17 standard, use string_view's instead of the const char*
+ * @todo consider adding load_module methods to context_t
+ * @todo When switching to the C++17 standard, use string_view's instead of the const char*
  * and ::std::string reference
  */
-///@{
 inline module_t load_from_file(
 	const context_t&        context,
 	const char*             path)
@@ -264,6 +282,7 @@ inline module_t load_from_file(
 	return detail_::load_from_file_in_current_context(context.device_id(), context.handle(), path);
 }
 
+/// @copydoc load_from_file(const context_t&, const char*)
 inline module_t load_from_file(
 	const context_t&        context,
 	const ::std::string&    path)
@@ -271,10 +290,22 @@ inline module_t load_from_file(
 	return load_from_file(context, path.c_str());
 }
 
+/**
+ * @copydoc load_from_file(const context_t&, const char*)
+ *
+ * @param device The device in whose primary context to create a module containing
+ * the loaded data
+ */
 module_t load_from_file(
 	const device_t&         device,
 	const char*             path);
 
+/**
+ * @copydoc load_from_file(const context_t&, const char*)
+ *
+ * @param device The device in whose primary context to create a module containing
+ * the loaded data
+ */
 inline module_t load_from_file(
 	const device_t&         device,
 	const ::std::string&    path)
@@ -282,8 +313,15 @@ inline module_t load_from_file(
 	return load_from_file(device, path.c_str());
 }
 
+/**
+ * @copydoc load_from_file(const context_t&, const char*)
+ *
+ * @note loads the module into the primary context of the device of the current context
+ * (although not necessarily in the current context!)
+ */
 module_t load_from_file(const char* path);
 
+/// @copydoc load_from_file(const char*)
 inline module_t load_from_file(const ::std::string& path)
 {
 	return load_from_file(path.c_str());
@@ -291,6 +329,7 @@ inline module_t load_from_file(const ::std::string& path)
 
 #if __cplusplus >= 201703L
 
+/// @copydoc load_from_file(device_t, const char*)
 inline module_t load_from_file(
 	const device_t&                 device,
 	const ::std::filesystem::path&  path)
@@ -298,6 +337,7 @@ inline module_t load_from_file(
 	return load_from_file(device, path.c_str());
 }
 
+/// @copydoc load_from_file(const char*)
 inline module_t load_from_file(
 	const ::std::filesystem::path&  path)
 {
@@ -320,23 +360,19 @@ inline module_t wrap(
 	return module_t{device_id, context_handle, module_handle, take_ownership, hold_pc_refcount_unit};
 }
 
-/*
-template <typename Creator>
-module_t create(const context_t& context, const void* module_data, Creator creator_function);
-*/
-
 /**
  * Creates a new module in a context using raw compiled code
  *
  * @param context The module will exist within this GPU context, i.e. the globals (functions,
  * variable) of the module would be usable within that constant.
  * @param module_data The raw compiled code for the module.
- * @param link_options Potential options for the PTX compilation and device linking of the code.
+ * @param link_options Potential options for the PTX compilation and linking of the compiled
+ *     device-side code.
  */
-///@{
 module_t create(const context_t& context, const void* module_data, const link::options_t& link_options);
+
+/// @copydoc create(const context_t&, const void*, const link::options_t&)
 module_t create(const context_t& context, const void* module_data);
-///@}
 
 inline void destroy(handle_t handle, context::handle_t context_handle, device::id_t device_id)
 {
@@ -362,7 +398,17 @@ inline device::primary_context_t get_context_for(device_t& locus);
 
 } // namespace detail_
 
-// Note: The following may create the primary context of a device!
+/**
+ * Create a new module - in a specified context or in a device's primary context,
+ * using raw module data in memory.
+ *
+ * @tparam Locus Either a @ref cuda::device_t or a {@ref cuda::context_t}.
+ * @tparam ContiguousContainer A span, a vector, a unique_span, or similar type
+ * @param locus Where the new module should be created
+ * @param module_data The raw data for the module in locus-accessible memory.
+ *
+ * @note This function may create/allocate resources for the primary context of a device!
+ */
 template <typename Locus, typename ContiguousContainer,
 	cuda::detail_::enable_if_t<cuda::detail_::is_kinda_like_contiguous_container<ContiguousContainer>::value, bool>>
 module_t create(
@@ -373,6 +419,14 @@ module_t create(
 	return detail_::create(context, module_data.data());
 }
 
+/**
+ * @copydoc create(Locus&&,	ContiguousContainer)
+ *
+ * @param link_options Options for PTX compilation and for linking the module data,
+ * eventually.
+ *
+ * @return
+ */
 // Note: The following may create the primary context of a device!
 template <typename Locus, typename ContiguousContainer,
 	cuda::detail_::enable_if_t<cuda::detail_::is_kinda_like_contiguous_container<ContiguousContainer>::value, bool>>
diff --git a/src/cuda/api/multi_wrapper_impls/unique_region.hpp b/src/cuda/api/multi_wrapper_impls/unique_region.hpp
index 34fb9c5f..3416def8 100644
--- a/src/cuda/api/multi_wrapper_impls/unique_region.hpp
+++ b/src/cuda/api/multi_wrapper_impls/unique_region.hpp
@@ -72,6 +72,14 @@ inline unique_region make_unique_region(size_t num_bytes)
 
 namespace managed {
 
+/**
+ * @brief Allocate a region of managed memory, accessible both from CUDA devices
+ * and from the CPU.
+ *
+ * @param context A context of possible single-device-visibility
+ *
+ * @returns An owning RAII/CADRe object for the allocated managed memory region
+ */
 inline unique_region make_unique_region(
 	const context_t&      context,
 	size_t                num_bytes,
@@ -81,6 +89,14 @@ inline unique_region make_unique_region(
 	return unique_region { detail_::allocate_in_current_context(num_bytes, initial_visibility) };
 }
 
+/**
+ * @brief Allocate a region of managed memory, accessible both from CUDA devices
+ * and from the CPU.
+ *
+ * @param context A context of possible single-device-visibility
+ *
+ * @returns An owning RAII/CADRe object for the allocated managed memory region
+ */
 inline unique_region make_unique_region(
 	const device_t&       device,
 	size_t                num_bytes,
diff --git a/src/cuda/api/multi_wrapper_impls/unique_span.hpp b/src/cuda/api/multi_wrapper_impls/unique_span.hpp
index c2907c63..0c0ee18b 100644
--- a/src/cuda/api/multi_wrapper_impls/unique_span.hpp
+++ b/src/cuda/api/multi_wrapper_impls/unique_span.hpp
@@ -28,7 +28,7 @@ unique_span<T> make_unique_span(const context_t& context, size_t num_elements)
 }
 
 /**
- * @brief Create a variant of ::std::unique_pointer for an array in
+ * @brief Allocate (but do)
  * device-global memory
  *
  * @tparam T  an array type; _not_ the type of individual elements
diff --git a/src/cuda/api/pci_id.hpp b/src/cuda/api/pci_id.hpp
index 8c8e56ae..9518e941 100644
--- a/src/cuda/api/pci_id.hpp
+++ b/src/cuda/api/pci_id.hpp
@@ -53,6 +53,8 @@ struct pci_location_t {
 	 * and any of them can be used.
 	 */
 	static pci_location_t parse(const ::std::string& id_str);
+
+	/// @copydoc parse(const ::std::string& id_str)
 	static pci_location_t parse(const char* id_str);
 };
 
diff --git a/src/cuda/api/peer_to_peer.hpp b/src/cuda/api/peer_to_peer.hpp
index 68c718b1..cc7c3f93 100644
--- a/src/cuda/api/peer_to_peer.hpp
+++ b/src/cuda/api/peer_to_peer.hpp
@@ -14,17 +14,31 @@ namespace cuda {
 
 namespace device {
 
+/**
+ * @namespace peer_to_peer
+ *
+ * API functions and definitions relating to communications among peer CUDA GPU devices
+ * on the same system.
+ */
 namespace peer_to_peer {
 
-// Aliases for all CUDA device attributes
+/// Aliases for CUDA driver GPU attribute codes
+///@{
+
+/// A relative value indicating the performance of the link between two devices
+constexpr const attribute_t link_performance_rank = CU_DEVICE_P2P_ATTRIBUTE_PERFORMANCE_RANK;
+
+/// 1 if access is supported, 0 otherwise
+constexpr const attribute_t	access_support = CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED;
+
+/// 1 if the first device can perform native atomic operations on the second device, 0 otherwise
+constexpr const attribute_t	native_atomics_support = CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED;
 
-constexpr const attribute_t link_performance_rank = CU_DEVICE_P2P_ATTRIBUTE_PERFORMANCE_RANK; /// A relative value indicating the performance of the link between two devices
-constexpr const attribute_t	access_support = CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED; /// 1 if access is supported, 0 otherwise
-constexpr const attribute_t	native_atomics_support = CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED; /// 1 if the first device can perform native atomic operations on the second device, 0 otherwise
 #if CUDA_VERSION >= 10000
-constexpr const attribute_t	array_access_support = CU_DEVICE_P2P_ATTRIBUTE_CUDA_ARRAY_ACCESS_SUPPORTED; /// 1 if special array iterpolatory access operations are supported across the link, 0 otherwise
+/// 1 if special array interpolatory access operations are supported across the link, 0 otherwise
+constexpr const attribute_t	array_access_support = CU_DEVICE_P2P_ATTRIBUTE_CUDA_ARRAY_ACCESS_SUPPORTED;
 #endif
-
+///@}
 
 namespace detail_ {
 /**
@@ -94,6 +108,12 @@ void disable_access_to(const context_t &peer_context);
 
 } // namespace current
 
+/**
+ * @namespace peer_to_peer
+ *
+ * API functions and definitions relating to communications among "peer" contexts on
+ * the same system, which may possibly regard different CUDA devices.
+ */
 namespace peer_to_peer {
 
 namespace detail_ {
diff --git a/src/cuda/api/pointer.hpp b/src/cuda/api/pointer.hpp
index 2e573b72..1688b8d3 100644
--- a/src/cuda/api/pointer.hpp
+++ b/src/cuda/api/pointer.hpp
@@ -120,7 +120,7 @@ inline memory::type_t type_of(const void* ptr)
 
 /// Obtain (a non-owning wrapper for) the CUDA context with which a memory address is associated
 /// (e.g. being the result of an allocation or mapping in that context)
-inline context_t context_of(const void* ptr);
+context_t context_of(void const* ptr);
 
 /**
  * A convenience wrapper around a raw pointer "known" to the CUDA runtime
diff --git a/src/cuda/api/primary_context.hpp b/src/cuda/api/primary_context.hpp
index db59acc5..4bbaff92 100644
--- a/src/cuda/api/primary_context.hpp
+++ b/src/cuda/api/primary_context.hpp
@@ -75,6 +75,12 @@ inline void increase_refcount(device::id_t device_id)
 
 } // namespace detail_
 
+/**
+ * @returns true if the device's primary context is active (i.e. has resources allocated for it),
+ * which implies we are holding a refcount unit for it somewhere.
+ *
+ * @note recall a primary context being active does not mean that it is the _current_ context
+ */
 inline bool is_active(const device_t& device);
 
 /**
@@ -140,6 +146,9 @@ class primary_context_t : public context_t {
 
 public:
 
+	/// @return a stream object for the default-ID stream of the device, which
+	/// is pre-created and on which actions are scheduled when the runtime API
+	/// is used and no stream is specified.
 	stream_t default_stream() const noexcept;
 
 public: // friendship
@@ -280,6 +289,7 @@ inline bool is_current(device::id_t device_id)
 
 } // namespace detail
 
+/// @return true if the current context is its device's primary context
 inline bool is_current()
 {
 	auto device_id = context::current::detail_::get_device_id();
diff --git a/src/cuda/api/stream.hpp b/src/cuda/api/stream.hpp
index e9896a9a..ab57b314 100644
--- a/src/cuda/api/stream.hpp
+++ b/src/cuda/api/stream.hpp
@@ -50,6 +50,11 @@ enum : bool {
 	nonblocking = async,
 };
 
+/**
+ * Kinds of conditions to apply to a value in GPU global memory
+ * when waiting on that value, i.e. on what condition to stop
+ * waiting.
+ */
 enum wait_condition_t : unsigned {
 	greater_or_equal_to            = CU_STREAM_WAIT_VALUE_GEQ,
 	geq                            = CU_STREAM_WAIT_VALUE_GEQ,
@@ -173,6 +178,9 @@ void enqueue_function_call(const stream_t& stream, Function function, void * arg
  *     the current context and outlasting it. When set to `true`,
  *     the proxy class will act as it does usually, destroying the stream
  *     when being destructed itself.
+ * @param hold_pc_refcount_unit when the stream's context is a device's primary
+ *     context, this controls whether that context must be kept active while the
+ *     stream continues to exist
  * @return an instance of the stream proxy class, with the specified
  *     device-stream combination.
  */
@@ -255,6 +263,8 @@ class stream_t {
 		return flags & CU_STREAM_NON_BLOCKING;
 	}
 
+	/// @returns the execution priority of a tasks on this stream (relative to other
+	/// tasks in other streams on the same device
 	stream::priority_t priority() const
 	{
 		int the_priority;
@@ -321,8 +331,21 @@ class stream_t {
 		const stream_t& associated_stream;
 
 	public:
+		///@cond
 		enqueue_t(const stream_t& stream) : associated_stream(stream) {}
+		///@nocond
 
+		/**
+		 * Schedule a kernel launch on the associated stream
+		 *
+		 * @param kernel A wrapper around the kernel to launch
+		 * @param launch_configuration A description of how to launch the kernel (e.g.
+		 *     block and grid dimensions).
+		 * @param parameters to arguments to be passed to the kernel for this launch
+		 *
+		 * @note This function is cognizant of the types of all arguments passed to it;
+		 * for a type-erased version, see @ref type_erased_kernel_launch()
+		 */
 		template<typename KernelFunction, typename... KernelParameters>
 		void kernel_launch(
 			const KernelFunction&       kernel_function,
@@ -336,6 +359,19 @@ class stream_t {
 				::std::forward<KernelParameters>(parameters)...);
 		}
 
+		/**
+		 * Schedule a kernel launch on the associated stream
+		 *
+		 * @param kernel A wrapper around the kernel to launch
+		 * @param launch_configuration A description of how to launch the kernel (e.g.
+		 *     block and grid dimensions).
+		 * @param marshalled_arguments Pointers to arguments to be passed to the kernel
+		 *     for this launch
+		 *
+		 * @note This signature does not require any type information regarding the kernel
+		 * function type; see @ref kernel_launch() for a type-observing version of the
+		 * same schedulign operation.
+		 */
 		void type_erased_kernel_launch(
 			const kernel_t&         kernel,
 			launch_configuration_t  launch_configuration,
@@ -345,21 +381,14 @@ class stream_t {
 		}
 
 		/**
-		 * Have the CUDA device perform an I/O operation between two specified
-		 * memory regions (on or off the actual device)
+		 * Copy operations
 		 *
+		 * The source and destination memory regions may be anywhere the CUDA driver can
+		 * map  (e.g. the device's global memory, host/system memory, the global memory of
+		 * another device, constant memory etc.)
 		 */
-
 		///@{
-		/**
-		 * @param destination destination region into which to copy. May be
-		 * anywhere in which memory can be mapped to the device's memory space (e.g.
-		 * the device's global memory, host memory or the global memory of another device)
-		 * @param source destination region from which to copy. May be
-		 * anywhere in which memory can be mapped to the device's memory space (e.g.
-		 * the device's global memory, host memory or the global memory of another device)
-		 * @param num_bytes size of the region to copy
-		 **/
+		/// Schedule a copy of one region of memory to another
 		void copy(void *destination, const void *source, size_t num_bytes) const
 		{
 			// CUDA doesn't seem to need us to be in the stream's context to enqueue the copy;
@@ -368,6 +397,7 @@ class stream_t {
 			memory::async::detail_::copy(destination, source, num_bytes, associated_stream.handle_);
 		}
 
+		/// @copybrief copy(void *, const void *, size_t) const
 		void copy(void* destination, memory::const_region_t source, size_t num_bytes) const
 		{
 #ifndef NDEBUG
@@ -378,16 +408,23 @@ class stream_t {
 			copy(destination, source.start(), num_bytes);
 		}
 
+		/**
+		 * @copybrief copy(void *, const void *, size_t) const
+		 *
+		 * @note @p num_bytes may be smaller than the sizes of any of the regions
+		 */
 		void copy(memory::region_t destination, memory::const_region_t source, size_t num_bytes) const
 		{
 			copy(destination.start(), source, num_bytes);
 		}
 
+		/// @copybrief copy(void *, const void *, size_t) const
 		void copy(memory::region_t destination, memory::const_region_t source) const
 		{
 			copy(destination, source, source.size());
 		}
 
+		/// @copybrief copy(void *, const void *, size_t) const
 		void copy(void* destination, memory::const_region_t source) const
 		{
 			copy(destination, source, source.size());
@@ -398,9 +435,9 @@ class stream_t {
 		 * Set all bytes of a certain region in device memory (or unified memory,
 		 * but using the CUDA device to do it) to a single fixed value.
 		 *
-		 * @param destination Beginning of the region to fill
+		 * @param start Beginning of the region to fill
 		 * @param byte_value the value with which to fill the memory region bytes
-		 * @param num_bytes size of the region to fill
+		 * @param num_bytes size in bytes of the region to fill
 		 */
 		void memset(void *start, int byte_value, size_t num_bytes) const
 		{
@@ -409,6 +446,7 @@ class stream_t {
 			memory::device::async::detail_::set(start, byte_value, num_bytes, associated_stream.handle_);
 		}
 
+		/// @copydoc memset(void *, int, size_t) const
 		void memset(memory::region_t region, int byte_value) const
 		{
 			memset(region.data(), byte_value, region.size());
@@ -422,7 +460,7 @@ class stream_t {
 		 * API call for setting to zero; does that mean there are special facilities
 		 * for zero'ing memory faster? Who knows.
 		 *
-		 * @param destination Beginning of the region to fill
+		 * @param start Beginning of the region to fill
 		 * @param num_bytes size of the region to fill
 		 */
 		void memzero(void *start, size_t num_bytes) const
@@ -431,6 +469,9 @@ class stream_t {
 			memory::device::async::detail_::zero(start, num_bytes, associated_stream.handle_);
 		}
 
+		/**
+		 * @copydoc memzero(void *, size_t) const
+		 */
 		void memzero(memory::region_t region) const
 		{
 			memzero(region.data(), region.size());
@@ -492,6 +533,7 @@ class stream_t {
 		}
 
 	public:
+		/// Enqueues a host-invokable object, typically a function or closure object call.
 		template <typename Invokable>
 		void host_invokable(Invokable& invokable) const
 		{
@@ -529,13 +571,16 @@ class stream_t {
 			memory::device::async::free(associated_stream, region);
 		}
 #endif
-		///@{
 
 		/**
 		 * Sets the attachment of a region of managed memory (i.e. in the address space visible
 		 * on all CUDA devices and the host) in one of several supported attachment modes.
 		 *
-		 * The attachmentis actually a commitment vis-a-vis the CUDA driver and the GPU itself
+		 * @param managed_region_start a pointer to the beginning of the managed memory region.
+		 * This cannot be a pointer to anywhere in the middle of an allocated region - you must
+		 * pass whatever @ref cuda::memory::managed::allocate() returned.
+		 *
+		 * The attachment is actually a commitment vis-a-vis the CUDA driver and the GPU itself
 		 * that it doesn't need to worry about accesses to this memory from devices other than
 		 * its object of attachment, so that the driver can optimize scheduling accordingly.
 		 *
@@ -547,12 +592,6 @@ class stream_t {
 		 * the attachment goes into effect (some time after) previous scheduled actions have
 		 * concluded.
 		 */
-		///@{
-		/**
-		 * @param managed_region_start a pointer to the beginning of the managed memory region.
-		 * This cannot be a pointer to anywhere in the middle of an allocated region - you must
-		 * pass whatever @ref cuda::memory::managed::allocate() returned.
-		 */
 		void attach_managed_region(
 			const void* managed_region_start,
 			memory::managed::attachment_t attachment = memory::managed::attachment_t::single_stream) const
@@ -572,8 +611,23 @@ class stream_t {
 		}
 
 		/**
-		 * @param region the managed memory region to attach; it cannot be a sub-region -
-		 * you must pass whatever @ref cuda::memory::managed::allocate() returned.
+		 * @copybrief attach_managed_region(const void*, memory::managed::attachment_t) const
+		 *
+		 * @param region the entire managed memory region; note this must not be a sub-region;
+		 * you must pass whatever the CUDA memory allocation or construction code provided
+		 * you with, in full.
+		 *
+		 * The attachment is actually a commitment vis-a-vis the CUDA driver and the GPU itself
+		 * that it doesn't need to worry about accesses to this memory from devices other than
+		 * its object of attachment, so that the driver can optimize scheduling accordingly.
+		 *
+		 * @note by default, the memory region is attached to this specific stream on its
+		 * specific device. In this case, the host will be allowed to read from this memory
+		 * region whenever no kernels are pending on this stream.
+		 *
+		 * @note Attachment happens asynchronously, as an operation on this stream, i.e.
+		 * the attachment goes into effect (some time after) previous scheduled actions have
+		 * concluded.
 		 */
 		void attach_managed_region(
 			memory::region_t region,
@@ -581,8 +635,6 @@ class stream_t {
 		{
 			attach_managed_region(region.start(), attachment);
 		}
-		///@}
-
 
 		/**
 		 * Will pause all further activity on the stream until the specified event has
@@ -862,16 +914,21 @@ class stream_t {
 		// it must release its refcount unit on destruction
 
 public: // data members - which only exist in lieu of namespaces
+
+	/// This data member is a gadget for use instead of a "class-local" namespace;
+	/// we do not need it as a distinct object
 	const enqueue_t     enqueue { *this };
 		// The use of *this here is safe, since enqueue_t doesn't do anything with it
 		// on its own. Any use of enqueue only happens through, well, *this - and
 		// after construction.
 };
 
+///@cond
 inline bool operator!=(const stream_t& lhs, const stream_t& rhs) noexcept
 {
 	return not (lhs == rhs);
 }
+///@nocond
 
 namespace stream {
 
@@ -972,16 +1029,7 @@ void enqueue_function_call(const stream_t& stream, Function function, void* argu
  *     for execution scheduling; lower numbers represent higher properties;
  *     each device has a range of priorities, which can be obtained using
  *     @ref device_t::stream_priority_range()
- * @param hold_pc_refcount_unit when the event's context is a device's primary
- *     context, this controls whether that context must be kept active while the
- *     event continues to exist
-  * @return The newly-created stream
- */
-///@{
-
-/**
- * @brief Create a new stream (= queue) in the primary execution context
- * of a CUDA device.
+ * @return The newly-created stream
  */
 stream_t create(
 	const device_t&   device,
@@ -992,6 +1040,16 @@ stream_t create(
  * @brief Create a new stream (= queue) in a CUDA execution context.
  *
  * @param context the execution context in which to create the stream
+ * @param synchronizes_with_default_stream if true, no work on this stream
+ *     will execute concurrently with work from the default stream (stream 0)
+ * @param priority priority of tasks on the stream, relative to other streams,
+ *     for execution scheduling; lower numbers represent higher properties;
+ *     each device has a range of priorities, which can be obtained using
+ *     @ref device_t::stream_priority_range()
+ * @param hold_pc_refcount_unit when the stream's context is a device's primary
+ *     context, this controls whether that context must be kept active while the
+ *     steam continues to exist
+ * @return The newly-created stream
  */
 stream_t create(
 	const context_t&  context,
@@ -1002,7 +1060,16 @@ stream_t create(
 
 } // namespace stream
 
-inline void synchronize(const stream_t& stream)
+/**
+ * Waits for all previously-scheduled tasks on a given stream to conclude,
+ * before returning.
+ *
+ * Depending on the `host_thread_sync_scheduling_policy_t` set for the
+ * specified stream, the thread calling this method will either yield,
+ * spin or block until all tasks scheduled previously scheduled on the
+ * stream have concluded.
+ */
+ inline void synchronize(const stream_t& stream)
 {
 	// Note: Unfortunately, even though CUDA should be aware of which context a stream belongs to,
 	// and not have trouble acting on a stream in another context - it balks at doing so under
diff --git a/src/cuda/api/texture_view.hpp b/src/cuda/api/texture_view.hpp
index c4456948..34a31a47 100644
--- a/src/cuda/api/texture_view.hpp
+++ b/src/cuda/api/texture_view.hpp
@@ -21,6 +21,7 @@ class texture_view;
 
 namespace texture {
 
+/// The CUDA driver's raw, opaque handle for texture objects
 using raw_handle_t = CUtexObject;
 
 /**
@@ -111,7 +112,6 @@ class texture_view {
 		other.owning_ = false;
 	};
 
-
 	template <typename T, dimensionality_t NumDimensions>
 	texture_view(
 		const cuda::array_t<T, NumDimensions>& arr,
@@ -160,7 +160,10 @@ class texture_view {
 
 public: // non-mutating getters
 
+	/// @returns A non-owning proxy object for the CUDA context in which this texture is defined
 	context_t context() const;
+
+	/// @returns A non-owning proxy object for the CUDA device on which this texture is defined
 	device_t device() const;
 
 public: // friendship
@@ -174,7 +177,7 @@ class texture_view {
 	bool owning_;
 };
 
-
+///@cond
 inline bool operator==(const texture_view& lhs, const texture_view& rhs) noexcept
 {
 	return lhs.raw_handle() == rhs.raw_handle();
@@ -184,7 +187,7 @@ inline bool operator!=(const texture_view& lhs, const texture_view& rhs) noexcep
 {
 	return lhs.raw_handle() != rhs.raw_handle();
 }
-
+///@nocond
 namespace texture {
 
 inline texture_view wrap(
diff --git a/src/cuda/api/types.hpp b/src/cuda/api/types.hpp
index c709db6d..21c3ae33 100644
--- a/src/cuda/api/types.hpp
+++ b/src/cuda/api/types.hpp
@@ -573,6 +573,22 @@ using size_t = unsigned;
 
 } // namespace shared
 
+/**
+ * This type of memory, also known as _unified_ memory, appears within
+ * a unified, all-system address space - and is used with the same
+ * address range on the host and on all relevant CUDA devices on a
+ * system. It is paged, so that it may exceed the physical size of
+ * a CUDA device's global memory. The CUDA driver takes care of
+ * "swapping" pages "out" from a device to host memory or "swapping"
+ * them back "in", as well as of propagation of changes between
+ * devices and host-memory.
+ *
+ * @note For more details, see
+ * <a href="https://devblogs.nvidia.com/parallelforall/unified-memory-cuda-beginners/">
+ * Unified Memory for CUDA Beginners</a> on the
+ * <a href="https://devblogs.nvidia.com/parallelforall">Parallel4All blog</a>.
+ *
+ */
 namespace managed {
 
 enum class initial_visibility_t {
@@ -706,7 +722,7 @@ using flags_t = unsigned;
 /**
  * Scheduling policies the Runtime API may use when the host-side
  * thread it is running in needs to wait for results from a certain
- * device
+ * device or context.
  */
 enum host_thread_sync_scheduling_policy_t : unsigned int {
 
diff --git a/src/cuda/api/unique_region.hpp b/src/cuda/api/unique_region.hpp
index d054217f..a9b65a99 100644
--- a/src/cuda/api/unique_region.hpp
+++ b/src/cuda/api/unique_region.hpp
@@ -151,6 +151,7 @@ class unique_region : public region_t {
 
 namespace device {
 
+/// A unique region of device-global memory
 using unique_region = memory::unique_region<detail_::deleter>;
 
 namespace detail_ {
@@ -164,20 +165,30 @@ inline unique_region make_unique_region(const context::handle_t context_handle,
 } // namespace detail_
 
 /**
- * @brief Allocate an array in device-global memory and return an owning class for it
+ * @brief Allocate a region in device-global memory
  *
- * @param num_bytes  the size in bytes of the allocated region
- */
-///@{
-/**
- * @param device     The CUDA device in whose global memory to make the allocation.
+ * @param context The context within which (and in the device global memory
+ *     of which) to make the allocation
+ * @param num_bytes Size of the region to be allocated, in bytes
+ * @returns An owning RAII/CADRe object for the allocated memory region
  */
 unique_region make_unique_region(const context_t& context, size_t num_bytes);
+
 /**
- * @param context    The CUDA context in which to make the allocation.
+ * @brief Allocate a region in device-global memory
+ *
+ * @param device The device in the global memory of which to make the allocation
+ * @returns An owning RAII/CADRe object for the allocated memory region
  */
 unique_region make_unique_region(const device_t& device, size_t num_bytes);
 
+/**
+ * @brief Allocate a region in device-global memory within the primary context
+ * of the current CUDA device
+ *
+ * @param device The device in the global memory of which to make the allocation
+ * @returns An owning RAII/CADRe object for the allocated memory region
+ */
 unique_region make_unique_region(size_t num_bytes);
 ///}@
 
@@ -198,19 +209,21 @@ inline device::unique_region make_unique_region(const device_t& device, size_t n
 
 namespace host {
 
+/// A unique region of pinned host memory
 using unique_region = memory::unique_region<detail_::deleter>;
 
-inline unique_region make_unique_region(
-	const context_t&    context,
-	size_t              num_bytes,
-	allocation_options  options = allocation_options{});
-inline unique_region make_unique_region(const device_t& device, size_t num_bytes);
+/**
+ * @brief Allocate a physical-address-pinned region of system memory
+ *
+ * @returns An owning RAII/CADRe object for the allocated memory region
+ */
 inline unique_region make_unique_region(size_t num_bytes);
 
 } // namespace host
 
 namespace managed {
 
+/// A unique region of managed memory, see @ref cuda::memory::managed
 using unique_region = memory::unique_region<detail_::deleter>;
 
 namespace detail_ {
@@ -226,14 +239,34 @@ inline unique_region make_unique_region(
 
 } // namespace detail_
 
+/**
+ * @copydoc make_unique_region(size_t num_bytes)
+ *
+ * @param context A context, to set when allocating the memory region, for whatever
+ *     association effect that may have.
+ */
 inline unique_region make_unique_region(
     const context_t&      context,
     size_t                num_bytes,
     initial_visibility_t  initial_visibility = initial_visibility_t::to_all_devices);
+
+/**
+ * @copydoc make_unique_region(size_t num_bytes)
+ *
+ * @param device A context, whose primary context will be current when allocating
+ *     the memory region, for whatever association effect that may have.
+ */
 inline unique_region make_unique_region(
     const device_t&       device,
     size_t                num_bytes,
     initial_visibility_t  initial_visibility = initial_visibility_t::to_all_devices);
+
+/**
+ * @brief Allocate a region of managed memory, accessible both from CUDA devices
+ * and from the CPU.
+ *
+ * @returns An owning RAII/CADRe object for the allocated managed memory region
+ */
 inline unique_region make_unique_region(
     size_t                num_bytes);
 
diff --git a/src/cuda/define_specifiers.hpp b/src/cuda/define_specifiers.hpp
index a907ea11..4234041b 100644
--- a/src/cuda/define_specifiers.hpp
+++ b/src/cuda/define_specifiers.hpp
@@ -8,6 +8,8 @@
 
 #ifdef __CUDACC__
 
+/// Shorthands for CUDA-specific function declaration decorations
+///@{
 #ifndef CUDA_FD
 #define CUDA_FD  __forceinline__ __device__
 #endif
@@ -57,5 +59,6 @@
 #ifndef CUDA_H
 #define CUDA_H
 #endif
+///@}
 
 #endif // __CUDACC__
diff --git a/src/cuda/rtc/compilation_output.hpp b/src/cuda/rtc/compilation_output.hpp
index 45e457c5..c95359bf 100644
--- a/src/cuda/rtc/compilation_output.hpp
+++ b/src/cuda/rtc/compilation_output.hpp
@@ -23,8 +23,18 @@ class context_t;
 ///@endcond
 
 namespace rtc {
+
+/**
+ * The output produced by a compilation process by one of the CUDA libraries,
+ * including any byproducts.
+ *
+ * @tparam Kind Which language was compiled to produce the result
+ *
+ * @note A failed compilation is also a (useful) kind of compilation output.
+ */
 template <source_kind_t Kind>
 class compilation_output_t;
+
 } // namespace rtc
 
 ///@cond
@@ -41,6 +51,7 @@ class module_t;
 
 namespace module {
 
+/// Build a contextualized module from the results of a successful compilation
 template <source_kind_t Kind>
 inline module_t create(
 	const context_t&                        context,
@@ -49,9 +60,6 @@ inline module_t create(
 
 } // namespace module
 
-/**
- * @brief Real-time compilation of CUDA programs using the NVIDIA NVRTC library.
- */
 namespace rtc {
 
 namespace program {
@@ -237,6 +245,11 @@ template <> inline status_t<cuda_cpp> destroy_and_return_status<cuda_cpp>(handle
 
 } // namespace program
 
+/**
+ * @namespace compilation_output
+ *
+ * Definitions relating to and supporting the @ref compilation_output_t class
+ */
 namespace compilation_output {
 
 namespace detail_ {
@@ -256,12 +269,14 @@ inline compilation_output_t<Kind> wrap(
 } // namespace compilation_output
 
 /**
- * Wrapper class for the result of an NVRTC compilation (including the program handle) -
- * whether it succeeded or failed due to errors in the program itself.
+ * The result of the compilation of an {@ref rtc::program_t}, whether successful or
+ * failed, with any related byproducts.
+ *
+ * @note This class _may_ own a low-level program handle.
  *
- * @note This class _may_ own an NVRTC low-level program handle.
  * @note If compilation failed due to apriori-invalid arguments - an exception will
- * have been thrown. The only failure this class may represent
+ * have been thrown. A failure indication in this class indicates a program whose
+ * compilation actually _took place_ and ended with a failure.
  */
 template <source_kind_t Kind>
 class compilation_output_base_t {
@@ -271,27 +286,30 @@ class compilation_output_base_t {
 	using status_type = status_t<source_kind>;
 
 public: // getters
+
+	/// @returns `true` if the compilation resulting in this output had succeeded
 	bool succeeded() const { return succeeded_; }
+
+	/// @returns `true` if the compilation resulting in this output had failed
 	bool failed() const { return not succeeded_; }
+
+	/// @returns `true` if the compilation resulting in this output had succeeded, `false` otherwise
 	operator bool() const { return succeeded_; }
 	const ::std::string& program_name() const { return program_name_; }
 	handle_type program_handle() const { return program_handle_; }
 
 public: // non-mutators
 
-	// Unfortunately, C++'s standard string class is very inflexible,
-	// and it is not possible for us to get it to have an appropriately-
-	// sized _uninitialized_ buffer. We will therefore have to use
-	// a clunkier return type.
-	//
-	// ::std::string log() const
-
 	/**
-	 * Obtain a copy of the log of the last compilation
+	 * Write a copy of the program compilation log into a user-provided buffer
 	 *
-	 * @note This will fail if the program has never been compiled.
+     * @param[inout] buffer A writable buffer large enough to contain the compilation log
+     *
+	 * @return the buffer passed in (which has now been overwritten with the log)
+	 *
+	 * @note This will fail if the program has never been compiled, or if the
+	 * buffer is not large enough to hold the complete log (plus nul character).
 	 */
-	///@{
 	span<char> log(span<char> buffer) const
 	{
 		size_t size = program::detail_::get_log_size<source_kind>(program_handle_, program_name_.c_str());
@@ -305,6 +323,13 @@ class compilation_output_base_t {
 		return { buffer.data(), size };
 	}
 
+	/**
+	 * Obtain a copy of the compilation log
+	 *
+	 * @returns an owning container with a nul-terminated copy of the log
+	 *
+	 * @note This will fail if the program has never been compiled.
+	 */
 	unique_span<char> log() const
 	{
 		size_t size = program::detail_::get_log_size<source_kind>(program_handle_, program_name_.c_str());
@@ -317,11 +342,34 @@ class compilation_output_base_t {
 		result[size] = '\0';
 		return result;
 	}
-	///@}
 
 #if CUDA_VERSION >= 11010
-	virtual unique_span<char> cubin() const = 0;
+	/**
+	 * Write the CUBIN result of the last compilation into a buffer.
+	 *
+	 * @param[inout] buffer A writable buffer large enough to contain the compiled
+	 *     program's CUBIN code.
+	 * @return The sub-buffer, starting at the beginning of @p buffer, containing
+	 *     exactly the compiled program's CUBIN (i.e. sized down to fit the contents)
+	 *
+	 * @note This will fail if the program has never been compiled; due to
+	 * compilation failure and also due to LTO/linking failure.
+	 */
 	virtual span<char> cubin(span<char> buffer) const = 0;
+
+	/**
+	 * Obtain a copy of the CUBIN code resulting from the program compilation
+	 *
+	 * @returns an owning container with a copy of the CUBIN code
+	 *
+	 * @note This will fail if the program has never been compiled; if the compilation
+	 * target was a virtual architecture (in which case only PTX is available); due to
+	 * compilation failure and also due to LTO/linking failure.
+	 */
+	virtual unique_span<char> cubin() const = 0;
+
+	/// @returns true if the program has been successfully compiled, with the result
+	/// containing CUBIN code.
 	virtual bool has_cubin() const = 0;
 #endif
 
@@ -382,9 +430,17 @@ class compilation_output_t<cuda_cpp> : public compilation_output_base_t<cuda_cpp
 	 */
 	///@{
 
+
 	/**
+	 * Write a copy of the PTX resulting from the compilation into a user-provided buffer
+	 *
 	 * @param[inout] buffer A writable buffer large enough to contain the compiled
 	 *     program's PTX code.
+	 * @return The sub-buffer, starting at the beginning of @p buffer, containing
+	 *     exactly the compiled program's PTX (i.e. sized down to fit the contents)
+	 *
+	 * @note This will throw if the program has never been compiled, or if the buffer
+	 * is not large enough to contain the compiled PTX code.
 	 */
 	span<char> ptx(span<char> buffer) const
 	{
@@ -398,6 +454,13 @@ class compilation_output_t<cuda_cpp> : public compilation_output_base_t<cuda_cpp
 		return { buffer.data(), size };
 	}
 
+	/**
+	 * Obtain a copy of the PTX resulting from the program compilation
+	 *
+	 * @returns an owning container with a nul-terminated copy of the PTX code
+	 *
+	 * @note This will fail if the program compilation has not produced any PTX
+	 */
 	unique_span<char> ptx() const
 	{
 		size_t size = program::detail_::get_ptx_size(program_handle_, program_name_.c_str());
@@ -410,8 +473,9 @@ class compilation_output_t<cuda_cpp> : public compilation_output_base_t<cuda_cpp
 		result[size] = '\0';
 		return result;
 	}
-	///@}
 
+	/// @returns true if the program has been successfully compiled, with the result containing
+	/// PTX code
 	bool has_ptx() const
 	{
 		size_t size;
@@ -427,15 +491,6 @@ class compilation_output_t<cuda_cpp> : public compilation_output_base_t<cuda_cpp
 	}
 
 #if CUDA_VERSION >= 11010
-	/**
-	 * Obtain a copy of the CUBIN result of the last compilation.
-	 *
-	 * @note CUBIN output is not available when compiling for a virtual architecture only.
-	 * Also, it may be missing in cases such as compilation failure or link-time
-	 * optimization compilation.
-	 * @note This will fail if the program has never been compiled.
-	 */
-	///@{
 	span<char> cubin(span<char> buffer) const override
 	{
 		size_t size = program::detail_::get_cubin_size<source_kind>(program_handle_, program_name_.c_str());
@@ -456,7 +511,6 @@ class compilation_output_t<cuda_cpp> : public compilation_output_base_t<cuda_cpp
 		program::detail_::get_cubin<source_kind>(result.data(), program_handle_, program_name_.c_str());
 		return result;
 	}
-	///@}
 
 	bool has_cubin() const override
 	{
@@ -467,13 +521,12 @@ class compilation_output_t<cuda_cpp> : public compilation_output_base_t<cuda_cpp
 			+ compilation_output::detail_::identify(*this));
 		return (size > 0);
 	}
-
 #endif
 
 #if CUDA_VERSION >= 11040
 	/**
-	 * Obtain a copy of the LTO IR result of the last compilation - the intermediate
-	 * representation used for link-time optimization
+	 * Write the LTO IR result of the last compilation - the intermediate
+	 * representation used for link-time optimization - into a buffer
 	 *
 	 * @throws ::std::invalid_argument if the supplied buffer is too small to hold
 	 * the program's LTO IR.
@@ -485,7 +538,6 @@ class compilation_output_t<cuda_cpp> : public compilation_output_base_t<cuda_cpp
 	 *
 	 * @note LTO IR was called NVVM in CUDA 11.x .
 	 */
-	/// @{
 	span<char> lto_ir(span<char> buffer) const
 	{
 		size_t size = program::detail_::get_lto_ir_size(program_handle_, program_name_.c_str());
@@ -498,6 +550,15 @@ class compilation_output_t<cuda_cpp> : public compilation_output_base_t<cuda_cpp
 		return { buffer.data(), size };
 	}
 
+	/**
+	 * Obtain a copy of the intermediate representation, for LTO purposes (LTO IR) resulting
+	 * from the program compilation
+	 *
+	 * @returns an owning container with a nul-terminated copy of the LTO-IR code
+	 *
+	 * @note This will fail if the program was not compiled successfully with the LTO IR option
+	 * enabled
+	 */
 	unique_span<char> lto_ir() const
 	{
 		size_t size = program::detail_::get_lto_ir_size(program_handle_, program_name_.c_str());
@@ -510,12 +571,9 @@ class compilation_output_t<cuda_cpp> : public compilation_output_base_t<cuda_cpp
 		result[size] = '\0';
 		return result;
 	}
-	/// @}
 
-	/**
-	 * Check whether the compilation also resulted in LTO IR - intermediate representation
-	 * for link-time optimization
-	 */
+	/// @returns true if the program has been successfully compiled, with the result containing
+	/// IR (intermediate representation) code usable for LTO (link-time optimization)
 	bool has_lto_ir() const
 	{
 		size_t size;
@@ -533,14 +591,13 @@ class compilation_output_t<cuda_cpp> : public compilation_output_base_t<cuda_cpp
 		}
 		return true;
 	}
-
 #endif
 
 	/**
 	 * Obtain the mangled/lowered form of an expression registered earlier, after
 	 * the compilation.
 	 *
-	 * @param unmangled_name A name of a __global__ or __device__ function or variable.
+	 * @param unmangled_name A name of a `__global__` or `__device__` function or variable.
 	 * @return The mangled name (which can actually be used for invoking kernels,
 	 * moving data etc.). The memory is owned by the NVRTC program and will be
 	 * released when it is destroyed.
@@ -554,6 +611,7 @@ class compilation_output_t<cuda_cpp> : public compilation_output_base_t<cuda_cpp
 		return result;
 	}
 
+	/// @copydoc get_mangling_of(const char*) const
 	const char* get_mangling_of(const ::std::string& unmangled_name) const
 	{
 		return get_mangling_of(unmangled_name.c_str());
@@ -575,12 +633,6 @@ class compilation_output_t<ptx> : public compilation_output_base_t<ptx> {
 		bool           own_handle);
 
 public: // non-mutators
-	/**
-	 * Obtain a copy of the CUBIN result of the last compilation.
-	 *
-	 * @note This will fail if the program has never been compiled.
-	 */
-	///@{
 	span<char> cubin(span<char> buffer) const override
 	{
 		size_t size = program::detail_::get_cubin_size<source_kind>(program_handle_, program_name_.c_str());
@@ -593,7 +645,6 @@ class compilation_output_t<ptx> : public compilation_output_base_t<ptx> {
 		return { buffer.data(), size };
 	}
 
-public: // non-mutators
 	unique_span<char> cubin() const override
 	{
 		size_t size = program::detail_::get_cubin_size<source_kind>(program_handle_, program_name_.c_str());
@@ -606,7 +657,6 @@ class compilation_output_t<ptx> : public compilation_output_base_t<ptx> {
 		result[size] = '\0';
 		return result;
 	}
-	///@}
 
 	bool has_cubin() const override
 	{
@@ -695,6 +745,8 @@ template<> inline module_t create<source_kind_t::ptx>(
 #endif // CUDA_VERSION >= 11010
 
 
+/// Build a module from the results of a successful compilation, in the primary context
+/// of the specified device
 template <source_kind_t Kind>
 inline module_t create(
 	device_t&                               device,
diff --git a/src/cuda/rtc/error.hpp b/src/cuda/rtc/error.hpp
index d880a2b5..8ac427b9 100644
--- a/src/cuda/rtc/error.hpp
+++ b/src/cuda/rtc/error.hpp
@@ -121,7 +121,6 @@ namespace rtc {
 template <source_kind_t Kind>
 class runtime_error : public ::std::runtime_error {
 public:
-	///@cond
 	// TODO: Constructor chaining; and perhaps allow for more construction mechanisms?
 	runtime_error(status_t<Kind> error_code) :
 		::std::runtime_error(describe(error_code)),
@@ -132,7 +131,6 @@ class runtime_error : public ::std::runtime_error {
 		::std::runtime_error(::std::move(what_arg) + ": " + describe(error_code)),
 		code_(error_code)
 	{ }
-	///@endcond
 	runtime_error(status::named_t<Kind> error_code) :
 		runtime_error(static_cast<status_t<Kind>>(error_code)) { }
 	runtime_error(status::named_t<Kind> error_code, const ::std::string& what_arg) :
@@ -190,6 +188,15 @@ inline void throw_if_error(rtc::status_t<Kind> status) noexcept(false)
 	if (is_failure(status)) { throw rtc::runtime_error<Kind>(status); }
 }
 
+/**
+ * Throws a @ref ::cuda::rtc::runtime_error exception if the status is not success
+ *
+ * @note The rationale for this macro is that neither the exception, nor its constructor
+ * arguments, are evaluated on the "happy path"; and that cannot be achieved with a
+ * function - which genertally/typically evaluates its arguments. To guarantee this
+ * lazy evaluation with a function, we would need exception-construction-argument-producing
+ * lambdas, which we would obviously rather avoid.
+ */
 #define throw_if_rtc_error_lazy(Kind, status__, ... ) \
 do { \
 	::cuda::rtc::status_t<Kind> tie_status__ = static_cast<::cuda::rtc::status_t<Kind>>(status__); \
diff --git a/src/cuda/rtc/program.hpp b/src/cuda/rtc/program.hpp
index bb51099f..9aed4531 100644
--- a/src/cuda/rtc/program.hpp
+++ b/src/cuda/rtc/program.hpp
@@ -23,6 +23,19 @@ namespace program {
 
 namespace detail_ {
 
+/**
+ * Create a new program object from source code
+ *
+ * @tparam Kind We can create a program with any one of the (two...) kinds of supported
+ *     source code
+ * @param program_name arbitrary identifier to recognize the program by; it's suggested
+ *     not to get too crazy
+ * @param program_source The source code of the program, possibly with include directives
+ *     in the case of C++
+ * @param num_headers The number of pairs of header "file" names and header content strings
+ * @param header_sources Pointers to nul-terminated per-header source code
+ * @param header_names Pointers to nul-terminated names of the different headers
+ */
 template <source_kind_t Kind = cuda_cpp>
 inline program::handle_t<Kind> create(
 	const char *program_name,
@@ -285,7 +298,7 @@ class program_t<cuda_cpp> : public program::detail_::base_t<cuda_cpp> {
 
 protected:
 	template <typename String>
-	static inline void check_string_type()
+	static void check_string_type()
 	{
 		using no_cref_string_type = typename ::std::remove_const<typename ::std::remove_reference<String>::type>::type;
 		static_assert(
diff --git a/src/cuda/rtc/types.hpp b/src/cuda/rtc/types.hpp
index 1526a4c6..80f0de90 100644
--- a/src/cuda/rtc/types.hpp
+++ b/src/cuda/rtc/types.hpp
@@ -51,13 +51,20 @@ enum source_kind_t {
 // provide a container which may then be resized.
 
 /**
- * @brief Real-time compilation of CUDA programs using the NVIDIA NVRTC library.
+ * @namespace rtc
+ *
+ * @brief Real-time compilation of programs using the NVIDIA libraries.
  */
 namespace rtc {
 
 /// A span of C-style strings the contents of which must not be modified
 using const_cstrings_span = span<const char* const>;
 
+/**
+ * @namespace program
+ *
+ * Definitions relating to source-code programs to be compiled
+ */
 namespace program {
 
 namespace detail_ {