Fixes #223, #272: Stream attributes:

* Can now copy all attributes between streams (with CUDA >= 11). * Can now get and set streams' synchronization policy (with CUDA >= 11).
eyalroz · Jan 14, 2022 · 4a1b8e5 · 4a1b8e5
1 parent 4e30bd2
commit 4a1b8e5
Show file tree

Hide file tree

Showing 3 changed files with 144 additions and 7 deletions.
diff --git a/examples/by_runtime_api_module/stream_management.cu b/examples/by_runtime_api_module/stream_management.cu
@@ -76,6 +76,20 @@ __global__ void increment(char* data, size_t length)
 	data[global_index]++;
 }
 
+#if CUDA_VERSION >= 11000
+const char* get_policy_name(cuda::stream::synchronization_policy_t policy)
+{
+	switch(policy) {
+		case cuda::stream::automatic: return "automatic";
+		case cuda::stream::spin: return "spin";
+		case cuda::stream::yield: return "yield";
+		case cuda::stream::block: return "block";
+		default:
+			return "unknown policy";
+	}
+}
+#endif // CUDA_VERSION >= 11000
+
 int main(int argc, char **argv)
 {
 	constexpr const size_t N = 50;
@@ -122,6 +136,38 @@ int main(int argc, char **argv)
 		cuda::stream::default_priority + 1,
 		cuda::stream::no_implicit_synchronization_with_default_stream);
 
+#if CUDA_VERSION >= 11000
+	// Stream synchronization policy and attribute copying
+
+	auto initial_policy = stream_1.synchronization_policy();
+	std::cout
+		<< "Initial stream synchronization policy is "
+		<< get_policy_name(initial_policy) << " (numeric value: " << (int) initial_policy << ")\n";
+	if (initial_policy != stream_2.synchronization_policy()) {
+		throw std::logic_error("Different synchronization policies for streams created the same way");
+	}
+	cuda::stream::synchronization_policy_t alt_policy =
+		(initial_policy == cuda::stream::yield) ? cuda::stream::block : cuda::stream::yield;
+	stream_2.set_synchronization_policy(alt_policy);
+	auto new_s2_policy = stream_2.synchronization_policy();
+	if (alt_policy != new_s2_policy) {
+		std::stringstream ss;
+		ss
+			<< "Got a different synchronization policy (" << get_policy_name(new_s2_policy) << ")"
+			<< " than the one we set the stream to (" << get_policy_name(alt_policy) << ")\n";
+		throw std::logic_error(ss.str());
+	}
+	std::cout << "Overwriting all attributes of stream 1 with those of stream 2.\n";
+	cuda::copy_attributes(stream_1, stream_2);
+	auto s1_policy_after_copy = stream_1.synchronization_policy();
+	if (alt_policy != s1_policy_after_copy) {
+		std::stringstream ss;
+		ss
+			<< "Got a different synchronization policy (" << get_policy_name(s1_policy_after_copy) << ")"
+			<< " than the one we expected after attribute-copying (" << get_policy_name(alt_policy) << ")\n";
+		throw std::logic_error(ss.str());
+	}
+#endif
 
 	constexpr auto buffer_size = 12345678;
 	auto buffer = cuda::memory::managed::make_unique<char[]>(

diff --git a/src/cuda/api/stream.hpp b/src/cuda/api/stream.hpp
@@ -38,6 +38,48 @@ enum : bool {
 	async = no_implicit_synchronization_with_default_stream,
 };
 
+#if CUDA_VERSION >= 11000
+/**
+ * Possible synchronization behavior of a host thread when performing a synchronous action
+ * on a stream (in particular, synchronizing with a stream).
+ */
+enum synchronization_policy_t : typename std::underlying_type<cudaSynchronizationPolicy>::type {
+	/**
+	 * @todo Figure out what this default actually is!
+	 */
+	automatic = cudaSyncPolicyAuto,
+
+	/**
+	 * @brief Keep control and spin-check for result availability
+	 *
+	 * Instruct CUDA to actively spin when waiting for the stream to
+	 * complete pending actions. This can decrease latency when waiting
+	 * for the device, but may lower the performance of other CPU threads
+	 * working in parallel.
+	 */
+	spin      = cudaSyncPolicySpin,
+
+	/**
+	 * @brief Yield control while waiting for results.
+	 *
+	 * Instruct CUDA to yield its thread when waiting for the stream
+	 * to complete pending actions. This can increase latency when
+	 * waiting for the device, but can increase the performance of other
+	 * CPU threads performing work in parallel.
+	 *
+	 */
+	yield     = cudaSyncPolicyYield,
+
+	/**
+	 * @brief Block the thread until the stream has concluded pending actions.
+	 *
+	 * Instruct CUDA to block the CPU thread on a synchronization
+	 * primitive when waiting for the stream to finish work.
+	 */
+	block  = cudaSyncPolicyBlockingSync
+};
+#endif // CUDA_VERSION >= 11000
+
 namespace detail_ {
 
 inline id_t create_on_current_device(
@@ -556,6 +598,26 @@ class stream_t {
 		cuda::synchronize(*this);
 	}
 
+#if CUDA_VERSION >= 11000
+	stream::synchronization_policy_t synchronization_policy()
+	{
+		device::current::detail_::scoped_override_t set_device_for_this_scope(device_id_);
+		cudaStreamAttrValue wrapped_result{};
+		auto status = cudaStreamGetAttribute(id_, cudaStreamAttributeSynchronizationPolicy, &wrapped_result);
+		throw_if_error(status);
+		return static_cast<stream::synchronization_policy_t>(wrapped_result.syncPolicy);
+	}
+
+	void set_synchronization_policy(stream::synchronization_policy_t policy)
+	{
+		device::current::detail_::scoped_override_t set_device_for_this_scope(device_id_);
+		cudaStreamAttrValue wrapped_value{};
+		wrapped_value.syncPolicy = static_cast<cudaSynchronizationPolicy>(policy);
+		auto status = cudaStreamSetAttribute(id_, cudaStreamAttributeSynchronizationPolicy, &wrapped_value);
+		throw_if_error(status);
+	}
+#endif
+
 protected: // constructor
 
 	stream_t(device::id_t device_id, stream::id_t stream_id, bool take_ownership = false) noexcept
@@ -679,6 +741,30 @@ inline void synchronize(const stream_t& stream)
 		+ " on CUDA device " + ::std::to_string(stream.device().id()));
 }
 
+#if CUDA_VERSION >= 11000
+/**
+ * Overwrite all "attributes" of one stream with those of another
+ *
+ * @param dest The stream whose attributes will be overwritten
+ * @param src The stream whose attributes are to be copied
+ *
+ * @note As of CUDA 11.5, the "attributes" are the thread
+ * synchronization policy and the various L2 access policy window
+ * settings; see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#L2_access_policy
+ * for details.
+ */
+inline void copy_attributes(const stream_t& dest, const stream_t& src)
+{
+#ifndef NDEBUG
+	if (dest.device() != src.device()) {
+		throw std::invalid_argument("Attempt to copy attributes between streams on different devices");
+	}
+#endif
+	device::current::scoped_override_t set_device_for_this_scope(dest.device());
+	auto status = cudaStreamCopyAttributes(dest.id(), src.id());
+	throw_if_error(status);
+}
+#endif // CUDA_VERSION >= 11000
 
 } // namespace cuda
 

diff --git a/src/cuda/common/types.hpp b/src/cuda/common/types.hpp
@@ -538,6 +538,7 @@ using kernel_parameter_decay_t = typename kernel_parameter_decay<P>::type;
  * device
  */
 enum host_thread_synch_scheduling_policy_t : unsigned int {
+
 	/**
 	 * @brief Default behavior; yield or spin based on a heuristic.
 	 *
@@ -549,6 +550,7 @@ enum host_thread_synch_scheduling_policy_t : unsigned int {
 	 * actively spin on the processor.
 	 */
 	heuristic = cudaDeviceScheduleAuto,
+
 	/**
 	 * @brief Keep control and spin-check for result availability
 	 *
@@ -559,6 +561,15 @@ enum host_thread_synch_scheduling_policy_t : unsigned int {
 	 *
 	 */
 	spin      = cudaDeviceScheduleSpin,
+
+	/**
+	 * @brief Block the thread until results are available.
+	 *
+	 * Instruct CUDA to block the CPU thread on a synchronization
+	 * primitive when waiting for the device to finish work.
+	 */
+	block     = cudaDeviceScheduleBlockingSync,
+
 	/**
 	 * @brief Yield control while waiting for results.
 	 *
@@ -568,14 +579,8 @@ enum host_thread_synch_scheduling_policy_t : unsigned int {
 	 * performing work in parallel with the device.
 	 *
 	 */
-	block     = cudaDeviceScheduleBlockingSync,
-	/**
-	 * @brief Block the thread until results are available.
-	 *
-	 * Instruct CUDA to block the CPU thread on a synchronization
-	 * primitive when waiting for the device to finish work.
-	 */
 	yield     = cudaDeviceScheduleYield,
+
 	/** see @ref heuristic */
 	automatic = heuristic,
 };