diff --git a/src/cuda/api/stream.hpp b/src/cuda/api/stream.hpp index 6976d359..e6beb8db 100644 --- a/src/cuda/api/stream.hpp +++ b/src/cuda/api/stream.hpp @@ -725,14 +725,30 @@ class stream_t { */ void flush_remote_writes() const { - CUstreamBatchMemOpParams flush_op; - flush_op.operation = CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES; + CUstreamBatchMemOpParams op_params; + op_params.flushRemoteWrites.operation = CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES; + op_params.flushRemoteWrites.flags = 0; unsigned count = 1; unsigned flags = 0; // Let's cross our fingers and assume nothing else needs to be set here... - cuStreamBatchMemOp(associated_stream.handle_, count, &flush_op, flags); + auto status = cuStreamBatchMemOp(associated_stream.handle_, count, &op_params, flags); + throw_if_error_lazy(status, "scheduling a flush-remote-writes memory operation as a 1-op batch"); } +#if CUDA_VERSION >= 11070 + void memory_barrier(memory::barrier_scope_t scope) const + { + CUstreamBatchMemOpParams op_params; + op_params.memoryBarrier.operation = CU_STREAM_MEM_OP_BARRIER; + op_params.memoryBarrier.flags = 0; + unsigned count = 1; + unsigned flags = 0; + // Let's cross our fingers and assume nothing else needs to be set here... + auto status = cuStreamBatchMemOp(associated_stream.handle_, count, &op_params, flags); + throw_if_error_lazy(status, "scheduling a memory barrier operation as a 1-op batch"); + } +#endif + /** * Enqueue multiple single-value write, wait and flush operations to the device * (avoiding the overhead of multiple enqueue calls). diff --git a/src/cuda/api/types.hpp b/src/cuda/api/types.hpp index 9bcfc894..ae3dcbd1 100644 --- a/src/cuda/api/types.hpp +++ b/src/cuda/api/types.hpp @@ -668,6 +668,14 @@ using range_attribute_t = CUmem_range_attribute; } // namespace managed +#if CUDA_VERSION >= 11070 +enum class barrier_scope_t : typename std::underlying_type::type { + device = CU_STREAM_MEMORY_BARRIER_TYPE_GPU, + system = CU_STREAM_MEMORY_BARRIER_TYPE_SYS +}; +#endif // CUDA_VERSION >= 11700 + + } // namespace memory /**