From d047bc676268ca67bc61e4109b6e4af25b1ee7da Mon Sep 17 00:00:00 2001 From: Eyal Rozenberg Date: Tue, 10 Sep 2024 23:12:37 +0300 Subject: [PATCH] Fixes #678: Reworked unique_span: * Dropped the deleter template parameter * Added a deleter member - always of the same type * The deleter now takes the span * Added a unique_span generator for non-default-constructible elements * Dropped the per-memory-space unique_span types - they're all the same type now * Made the default constructor explicitly zero thing out to avoid spurious deletions of uninitialized / partially uninitialized spans --- .../p2pBandwidthLatencyTest.cu | 12 +- .../other/io_compute_overlap_with_streams.cu | 12 +- examples/other/new_cpp_standard/main.cpp | 4 +- src/cuda/api/detail/unique_span.hpp | 154 ++++++++++++++---- src/cuda/api/memory.hpp | 129 +++++++++++---- .../api/multi_wrapper_impls/unique_span.hpp | 13 +- 6 files changed, 248 insertions(+), 76 deletions(-) diff --git a/examples/modified_cuda_samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest.cu b/examples/modified_cuda_samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest.cu index d440b1d7..f7541c36 100644 --- a/examples/modified_cuda_samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest.cu +++ b/examples/modified_cuda_samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest.cu @@ -164,8 +164,8 @@ void outputBandwidthMatrix(P2PEngine mechanism, bool test_p2p, P2PDataTransfer p int numElems = 10000000; int repeat = 5; vector streams; - vector> buffers; - vector> buffersD2D; // buffer for D2D, that is, intra-GPU copy + vector> buffers; + vector> buffersD2D; // buffer for D2D, that is, intra-GPU copy vector start; vector stop; @@ -294,8 +294,8 @@ void outputBidirectionalBandwidthMatrix(P2PEngine p2p_mechanism, bool test_p2p) vector streams_0; vector streams_1; - vector> buffers; - vector> buffersD2D; // buffer for D2D, that is, intra-GPU copy + vector> buffers; + vector> buffersD2D; // buffer for D2D, that is, intra-GPU copy vector start; vector stop; @@ -405,8 +405,8 @@ void outputLatencyMatrix(P2PEngine p2p_mechanism, bool test_p2p, P2PDataTransfer // vector streams; - vector> buffers; - vector> buffersD2D; // buffer for D2D, that is, intra-GPU copy + vector> buffers; + vector> buffersD2D; // buffer for D2D, that is, intra-GPU copy vector start; vector stop; diff --git a/examples/other/io_compute_overlap_with_streams.cu b/examples/other/io_compute_overlap_with_streams.cu index f53b4e9f..08b9e185 100644 --- a/examples/other/io_compute_overlap_with_streams.cu +++ b/examples/other/io_compute_overlap_with_streams.cu @@ -46,12 +46,12 @@ constexpr I div_rounding_up(I dividend, const I2 divisor) noexcept } struct buffer_set_t { - cuda::memory::host::unique_span host_lhs; - cuda::memory::host::unique_span host_rhs; - cuda::memory::host::unique_span host_result; - cuda::memory::device::unique_span device_lhs; - cuda::memory::device::unique_span device_rhs; - cuda::memory::device::unique_span device_result; + cuda::unique_span host_lhs; + cuda::unique_span host_rhs; + cuda::unique_span host_result; + cuda::unique_span device_lhs; + cuda::unique_span device_rhs; + cuda::unique_span device_result; }; std::vector generate_buffers( diff --git a/examples/other/new_cpp_standard/main.cpp b/examples/other/new_cpp_standard/main.cpp index 6e0e876c..b3cae564 100644 --- a/examples/other/new_cpp_standard/main.cpp +++ b/examples/other/new_cpp_standard/main.cpp @@ -25,8 +25,8 @@ cuda::device::id_t get_current_device_id() void unique_spans() { - cuda::memory::host::unique_span data1(nullptr, 0); - cuda::memory::host::unique_span data2(nullptr, 0); + cuda::unique_span data1(nullptr, 0, cuda::detail_::default_span_deleter); + cuda::unique_span data2(nullptr, 0, cuda::detail_::default_span_deleter); data1 = std::move(data2); } diff --git a/src/cuda/api/detail/unique_span.hpp b/src/cuda/api/detail/unique_span.hpp index 8bd12fa0..f04b60d1 100644 --- a/src/cuda/api/detail/unique_span.hpp +++ b/src/cuda/api/detail/unique_span.hpp @@ -33,45 +33,46 @@ namespace cuda { * included in, C++14. It can be though of as a variation on std::array, with the the size and capacity * set dynamically, at construction time, rather than statically. * - * @note unique_span = unique_span+typing or span+ownership+non_null + * @note unique_span = unique_span+typing or span+ownership+non_null . Well, sort of, because this + * class supports complex construction-allocation and deletion patterns, through deleter objects. * - * @tparam T the type of individual elements in the unique_span + * @tparam T an individual element in the unique_span */ -template> +template class unique_span : public ::cuda::span { public: // span types using span_type = span; // Exposing some span type definitions, strictly for terseness // (they're all visible on the outside anyway) - using size_type = typename span::size_type; - using pointer = typename span::pointer; - using reference = typename span::reference; - using deleter_type = Deleter; + using size_type = typename span_type::size_type; + using pointer = typename span_type::pointer; + using reference = typename span_type::reference; + using deleter_type = void (*)(span_type); -public: // exposing span data members - using span::data; - using span::size; +public: // exposing span data members & adding our own + using span_type::data; + using span_type::size; + deleter_type deleter_; public: // constructors and destructor - constexpr unique_span() noexcept = default; + // Note: span_type's default ctor will create a {nullptr, 0} empty span. + constexpr unique_span() noexcept : span_type(), deleter_{nullptr} {} // Disable copy construction - as this class never allocates; unique_span(const unique_span&) = delete; // ... and also match other kinds of unique_span's, which may get converted into // a span and thus leak memory on construction! - template - unique_span(const unique_span&) = delete; + template + unique_span(const unique_span&) = delete; // Note: This template provides constructibility of unique_span from unique_span - template - unique_span(unique_span&& other) - : unique_span{ other.release() } + template + unique_span(unique_span&& other) : unique_span{ other.release(), other.deleter_ } { static_assert( - ::std::is_assignable>::value and - ::std::is_assignable::value, + ::std::is_assignable>::value, "Invalid unique_span initializer"); } @@ -81,25 +82,33 @@ class unique_span : public ::cuda::span { /// of a non-owned span when passing to a function, then trying to release that /// memory returning from it. ///@{ - explicit unique_span(span_type span) noexcept : span_type{span} { } - explicit unique_span(pointer data, size_type size) noexcept : unique_span{span_type{data, size}} { } + explicit unique_span(span_type span, deleter_type deleter) noexcept + : span_type{span}, deleter_(deleter) { } + explicit unique_span(pointer data, size_type size, deleter_type deleter) noexcept + : unique_span(span_type{data, size}, deleter) { } + explicit unique_span(memory::region_t region, deleter_type deleter) NOEXCEPT_IF_NDEBUG + : unique_span(span_type{region.start(), region.size() / sizeof(T)}, deleter) + { +#ifndef NDEBUG + if (sizeof(T) * size != region.size()) { + throw ::std::invalid_argument("Attempt to create a unique_span with a memory region which" + "does not comprise an integral number of areas of the element type size"); + } +#endif + } + ///@} - // Note: No constructor which also takes a deleter. We do not hold a deleter - // member - unlike unique_ptr's. Perhaps we should? - /** A move constructor. - * - * @note Moving is the only way a unique_span may have its @ref data_ field become - * null; the user is strongly assumed not to use the `unique_span` after moving from - * it. - */ - unique_span(unique_span&& other) noexcept : unique_span{ other.release() } { } + /// A move constructor. + /// + /// @TODO Can we drop this one in favor of the general move ctor? + unique_span(unique_span&& other) noexcept : unique_span(other.release(), other.deleter_) { } ~unique_span() noexcept { if (data() != nullptr) { - deleter_type{}(data()); + deleter_(*this); } #ifndef NDEBUG span_type::operator=(span_type{static_cast(nullptr), 0}); @@ -142,19 +151,41 @@ class unique_span : public ::cuda::span { * @note This is not marked nodiscard by the same argument as for std::unique_ptr; * see also @url https://stackoverflow.com/q/60535399/1593077 and * @url http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2017/p0600r1.pdf + * + * @note it is the caller's responsibility to ensure it has a copy of the deleter + * for the released span. */ span_type release() noexcept { span_type released { data(), size() }; span_type::operator=(span_type{ static_cast(nullptr), 0 }); + // Note that we are _not_ replacing deleter. return released; } }; // class unique_span +namespace detail_ { + +// @note you can't just use this always. Thus, only one of the make_ functions +// below uses it. +// +// @note that if a nullptr happens to be deleted - that's not a problem; +// it is supported by the delete operation(s). +template +inline void default_span_deleter(span sp) +{ + delete[] sp.data(); +} + +} // namespace detail_ + + /** * A parallel of ::std::make_unique_for_overwrite, for @ref unique_span's, i.e. which maintains * the number of elements allocated. * + * @param size the number of elements in the unique_span to be created. It may legitimately be 0. + * * @tparam T the type of elements in the allocated @ref unique_span. * * @param size The number of @tparam T elements to allocate @@ -162,7 +193,68 @@ class unique_span : public ::cuda::span { template unique_span make_unique_span(size_t size) { - return unique_span{ new T[size], size }; + // Note: It _is_ acceptable pass 0 here. + // See https://stackoverflow.com/q/1087042/1593077 + return unique_span(new T[size], size, detail_::default_span_deleter); +} + +namespace detail_ { + +template +inline void elementwise_destruct(span sp) +{ + for (auto& element : sp) { element.~T(); } +} + +// Use this structure to wrap a deleter which takes trivially-destructible/raw memory, +// to then pass on for use with a typed span +// +// Note: Ignores alignment. +template +struct deleter_with_elementwise_destruction { + template + void operator()(span sp) + { + elementwise_destruct(sp); + raw_deleter(static_cast(sp.data())); + } + RawDeleter raw_deleter; +}; + +template +void delete_with_elementwise_destruction(span sp, RawDeleter raw_deleter) +{ + elementwise_destruct(sp); + raw_deleter(static_cast(sp.data())); +} + +} // namespace detail_ + +/** + * The alternative to `std::generate` and similar functions, for the unique_span, seeing + * how its elements must be constructed as it is constructed. + * + * @param size the number of elements in the unique_span to be created. It may legitimately be 0. + * @param gen a function for generating new values for move-construction into the new unique_span + * + * @tparam T the type of elements in the allocated @ref unique_span. + * @tparam Generator A type invokable with the element index, to produce a T-constructor-argument + * + * @param size The number of @tparam T elements to allocate + */ +template +unique_span generate_unique_span(size_t size, Generator generator_by_index) noexcept +{ + // Q: Do I need to check the alignment here? Perhaps allocate more to ensure alignment? + auto result_data = static_cast(::operator new(sizeof(T) * size)); + for (size_t i = 0; i < size; i++) { + new(&result_data[i]) T(generator_by_index(i)); + } + auto deleter = [](span sp) { + auto raw_deleter = [](void* ptr) { ::operator delete(ptr); }; + detail_::delete_with_elementwise_destruction(sp, raw_deleter); + }; + return unique_span(result_data, size, deleter); } } // namespace cuda diff --git a/src/cuda/api/memory.hpp b/src/cuda/api/memory.hpp index 7711e5f3..b0a73105 100644 --- a/src/cuda/api/memory.hpp +++ b/src/cuda/api/memory.hpp @@ -98,6 +98,18 @@ struct allocation_options { namespace detail_ { +template +inline void check_allocation_type() noexcept +{ + static_assert(::std::is_trivially_constructible::value, + "Attempt to create a typed buffer of a non-trivially-constructive type"); + static_assert(not CheckConstructibility or ::std::is_trivially_destructible::value, + "Attempt to create a typed buffer of a non-trivially-destructible type " + "without allowing for its destruction"); + static_assert(::std::is_trivially_copyable::value, + "Attempt to create a typed buffer of a non-trivially-copyable type"); +} + inline unsigned make_cuda_host_alloc_flags(allocation_options options) { return @@ -325,6 +337,7 @@ namespace detail_ { struct allocator { void* operator()(size_t num_bytes) const { return detail_::allocate_in_current_context(num_bytes).start(); } }; + struct deleter { void operator()(void* ptr) const { cuda::memory::device::free(ptr); } }; @@ -2446,25 +2459,63 @@ inline bool is_part_of_a_region_pair(const void* ptr) } // namespace mapped +namespace detail_ { +/** + * Create a unique_span without default construction, using raw-memory allocator + * and deleter gadgets. + * + * @note We allow this only for "convenient" types; see @ref detail_check_allocation_type + * + * @tparam T Element of the created unique_span + * @tparam UntypedAllocator can allocate untyped memory given a size + * @tparam UntypedDeleter can delete memory given a pointer (disregarding the type) + * + * @param size number of elements in the unique_span to be created + * @param raw_allocator a gadget for allocating untyped memory + * @param raw_deleter a gadget which can de-allocate/delete allocations by @p raw_allocator + * @return the newly-created unique_span + */ +template +unique_span make_convenient_type_unique_span(size_t size, RegionAllocator allocator) +{ + memory::detail_::check_allocation_type(); + auto deleter = [](span sp) { + return RawDeleter{}(sp.data()); + }; + region_t allocated_region = allocator(size * sizeof(T)); + return unique_span( + allocated_region.as_span(), // no constructor calls - trivial construction + deleter // no destructor calls - trivial destruction + ); +} +} // namespace detail_ namespace device { -/// A unique span of device-global memory -template -using unique_span = cuda::unique_span; - namespace detail_ { template unique_span make_unique_span(const context::handle_t context_handle, size_t size) { + CAW_SET_SCOPE_CONTEXT(context_handle); + return memory::detail_::make_convenient_type_unique_span(size, allocate_in_current_context); +/* + memory::detail_::check_allocation_type(); // Note: _Not_ asserting trivial-copy-constructibility here; so if you want to copy data - // to/from the device using this object - it's your own repsonsibility to ensure that's + // to/from the device using this object - it's your own responsibility to ensure that's // a valid thing to do. CAW_SET_SCOPE_CONTEXT(context_handle); - return unique_span{ allocate_in_current_context(size * sizeof(T)) }; + auto deleter = [](span sp) { + return detail_::deleter{}(sp.data()); + }; + auto allocated = allocate_in_current_context(size * sizeof(T)).as_span(); + return unique_span( + allocated, // no constructor calls - trivial construction + deleter // no destructor calls - trivial destruction + ); +*/ } } // namespace detail_ @@ -2510,24 +2561,20 @@ unique_span make_unique_span(size_t size); /// See @ref `device::make_unique_span(const context_t& context, size_t size)` template -inline device::unique_span make_unique_span(const context_t& context, size_t size) +inline unique_span make_unique_span(const context_t& context, size_t size) { return device::make_unique_span(context, size); } /// See @ref `device::make_unique_span(const context_t& context, size_t num_elements)` template -inline device::unique_span make_unique_span(const device_t& device, size_t size) +inline unique_span make_unique_span(const device_t& device, size_t size) { return device::make_unique_span(device, size); } namespace host { -/// A unique span of CUDA-driver-allocated, pinned host (=system) memory -template -using unique_span = cuda::unique_span; - /** * Allocate memory for a consecutive sequence of typed elements in system * (host-side) memory. @@ -2543,34 +2590,60 @@ using unique_span = cuda::unique_span; * similar to {@ref cuda::device::make_unique_region}, except that the allocation is * conceived as typed elements. * - * @note Typically, this is used for trivially-constructible elements, for which reason the - * non-construction of individual elements should not pose a problem. But - let the user - * beware, especially since this is host-side memory. + * @note We assume this memory is used for copying to or from device-side memory; hence, + * we constrain the type to be trivially constructible, destructible and copyable + * + * @note ignoring alignment */ template unique_span make_unique_span(size_t size) { - return unique_span{ allocate(size * sizeof(T)) }; + // Need this because of allocate takes more arguments and has default ones + auto allocator = [](size_t size) { return allocate(size); }; + return memory::detail_::make_convenient_type_unique_span(size, allocator); +/* + memory::detail_::check_allocation_type(); + // Note: _Not_ asserting trivial-copy-constructibility here; so if you want to copy data + // to/from the device using this object - it's your own responsibility to ensure that's + // a valid thing to do. + auto deleter = [](span sp) { + return detail_::deleter{}(sp.data()); + }; + auto allocated = allocate(size * sizeof(T)).as_span(); + return unique_span( + allocated, // no constructor calls - trivial destruction + deleter // no destructor calls - trivial destruction + ); +*/ } } // namespace host namespace managed { -/// A unique span of CUDA-driver-allocated managed memory -template -using unique_span = cuda::unique_span; - namespace detail_ { -template +template unique_span make_unique_span( const context::handle_t context_handle, - size_t size, - initial_visibility_t initial_visibility = initial_visibility_t::to_all_devices) + size_t size) { CAW_SET_SCOPE_CONTEXT(context_handle); - return unique_span{ allocate_in_current_context(size * sizeof(T), initial_visibility) }; + auto allocator = [](size_t size) { + return allocate_in_current_context(size, InitialVisibility); + }; + return memory::detail_::make_convenient_type_unique_span(size, allocator); +/* + memory::detail_::check_allocation_type(); + auto deleter = [](span sp) { + return detail_::deleter{}(sp.data()); + }; + auto allocated = allocate_in_current_context(size * sizeof(T), initial_visibility).as_span(); + return unique_span( + allocated, // no constructor calls - trivial destruction + deleter // no destructor calls - trivial destruction + ); +*/ } } // namespace detail_ @@ -2601,7 +2674,7 @@ template unique_span make_unique_span( const context_t& context, size_t size, - initial_visibility_t initial_visibility = initial_visibility_t::to_all_devices); + initial_visibility_t initial_visibility = initial_visibility_t::to_all_devices); /** * @copydoc make_unique_span(const context_t&, size_t) @@ -2612,7 +2685,7 @@ template unique_span make_unique_span( const device_t& device, size_t size, - initial_visibility_t initial_visibility = initial_visibility_t::to_all_devices); + initial_visibility_t initial_visibility = initial_visibility_t::to_all_devices); /** * @copydoc make_unique_span(const context_t&, size_t) @@ -2622,8 +2695,8 @@ unique_span make_unique_span( */ template unique_span make_unique_span( - size_t size, - initial_visibility_t initial_visibility = initial_visibility_t::to_all_devices); + size_t size, + initial_visibility_t initial_visibility = initial_visibility_t::to_all_devices); } // namespace managed diff --git a/src/cuda/api/multi_wrapper_impls/unique_span.hpp b/src/cuda/api/multi_wrapper_impls/unique_span.hpp index 0c0ee18b..248f26de 100644 --- a/src/cuda/api/multi_wrapper_impls/unique_span.hpp +++ b/src/cuda/api/multi_wrapper_impls/unique_span.hpp @@ -74,17 +74,24 @@ template unique_span make_unique_span( const context_t& context, size_t size, - initial_visibility_t initial_visibility) + initial_visibility_t initial_visibility) { CAW_SET_SCOPE_CONTEXT(context.handle()); - return unique_span{ detail_::allocate_in_current_context(size * sizeof(T), initial_visibility) }; + switch (initial_visibility) { + case initial_visibility_t::to_all_devices: + return detail_::make_unique_span(context.handle(), size); + case initial_visibility_t::to_supporters_of_concurrent_managed_access: + return detail_::make_unique_span(context.handle(), size); + default: + throw ::std::logic_error("Library not yet updated to support additional initial visibility values"); + } } template unique_span make_unique_span( const device_t& device, size_t size, - initial_visibility_t initial_visibility) + initial_visibility_t initial_visibility) { auto pc = device.primary_context(); return make_unique_span(pc, size, initial_visibility);