From 116313825a7845c384e53b509a0535b0dd13d954 Mon Sep 17 00:00:00 2001 From: Eyal Rozenberg Date: Thu, 19 Aug 2021 23:16:36 +0300 Subject: [PATCH] Fixes #256: Replaced our library's `detail` namespaces with `detail_`, to avoid libcu++ having ambiguity of some of its own `detail` namespaces. --- .../device_management.cpp | 6 +- .../by_runtime_api_module/error_handling.cu | 4 +- src/cuda/api/array.hpp | 6 +- src/cuda/api/current_device.hpp | 16 +- src/cuda/api/detail/device_properties.hpp | 30 ++-- src/cuda/api/device.hpp | 20 +-- src/cuda/api/devices.hpp | 8 +- src/cuda/api/error.hpp | 4 +- src/cuda/api/event.hpp | 30 ++-- src/cuda/api/ipc.hpp | 10 +- src/cuda/api/kernel.hpp | 12 +- src/cuda/api/kernel_launch.hpp | 10 +- src/cuda/api/memory.hpp | 152 +++++++++--------- src/cuda/api/multi_wrapper_impls.hpp | 122 +++++++------- src/cuda/api/pci_id.hpp | 4 +- src/cuda/api/pointer.hpp | 2 +- src/cuda/api/stream.hpp | 34 ++-- src/cuda/api/texture_view.hpp | 10 +- src/cuda/api/unique_ptr.hpp | 50 +++--- src/cuda/common/types.hpp | 4 +- src/cuda/nvtx/profiling.cpp | 6 +- 21 files changed, 270 insertions(+), 270 deletions(-) diff --git a/examples/by_runtime_api_module/device_management.cpp b/examples/by_runtime_api_module/device_management.cpp index c11f8556..6ae5fb07 100644 --- a/examples/by_runtime_api_module/device_management.cpp +++ b/examples/by_runtime_api_module/device_management.cpp @@ -191,14 +191,14 @@ int main(int argc, char **argv) auto device_1 = cuda::device::get(1); cuda::device::current::set(device_0); assert(cuda::device::current::get() == device_0); - assert(cuda::device::current::detail::get_id() == device_0.id()); + assert(cuda::device::current::detail_::get_id() == device_0.id()); cuda::device::current::set(device_1); assert(cuda::device::current::get() == device_1); - assert(cuda::device::current::detail::get_id() == device_1.id()); + assert(cuda::device::current::detail_::get_id() == device_1.id()); } try { - cuda::device::current::detail::set(device_count); + cuda::device::current::detail_::set(device_count); die_("Should not have been able to set the current device to " + std::to_string(device_count) + " since that's the device count, and " + "the maximum valid ID should be " + std::to_string(device_count - 1) diff --git a/examples/by_runtime_api_module/error_handling.cu b/examples/by_runtime_api_module/error_handling.cu index c0b04695..85377ac3 100644 --- a/examples/by_runtime_api_module/error_handling.cu +++ b/examples/by_runtime_api_module/error_handling.cu @@ -28,7 +28,7 @@ int main(int argc, char **argv) } try { - cuda::device::current::detail::set(device_count); + cuda::device::current::detail_::set(device_count); die_("An exception should have be thrown"); } catch(cuda::runtime_error& e) { @@ -51,7 +51,7 @@ int main(int argc, char **argv) // clearing the error try { - cuda::device::current::detail::set(device_count); + cuda::device::current::detail_::set(device_count); die_("An exception should have be thrown"); } catch(cuda::runtime_error&) { } diff --git a/src/cuda/api/array.hpp b/src/cuda/api/array.hpp index ae13896b..063a1edd 100644 --- a/src/cuda/api/array.hpp +++ b/src/cuda/api/array.hpp @@ -20,7 +20,7 @@ class device_t; namespace array { -namespace detail { +namespace detail_ { template cudaArray* allocate_on_current_device(array::dimensions_t<3> dimensions) @@ -49,7 +49,7 @@ cudaArray* allocate(device_t& device, array::dimensions_t<3> dimensions); template cudaArray* allocate(device_t& device, array::dimensions_t<2> dimensions); -} // namespace detail +} // namespace detail_ } // namespace array @@ -91,7 +91,7 @@ class array_t { * Creates and wraps a new CUDA array. */ array_t(device_t& device, array::dimensions_t dimensions) - : array_t(array::detail::allocate(device, dimensions), dimensions) {} + : array_t(array::detail_::allocate(device, dimensions), dimensions) {} array_t(const array_t& other) = delete; array_t(array_t&& other) noexcept : array_t(other.raw_array_, other.dimensions_) { diff --git a/src/cuda/api/current_device.hpp b/src/cuda/api/current_device.hpp index e86643b8..00b0dfcb 100644 --- a/src/cuda/api/current_device.hpp +++ b/src/cuda/api/current_device.hpp @@ -37,7 +37,7 @@ namespace device { namespace current { -namespace detail { +namespace detail_ { /** * Obtains the numeric id of the device set as current for the CUDA Runtime API @@ -79,14 +79,14 @@ inline void set(const id_t* device_ids, size_t num_devices) } /** - * @note See the out-of-`detail::` version of this class. + * @note See the out-of-`detail_::` version of this class. */ class scoped_override_t { protected: static id_t replace(id_t new_device_id) { - id_t previous_device_id = device::current::detail::get_id(); - device::current::detail::set(new_device_id); + id_t previous_device_id = device::current::detail_::get_id(); + device::current::detail_::set(new_device_id); return previous_device_id; } @@ -102,12 +102,12 @@ class scoped_override_t { }; -} // namespace detail +} // namespace detail_ /** * Reset the CUDA Runtime API's current device to its default value - the default device */ -inline void set_to_default() { return detail::set(device::default_device_id); } +inline void set_to_default() { return detail_::set(device::default_device_id); } void set(device_t device); @@ -116,9 +116,9 @@ void set(device_t device); * what remains of the current scope, and changing it back to its previous value * when exiting the scope. */ -class scoped_override_t : private detail::scoped_override_t { +class scoped_override_t : private detail_::scoped_override_t { protected: - using parent = detail::scoped_override_t; + using parent = detail_::scoped_override_t; public: scoped_override_t(device_t& device); scoped_override_t(device_t&& device); diff --git a/src/cuda/api/detail/device_properties.hpp b/src/cuda/api/detail/device_properties.hpp index dc40225e..4c847fb9 100644 --- a/src/cuda/api/detail/device_properties.hpp +++ b/src/cuda/api/detail/device_properties.hpp @@ -1,5 +1,5 @@ /** - * @file detail/device_properties.hpp + * @file detail_/device_properties.hpp * * @brief Implementation of methods and helper functions for device-property-related classes. * @@ -47,7 +47,7 @@ inline constexpr bool operator >=(const compute_architecture_t& lhs, const compu return lhs.major > rhs.major; } -namespace detail { +namespace detail_ { constexpr const int invalid_architecture_return { 0 }; enum : memory::shared::size_t { KiB = 1024 }; @@ -55,7 +55,7 @@ enum : memory::shared::size_t { KiB = 1024 }; template inline T ensure_arch_property_validity(T v, const compute_architecture_t& arch) { - if (v == detail::invalid_architecture_return) { + if (v == detail_::invalid_architecture_return) { throw ::std::invalid_argument("No architecture numbered " + ::std::to_string(arch.major)); } return v; @@ -147,30 +147,30 @@ inline constexpr unsigned max_in_flight_threads_per_processor(const compute_arch invalid_architecture_return; } -} // namespace detail +} // namespace detail_ inline const char* compute_architecture_t::name() const { - return detail::ensure_arch_property_validity(detail::architecture_name(*this), *this); + return detail_::ensure_arch_property_validity(detail_::architecture_name(*this), *this); } inline unsigned compute_architecture_t::max_in_flight_threads_per_processor() const { - return detail::ensure_arch_property_validity(detail::max_in_flight_threads_per_processor(*this), *this); + return detail_::ensure_arch_property_validity(detail_::max_in_flight_threads_per_processor(*this), *this); } inline unsigned compute_architecture_t::max_shared_memory_per_block() const { - return detail::ensure_arch_property_validity(detail::max_shared_memory_per_block(*this), *this); + return detail_::ensure_arch_property_validity(detail_::max_shared_memory_per_block(*this), *this); } inline unsigned compute_architecture_t::max_resident_warps_per_processor() const { - return detail::ensure_arch_property_validity(detail::max_resident_warps_per_processor(*this), *this); + return detail_::ensure_arch_property_validity(detail_::max_resident_warps_per_processor(*this), *this); } inline unsigned compute_architecture_t::max_warp_schedulings_per_processor_cycle() const { - return detail::ensure_arch_property_validity(detail::max_warp_schedulings_per_processor_cycle(*this), *this); + return detail_::ensure_arch_property_validity(detail_::max_warp_schedulings_per_processor_cycle(*this), *this); } // compute_capability_t-related @@ -224,7 +224,7 @@ inline constexpr compute_capability_t make_compute_capability(unsigned major, un return { {major}, minor }; } -namespace detail { +namespace detail_ { inline constexpr unsigned max_in_flight_threads_per_processor(const compute_capability_t& cc) { @@ -262,26 +262,26 @@ inline constexpr unsigned max_resident_warps_per_processor(const compute_capabil max_resident_warps_per_processor(cc.architecture); } -} // namespace detail +} // namespace detail_ inline unsigned compute_capability_t::max_in_flight_threads_per_processor() const { - return detail::ensure_arch_property_validity(detail::max_in_flight_threads_per_processor(*this), architecture); + return detail_::ensure_arch_property_validity(detail_::max_in_flight_threads_per_processor(*this), architecture); } inline unsigned compute_capability_t::max_warp_schedulings_per_processor_cycle() const { - return detail::ensure_arch_property_validity(detail::max_warp_schedulings_per_processor_cycle(*this), architecture); + return detail_::ensure_arch_property_validity(detail_::max_warp_schedulings_per_processor_cycle(*this), architecture); } inline unsigned compute_capability_t::max_shared_memory_per_block() const { - return detail::ensure_arch_property_validity(detail::max_shared_memory_per_block(*this), architecture); + return detail_::ensure_arch_property_validity(detail_::max_shared_memory_per_block(*this), architecture); } inline unsigned compute_capability_t::max_resident_warps_per_processor() const { - return detail::ensure_arch_property_validity(detail::max_resident_warps_per_processor(*this), architecture); + return detail_::ensure_arch_property_validity(detail_::max_resident_warps_per_processor(*this), architecture); } // properties_t-related diff --git a/src/cuda/api/device.hpp b/src/cuda/api/device.hpp index 422fcc0e..22af5fa7 100644 --- a/src/cuda/api/device.hpp +++ b/src/cuda/api/device.hpp @@ -155,7 +155,7 @@ class device_t { using resource_id_t = cudaLimit; protected: // types - using scoped_setter_t = device::current::detail::scoped_override_t; + using scoped_setter_t = device::current::detail_::scoped_override_t; using flags_t = unsigned; ///@cond @@ -180,8 +180,8 @@ class device_t { protected: const device::id_t device_id_; - using deleter = memory::device::detail::deleter; - using allocator = memory::device::detail::allocator; + using deleter = memory::device::detail_::deleter; + using allocator = memory::device::detail_::allocator; public: ///@cond @@ -199,7 +199,7 @@ class device_t { memory::region_t allocate(size_t size_in_bytes) { scoped_setter_t set_device_for_this_scope(device_id_); - return memory::device::detail::allocate(size_in_bytes); + return memory::device::detail_::allocate(size_in_bytes); } // Perhaps drop this? it should really go into a managed namespace @@ -231,7 +231,7 @@ class device_t { initial_visibility_t::to_supporters_of_concurrent_managed_access) { scoped_setter_t set_device_for_this_scope(device_id_); - return cuda::memory::managed::detail::allocate(size_in_bytes, initial_visibility); + return cuda::memory::managed::detail_::allocate(size_in_bytes, initial_visibility); } /** @@ -688,7 +688,7 @@ class device_t { */ device_t& make_current() { - device::current::detail::set(id()); + device::current::detail_::set(id()); return *this; } @@ -754,12 +754,12 @@ namespace current { /** * Obtains (a proxy for) the device which the CUDA runtime API considers to be current. */ -inline device_t get() { return device::get(detail::get_id()); } +inline device_t get() { return device::get(detail_::get_id()); } /** * Tells the CUDA runtime API to consider the specified device as the current one. */ -inline void set(device_t device) { detail::set(device.id()); } +inline void set(device_t device) { detail_::set(device.id()); } } // namespace current @@ -771,7 +771,7 @@ inline void set(device_t device) { detail::set(device.id()); } */ inline device_t get(pci_location_t pci_id) { - auto resolved_id = device::detail::resolve_id(pci_id); + auto resolved_id = device::detail_::resolve_id(pci_id); return get(resolved_id); } @@ -797,7 +797,7 @@ inline device_t get(const ::std::string& pci_id_str) inline void synchronize(device_t& device) { auto device_id = device.id(); - device::current::detail::scoped_override_t set_device_for_this_scope(device_id); + device::current::detail_::scoped_override_t set_device_for_this_scope(device_id); auto status = cudaDeviceSynchronize(); throw_if_error(status, "Failed synchronizing " + ::std::to_string(device_id)); } diff --git a/src/cuda/api/devices.hpp b/src/cuda/api/devices.hpp index a767c032..d3dccef9 100644 --- a/src/cuda/api/devices.hpp +++ b/src/cuda/api/devices.hpp @@ -11,7 +11,7 @@ namespace cuda { -namespace detail { +namespace detail_ { // Note that while nothing constrains you from instantiating // this class many times, all instances are the same (as CUDA @@ -203,11 +203,11 @@ inline bool operator!= ( return not (lhs == rhs); } -} // namespace detail +} // namespace detail_ -inline detail::all_devices devices() +inline detail_::all_devices devices() { - return detail::all_devices(); + return detail_::all_devices(); } } // namespace cuda diff --git a/src/cuda/api/error.hpp b/src/cuda/api/error.hpp index b4f132fc..0986e44d 100644 --- a/src/cuda/api/error.hpp +++ b/src/cuda/api/error.hpp @@ -134,7 +134,7 @@ constexpr inline bool is_failure(status_t status) { return status != (status_t) */ inline ::std::string describe(status_t status) { return cudaGetErrorString(status); } -namespace detail { +namespace detail_ { template ::std::string as_hex(I x) @@ -165,7 +165,7 @@ inline ::std::string ptr_as_hex(const I* ptr) return as_hex((size_t) ptr); } -} // namespace detail +} // namespace detail_ /** * A (base?) class for exceptions raised by CUDA code; these errors are thrown by diff --git a/src/cuda/api/event.hpp b/src/cuda/api/event.hpp index d9b743fa..20ec3df4 100644 --- a/src/cuda/api/event.hpp +++ b/src/cuda/api/event.hpp @@ -28,7 +28,7 @@ class stream_t; namespace event { -namespace detail { +namespace detail_ { /** * Schedule a specified event to occur (= to fire) when all activities @@ -40,8 +40,8 @@ namespace detail { inline void enqueue(stream::id_t stream_id, id_t event_id) { auto status = cudaEventRecord(event_id, stream_id); cuda::throw_if_error(status, - "Failed recording event " + cuda::detail::ptr_as_hex(event_id) - + " on stream " + cuda::detail::ptr_as_hex(stream_id)); + "Failed recording event " + cuda::detail_::ptr_as_hex(event_id) + + " on stream " + cuda::detail_::ptr_as_hex(stream_id)); } constexpr unsigned inline make_flags(bool uses_blocking_sync, bool records_timing, bool interprocess) @@ -52,7 +52,7 @@ constexpr unsigned inline make_flags(bool uses_blocking_sync, bool records_timin | ( interprocess ? cudaEventInterprocess : 0 ); } -} // namespace detail +} // namespace detail_ } // namespace event @@ -62,7 +62,7 @@ class event_t; namespace event { -namespace detail { +namespace detail_ { /** * @brief Wrap an existing CUDA event in a @ref event_t instance. * @@ -81,7 +81,7 @@ event_t wrap( id_t event_id, bool take_ownership = false) noexcept; -} // namespace detail +} // namespace detail_ } // namespace event @@ -138,7 +138,7 @@ class event_t { if (status == cuda::status::success) return true; if (status == cuda::status::not_ready) return false; throw cuda::runtime_error(status, - "Could not determine whether event " + detail::ptr_as_hex(id_) + "Could not determine whether event " + detail_::ptr_as_hex(id_) + "has already occurred or not."); } @@ -160,7 +160,7 @@ class event_t { */ void record() { - event::detail::enqueue(stream::default_stream_id, id_); + event::detail_::enqueue(stream::default_stream_id, id_); } /** @@ -195,7 +195,7 @@ class event_t { public: // friendship - friend event_t event::detail::wrap(device::id_t device_id, event::id_t event_id, bool take_ownership) noexcept; + friend event_t event::detail_::wrap(device::id_t device_id, event::id_t event_id, bool take_ownership) noexcept; public: // constructors and destructor @@ -251,7 +251,7 @@ inline duration_t time_elapsed_between(const event_t& start, const event_t& end) return duration_t { elapsed_milliseconds }; } -namespace detail { +namespace detail_ { /** * Obtain a proxy object for an already-existing CUDA event @@ -304,12 +304,12 @@ inline event_t create( bool records_timing, bool interprocess) { - device::current::detail::scoped_override_t + device::current::detail_::scoped_override_t set_device_for_this_scope(device_id); - return detail::create_on_current_device(device_id, uses_blocking_sync, records_timing, interprocess); + return detail_::create_on_current_device(device_id, uses_blocking_sync, records_timing, interprocess); } -} // namespace detail +} // namespace detail_ /** * @brief creates a new execution stream on a device. @@ -344,10 +344,10 @@ inline void synchronize(const event_t& event) { auto device_id = event.device_id(); auto event_id = event.id(); - device::current::detail::scoped_override_t device_for_this_scope(device_id); + device::current::detail_::scoped_override_t device_for_this_scope(device_id); auto status = cudaEventSynchronize(event_id); throw_if_error(status, "Failed synchronizing the event with id " - + cuda::detail::ptr_as_hex(event_id) + " on " + ::std::to_string(device_id)); + + cuda::detail_::ptr_as_hex(event_id) + " on " + ::std::to_string(device_id)); } } // namespace cuda diff --git a/src/cuda/api/ipc.hpp b/src/cuda/api/ipc.hpp index b9ef90eb..f3bfe8c5 100644 --- a/src/cuda/api/ipc.hpp +++ b/src/cuda/api/ipc.hpp @@ -61,7 +61,7 @@ inline handle_t export_(void* device_ptr) { handle_t handle; auto status = cudaIpcGetMemHandle(&handle, device_ptr); cuda::throw_if_error(status, - "Failed producing an IPC memory handle for device pointer " + cuda::detail::ptr_as_hex(device_ptr)); + "Failed producing an IPC memory handle for device pointer " + cuda::detail_::ptr_as_hex(device_ptr)); return handle; } @@ -95,7 +95,7 @@ inline void unmap(void* ipc_mapped_ptr) auto status = cudaIpcCloseMemHandle(ipc_mapped_ptr); cuda::throw_if_error(status, "Failed unmapping IPC memory mapped to " + - cuda::detail::ptr_as_hex(ipc_mapped_ptr)); + cuda::detail_::ptr_as_hex(ipc_mapped_ptr)); } /** @@ -158,14 +158,14 @@ namespace ipc { */ using handle_t = cudaIpcEventHandle_t; -namespace detail { +namespace detail_ { inline handle_t export_(id_t event_id) { handle_t ipc_handle; auto status = cudaIpcGetEventHandle(&ipc_handle, event_id); cuda::throw_if_error(status, - "Failed obtaining an IPC event handle for event " + cuda::detail::ptr_as_hex(event_id)); + "Failed obtaining an IPC event handle for event " + cuda::detail_::ptr_as_hex(event_id)); return ipc_handle; } @@ -178,7 +178,7 @@ inline event::id_t import(const handle_t& handle) return event_id; } -} // namespace detail +} // namespace detail_ /** * Enable use of an event which this process created by other processes diff --git a/src/cuda/api/kernel.hpp b/src/cuda/api/kernel.hpp index 606e978e..a8440774 100644 --- a/src/cuda/api/kernel.hpp +++ b/src/cuda/api/kernel.hpp @@ -211,7 +211,7 @@ class kernel_t { namespace kernel { -namespace detail { +namespace detail_ { template struct bool_pack; template @@ -219,11 +219,11 @@ using all_true = ::std::is_same, bool_pack>; template struct raw_kernel_typegen { - static_assert(all_true<::std::is_same>::value...>::value, + static_assert(all_true<::std::is_same>::value...>::value, "Invalid kernel parameter types" ); using type = void(*)(KernelParameters...); // Why no decay? After all, CUDA kernels only takes parameters by value, right? - // Well, we're inside `detail::`. You should be careful to only instantiate this class with + // Well, we're inside `detail_::`. You should be careful to only instantiate this class with // nice simple types we can pass to CUDA kernels. }; @@ -247,7 +247,7 @@ Kernel unwrap_inner(::std::false_type, Kernel raw_function) return raw_function; } -} // namespace detail +} // namespace detail_ /** * Obtain the raw function pointer of any type acceptable as a launchable kernel @@ -256,12 +256,12 @@ Kernel unwrap_inner(::std::false_type, Kernel raw_function) template auto unwrap(Kernel f) -> typename ::std::conditional< ::std::is_same::type, kernel_t>::value, - typename detail::raw_kernel_typegen::type, + typename detail_::raw_kernel_typegen::type, Kernel>::type { using got_a_kernel_t = ::std::integral_constant::type, kernel_t>::value>; - return detail::unwrap_inner(got_a_kernel_t{}, f); + return detail_::unwrap_inner(got_a_kernel_t{}, f); } } // namespace kernel diff --git a/src/cuda/api/kernel_launch.hpp b/src/cuda/api/kernel_launch.hpp index 44ac1a62..bd4aabdd 100644 --- a/src/cuda/api/kernel_launch.hpp +++ b/src/cuda/api/kernel_launch.hpp @@ -69,7 +69,7 @@ constexpr grid::dimensions_t single_block() { return 1; } */ constexpr grid::block_dimensions_t single_thread_per_block() { return 1; } -namespace detail { +namespace detail_ { template bool intrinsic_block_cooperation_value(const Kernel&) @@ -96,7 +96,7 @@ inline void collect_argument_addresses(void** collected_addresses, Arg&& arg, Ar collect_argument_addresses(collected_addresses + 1, ::std::forward(args)...); } -// Note: Unlike the non-detail functions - this one +// Note: Unlike the non-detail_ functions - this one // cannot handle type-erased kernel_t's. template inline void enqueue_launch( @@ -143,7 +143,7 @@ inline void enqueue_launch( // fill the argument array with our parameters. Yes, the use // of the two terms is confusing here and depends on how you // look at things. - detail::collect_argument_addresses(argument_ptrs, ::std::forward(parameters)...); + detail_::collect_argument_addresses(argument_ptrs, ::std::forward(parameters)...); auto status = cudaLaunchCooperativeKernel( (const void*) kernel_function, launch_configuration.grid_dimensions, @@ -162,7 +162,7 @@ inline void enqueue_launch( #endif -} // namespace detail +} // namespace detail_ /** * @brief Enqueues a kernel on a stream (=queue) on the current CUDA device. @@ -220,7 +220,7 @@ inline void enqueue_launch( KernelParameters&&... parameters) { enqueue_launch( - detail::intrinsic_block_cooperation_value(kernel_function), + detail_::intrinsic_block_cooperation_value(kernel_function), kernel_function, stream, launch_configuration, diff --git a/src/cuda/api/memory.hpp b/src/cuda/api/memory.hpp index cde74dce..73512bd7 100644 --- a/src/cuda/api/memory.hpp +++ b/src/cuda/api/memory.hpp @@ -51,7 +51,7 @@ class stream_t; */ namespace memory { -namespace detail { +namespace detail_ { template class base_region_t { @@ -72,13 +72,13 @@ template T* get() const { return start(); } }; -} // namespace detail +} // namespace detail_ -struct region_t : public detail::base_region_t { +struct region_t : public detail_::base_region_t { using base_region_t::base_region_t; }; -struct const_region_t : public detail::base_region_t { +struct const_region_t : public detail_::base_region_t { using base_region_t::base_region_t; const_region_t(const region_t& r) : base_region_t(r.start(), r.size()) {} }; @@ -122,7 +122,7 @@ struct allocation_options { cpu_write_combining write_combining; }; -namespace detail { +namespace detail_ { inline unsigned make_cuda_host_alloc_flags(allocation_options options) { @@ -131,7 +131,7 @@ inline unsigned make_cuda_host_alloc_flags(allocation_options options) (options.write_combining == cpu_write_combining::with_wc ? cudaHostAllocWriteCombined : 0); } -} // namespace detail +} // namespace detail_ /** * @namespace mapped @@ -171,7 +171,7 @@ namespace memory { */ namespace device { -namespace detail { +namespace detail_ { /** * Allocate memory on current device @@ -191,22 +191,22 @@ inline region_t allocate(size_t num_bytes) throw_if_error(status, "Failed allocating " + ::std::to_string(num_bytes) + " bytes of global memory on CUDA device " + - ::std::to_string(cuda::device::current::detail::get_id())); + ::std::to_string(cuda::device::current::detail_::get_id())); return {allocated, num_bytes}; } inline region_t allocate(cuda::device::id_t device_id, size_t size_in_bytes) { - cuda::device::current::detail::scoped_override_t set_device_for_this_scope(device_id); - return memory::device::detail::allocate(size_in_bytes); + cuda::device::current::detail_::scoped_override_t set_device_for_this_scope(device_id); + return memory::device::detail_::allocate(size_in_bytes); } -} // namespace detail +} // namespace detail_ namespace async { -namespace detail { +namespace detail_ { /** * Allocate memory asynchronously on a specified stream. @@ -228,7 +228,7 @@ inline region_t allocate( throw_if_error(status, "Failed scheduling an asynchronous allocation of " + ::std::to_string(num_bytes) + " bytes of global memory " - + " on stream " + cuda::detail::ptr_as_hex(stream_id) + + " on stream " + cuda::detail_::ptr_as_hex(stream_id) + " on CUDA device " + ::std::to_string(device_id)); return {allocated, num_bytes}; #else @@ -239,7 +239,7 @@ inline region_t allocate( #endif } -} // namespace detail +} // namespace detail_ /** * Schedule an allocation of device-side memory on a CUDA stream. @@ -267,7 +267,7 @@ inline region_t allocate(const cuda::stream_t& stream, size_t size_in_bytes); inline void free(void* ptr) { auto result = cudaFree(ptr); - throw_if_error(result, "Freeing device memory at 0x" + cuda::detail::ptr_as_hex(ptr)); + throw_if_error(result, "Freeing device memory at 0x" + cuda::detail_::ptr_as_hex(ptr)); } inline void free(region_t region) { free(region.start()); } ///@} @@ -286,15 +286,15 @@ inline void free(region_t region) { free(region.start()); } */ inline region_t allocate(cuda::device_t device, size_t size_in_bytes); -namespace detail { +namespace detail_ { struct allocator { // Allocates on the current device! - void* operator()(size_t num_bytes) const { return detail::allocate(num_bytes).start(); } + void* operator()(size_t num_bytes) const { return detail_::allocate(num_bytes).start(); } }; struct deleter { void operator()(void* ptr) const { cuda::memory::device::free(ptr); } }; -} // namespace detail +} // namespace detail_ /** * @brief Sets all bytes in a region of memory to a fixed value @@ -437,7 +437,7 @@ inline void set(region_t region, int byte_value) default: throw runtime_error( cuda::status::invalid_value, - "CUDA returned an invalid memory type for the pointer 0x" + cuda::detail::ptr_as_hex(region.start()) + "CUDA returned an invalid memory type for the pointer 0x" + cuda::detail_::ptr_as_hex(region.start()) ); } } @@ -466,7 +466,7 @@ inline void zero(T* ptr) zero(ptr, sizeof(T)); } -namespace detail { +namespace detail_ { /** * @note When constructing this class - destination first, source second @@ -532,7 +532,7 @@ inline void copy(array_t& destination, const T *source) template inline void copy(array_t& destination, const T *source) { - const auto copy_params = detail::copy_params_t(destination, source); + const auto copy_params = detail_::copy_params_t(destination, source); auto result = cudaMemcpy3D(©_params); throw_if_error(result, "Synchronously copying into a 3-dimensional CUDA array"); } @@ -559,12 +559,12 @@ inline void copy(T *destination, const array_t& source) template inline void copy(T* destination, const array_t& source) { - const auto copy_params = detail::copy_params_t(destination, source); + const auto copy_params = detail_::copy_params_t(destination, source); auto result = cudaMemcpy3D(©_params); throw_if_error(result, "Synchronously copying from a 3-dimensional CUDA array"); } -} // namespace detail +} // namespace detail_ /** @@ -580,7 +580,7 @@ inline void copy(T* destination, const array_t& source) template inline void copy(array_t& destination, const T* source) { - detail::copy(destination, source); + detail_::copy(destination, source); } /** @@ -596,7 +596,7 @@ inline void copy(array_t& destination, const T* source) template inline void copy(T* destination, const array_t& source) { - detail::copy(destination, source); + detail_::copy(destination, source); } /** @@ -615,7 +615,7 @@ inline void copy_single(T* destination, const T* source) namespace async { -namespace detail { +namespace detail_ { /** * Asynchronously copies data between memory spaces or within a memory space. @@ -643,23 +643,23 @@ inline void copy(void* destination, const void* source, size_t num_bytes, stream // TODO: Determine whether it was from host to device, device to host etc and // add this information to the error string - throw_if_error(result, "Scheduling a memory copy on stream " + cuda::detail::ptr_as_hex(stream_id)); + throw_if_error(result, "Scheduling a memory copy on stream " + cuda::detail_::ptr_as_hex(stream_id)); } template void copy(array_t& destination, const T* source, stream::id_t stream_id) { - const auto copy_params = memory::detail::copy_params_t(destination, source); + const auto copy_params = memory::detail_::copy_params_t(destination, source); auto result = cudaMemcpy3DAsync(©_params, stream_id); - throw_if_error(result, "Scheduling a memory copy into a 3D CUDA array on stream " + cuda::detail::ptr_as_hex(stream_id)); + throw_if_error(result, "Scheduling a memory copy into a 3D CUDA array on stream " + cuda::detail_::ptr_as_hex(stream_id)); } template void copy(T* destination, const array_t& source, stream::id_t stream_id) { - const auto copy_params = memory::detail::copy_params_t(destination, source); + const auto copy_params = memory::detail_::copy_params_t(destination, source); auto result = cudaMemcpy3DAsync(©_params, stream_id); - throw_if_error(result, "Scheduling a memory copy out of a 3D CUDA array on stream " + cuda::detail::ptr_as_hex(stream_id)); + throw_if_error(result, "Scheduling a memory copy out of a 3D CUDA array on stream " + cuda::detail_::ptr_as_hex(stream_id)); } template @@ -679,7 +679,7 @@ void copy(array_t& destination, const T* source, stream::id_t stream_id) dimensions.height, cudaMemcpyDefault, stream_id); - throw_if_error(result, "Scheduling a memory copy into a 2D CUDA array on stream " + cuda::detail::ptr_as_hex(stream_id)); + throw_if_error(result, "Scheduling a memory copy into a 2D CUDA array on stream " + cuda::detail_::ptr_as_hex(stream_id)); } template @@ -699,7 +699,7 @@ void copy(T* destination, const array_t& source, cuda::stream::id_t stream dimensions.height, cudaMemcpyDefault, stream_id); - throw_if_error(result, "Scheduling a memory copy out of a 3D CUDA array on stream " + cuda::detail::ptr_as_hex(stream_id)); + throw_if_error(result, "Scheduling a memory copy out of a 3D CUDA array on stream " + cuda::detail_::ptr_as_hex(stream_id)); } /** @@ -719,7 +719,7 @@ inline void copy_single(T& destination, const T& source, stream::id_t stream_id) copy(&destination, &source, sizeof(T), stream_id); } -} // namespace detail +} // namespace detail_ /** * Asynchronously copies data between memory spaces or within a memory space. @@ -841,7 +841,7 @@ namespace device { namespace async { -namespace detail { +namespace detail_ { inline void set(void* start, int byte_value, size_t num_bytes, stream::id_t stream_id) { @@ -866,7 +866,7 @@ inline void zero(region_t region, stream::id_t stream_id) zero(region.start(), region.size(), stream_id); } -} // namespace detail +} // namespace detail_ /** * Asynchronously sets all bytes in a stretch of memory to a single value @@ -937,7 +937,7 @@ inline void* allocate( allocation_options options) { void* allocated = nullptr; - auto flags = cuda::memory::detail::make_cuda_host_alloc_flags(options); + auto flags = cuda::memory::detail_::make_cuda_host_alloc_flags(options); auto result = cudaHostAlloc(&allocated, size_in_bytes, flags); if (is_success(result) && allocated == nullptr) { // Can this even happen? hopefully not @@ -966,10 +966,10 @@ inline void* allocate(size_t size_in_bytes, cpu_write_combining cpu_wc) inline void free(void* host_ptr) { auto result = cudaFreeHost(host_ptr); - throw_if_error(result, "Freeing pinned host memory at 0x" + cuda::detail::ptr_as_hex(host_ptr)); + throw_if_error(result, "Freeing pinned host memory at 0x" + cuda::detail_::ptr_as_hex(host_ptr)); } -namespace detail { +namespace detail_ { struct allocator { void* operator()(size_t num_bytes) const { return cuda::memory::host::allocate(num_bytes); } @@ -995,7 +995,7 @@ inline void register_(const void *ptr, size_t size, unsigned flags) auto result = cudaHostRegister(const_cast(ptr), size, flags); throw_if_error(result, "Could not register and page-lock the region of " + ::std::to_string(size) + - " bytes of host memory at " + cuda::detail::ptr_as_hex(ptr)); + " bytes of host memory at " + cuda::detail_::ptr_as_hex(ptr)); } inline void register_(const_region_t region, unsigned flags) @@ -1003,7 +1003,7 @@ inline void register_(const_region_t region, unsigned flags) register_(region.start(), region.size(), flags); } -} // namespace detail +} // namespace detail_ /** * Whether or not the registration of the host-side pointer should map @@ -1042,7 +1042,7 @@ inline void register_(const void *ptr, size_t size, bool map_into_device_space, bool make_device_side_accesible_to_all) { - detail::register_( + detail_::register_( ptr, size, (register_mapped_io_space ? cudaHostRegisterIoMemory : 0) | (map_into_device_space ? cudaHostRegisterMapped : 0) @@ -1067,7 +1067,7 @@ inline void register_( inline void register_(void const *ptr, size_t size) { - detail::register_(ptr, size, cudaHostRegisterDefault); + detail_::register_(ptr, size, cudaHostRegisterDefault); } inline void register_(const_region_t region) @@ -1140,7 +1140,7 @@ namespace managed { class const_region_t; -namespace detail { +namespace detail_ { template inline T get_scalar_range_attribute(managed::const_region_t region, cudaMemRangeAttribute attribute); @@ -1149,8 +1149,8 @@ inline void set_scalar_range_attribute(managed::const_region_t region, cudaMemor inline void set_scalar_range_attribute(managed::const_region_t region, cudaMemoryAdvise attribute); template -struct base_region_t : public memory::detail::base_region_t { - using parent = memory::detail::base_region_t; +struct base_region_t : public memory::detail_::base_region_t { + using parent = memory::detail_::base_region_t; using parent::parent; bool is_read_mostly() const @@ -1165,7 +1165,7 @@ struct base_region_t : public memory::detail::base_region_t { void undesignate_read_mostly() const { - detail::set_scalar_range_attribute(*this, cudaMemAdviseUnsetReadMostly); + detail_::set_scalar_range_attribute(*this, cudaMemAdviseUnsetReadMostly); } device_t preferred_location() const; @@ -1175,16 +1175,16 @@ struct base_region_t : public memory::detail::base_region_t { // TODO: Consider using a field proxy }; -} // namespace detail +} // namespace detail_ -struct region_t : public detail::base_region_t { +struct region_t : public detail_::base_region_t { using base_region_t::base_region_t; operator memory::region_t() { return memory::region_t{ start(), size() }; } }; -struct const_region_t : public detail::base_region_t { +struct const_region_t : public detail_::base_region_t { using base_region_t::base_region_t; - const_region_t(const region_t& r) : detail::base_region_t(r.start(), r.size()) {} + const_region_t(const region_t& r) : detail_::base_region_t(r.start(), r.size()) {} }; void advise_expected_access_by(managed::const_region_t region, device_t& device); @@ -1193,7 +1193,7 @@ void advise_no_access_expected_by(managed::const_region_t region, device_t& devi template > typename ::std::vector accessors(managed::const_region_t region, const Allocator& allocator = Allocator() ); -namespace detail { +namespace detail_ { template inline T get_scalar_range_attribute(managed::const_region_t region, cudaMemRangeAttribute attribute) @@ -1202,7 +1202,7 @@ inline T get_scalar_range_attribute(managed::const_region_t region, cudaMemRange auto result = cudaMemRangeGetAttribute( &attribute_value, sizeof(attribute_value), attribute, region.start(), region.size()); throw_if_error(result, - "Obtaining an attribute for a managed memory range at " + cuda::detail::ptr_as_hex(region.start())); + "Obtaining an attribute for a managed memory range at " + cuda::detail_::ptr_as_hex(region.start())); return static_cast(attribute_value); } @@ -1210,7 +1210,7 @@ inline void set_scalar_range_attribute(managed::const_region_t region, cudaMemor { auto result = cudaMemAdvise(region.start(), region.size(), advice, device_id); throw_if_error(result, - "Setting an attribute for a managed memory range at " + cuda::detail::ptr_as_hex(region.start())); + "Setting an attribute for a managed memory range at " + cuda::detail_::ptr_as_hex(region.start())); } inline void set_scalar_range_attribute(managed::const_region_t region, cudaMemoryAdvise attribute) @@ -1219,7 +1219,7 @@ inline void set_scalar_range_attribute(managed::const_region_t region, cudaMemor set_scalar_range_attribute(region, attribute, ignored_device_index); } -} // namespace detail +} // namespace detail_ enum class initial_visibility_t { @@ -1235,7 +1235,7 @@ enum class attachment_t { }; -namespace detail { +namespace detail_ { inline region_t allocate( size_t num_bytes, @@ -1263,7 +1263,7 @@ inline region_t allocate( inline void free(void* ptr) { auto result = cudaFree(ptr); - throw_if_error(result, "Freeing managed memory at 0x" + cuda::detail::ptr_as_hex(ptr)); + throw_if_error(result, "Freeing managed memory at 0x" + cuda::detail_::ptr_as_hex(ptr)); } inline void free(region_t region) { @@ -1276,7 +1276,7 @@ struct allocator { // Allocates on the current device! void* operator()(size_t num_bytes) const { - return detail::allocate(num_bytes, InitialVisibility).start(); + return detail_::allocate(num_bytes, InitialVisibility).start(); } }; struct deleter { @@ -1288,11 +1288,11 @@ inline region_t allocate( size_t num_bytes, initial_visibility_t initial_visibility = initial_visibility_t::to_all_devices) { - cuda::device::current::detail::scoped_override_t set_device_for_this_scope(device_id); - return detail::allocate(num_bytes, initial_visibility); + cuda::device::current::detail_::scoped_override_t set_device_for_this_scope(device_id); + return detail_::allocate(num_bytes, initial_visibility); } -} // namespace detail +} // namespace detail_ /** * @brief Allocate a a region of managed memory, accessible with the same @@ -1322,7 +1322,7 @@ inline void free(void* managed_ptr) auto result = cudaFree(managed_ptr); throw_if_error(result, "Freeing managed memory (host and device regions) at address 0x" - + cuda::detail::ptr_as_hex(managed_ptr)); + + cuda::detail_::ptr_as_hex(managed_ptr)); } inline void free(region_t region) @@ -1346,14 +1346,14 @@ inline void set(const_region_t region, device_inspecific_kind_t advice) cuda::device::id_t ignored_device_index{}; auto result = cudaMemAdvise(region.start(), region.size(), (cudaMemoryAdvise) advice, ignored_device_index); throw_if_error(result, - "Setting advice on a (managed) memory region at" + cuda::detail::ptr_as_hex(region.start())); + "Setting advice on a (managed) memory region at" + cuda::detail_::ptr_as_hex(region.start())); } } // namespace advice namespace async { -namespace detail { +namespace detail_ { inline void prefetch( const_region_t region, @@ -1363,10 +1363,10 @@ inline void prefetch( auto result = cudaMemPrefetchAsync(region.start(), region.size(), destination, stream_id); throw_if_error(result, "Prefetching " + ::std::to_string(region.size()) + " bytes of managed memory at address " - + cuda::detail::ptr_as_hex(region.start()) + " to device " + ::std::to_string(destination)); + + cuda::detail_::ptr_as_hex(region.start()) + " to device " + ::std::to_string(destination)); } -} // namespace detail +} // namespace detail_ /** * @brief Prefetches a region of managed memory to a specific device, so @@ -1393,7 +1393,7 @@ inline void prefetch_to_host(const_region_t managed_region) // device indicator is used. throw_if_error(result, "Prefetching " + ::std::to_string(managed_region.size()) + " bytes of managed memory at address " - + cuda::detail::ptr_as_hex(managed_region.start()) + " into host memory"); + + cuda::detail_::ptr_as_hex(managed_region.start()) + " into host memory"); } } // namespace async @@ -1417,11 +1417,11 @@ inline T* device_side_pointer_for(T* host_memory_ptr) get_device_pointer_flags); throw_if_error(status, "Failed obtaining the device-side pointer for host-memory pointer " - + cuda::detail::ptr_as_hex(host_memory_ptr) + " supposedly mapped to device memory"); + + cuda::detail_::ptr_as_hex(host_memory_ptr) + " supposedly mapped to device memory"); return device_side_ptr; } -namespace detail { +namespace detail_ { /** * Allocates a mapped pair of memory regions - on the current device @@ -1439,7 +1439,7 @@ inline region_pair allocate( region_pair allocated; allocated.size_in_bytes = size_in_bytes; auto flags = cudaHostAllocMapped & - cuda::memory::detail::make_cuda_host_alloc_flags(options); + cuda::memory::detail_::make_cuda_host_alloc_flags(options); // Note: the typed cudaHostAlloc also takes its size in bytes, apparently, // not in number of elements auto status = cudaHostAlloc(&allocated.host_side, size_in_bytes, flags); @@ -1449,7 +1449,7 @@ inline region_pair allocate( } throw_if_error(status, "Failed allocating a mapped pair of memory regions of size " + ::std::to_string(size_in_bytes) - + " bytes of global memory on device " + ::std::to_string(cuda::device::current::detail::get_id())); + + " bytes of global memory on device " + ::std::to_string(cuda::device::current::detail_::get_id())); allocated.device_side = device_side_pointer_for(allocated.host_side); return allocated; } @@ -1469,11 +1469,11 @@ inline region_pair allocate( size_t size_in_bytes, allocation_options options) { - cuda::device::current::detail::scoped_override_t set_device_for_this_scope(device_id); - return detail::allocate(size_in_bytes, options); + cuda::device::current::detail_::scoped_override_t set_device_for_this_scope(device_id); + return detail_::allocate(size_in_bytes, options); } -} // namespace detail +} // namespace detail_ /** * Allocate a pair of memory regions, on the host and on the device, mapped to each other so @@ -1564,9 +1564,9 @@ inline region_t locate(symbol_t symbol) void* start; size_t symbol_size; auto api_call_result = cudaGetSymbolAddress(&start, symbol.handle); - throw_if_error(api_call_result, "Could not locate the device memory address for symbol " + cuda::detail::ptr_as_hex(symbol.handle)); + throw_if_error(api_call_result, "Could not locate the device memory address for symbol " + cuda::detail_::ptr_as_hex(symbol.handle)); api_call_result = cudaGetSymbolSize(&symbol_size, symbol.handle); - throw_if_error(api_call_result, "Could not locate the device memory address for symbol " + cuda::detail::ptr_as_hex(symbol.handle)); + throw_if_error(api_call_result, "Could not locate the device memory address for symbol " + cuda::detail_::ptr_as_hex(symbol.handle)); return {start, symbol_size}; } diff --git a/src/cuda/api/multi_wrapper_impls.hpp b/src/cuda/api/multi_wrapper_impls.hpp index 6e2a28fb..5a82265c 100644 --- a/src/cuda/api/multi_wrapper_impls.hpp +++ b/src/cuda/api/multi_wrapper_impls.hpp @@ -26,23 +26,23 @@ namespace cuda { namespace array { -namespace detail { +namespace detail_ { template inline cudaArray* allocate(device_t& device, array::dimensions_t<3> dimensions) { - device::current::detail::scoped_override_t set_device_for_this_scope(device.id()); + device::current::detail_::scoped_override_t set_device_for_this_scope(device.id()); return allocate_on_current_device(dimensions); } template inline cudaArray* allocate(device_t& device, array::dimensions_t<2> dimensions) { - device::current::detail::scoped_override_t set_device_for_this_scope(device.id()); + device::current::detail_::scoped_override_t set_device_for_this_scope(device.id()); return allocate_on_current_device(dimensions); } -} // namespace detail +} // namespace detail_ } // namespace array @@ -57,19 +57,19 @@ inline event_t create( auto device_id = device.id(); // Yes, we need the ID explicitly even on the current device, // because event_t's don't have an implicit device ID. - return event::detail::create(device_id , uses_blocking_sync, records_timing, interprocess); + return event::detail_::create(device_id , uses_blocking_sync, records_timing, interprocess); } namespace ipc { inline handle_t export_(event_t& event) { - return detail::export_(event.id()); + return detail_::export_(event.id()); } inline event_t import(device_t& device, const handle_t& handle) { - return event::detail::wrap(device.id(), detail::import(handle), do_not_take_ownership); + return event::detail_::wrap(device.id(), detail_::import(handle), do_not_take_ownership); } } // namespace ipc @@ -81,7 +81,7 @@ inline event_t import(device_t& device, const handle_t& handle) inline stream_t device_t::default_stream() const noexcept { - return stream::detail::wrap(id(), stream::default_stream_id); + return stream::detail_::wrap(id(), stream::default_stream_id); } inline stream_t @@ -89,8 +89,8 @@ device_t::create_stream( bool will_synchronize_with_default_stream, stream::priority_t priority) { - device::current::detail::scoped_override_t set_device_for_this_scope(id_); - return stream::detail::wrap(id(), stream::detail::create_on_current_device( + device::current::detail_::scoped_override_t set_device_for_this_scope(id_); + return stream::detail_::wrap(id(), stream::detail_::create_on_current_device( will_synchronize_with_default_stream, priority), do_take_ownership); } @@ -104,9 +104,9 @@ inline scoped_override_t::scoped_override_t(device_t&& device) : parent(device.i } // namespace device -namespace detail { +namespace detail_ { -} // namespace detail +} // namespace detail_ template void device_t::launch( @@ -140,7 +140,7 @@ inline void event_t::record(const stream_t& stream) // Note: // TODO: Perhaps check the device ID here, rather than // have the Runtime API call fail? - event::detail::enqueue(stream.id(), id_); + event::detail_::enqueue(stream.id(), id_); } inline void event_t::fire(const stream_t& stream) @@ -160,15 +160,15 @@ inline device_t stream_t::device() const noexcept inline void stream_t::enqueue_t::wait(const event_t& event_) { auto device_id = associated_stream.device_id_; - device::current::detail::scoped_override_t set_device_for_this_context(device_id); + device::current::detail_::scoped_override_t set_device_for_this_context(device_id); // Required by the CUDA runtime API; the flags value is currently unused constexpr const unsigned int flags = 0; auto status = cudaStreamWaitEvent(associated_stream.id_, event_.id(), flags); throw_if_error(status, - ::std::string("Failed scheduling a wait for event ") + cuda::detail::ptr_as_hex(event_.id()) - + " on stream " + cuda::detail::ptr_as_hex(associated_stream.id_) + ::std::string("Failed scheduling a wait for event ") + cuda::detail_::ptr_as_hex(event_.id()) + + " on stream " + cuda::detail_::ptr_as_hex(associated_stream.id_) + " on CUDA device " + ::std::to_string(device_id)); } @@ -181,8 +181,8 @@ inline event_t& stream_t::enqueue_t::event(event_t& existing_event) + ::std::to_string(existing_event.device_id()) + " to be triggered by a stream on CUDA device " + ::std::to_string(device_id ) ); } - device::current::detail::scoped_override_t set_device_for_this_context(device_id); - stream::detail::record_event_on_current_device(device_id, associated_stream.id_, existing_event.id()); + device::current::detail_::scoped_override_t set_device_for_this_context(device_id); + stream::detail_::record_event_on_current_device(device_id, associated_stream.id_, existing_event.id()); return existing_event; } @@ -192,11 +192,11 @@ inline event_t stream_t::enqueue_t::event( bool interprocess) { auto device_id = associated_stream.device_id_; - device::current::detail::scoped_override_t set_device_for_this_scope(device_id); + device::current::detail_::scoped_override_t set_device_for_this_scope(device_id); - event_t ev { event::detail::create_on_current_device(device_id, uses_blocking_sync, records_timing, interprocess) }; + event_t ev { event::detail_::create_on_current_device(device_id, uses_blocking_sync, records_timing, interprocess) }; // Note that, at this point, the event is not associated with this enqueue object's stream. - stream::detail::record_event_on_current_device(device_id, associated_stream.id_, ev.id()); + stream::detail_::record_event_on_current_device(device_id, associated_stream.id_, ev.id()); return ev; } @@ -212,25 +212,25 @@ namespace async { inline void copy(void *destination, const void *source, size_t num_bytes, const stream_t& stream) { - detail::copy(destination, source, num_bytes, stream.id()); + detail_::copy(destination, source, num_bytes, stream.id()); } template inline void copy(array_t& destination, const T* source, const stream_t& stream) { - detail::copy(destination, source, stream.id()); + detail_::copy(destination, source, stream.id()); } template inline void copy(T* destination, const array_t& source, const stream_t& stream) { - detail::copy(destination, source, stream.id()); + detail_::copy(destination, source, stream.id()); } template inline void copy_single(T& destination, const T& source, const stream_t& stream) { - detail::copy_single(&destination, &source, sizeof(T), stream.id()); + detail_::copy_single(&destination, &source, sizeof(T), stream.id()); } } // namespace async @@ -239,24 +239,24 @@ namespace device { inline region_t allocate(cuda::device_t device, size_t size_in_bytes) { - return detail::allocate(device.id(), size_in_bytes); + return detail_::allocate(device.id(), size_in_bytes); } namespace async { inline region_t allocate(const stream_t& stream, size_t size_in_bytes) { - return detail::allocate(stream.device().id(), stream.id(), size_in_bytes); + return detail_::allocate(stream.device().id(), stream.id(), size_in_bytes); } inline void set(void* start, int byte_value, size_t num_bytes, const stream_t& stream) { - detail::set(start, byte_value, num_bytes, stream.id()); + detail_::set(start, byte_value, num_bytes, stream.id()); } inline void zero(void* start, size_t num_bytes, const stream_t& stream) { - detail::zero(start, num_bytes, stream.id()); + detail_::zero(start, num_bytes, stream.id()); } } // namespace async @@ -275,8 +275,8 @@ template inline unique_ptr make_unique(device_t device, size_t num_elements) { static_assert(::std::is_array::value, "make_unique(device, num_elements) can only be invoked for T being an array type, T = U[]"); - cuda::device::current::detail::scoped_override_t set_device_for_this_scope(device.id()); - return cuda::memory::detail::make_unique(num_elements); + cuda::device::current::detail_::scoped_override_t set_device_for_this_scope(device.id()); + return cuda::memory::detail_::make_unique(num_elements); } /** @@ -291,46 +291,46 @@ inline unique_ptr make_unique(device_t device, size_t num_elements) template inline unique_ptr make_unique(device_t device) { - cuda::device::current::detail::scoped_override_t set_device_for_this_scope(device.id()); - return cuda::memory::detail::make_unique(); + cuda::device::current::detail_::scoped_override_t set_device_for_this_scope(device.id()); + return cuda::memory::detail_::make_unique(); } } // namespace device namespace managed { -namespace detail { +namespace detail_ { template inline device_t base_region_t::preferred_location() const { - auto device_id = detail::get_scalar_range_attribute(*this, cudaMemRangeAttributePreferredLocation); + auto device_id = detail_::get_scalar_range_attribute(*this, cudaMemRangeAttributePreferredLocation); return cuda::device::get(device_id); } template inline void base_region_t::set_preferred_location(device_t& device) const { - detail::set_scalar_range_attribute(*this, (cudaMemoryAdvise) cudaMemAdviseSetPreferredLocation, device.id()); + detail_::set_scalar_range_attribute(*this, (cudaMemoryAdvise) cudaMemAdviseSetPreferredLocation, device.id()); } template inline void base_region_t::clear_preferred_location() const { - detail::set_scalar_range_attribute(*this, (cudaMemoryAdvise) cudaMemAdviseUnsetPreferredLocation); + detail_::set_scalar_range_attribute(*this, (cudaMemoryAdvise) cudaMemAdviseUnsetPreferredLocation); } -} // namespace detail +} // namespace detail_ inline void advise_expected_access_by(const_region_t region, device_t& device) { - detail::set_scalar_range_attribute(region, cudaMemAdviseSetAccessedBy, device.id()); + detail_::set_scalar_range_attribute(region, cudaMemAdviseSetAccessedBy, device.id()); } inline void advise_no_access_expected_by(const_region_t region, device_t& device) { - detail::set_scalar_range_attribute(region, cudaMemAdviseUnsetAccessedBy, device.id()); + detail_::set_scalar_range_attribute(region, cudaMemAdviseUnsetAccessedBy, device.id()); } template @@ -346,7 +346,7 @@ ::std::vector accessors(const_region_t region, const Alloca auto status = cudaMemRangeGetAttribute( device_ids, sizeof(device_t) * devices.size(), cudaMemRangeAttributeAccessedBy, region.start(), region.size()); - throw_if_error(status, "Obtaining the IDs of devices with access to the managed memory range at " + cuda::detail::ptr_as_hex(region.start())); + throw_if_error(status, "Obtaining the IDs of devices with access to the managed memory range at " + cuda::detail_::ptr_as_hex(region.start())); auto first_invalid_element = ::std::lower_bound(device_ids, device_ids + num_devices, cudaInvalidDeviceId); // We may have gotten less results that the set of all devices, so let's whittle that down @@ -364,7 +364,7 @@ inline void prefetch( cuda::device_t destination, const stream_t& stream) { - detail::prefetch(region, destination.id(), stream.id()); + detail_::prefetch(region, destination.id(), stream.id()); } } // namespace async @@ -375,7 +375,7 @@ inline region_t allocate( size_t num_bytes, initial_visibility_t initial_visibility) { - return detail::allocate(device.id(), num_bytes, initial_visibility); + return detail_::allocate(device.id(), num_bytes, initial_visibility); } } // namespace managed @@ -387,7 +387,7 @@ inline region_pair allocate( size_t size_in_bytes, allocation_options options) { - return cuda::memory::mapped::detail::allocate(device.id(), size_in_bytes, options); + return cuda::memory::mapped::detail_::allocate(device.id(), size_in_bytes, options); } } // namespace mapped @@ -398,14 +398,14 @@ inline region_pair allocate( inline void kernel_t::set_attribute(cudaFuncAttribute attribute, int value) { - device::current::detail::scoped_override_t set_device_for_this_context(device_id_); + device::current::detail_::scoped_override_t set_device_for_this_context(device_id_); auto result = cudaFuncSetAttribute(ptr_, attribute, value); throw_if_error(result, "Setting CUDA device function attribute " + ::std::to_string(attribute) + " to value " + ::std::to_string(value)); } inline void kernel_t::opt_in_to_extra_dynamic_memory(cuda::memory::shared::size_t amount_required_by_kernel) { - device::current::detail::scoped_override_t set_device_for_this_context(device_id_); + device::current::detail_::scoped_override_t set_device_for_this_context(device_id_); #if CUDART_VERSION >= 9000 auto result = cudaFuncSetAttribute(ptr_, cudaFuncAttributeMaxDynamicSharedMemorySize, amount_required_by_kernel); throw_if_error(result, @@ -439,7 +439,7 @@ kernel_t::min_grid_params_for_max_occupancy( disable_caching_override ? cudaOccupancyDisableCachingOverride : cudaOccupancyDefault ); throw_if_error(result, - "Failed obtaining parameters for a minimum-size grid for kernel " + detail::ptr_as_hex(ptr_) + + "Failed obtaining parameters for a minimum-size grid for kernel " + detail_::ptr_as_hex(ptr_) + " on device " + ::std::to_string(device_id_) + "."); return { min_grid_size_in_blocks, block_size }; #endif @@ -464,7 +464,7 @@ kernel_t::min_grid_params_for_max_occupancy( disable_caching_override ? cudaOccupancyDisableCachingOverride : cudaOccupancyDefault ); throw_if_error(result, - "Failed obtaining parameters for a minimum-size grid for kernel " + detail::ptr_as_hex(ptr_) + + "Failed obtaining parameters for a minimum-size grid for kernel " + detail_::ptr_as_hex(ptr_) + " on device " + ::std::to_string(device_id_) + "."); return { min_grid_size_in_blocks, block_size }; #endif @@ -473,7 +473,7 @@ kernel_t::min_grid_params_for_max_occupancy( inline void kernel_t::set_preferred_shared_mem_fraction(unsigned shared_mem_percentage) { - device::current::detail::scoped_override_t set_device_for_this_context(device_id_); + device::current::detail_::scoped_override_t set_device_for_this_context(device_id_); if (shared_mem_percentage > 100) { throw ::std::invalid_argument("Percentage value can't exceed 100"); } @@ -487,7 +487,7 @@ inline void kernel_t::set_preferred_shared_mem_fraction(unsigned shared_mem_perc inline kernel::attributes_t kernel_t::attributes() const { - device::current::detail::scoped_override_t set_device_for_this_context(device_id_); + device::current::detail_::scoped_override_t set_device_for_this_context(device_id_); kernel::attributes_t function_attributes; auto status = cudaFuncGetAttributes(&function_attributes, ptr_); throw_if_error(status, "Failed obtaining attributes for a CUDA device function"); @@ -496,7 +496,7 @@ inline kernel::attributes_t kernel_t::attributes() const inline void kernel_t::set_cache_preference(multiprocessor_cache_preference_t preference) { - device::current::detail::scoped_override_t set_device_for_this_context(device_id_); + device::current::detail_::scoped_override_t set_device_for_this_context(device_id_); auto result = cudaFuncSetCacheConfig(ptr_, (cudaFuncCache) preference); throw_if_error(result, "Setting the multiprocessor L1/Shared Memory cache distribution preference for a " @@ -507,7 +507,7 @@ inline void kernel_t::set_cache_preference(multiprocessor_cache_preference_t pr inline void kernel_t::set_shared_memory_bank_size( multiprocessor_shared_memory_bank_size_option_t config) { - device::current::detail::scoped_override_t set_device_for_this_context(device_id_); + device::current::detail_::scoped_override_t set_device_for_this_context(device_id_); auto result = cudaFuncSetSharedMemConfig(ptr_, (cudaSharedMemConfig) config); throw_if_error(result); } @@ -517,7 +517,7 @@ inline grid::dimension_t kernel_t::maximum_active_blocks_per_multiprocessor( memory::shared::size_t dynamic_shared_memory_per_block, bool disable_caching_override) { - device::current::detail::scoped_override_t set_device_for_this_context(device_id_); + device::current::detail_::scoped_override_t set_device_for_this_context(device_id_); int result; unsigned int flags = disable_caching_override ? cudaOccupancyDisableCachingOverride : cudaOccupancyDefault; @@ -541,20 +541,20 @@ inline stream_t create( bool synchronizes_with_default_stream, priority_t priority) { - return detail::create(device.id(), synchronizes_with_default_stream, priority); + return detail_::create(device.id(), synchronizes_with_default_stream, priority); } -namespace detail { +namespace detail_ { inline void record_event_on_current_device(device::id_t device_id, stream::id_t stream_id, event::id_t event_id) { auto status = cudaEventRecord(event_id, stream_id); throw_if_error(status, - "Failed scheduling event " + cuda::detail::ptr_as_hex(event_id) + " to occur" - + " on stream " + cuda::detail::ptr_as_hex(stream_id) + "Failed scheduling event " + cuda::detail_::ptr_as_hex(event_id) + " to occur" + + " on stream " + cuda::detail_::ptr_as_hex(stream_id) + " on CUDA device " + ::std::to_string(device_id)); } -} // namespace detail +} // namespace detail_ } // namespace stream @@ -569,7 +569,7 @@ inline void enqueue_launch( auto unwrapped_kernel_function = kernel::unwrap< Kernel, - detail::kernel_parameter_decay_t... + detail_::kernel_parameter_decay_t... >(kernel_function); // Note: This helper function is necessary since we may have gotten a // kernel_t as Kernel, which is type-erased - in @@ -583,10 +583,10 @@ inline void enqueue_launch( // we massage those a bit. #ifdef DEBUG - assert(thread_block_cooperation == detail::intrinsic_block_cooperation_value, + assert(thread_block_cooperation == detail_::intrinsic_block_cooperation_value, "mismatched indications of whether thread block should be able to cooperate for a kernel"); #endif - detail::enqueue_launch( + detail_::enqueue_launch( thread_block_cooperation, unwrapped_kernel_function, stream.id(), diff --git a/src/cuda/api/pci_id.hpp b/src/cuda/api/pci_id.hpp index 79d1f72a..8a88a266 100644 --- a/src/cuda/api/pci_id.hpp +++ b/src/cuda/api/pci_id.hpp @@ -40,7 +40,7 @@ struct pci_location_t { // In lieu of making this class a variant with 3 type combinations. }; -namespace detail { +namespace detail_ { /** * Obtain a CUDA device id for a PCIe bus device @@ -58,7 +58,7 @@ inline id_t resolve_id(pci_location_t pci_id) return cuda_device_id; } -} // namespace detail +} // namespace detail_ } // namespace device diff --git a/src/cuda/api/pointer.hpp b/src/cuda/api/pointer.hpp index 1bfa5367..1e8fb1f6 100644 --- a/src/cuda/api/pointer.hpp +++ b/src/cuda/api/pointer.hpp @@ -110,7 +110,7 @@ class pointer_t { { pointer::attributes_t the_attributes; auto status = cudaPointerGetAttributes (&the_attributes, ptr_); - throw_if_error(status, "Failed obtaining attributes of pointer " + cuda::detail::ptr_as_hex(ptr_)); + throw_if_error(status, "Failed obtaining attributes of pointer " + cuda::detail_::ptr_as_hex(ptr_)); return the_attributes; } device_t device() const noexcept; diff --git a/src/cuda/api/stream.hpp b/src/cuda/api/stream.hpp index 510ab554..2dc7fdfe 100644 --- a/src/cuda/api/stream.hpp +++ b/src/cuda/api/stream.hpp @@ -38,7 +38,7 @@ enum : bool { async = no_implicit_synchronization_with_default_stream, }; -namespace detail { +namespace detail_ { inline id_t create_on_current_device( bool synchronizes_with_default_stream, @@ -51,7 +51,7 @@ inline id_t create_on_current_device( auto status = cudaStreamCreateWithPriority(&new_stream_id, flags, priority); cuda::throw_if_error(status, ::std::string("Failed creating a new stream on CUDA device ") - + ::std::to_string(device::current::detail::get_id())); + + ::std::to_string(device::current::detail_::get_id())); return new_stream_id; } @@ -71,7 +71,7 @@ inline id_t create_on_current_device( */ inline bool is_associated_with(stream::id_t stream_id, device::id_t device_id) { - device::current::detail::scoped_override_t set_device_for_this_scope(device_id); + device::current::detail_::scoped_override_t set_device_for_this_scope(device_id); auto status = cudaStreamQuery(stream_id); switch(status) { case cudaSuccess: @@ -103,7 +103,7 @@ inline device::id_t associated_device(stream::id_t stream_id) if (is_associated_with(stream_id, device_index)) { return device_index; } } throw ::std::runtime_error( - "Could not find any device associated with stream " + cuda::detail::ptr_as_hex(stream_id)); + "Could not find any device associated with stream " + cuda::detail_::ptr_as_hex(stream_id)); } inline void record_event_on_current_device(device::id_t current_device_id, stream::id_t stream_id, event::id_t event_id); @@ -120,7 +120,7 @@ stream_t wrap( id_t stream_id, bool take_ownership = false) noexcept; -} // namespace detail +} // namespace detail_ } // namespace stream @@ -146,7 +146,7 @@ class stream_t { }; protected: // type definitions - using device_setter_type = device::current::detail::scoped_override_t; + using device_setter_type = device::current::detail_::scoped_override_t; public: // const getters @@ -203,7 +203,7 @@ class stream_t { default: throw(cuda::runtime_error(status, "unexpected status returned from cudaStreamQuery() for stream " - + detail::ptr_as_hex(id_))); + + detail_::ptr_as_hex(id_))); } } @@ -350,7 +350,7 @@ class stream_t { { // It is not necessary to make the device current, according to: // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#stream-and-event-behavior - memory::async::detail::copy(destination, source, num_bytes, associated_stream.id_); + memory::async::detail_::copy(destination, source, num_bytes, associated_stream.id_); } void copy(void* destination, memory::const_region_t source, size_t num_bytes) @@ -386,7 +386,7 @@ class stream_t { { // Is it necessary to set the device? I wonder. device_setter_type set_device_for_this_scope(associated_stream.device_id_); - memory::device::async::detail::set(destination, byte_value, num_bytes, associated_stream.id_); + memory::device::async::detail_::set(destination, byte_value, num_bytes, associated_stream.id_); } /** @@ -404,7 +404,7 @@ class stream_t { { // Is it necessary to set the device? I wonder. device_setter_type set_device_for_this_scope(associated_stream.device_id_); - memory::device::async::detail::zero(destination, num_bytes, associated_stream.id_); + memory::device::async::detail_::zero(destination, num_bytes, associated_stream.id_); } /** @@ -480,7 +480,7 @@ class stream_t { throw_if_error(status, ::std::string("Failed scheduling a callback to be launched") - + " on stream " + cuda::detail::ptr_as_hex(associated_stream.id_) + + " on stream " + cuda::detail_::ptr_as_hex(associated_stream.id_) + " on CUDA device " + ::std::to_string(associated_stream.device_id_)); } @@ -534,7 +534,7 @@ class stream_t { associated_stream.id_, managed_region_start, length, flags); throw_if_error(status, "Failed scheduling an attachment of a managed memory region" - " on stream " + cuda::detail::ptr_as_hex(associated_stream.id_) + " on stream " + cuda::detail_::ptr_as_hex(associated_stream.id_) + " on CUDA device " + ::std::to_string(associated_stream.device_id_)); } @@ -609,7 +609,7 @@ class stream_t { public: // friendship - friend stream_t stream::detail::wrap(device::id_t device_id, stream::id_t stream_id, bool take_ownership) noexcept; + friend stream_t stream::detail_::wrap(device::id_t device_id, stream::id_t stream_id, bool take_ownership) noexcept; friend inline bool operator==(const stream_t& lhs, const stream_t& rhs) noexcept { @@ -635,7 +635,7 @@ inline bool operator!=(const stream_t& lhs, const stream_t& rhs) noexcept namespace stream { -namespace detail { +namespace detail_ { /** * @brief Wrap an existing stream in a @ref stream_t instance. * @@ -663,13 +663,13 @@ inline stream_t create( bool synchronizes_with_default_stream, priority_t priority = stream::default_priority) { - device::current::detail::scoped_override_t set_device_for_this_scope(device_id); - auto new_stream_id = cuda::stream::detail::create_on_current_device( + device::current::detail_::scoped_override_t set_device_for_this_scope(device_id); + auto new_stream_id = cuda::stream::detail_::create_on_current_device( synchronizes_with_default_stream, priority); return wrap(device_id, new_stream_id, do_take_ownership); } -} // namespace detail +} // namespace detail_ /** * @brief Create a new stream (= queue) on a CUDA device. diff --git a/src/cuda/api/texture_view.hpp b/src/cuda/api/texture_view.hpp index da042ffa..8acd7b54 100644 --- a/src/cuda/api/texture_view.hpp +++ b/src/cuda/api/texture_view.hpp @@ -42,11 +42,11 @@ struct descriptor_t : public cudaTextureDesc { } }; -namespace detail { +namespace detail_ { inline texture_view wrap(texture::raw_handle_t handle, bool take_ownership) noexcept; -} // namespace detail +} // namespace detail_ } // namespace texture @@ -122,7 +122,7 @@ class texture_view { public: // friendship - friend texture_view texture::detail::wrap(raw_handle_type handle, bool take_ownersip) noexcept; + friend texture_view texture::detail_::wrap(raw_handle_type handle, bool take_ownersip) noexcept; protected: raw_handle_type raw_view_handle { } ; @@ -141,14 +141,14 @@ inline bool operator!=(const texture_view& lhs, const texture_view& rhs) noexcep } namespace texture { -namespace detail { +namespace detail_ { inline texture_view wrap(texture::raw_handle_t handle, bool take_ownership) noexcept { return texture_view(handle, take_ownership); } -} // namespace detail +} // namespace detail_ } // namespace texture } // namespace cuda diff --git a/src/cuda/api/unique_ptr.hpp b/src/cuda/api/unique_ptr.hpp index a5579e9b..0c0ffffb 100644 --- a/src/cuda/api/unique_ptr.hpp +++ b/src/cuda/api/unique_ptr.hpp @@ -13,7 +13,7 @@ namespace cuda { namespace memory { -namespace detail { +namespace detail_ { template @@ -29,15 +29,15 @@ template struct make_unique_selector -inline typename detail::make_unique_selector::non_array make_unique() +inline typename detail_::make_unique_selector::non_array make_unique() { static_assert(::std::is_trivially_constructible::value, "Allocating with non-trivial construction on the device is not supported."); auto space_ptr = Allocator()(sizeof(T)); - return typename detail::make_unique_selector::non_array(static_cast(space_ptr)); + return typename detail_::make_unique_selector::non_array(static_cast(space_ptr)); } template -inline typename detail::make_unique_selector::unbounded_array make_unique(size_t num_elements) +inline typename detail_::make_unique_selector::unbounded_array make_unique(size_t num_elements) { // If this function is instantiated, T is of the form "element_type[]" using element_type = typename ::std::remove_extent::type; @@ -46,19 +46,19 @@ inline typename detail::make_unique_selector::unbounded_array make_u static_assert(::std::is_trivially_constructible::value, "Allocating with non-trivial construction on the device is not supported."); void* space_ptr = Allocator()(sizeof(element_type) * num_elements); - return typename detail::make_unique_selector::unbounded_array(static_cast(space_ptr)); + return typename detail_::make_unique_selector::unbounded_array(static_cast(space_ptr)); } template -inline typename detail::make_unique_selector::bounded_array make_unique(Args&&...) = delete; +inline typename detail_::make_unique_selector::bounded_array make_unique(Args&&...) = delete; -using deleter = device::detail::deleter; +using deleter = device::detail_::deleter; template inline ::std::unique_ptr make_unique(cuda::device::id_t device_id, size_t n) { - cuda::device::current::detail::scoped_override_t set_device_for_this_scope(device_id); - return memory::detail::make_unique(n); + cuda::device::current::detail_::scoped_override_t set_device_for_this_scope(device_id); + return memory::detail_::make_unique(n); } template @@ -66,17 +66,17 @@ inline ::std::unique_ptr make_unique(cuda::device::id_t device_id) { - cuda::device::current::detail::scoped_override_t set_device_for_this_scope(device_id); + cuda::device::current::detail_::scoped_override_t set_device_for_this_scope(device_id); - return memory::detail::make_unique(); + return memory::detail_::make_unique(); } -} // namespace detail +} // namespace detail_ namespace device { template -using unique_ptr = ::std::unique_ptr; +using unique_ptr = ::std::unique_ptr; template inline unique_ptr make_unique(cuda::device_t device, size_t n); @@ -96,18 +96,18 @@ inline unique_ptr make_unique(T* raw_ptr) namespace host { template -using unique_ptr = ::std::unique_ptr; +using unique_ptr = ::std::unique_ptr; template inline unique_ptr make_unique(size_t n) { - return cuda::memory::detail::make_unique(n); + return cuda::memory::detail_::make_unique(n); } template inline unique_ptr make_unique() { - return cuda::memory::detail::make_unique(); + return cuda::memory::detail_::make_unique(); } } // namespace host @@ -115,7 +115,7 @@ inline unique_ptr make_unique() namespace managed { template -using unique_ptr = ::std::unique_ptr; +using unique_ptr = ::std::unique_ptr; template inline unique_ptr make_unique( @@ -123,11 +123,11 @@ inline unique_ptr make_unique( initial_visibility_t initial_visibility = initial_visibility_t::to_all_devices) { return (initial_visibility == initial_visibility_t::to_all_devices) ? - cuda::memory::detail::make_unique, detail::deleter + cuda::memory::detail_::make_unique, detail_::deleter >(n) : - cuda::memory::detail::make_unique, detail::deleter + cuda::memory::detail_::make_unique, detail_::deleter >(n); } @@ -136,11 +136,11 @@ inline unique_ptr make_unique( initial_visibility_t initial_visibility = initial_visibility_t::to_all_devices) { return (initial_visibility == initial_visibility_t::to_all_devices) ? - cuda::memory::detail::make_unique, detail::deleter + cuda::memory::detail_::make_unique, detail_::deleter >() : - cuda::memory::detail::make_unique, detail::deleter + cuda::memory::detail_::make_unique, detail_::deleter >(); } diff --git a/src/cuda/common/types.hpp b/src/cuda/common/types.hpp index 0453867e..b55d4092 100644 --- a/src/cuda/common/types.hpp +++ b/src/cuda/common/types.hpp @@ -485,7 +485,7 @@ using pair_attribute_t = cudaDeviceP2PAttr; } // namespace device -namespace detail { +namespace detail_ { /** * @brief adapt a type to be usable as a kernel parameter. @@ -514,7 +514,7 @@ struct kernel_parameter_decay { template using kernel_parameter_decay_t = typename kernel_parameter_decay

::type; -} // namespace detail +} // namespace detail_ /** * Scheduling policies the Runtime API may use when the host-side diff --git a/src/cuda/nvtx/profiling.cpp b/src/cuda/nvtx/profiling.cpp index 02eb6ddf..73fe35aa 100644 --- a/src/cuda/nvtx/profiling.cpp +++ b/src/cuda/nvtx/profiling.cpp @@ -25,13 +25,13 @@ namespace profiling { namespace mark { -namespace detail { +namespace detail_ { static ::std::mutex profiler_mutex; // To prevent multiple threads from accessing the profiler simultaneously } void point(const ::std::string& description, color_t color) { - ::std::lock_guard<::std::mutex> { detail::profiler_mutex }; + ::std::lock_guard<::std::mutex> { detail_::profiler_mutex }; // logging? nvtxEventAttributes_t eventAttrib = {0}; eventAttrib.version = NVTX_VERSION; @@ -47,7 +47,7 @@ range::handle_t range_start( const ::std::string& description, ::cuda::profiling::range::type_t type, color_t color) { (void) type; // Currently not doing anything with the type; maybe in the future - ::std::lock_guard<::std::mutex> { detail::profiler_mutex }; + ::std::lock_guard<::std::mutex> { detail_::profiler_mutex }; nvtxEventAttributes_t range_attributes; range_attributes.version = NVTX_VERSION; range_attributes.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;