From 116313825a7845c384e53b509a0535b0dd13d954 Mon Sep 17 00:00:00 2001
From: Eyal Rozenberg <eyalroz1@gmx.com>
Date: Thu, 19 Aug 2021 23:16:36 +0300
Subject: [PATCH] Fixes #256: Replaced our library's `detail` namespaces with
 `detail_`, to avoid libcu++ having ambiguity of some of its own `detail`
 namespaces.

---
 .../device_management.cpp                     |   6 +-
 .../by_runtime_api_module/error_handling.cu   |   4 +-
 src/cuda/api/array.hpp                        |   6 +-
 src/cuda/api/current_device.hpp               |  16 +-
 src/cuda/api/detail/device_properties.hpp     |  30 ++--
 src/cuda/api/device.hpp                       |  20 +--
 src/cuda/api/devices.hpp                      |   8 +-
 src/cuda/api/error.hpp                        |   4 +-
 src/cuda/api/event.hpp                        |  30 ++--
 src/cuda/api/ipc.hpp                          |  10 +-
 src/cuda/api/kernel.hpp                       |  12 +-
 src/cuda/api/kernel_launch.hpp                |  10 +-
 src/cuda/api/memory.hpp                       | 152 +++++++++---------
 src/cuda/api/multi_wrapper_impls.hpp          | 122 +++++++-------
 src/cuda/api/pci_id.hpp                       |   4 +-
 src/cuda/api/pointer.hpp                      |   2 +-
 src/cuda/api/stream.hpp                       |  34 ++--
 src/cuda/api/texture_view.hpp                 |  10 +-
 src/cuda/api/unique_ptr.hpp                   |  50 +++---
 src/cuda/common/types.hpp                     |   4 +-
 src/cuda/nvtx/profiling.cpp                   |   6 +-
 21 files changed, 270 insertions(+), 270 deletions(-)
diff --git a/examples/by_runtime_api_module/device_management.cpp b/examples/by_runtime_api_module/device_management.cpp
index c11f8556..6ae5fb07 100644
--- a/examples/by_runtime_api_module/device_management.cpp
+++ b/examples/by_runtime_api_module/device_management.cpp
@@ -191,14 +191,14 @@ int main(int argc, char **argv)
 		auto device_1 = cuda::device::get(1);
 		cuda::device::current::set(device_0);
 		assert(cuda::device::current::get() == device_0);
-		assert(cuda::device::current::detail::get_id() == device_0.id());
+		assert(cuda::device::current::detail_::get_id() == device_0.id());
 		cuda::device::current::set(device_1);
 		assert(cuda::device::current::get() == device_1);
-		assert(cuda::device::current::detail::get_id() == device_1.id());
+		assert(cuda::device::current::detail_::get_id() == device_1.id());
 	}
 
 	try {
-		cuda::device::current::detail::set(device_count);
+		cuda::device::current::detail_::set(device_count);
 		die_("Should not have been able to set the current device to "
 			+ std::to_string(device_count) + " since that's the device count, and "
 			+ "the maximum valid ID should be " + std::to_string(device_count - 1)
diff --git a/examples/by_runtime_api_module/error_handling.cu b/examples/by_runtime_api_module/error_handling.cu
index c0b04695..85377ac3 100644
--- a/examples/by_runtime_api_module/error_handling.cu
+++ b/examples/by_runtime_api_module/error_handling.cu
@@ -28,7 +28,7 @@ int main(int argc, char **argv)
 	}
 
 	try {
-		cuda::device::current::detail::set(device_count);
+		cuda::device::current::detail_::set(device_count);
 		die_("An exception should have be thrown");
 	}
 	catch(cuda::runtime_error& e) {
@@ -51,7 +51,7 @@ int main(int argc, char **argv)
 	// clearing the error
 
 	try {
-		cuda::device::current::detail::set(device_count);
+		cuda::device::current::detail_::set(device_count);
 		die_("An exception should have be thrown");
 	}
 	catch(cuda::runtime_error&) { }
diff --git a/src/cuda/api/array.hpp b/src/cuda/api/array.hpp
index ae13896b..063a1edd 100644
--- a/src/cuda/api/array.hpp
+++ b/src/cuda/api/array.hpp
@@ -20,7 +20,7 @@ class device_t;
 
 namespace array {
 
-namespace detail {
+namespace detail_ {
 
 template<typename T>
 cudaArray* allocate_on_current_device(array::dimensions_t<3> dimensions)
@@ -49,7 +49,7 @@ cudaArray* allocate(device_t& device, array::dimensions_t<3> dimensions);
 template<typename T>
 cudaArray* allocate(device_t& device, array::dimensions_t<2> dimensions);
 
-} // namespace detail
+} // namespace detail_
 
 } // namespace array
 
@@ -91,7 +91,7 @@ class array_t {
 	 * Creates and wraps a new CUDA array.
 	 */
 	array_t(device_t& device, array::dimensions_t<NumDimensions> dimensions)
-		: array_t(array::detail::allocate<T>(device, dimensions), dimensions) {}
+		: array_t(array::detail_::allocate<T>(device, dimensions), dimensions) {}
 	array_t(const array_t& other) = delete;
 	array_t(array_t&& other) noexcept : array_t(other.raw_array_, other.dimensions_)
 	{
diff --git a/src/cuda/api/current_device.hpp b/src/cuda/api/current_device.hpp
index e86643b8..00b0dfcb 100644
--- a/src/cuda/api/current_device.hpp
+++ b/src/cuda/api/current_device.hpp
@@ -37,7 +37,7 @@ namespace device {
 
 namespace current {
 
-namespace detail {
+namespace detail_ {
 
 /**
  * Obtains the numeric id of the device set as current for the CUDA Runtime API
@@ -79,14 +79,14 @@ inline void set(const id_t* device_ids, size_t num_devices)
 }
 
 /**
- * @note See the out-of-`detail::` version of this class.
+ * @note See the out-of-`detail_::` version of this class.
  */
 class scoped_override_t {
 protected:
 	static id_t  replace(id_t new_device_id)
 	{
-		id_t previous_device_id = device::current::detail::get_id();
-		device::current::detail::set(new_device_id);
+		id_t previous_device_id = device::current::detail_::get_id();
+		device::current::detail_::set(new_device_id);
 		return previous_device_id;
 	}
 
@@ -102,12 +102,12 @@ class scoped_override_t {
 };
 
 
-} // namespace detail
+} // namespace detail_
 
 /**
  * Reset the CUDA Runtime API's current device to its default value - the default device
  */
-inline void set_to_default() { return detail::set(device::default_device_id); }
+inline void set_to_default() { return detail_::set(device::default_device_id); }
 
 void set(device_t device);
 
@@ -116,9 +116,9 @@ void set(device_t device);
  * what remains of the current scope, and changing it back to its previous value
  * when exiting the scope.
  */
-class scoped_override_t : private detail::scoped_override_t {
+class scoped_override_t : private detail_::scoped_override_t {
 protected:
-	using parent = detail::scoped_override_t;
+	using parent = detail_::scoped_override_t;
 public:
 	scoped_override_t(device_t& device);
 	scoped_override_t(device_t&& device);
diff --git a/src/cuda/api/detail/device_properties.hpp b/src/cuda/api/detail/device_properties.hpp
index dc40225e..4c847fb9 100644
--- a/src/cuda/api/detail/device_properties.hpp
+++ b/src/cuda/api/detail/device_properties.hpp
@@ -1,5 +1,5 @@
 /**
- * @file detail/device_properties.hpp
+ * @file detail_/device_properties.hpp
  *
  * @brief Implementation of methods and helper functions for device-property-related classes.
  *
@@ -47,7 +47,7 @@ inline constexpr bool operator >=(const compute_architecture_t& lhs, const compu
 	return lhs.major > rhs.major;
 }
 
-namespace detail {
+namespace detail_ {
 
 constexpr const int invalid_architecture_return { 0 };
 enum : memory::shared::size_t { KiB = 1024 };
@@ -55,7 +55,7 @@ enum : memory::shared::size_t { KiB = 1024 };
 template <typename T>
 inline T ensure_arch_property_validity(T v, const compute_architecture_t& arch)
 {
-	if (v == detail::invalid_architecture_return) {
+	if (v == detail_::invalid_architecture_return) {
 		throw ::std::invalid_argument("No architecture numbered " + ::std::to_string(arch.major));
 	}
 	return v;
@@ -147,30 +147,30 @@ inline constexpr unsigned max_in_flight_threads_per_processor(const compute_arch
 		invalid_architecture_return;
 }
 
-} // namespace detail
+} // namespace detail_
 
 inline const char* compute_architecture_t::name() const {
-	return detail::ensure_arch_property_validity(detail::architecture_name(*this), *this);
+	return detail_::ensure_arch_property_validity(detail_::architecture_name(*this), *this);
 }
 
 inline unsigned compute_architecture_t::max_in_flight_threads_per_processor() const
 {
-	return detail::ensure_arch_property_validity(detail::max_in_flight_threads_per_processor(*this), *this);
+	return detail_::ensure_arch_property_validity(detail_::max_in_flight_threads_per_processor(*this), *this);
 }
 
 inline unsigned compute_architecture_t::max_shared_memory_per_block() const
 {
-	return detail::ensure_arch_property_validity(detail::max_shared_memory_per_block(*this), *this);
+	return detail_::ensure_arch_property_validity(detail_::max_shared_memory_per_block(*this), *this);
 }
 
 inline unsigned compute_architecture_t::max_resident_warps_per_processor() const
 {
-	return detail::ensure_arch_property_validity(detail::max_resident_warps_per_processor(*this), *this);
+	return detail_::ensure_arch_property_validity(detail_::max_resident_warps_per_processor(*this), *this);
 }
 
 inline unsigned compute_architecture_t::max_warp_schedulings_per_processor_cycle() const
 {
-	return detail::ensure_arch_property_validity(detail::max_warp_schedulings_per_processor_cycle(*this), *this);
+	return detail_::ensure_arch_property_validity(detail_::max_warp_schedulings_per_processor_cycle(*this), *this);
 }
 
 // compute_capability_t-related
@@ -224,7 +224,7 @@ inline constexpr compute_capability_t make_compute_capability(unsigned major, un
 	return { {major}, minor };
 }
 
-namespace detail {
+namespace detail_ {
 
 inline constexpr unsigned max_in_flight_threads_per_processor(const compute_capability_t& cc)
 {
@@ -262,26 +262,26 @@ inline constexpr unsigned max_resident_warps_per_processor(const compute_capabil
 		max_resident_warps_per_processor(cc.architecture);
 }
 
-} // namespace detail
+} // namespace detail_
 
 inline unsigned compute_capability_t::max_in_flight_threads_per_processor() const
 {
-	return detail::ensure_arch_property_validity(detail::max_in_flight_threads_per_processor(*this), architecture);
+	return detail_::ensure_arch_property_validity(detail_::max_in_flight_threads_per_processor(*this), architecture);
 }
 
 inline unsigned compute_capability_t::max_warp_schedulings_per_processor_cycle() const
 {
-	return detail::ensure_arch_property_validity(detail::max_warp_schedulings_per_processor_cycle(*this), architecture);
+	return detail_::ensure_arch_property_validity(detail_::max_warp_schedulings_per_processor_cycle(*this), architecture);
 }
 
 inline unsigned compute_capability_t::max_shared_memory_per_block() const
 {
-	return detail::ensure_arch_property_validity(detail::max_shared_memory_per_block(*this), architecture);
+	return detail_::ensure_arch_property_validity(detail_::max_shared_memory_per_block(*this), architecture);
 }
 
 inline unsigned compute_capability_t::max_resident_warps_per_processor() const
 {
-	return detail::ensure_arch_property_validity(detail::max_resident_warps_per_processor(*this), architecture);
+	return detail_::ensure_arch_property_validity(detail_::max_resident_warps_per_processor(*this), architecture);
 }
 
 // properties_t-related
diff --git a/src/cuda/api/device.hpp b/src/cuda/api/device.hpp
index 422fcc0e..22af5fa7 100644
--- a/src/cuda/api/device.hpp
+++ b/src/cuda/api/device.hpp
@@ -155,7 +155,7 @@ class device_t {
 	using resource_id_t = cudaLimit;
 
 protected: // types
-	using scoped_setter_t = device::current::detail::scoped_override_t;
+	using scoped_setter_t = device::current::detail_::scoped_override_t;
 	using flags_t = unsigned;
 
 	///@cond
@@ -180,8 +180,8 @@ class device_t {
 	protected:
 		const device::id_t device_id_;
 
-		using deleter = memory::device::detail::deleter;
-		using allocator = memory::device::detail::allocator;
+		using deleter = memory::device::detail_::deleter;
+		using allocator = memory::device::detail_::allocator;
 
 	public:
 		///@cond
@@ -199,7 +199,7 @@ class device_t {
 		memory::region_t allocate(size_t size_in_bytes)
 		{
 			scoped_setter_t set_device_for_this_scope(device_id_);
-			return memory::device::detail::allocate(size_in_bytes);
+			return memory::device::detail_::allocate(size_in_bytes);
 		}
 
 		// Perhaps drop this? it should really go into a managed namespace
@@ -231,7 +231,7 @@ class device_t {
 				initial_visibility_t::to_supporters_of_concurrent_managed_access)
 		{
 			scoped_setter_t set_device_for_this_scope(device_id_);
-			return cuda::memory::managed::detail::allocate(size_in_bytes, initial_visibility);
+			return cuda::memory::managed::detail_::allocate(size_in_bytes, initial_visibility);
 		}
 
 		/**
@@ -688,7 +688,7 @@ class device_t {
 	 */
 	device_t& make_current()
 	{
-		device::current::detail::set(id());
+		device::current::detail_::set(id());
 		return *this;
 	}
 
@@ -754,12 +754,12 @@ namespace current {
 /**
  * Obtains (a proxy for) the device which the CUDA runtime API considers to be current.
  */
-inline device_t get() { return device::get(detail::get_id()); }
+inline device_t get() { return device::get(detail_::get_id()); }
 
 /**
  * Tells the CUDA runtime API to consider the specified device as the current one.
  */
-inline void set(device_t device) { detail::set(device.id()); }
+inline void set(device_t device) { detail_::set(device.id()); }
 
 } // namespace current
 
@@ -771,7 +771,7 @@ inline void set(device_t device) { detail::set(device.id()); }
  */
 inline device_t get(pci_location_t pci_id)
 {
-	auto resolved_id = device::detail::resolve_id(pci_id);
+	auto resolved_id = device::detail_::resolve_id(pci_id);
 	return get(resolved_id);
 }
 
@@ -797,7 +797,7 @@ inline device_t get(const ::std::string& pci_id_str)
 inline void synchronize(device_t& device)
 {
 	auto device_id = device.id();
-	device::current::detail::scoped_override_t set_device_for_this_scope(device_id);
+	device::current::detail_::scoped_override_t set_device_for_this_scope(device_id);
 	auto status = cudaDeviceSynchronize();
 	throw_if_error(status, "Failed synchronizing " + ::std::to_string(device_id));
 }
diff --git a/src/cuda/api/devices.hpp b/src/cuda/api/devices.hpp
index a767c032..d3dccef9 100644
--- a/src/cuda/api/devices.hpp
+++ b/src/cuda/api/devices.hpp
@@ -11,7 +11,7 @@
 
 namespace cuda {
 
-namespace detail {
+namespace detail_ {
 
 // Note that while nothing constrains you from instantiating
 // this class many times, all instances are the same (as CUDA
@@ -203,11 +203,11 @@ inline bool operator!= (
 	return not (lhs == rhs);
 }
 
-} // namespace detail
+} // namespace detail_
 
-inline detail::all_devices devices()
+inline detail_::all_devices devices()
 {
-	return detail::all_devices();
+	return detail_::all_devices();
 }
 
 } // namespace cuda
diff --git a/src/cuda/api/error.hpp b/src/cuda/api/error.hpp
index b4f132fc..0986e44d 100644
--- a/src/cuda/api/error.hpp
+++ b/src/cuda/api/error.hpp
@@ -134,7 +134,7 @@ constexpr inline bool is_failure(status_t status)  { return status != (status_t)
  */
 inline ::std::string describe(status_t status) { return cudaGetErrorString(status); }
 
-namespace detail {
+namespace detail_ {
 
 template <typename I, bool UpperCase = false>
 ::std::string as_hex(I x)
@@ -165,7 +165,7 @@ inline ::std::string ptr_as_hex(const I* ptr)
 	return as_hex((size_t) ptr);
 }
 
-} // namespace detail
+} // namespace detail_
 
 /**
  * A (base?) class for exceptions raised by CUDA code; these errors are thrown by
diff --git a/src/cuda/api/event.hpp b/src/cuda/api/event.hpp
index d9b743fa..20ec3df4 100644
--- a/src/cuda/api/event.hpp
+++ b/src/cuda/api/event.hpp
@@ -28,7 +28,7 @@ class stream_t;
 
 namespace event {
 
-namespace detail {
+namespace detail_ {
 
 /**
  * Schedule a specified event to occur (= to fire) when all activities
@@ -40,8 +40,8 @@ namespace detail {
 inline void enqueue(stream::id_t stream_id, id_t event_id) {
 	auto status = cudaEventRecord(event_id, stream_id);
 	cuda::throw_if_error(status,
-		"Failed recording event " + cuda::detail::ptr_as_hex(event_id)
-		+ " on stream " + cuda::detail::ptr_as_hex(stream_id));
+		"Failed recording event " + cuda::detail_::ptr_as_hex(event_id)
+		+ " on stream " + cuda::detail_::ptr_as_hex(stream_id));
 }
 
 constexpr unsigned inline make_flags(bool uses_blocking_sync, bool records_timing, bool interprocess)
@@ -52,7 +52,7 @@ constexpr unsigned inline make_flags(bool uses_blocking_sync, bool records_timin
 		| ( interprocess        ? cudaEventInterprocess : 0  );
 }
 
-} // namespace detail
+} // namespace detail_
 
 } // namespace event
 
@@ -62,7 +62,7 @@ class event_t;
 
 namespace event {
 
-namespace detail {
+namespace detail_ {
 /**
  * @brief Wrap an existing CUDA event in a @ref event_t instance.
  *
@@ -81,7 +81,7 @@ event_t wrap(
 	id_t          event_id,
 	bool          take_ownership = false) noexcept;
 
-} // namespace detail
+} // namespace detail_
 
 } // namespace event
 
@@ -138,7 +138,7 @@ class event_t {
 		if (status == cuda::status::success) return true;
 		if (status == cuda::status::not_ready) return false;
 		throw cuda::runtime_error(status,
-			"Could not determine whether event " + detail::ptr_as_hex(id_)
+			"Could not determine whether event " + detail_::ptr_as_hex(id_)
 			+ "has already occurred or not.");
 	}
 
@@ -160,7 +160,7 @@ class event_t {
 	 */
 	void record()
 	{
-		event::detail::enqueue(stream::default_stream_id, id_);
+		event::detail_::enqueue(stream::default_stream_id, id_);
 	}
 
 	/**
@@ -195,7 +195,7 @@ class event_t {
 
 public: // friendship
 
-	friend event_t event::detail::wrap(device::id_t device_id, event::id_t event_id, bool take_ownership) noexcept;
+	friend event_t event::detail_::wrap(device::id_t device_id, event::id_t event_id, bool take_ownership) noexcept;
 
 public: // constructors and destructor
 
@@ -251,7 +251,7 @@ inline duration_t time_elapsed_between(const event_t& start, const event_t& end)
 	return duration_t { elapsed_milliseconds };
 }
 
-namespace detail {
+namespace detail_ {
 
 /**
  * Obtain a proxy object for an already-existing CUDA event
@@ -304,12 +304,12 @@ inline event_t create(
 	bool          records_timing,
 	bool          interprocess)
 {
-	device::current::detail::scoped_override_t
+	device::current::detail_::scoped_override_t
 		set_device_for_this_scope(device_id);
-	return detail::create_on_current_device(device_id, uses_blocking_sync, records_timing, interprocess);
+	return detail_::create_on_current_device(device_id, uses_blocking_sync, records_timing, interprocess);
 }
 
-} // namespace detail
+} // namespace detail_
 
 /**
  * @brief creates a new execution stream on a device.
@@ -344,10 +344,10 @@ inline void synchronize(const event_t& event)
 {
 	auto device_id = event.device_id();
 	auto event_id = event.id();
-	device::current::detail::scoped_override_t device_for_this_scope(device_id);
+	device::current::detail_::scoped_override_t device_for_this_scope(device_id);
 	auto status = cudaEventSynchronize(event_id);
 	throw_if_error(status, "Failed synchronizing the event with id "
-		+ cuda::detail::ptr_as_hex(event_id) + " on   " + ::std::to_string(device_id));
+		+ cuda::detail_::ptr_as_hex(event_id) + " on   " + ::std::to_string(device_id));
 }
 
 } // namespace cuda
diff --git a/src/cuda/api/ipc.hpp b/src/cuda/api/ipc.hpp
index b9ef90eb..f3bfe8c5 100644
--- a/src/cuda/api/ipc.hpp
+++ b/src/cuda/api/ipc.hpp
@@ -61,7 +61,7 @@ inline handle_t export_(void* device_ptr) {
 	handle_t handle;
 	auto status = cudaIpcGetMemHandle(&handle, device_ptr);
 		cuda::throw_if_error(status,
-			"Failed producing an IPC memory handle for device pointer " + cuda::detail::ptr_as_hex(device_ptr));
+			"Failed producing an IPC memory handle for device pointer " + cuda::detail_::ptr_as_hex(device_ptr));
 	return handle;
 }
 
@@ -95,7 +95,7 @@ inline void unmap(void* ipc_mapped_ptr)
 	auto status = cudaIpcCloseMemHandle(ipc_mapped_ptr);
 	cuda::throw_if_error(status,
 		"Failed unmapping IPC memory mapped to " +
-		cuda::detail::ptr_as_hex(ipc_mapped_ptr));
+		cuda::detail_::ptr_as_hex(ipc_mapped_ptr));
 }
 
 /**
@@ -158,14 +158,14 @@ namespace ipc {
  */
 using handle_t = cudaIpcEventHandle_t;
 
-namespace detail {
+namespace detail_ {
 
 inline handle_t export_(id_t event_id)
 {
 	handle_t ipc_handle;
 	auto status = cudaIpcGetEventHandle(&ipc_handle, event_id);
 	cuda::throw_if_error(status,
-		"Failed obtaining an IPC event handle for event " + cuda::detail::ptr_as_hex(event_id));
+		"Failed obtaining an IPC event handle for event " + cuda::detail_::ptr_as_hex(event_id));
 	return ipc_handle;
 }
 
@@ -178,7 +178,7 @@ inline event::id_t import(const handle_t& handle)
 	return event_id;
 }
 
-} // namespace detail
+} // namespace detail_
 
 /**
  * Enable use of an event which this process created by other processes
diff --git a/src/cuda/api/kernel.hpp b/src/cuda/api/kernel.hpp
index 606e978e..a8440774 100644
--- a/src/cuda/api/kernel.hpp
+++ b/src/cuda/api/kernel.hpp
@@ -211,7 +211,7 @@ class kernel_t {
 
 namespace kernel {
 
-namespace detail {
+namespace detail_ {
 
 template<bool...> struct bool_pack;
 template<bool... bs>
@@ -219,11 +219,11 @@ using all_true = ::std::is_same<bool_pack<bs..., true>, bool_pack<true, bs...>>;
 
 template<typename... KernelParameters>
 struct raw_kernel_typegen {
-	static_assert(all_true<::std::is_same<KernelParameters, ::cuda::detail::kernel_parameter_decay_t<KernelParameters>>::value...>::value,
+	static_assert(all_true<::std::is_same<KernelParameters, ::cuda::detail_::kernel_parameter_decay_t<KernelParameters>>::value...>::value,
 		"Invalid kernel parameter types" );
 	using type = void(*)(KernelParameters...);
 		// Why no decay? After all, CUDA kernels only takes parameters by value, right?
-		// Well, we're inside `detail::`. You should be careful to only instantiate this class with
+		// Well, we're inside `detail_::`. You should be careful to only instantiate this class with
 		// nice simple types we can pass to CUDA kernels.
 };
 
@@ -247,7 +247,7 @@ Kernel unwrap_inner(::std::false_type, Kernel raw_function)
 	return raw_function;
 }
 
-} // namespace detail
+} // namespace detail_
 
 /**
  * Obtain the raw function pointer of any type acceptable as a launchable kernel
@@ -256,12 +256,12 @@ Kernel unwrap_inner(::std::false_type, Kernel raw_function)
 template<typename Kernel, typename... KernelParameters>
 auto unwrap(Kernel f) -> typename ::std::conditional<
 	::std::is_same<typename ::std::decay<Kernel>::type, kernel_t>::value,
-	typename detail::raw_kernel_typegen<KernelParameters...>::type,
+	typename detail_::raw_kernel_typegen<KernelParameters...>::type,
 	Kernel>::type
 {
 	using got_a_kernel_t =
 		::std::integral_constant<bool, ::std::is_same<typename ::std::decay<Kernel>::type, kernel_t>::value>;
-	return detail::unwrap_inner<Kernel, KernelParameters...>(got_a_kernel_t{}, f);
+	return detail_::unwrap_inner<Kernel, KernelParameters...>(got_a_kernel_t{}, f);
 }
 
 } // namespace kernel
diff --git a/src/cuda/api/kernel_launch.hpp b/src/cuda/api/kernel_launch.hpp
index 44ac1a62..bd4aabdd 100644
--- a/src/cuda/api/kernel_launch.hpp
+++ b/src/cuda/api/kernel_launch.hpp
@@ -69,7 +69,7 @@ constexpr grid::dimensions_t single_block() { return 1; }
  */
 constexpr grid::block_dimensions_t single_thread_per_block() { return 1; }
 
-namespace detail {
+namespace detail_ {
 
 template<typename Kernel>
 bool intrinsic_block_cooperation_value(const Kernel&)
@@ -96,7 +96,7 @@ inline void collect_argument_addresses(void** collected_addresses, Arg&& arg, Ar
 	collect_argument_addresses(collected_addresses + 1, ::std::forward<Args>(args)...);
 }
 
-// Note: Unlike the non-detail functions - this one
+// Note: Unlike the non-detail_ functions - this one
 // cannot handle type-erased kernel_t's.
 template<typename RawKernel, typename... KernelParameters>
 inline void enqueue_launch(
@@ -143,7 +143,7 @@ inline void enqueue_launch(
 		// fill the argument array with our parameters. Yes, the use
 		// of the two terms is confusing here and depends on how you
 		// look at things.
-		detail::collect_argument_addresses(argument_ptrs, ::std::forward<KernelParameters>(parameters)...);
+		detail_::collect_argument_addresses(argument_ptrs, ::std::forward<KernelParameters>(parameters)...);
 		auto status = cudaLaunchCooperativeKernel(
 			(const void*) kernel_function,
 			launch_configuration.grid_dimensions,
@@ -162,7 +162,7 @@ inline void enqueue_launch(
 #endif
 
 
-} // namespace detail
+} // namespace detail_
 
 /**
  * @brief Enqueues a kernel on a stream (=queue) on the current CUDA device.
@@ -220,7 +220,7 @@ inline void enqueue_launch(
 	KernelParameters&&...   parameters)
 {
 	enqueue_launch(
-		detail::intrinsic_block_cooperation_value(kernel_function),
+		detail_::intrinsic_block_cooperation_value(kernel_function),
 		kernel_function,
 		stream,
 		launch_configuration,
diff --git a/src/cuda/api/memory.hpp b/src/cuda/api/memory.hpp
index cde74dce..73512bd7 100644
--- a/src/cuda/api/memory.hpp
+++ b/src/cuda/api/memory.hpp
@@ -51,7 +51,7 @@ class stream_t;
  */
 namespace memory {
 
-namespace detail {
+namespace detail_ {
 
 template <class T>
 	class base_region_t {
@@ -72,13 +72,13 @@ template <class T>
 		T* get() const { return start(); }
 	};
 
-}  // namespace detail
+}  // namespace detail_
 
-struct region_t : public detail::base_region_t<void> {
+struct region_t : public detail_::base_region_t<void> {
 	using base_region_t<void>::base_region_t;
 };
 
-struct const_region_t : public detail::base_region_t<void const> {
+struct const_region_t : public detail_::base_region_t<void const> {
 	using base_region_t<void const>::base_region_t;
 	const_region_t(const region_t& r) : base_region_t(r.start(), r.size()) {}
 };
@@ -122,7 +122,7 @@ struct allocation_options {
 	cpu_write_combining          write_combining;
 };
 
-namespace detail {
+namespace detail_ {
 
 inline unsigned make_cuda_host_alloc_flags(allocation_options options)
 {
@@ -131,7 +131,7 @@ inline unsigned make_cuda_host_alloc_flags(allocation_options options)
 		(options.write_combining == cpu_write_combining::with_wc             ? cudaHostAllocWriteCombined : 0);
 }
 
-} // namespace detail
+} // namespace detail_
 
 /**
  * @namespace mapped
@@ -171,7 +171,7 @@ namespace memory {
  */
 namespace device {
 
-namespace detail {
+namespace detail_ {
 
 /**
  * Allocate memory on current device
@@ -191,22 +191,22 @@ inline region_t allocate(size_t num_bytes)
 	throw_if_error(status,
 		"Failed allocating " + ::std::to_string(num_bytes) +
 		" bytes of global memory on CUDA device " +
-		::std::to_string(cuda::device::current::detail::get_id()));
+		::std::to_string(cuda::device::current::detail_::get_id()));
 	return {allocated, num_bytes};
 }
 
 inline region_t allocate(cuda::device::id_t device_id, size_t size_in_bytes)
 {
-	cuda::device::current::detail::scoped_override_t set_device_for_this_scope(device_id);
-	return memory::device::detail::allocate(size_in_bytes);
+	cuda::device::current::detail_::scoped_override_t set_device_for_this_scope(device_id);
+	return memory::device::detail_::allocate(size_in_bytes);
 }
 
-} // namespace detail
+} // namespace detail_
 
 
 namespace async {
 
-namespace detail {
+namespace detail_ {
 
 /**
  * Allocate memory asynchronously on a specified stream.
@@ -228,7 +228,7 @@ inline region_t allocate(
 	throw_if_error(status,
 		"Failed scheduling an asynchronous allocation of " + ::std::to_string(num_bytes) +
 		" bytes of global memory "
-		+ " on stream " + cuda::detail::ptr_as_hex(stream_id)
+		+ " on stream " + cuda::detail_::ptr_as_hex(stream_id)
 		+ " on CUDA device " + ::std::to_string(device_id));
 	return {allocated, num_bytes};
 #else
@@ -239,7 +239,7 @@ inline region_t allocate(
 #endif
 }
 
-} // namespace detail
+} // namespace detail_
 
 /**
  * Schedule an allocation of device-side memory on a CUDA stream.
@@ -267,7 +267,7 @@ inline region_t allocate(const cuda::stream_t& stream, size_t size_in_bytes);
 inline void free(void* ptr)
 {
 	auto result = cudaFree(ptr);
-	throw_if_error(result, "Freeing device memory at 0x" + cuda::detail::ptr_as_hex(ptr));
+	throw_if_error(result, "Freeing device memory at 0x" + cuda::detail_::ptr_as_hex(ptr));
 }
 inline void free(region_t region) { free(region.start()); }
 ///@}
@@ -286,15 +286,15 @@ inline void free(region_t region) { free(region.start()); }
  */
 inline region_t allocate(cuda::device_t device, size_t size_in_bytes);
 
-namespace detail {
+namespace detail_ {
 struct allocator {
 	// Allocates on the current device!
-	void* operator()(size_t num_bytes) const { return detail::allocate(num_bytes).start(); }
+	void* operator()(size_t num_bytes) const { return detail_::allocate(num_bytes).start(); }
 };
 struct deleter {
 	void operator()(void* ptr) const { cuda::memory::device::free(ptr); }
 };
-} // namespace detail
+} // namespace detail_
 
 /**
  * @brief Sets all bytes in a region of memory to a fixed value
@@ -437,7 +437,7 @@ inline void set(region_t region, int byte_value)
 	default:
 		throw runtime_error(
 			cuda::status::invalid_value,
-			"CUDA returned an invalid memory type for the pointer 0x" + cuda::detail::ptr_as_hex(region.start())
+			"CUDA returned an invalid memory type for the pointer 0x" + cuda::detail_::ptr_as_hex(region.start())
 		);
 	}
 }
@@ -466,7 +466,7 @@ inline void zero(T* ptr)
 	zero(ptr, sizeof(T));
 }
 
-namespace detail {
+namespace detail_ {
 
 /**
  * @note When constructing this class - destination first, source second
@@ -532,7 +532,7 @@ inline void copy(array_t<T, 2>& destination, const T *source)
 template <typename T>
 inline void copy(array_t<T, 3>& destination, const T *source)
 {
-	const auto copy_params = detail::copy_params_t(destination, source);
+	const auto copy_params = detail_::copy_params_t(destination, source);
 	auto result = cudaMemcpy3D(&copy_params);
 	throw_if_error(result, "Synchronously copying into a 3-dimensional CUDA array");
 }
@@ -559,12 +559,12 @@ inline void copy(T *destination, const array_t<T, 2>& source)
 template <typename T>
 inline void copy(T* destination, const array_t<T, 3>& source)
 {
-	const auto copy_params = detail::copy_params_t(destination, source);
+	const auto copy_params = detail_::copy_params_t(destination, source);
 	auto result = cudaMemcpy3D(&copy_params);
 	throw_if_error(result, "Synchronously copying from a 3-dimensional CUDA array");
 }
 
-} // namespace detail
+} // namespace detail_
 
 
 /**
@@ -580,7 +580,7 @@ inline void copy(T* destination, const array_t<T, 3>& source)
 template <typename T, dimensionality_t NumDimensions>
 inline void copy(array_t<T, NumDimensions>& destination, const T* source)
 {
-	detail::copy(destination, source);
+	detail_::copy(destination, source);
 }
 
 /**
@@ -596,7 +596,7 @@ inline void copy(array_t<T, NumDimensions>& destination, const T* source)
 template <typename T, dimensionality_t NumDimensions>
 inline void copy(T* destination, const array_t<T, NumDimensions>& source)
 {
-	detail::copy(destination, source);
+	detail_::copy(destination, source);
 }
 
 /**
@@ -615,7 +615,7 @@ inline void copy_single(T* destination, const T* source)
 
 namespace async {
 
-namespace detail {
+namespace detail_ {
 
 /**
  * Asynchronously copies data between memory spaces or within a memory space.
@@ -643,23 +643,23 @@ inline void copy(void* destination, const void* source, size_t num_bytes, stream
 
 	// TODO: Determine whether it was from host to device, device to host etc and
 	// add this information to the error string
-	throw_if_error(result, "Scheduling a memory copy on stream " + cuda::detail::ptr_as_hex(stream_id));
+	throw_if_error(result, "Scheduling a memory copy on stream " + cuda::detail_::ptr_as_hex(stream_id));
 }
 
 template<typename T>
 void copy(array_t<T, 3>& destination, const T* source, stream::id_t stream_id)
 {
-	const auto copy_params = memory::detail::copy_params_t(destination, source);
+	const auto copy_params = memory::detail_::copy_params_t(destination, source);
 	auto result = cudaMemcpy3DAsync(&copy_params, stream_id);
-	throw_if_error(result, "Scheduling a memory copy into a 3D CUDA array on stream " + cuda::detail::ptr_as_hex(stream_id));
+	throw_if_error(result, "Scheduling a memory copy into a 3D CUDA array on stream " + cuda::detail_::ptr_as_hex(stream_id));
 }
 
 template<typename T>
 void copy(T* destination, const array_t<T, 3>& source, stream::id_t stream_id)
 {
-	const auto copy_params = memory::detail::copy_params_t(destination, source);
+	const auto copy_params = memory::detail_::copy_params_t(destination, source);
 	auto result = cudaMemcpy3DAsync(&copy_params, stream_id);
-	throw_if_error(result, "Scheduling a memory copy out of a 3D CUDA array on stream " + cuda::detail::ptr_as_hex(stream_id));
+	throw_if_error(result, "Scheduling a memory copy out of a 3D CUDA array on stream " + cuda::detail_::ptr_as_hex(stream_id));
 }
 
 template<typename T>
@@ -679,7 +679,7 @@ void copy(array_t<T, 2>& destination, const T* source, stream::id_t stream_id)
 		dimensions.height,
 		cudaMemcpyDefault,
 		stream_id);
-	throw_if_error(result, "Scheduling a memory copy into a 2D CUDA array on stream " + cuda::detail::ptr_as_hex(stream_id));
+	throw_if_error(result, "Scheduling a memory copy into a 2D CUDA array on stream " + cuda::detail_::ptr_as_hex(stream_id));
 }
 
 template<typename T>
@@ -699,7 +699,7 @@ void copy(T* destination, const array_t<T, 2>& source, cuda::stream::id_t stream
 		dimensions.height,
 		cudaMemcpyDefault,
 		stream_id);
-	throw_if_error(result, "Scheduling a memory copy out of a 3D CUDA array on stream " + cuda::detail::ptr_as_hex(stream_id));
+	throw_if_error(result, "Scheduling a memory copy out of a 3D CUDA array on stream " + cuda::detail_::ptr_as_hex(stream_id));
 }
 
 /**
@@ -719,7 +719,7 @@ inline void copy_single(T& destination, const T& source, stream::id_t stream_id)
 	copy(&destination, &source, sizeof(T), stream_id);
 }
 
-} // namespace detail
+} // namespace detail_
 
 /**
  * Asynchronously copies data between memory spaces or within a memory space.
@@ -841,7 +841,7 @@ namespace device {
 
 namespace async {
 
-namespace detail {
+namespace detail_ {
 
 inline void set(void* start, int byte_value, size_t num_bytes, stream::id_t stream_id)
 {
@@ -866,7 +866,7 @@ inline void zero(region_t region, stream::id_t stream_id)
 	zero(region.start(), region.size(), stream_id);
 }
 
-} // namespace detail
+} // namespace detail_
 
 /**
  * Asynchronously sets all bytes in a stretch of memory to a single value
@@ -937,7 +937,7 @@ inline void* allocate(
 	allocation_options  options)
 {
 	void* allocated = nullptr;
-	auto flags = cuda::memory::detail::make_cuda_host_alloc_flags(options);
+	auto flags = cuda::memory::detail_::make_cuda_host_alloc_flags(options);
 	auto result = cudaHostAlloc(&allocated, size_in_bytes, flags);
 	if (is_success(result) && allocated == nullptr) {
 		// Can this even happen? hopefully not
@@ -966,10 +966,10 @@ inline void* allocate(size_t size_in_bytes, cpu_write_combining cpu_wc)
 inline void free(void* host_ptr)
 {
 	auto result = cudaFreeHost(host_ptr);
-	throw_if_error(result, "Freeing pinned host memory at 0x" + cuda::detail::ptr_as_hex(host_ptr));
+	throw_if_error(result, "Freeing pinned host memory at 0x" + cuda::detail_::ptr_as_hex(host_ptr));
 }
 
-namespace detail {
+namespace detail_ {
 
 struct allocator {
 	void* operator()(size_t num_bytes) const { return cuda::memory::host::allocate(num_bytes); }
@@ -995,7 +995,7 @@ inline void register_(const void *ptr, size_t size, unsigned flags)
 	auto result = cudaHostRegister(const_cast<void *>(ptr), size, flags);
 	throw_if_error(result,
 		"Could not register and page-lock the region of " + ::std::to_string(size) +
-		" bytes of host memory at " + cuda::detail::ptr_as_hex(ptr));
+		" bytes of host memory at " + cuda::detail_::ptr_as_hex(ptr));
 }
 
 inline void register_(const_region_t region, unsigned flags)
@@ -1003,7 +1003,7 @@ inline void register_(const_region_t region, unsigned flags)
 	register_(region.start(), region.size(), flags);
 }
 
-} // namespace detail
+} // namespace detail_
 
 /**
  * Whether or not the registration of the host-side pointer should map
@@ -1042,7 +1042,7 @@ inline void register_(const void *ptr, size_t size,
 	bool map_into_device_space,
 	bool make_device_side_accesible_to_all)
 {
-	detail::register_(
+	detail_::register_(
 		ptr, size,
 		  (register_mapped_io_space ? cudaHostRegisterIoMemory : 0)
 		| (map_into_device_space ? cudaHostRegisterMapped : 0)
@@ -1067,7 +1067,7 @@ inline void register_(
 
 inline void register_(void const *ptr, size_t size)
 {
-	detail::register_(ptr, size, cudaHostRegisterDefault);
+	detail_::register_(ptr, size, cudaHostRegisterDefault);
 }
 
 inline void register_(const_region_t region)
@@ -1140,7 +1140,7 @@ namespace managed {
 
 class const_region_t;
 
-namespace detail {
+namespace detail_ {
 
 template <typename T>
 inline T get_scalar_range_attribute(managed::const_region_t region, cudaMemRangeAttribute attribute);
@@ -1149,8 +1149,8 @@ inline void set_scalar_range_attribute(managed::const_region_t region, cudaMemor
 inline void set_scalar_range_attribute(managed::const_region_t region, cudaMemoryAdvise attribute);
 
 template <typename T>
-struct base_region_t : public memory::detail::base_region_t<T> {
-	using parent = memory::detail::base_region_t<T>;
+struct base_region_t : public memory::detail_::base_region_t<T> {
+	using parent = memory::detail_::base_region_t<T>;
 	using parent::parent;
 
 	bool is_read_mostly() const
@@ -1165,7 +1165,7 @@ struct base_region_t : public memory::detail::base_region_t<T> {
 
 	void undesignate_read_mostly() const
 	{
-		detail::set_scalar_range_attribute(*this, cudaMemAdviseUnsetReadMostly);
+		detail_::set_scalar_range_attribute(*this, cudaMemAdviseUnsetReadMostly);
 	}
 
 	device_t preferred_location() const;
@@ -1175,16 +1175,16 @@ struct base_region_t : public memory::detail::base_region_t<T> {
 	// TODO: Consider using a field proxy
 };
 
-} // namespace detail
+} // namespace detail_
 
-struct region_t : public detail::base_region_t<void> {
+struct region_t : public detail_::base_region_t<void> {
 	using base_region_t<void>::base_region_t;
 	operator memory::region_t() { return memory::region_t{ start(), size() }; }
 };
 
-struct const_region_t : public detail::base_region_t<void const> {
+struct const_region_t : public detail_::base_region_t<void const> {
 	using base_region_t<void const>::base_region_t;
-	const_region_t(const region_t& r) : detail::base_region_t<void const>(r.start(), r.size()) {}
+	const_region_t(const region_t& r) : detail_::base_region_t<void const>(r.start(), r.size()) {}
 };
 
 void advise_expected_access_by(managed::const_region_t region, device_t& device);
@@ -1193,7 +1193,7 @@ void advise_no_access_expected_by(managed::const_region_t region, device_t& devi
 template <typename Allocator = ::std::allocator<cuda::device_t> >
 typename ::std::vector<device_t, Allocator> accessors(managed::const_region_t region, const Allocator& allocator = Allocator() );
 
-namespace detail {
+namespace detail_ {
 
 template <typename T>
 inline T get_scalar_range_attribute(managed::const_region_t region, cudaMemRangeAttribute attribute)
@@ -1202,7 +1202,7 @@ inline T get_scalar_range_attribute(managed::const_region_t region, cudaMemRange
 	auto result = cudaMemRangeGetAttribute(
 		&attribute_value, sizeof(attribute_value), attribute, region.start(), region.size());
 	throw_if_error(result,
-		"Obtaining an attribute for a managed memory range at " + cuda::detail::ptr_as_hex(region.start()));
+		"Obtaining an attribute for a managed memory range at " + cuda::detail_::ptr_as_hex(region.start()));
 	return static_cast<T>(attribute_value);
 }
 
@@ -1210,7 +1210,7 @@ inline void set_scalar_range_attribute(managed::const_region_t region, cudaMemor
 {
 	auto result = cudaMemAdvise(region.start(), region.size(), advice, device_id);
 	throw_if_error(result,
-		"Setting an attribute for a managed memory range at " + cuda::detail::ptr_as_hex(region.start()));
+		"Setting an attribute for a managed memory range at " + cuda::detail_::ptr_as_hex(region.start()));
 }
 
 inline void set_scalar_range_attribute(managed::const_region_t region, cudaMemoryAdvise attribute)
@@ -1219,7 +1219,7 @@ inline void set_scalar_range_attribute(managed::const_region_t region, cudaMemor
 	set_scalar_range_attribute(region, attribute, ignored_device_index);
 }
 
-} // namespace detail
+} // namespace detail_
 
 
 enum class initial_visibility_t {
@@ -1235,7 +1235,7 @@ enum class attachment_t {
 };
 
 
-namespace detail {
+namespace detail_ {
 
 inline region_t allocate(
 	size_t                num_bytes,
@@ -1263,7 +1263,7 @@ inline region_t allocate(
 inline void free(void* ptr)
 {
 	auto result = cudaFree(ptr);
-	throw_if_error(result, "Freeing managed memory at 0x" + cuda::detail::ptr_as_hex(ptr));
+	throw_if_error(result, "Freeing managed memory at 0x" + cuda::detail_::ptr_as_hex(ptr));
 }
 inline void free(region_t region)
 {
@@ -1276,7 +1276,7 @@ struct allocator {
 	// Allocates on the current device!
 	void* operator()(size_t num_bytes) const
 	{
-		return detail::allocate(num_bytes, InitialVisibility).start();
+		return detail_::allocate(num_bytes, InitialVisibility).start();
 	}
 };
 struct deleter {
@@ -1288,11 +1288,11 @@ inline region_t allocate(
 	size_t                num_bytes,
 	initial_visibility_t  initial_visibility = initial_visibility_t::to_all_devices)
 {
-	cuda::device::current::detail::scoped_override_t set_device_for_this_scope(device_id);
-	return detail::allocate(num_bytes, initial_visibility);
+	cuda::device::current::detail_::scoped_override_t set_device_for_this_scope(device_id);
+	return detail_::allocate(num_bytes, initial_visibility);
 }
 
-} // namespace detail
+} // namespace detail_
 
 /**
  * @brief Allocate a a region of managed memory, accessible with the same
@@ -1322,7 +1322,7 @@ inline void free(void* managed_ptr)
 	auto result = cudaFree(managed_ptr);
 	throw_if_error(result,
 		"Freeing managed memory (host and device regions) at address 0x"
-		+ cuda::detail::ptr_as_hex(managed_ptr));
+		+ cuda::detail_::ptr_as_hex(managed_ptr));
 }
 
 inline void free(region_t region)
@@ -1346,14 +1346,14 @@ inline void set(const_region_t region, device_inspecific_kind_t advice)
 	cuda::device::id_t ignored_device_index{};
 	auto result = cudaMemAdvise(region.start(), region.size(), (cudaMemoryAdvise) advice, ignored_device_index);
 	throw_if_error(result,
-		"Setting advice on a (managed) memory region at" + cuda::detail::ptr_as_hex(region.start()));
+		"Setting advice on a (managed) memory region at" + cuda::detail_::ptr_as_hex(region.start()));
 }
 
 } // namespace advice
 
 namespace async {
 
-namespace detail {
+namespace detail_ {
 
 inline void prefetch(
 	const_region_t      region,
@@ -1363,10 +1363,10 @@ inline void prefetch(
 	auto result = cudaMemPrefetchAsync(region.start(), region.size(), destination, stream_id);
 	throw_if_error(result,
 		"Prefetching " + ::std::to_string(region.size()) + " bytes of managed memory at address "
-		 + cuda::detail::ptr_as_hex(region.start()) + " to device " + ::std::to_string(destination));
+		 + cuda::detail_::ptr_as_hex(region.start()) + " to device " + ::std::to_string(destination));
 }
 
-} // namespace detail
+} // namespace detail_
 
 /**
  * @brief Prefetches a region of managed memory to a specific device, so
@@ -1393,7 +1393,7 @@ inline void prefetch_to_host(const_region_t managed_region)
 		// device indicator is used.
 	throw_if_error(result,
 		"Prefetching " + ::std::to_string(managed_region.size()) + " bytes of managed memory at address "
-		 + cuda::detail::ptr_as_hex(managed_region.start()) + " into host memory");
+		 + cuda::detail_::ptr_as_hex(managed_region.start()) + " into host memory");
 }
 
 } // namespace async
@@ -1417,11 +1417,11 @@ inline T* device_side_pointer_for(T* host_memory_ptr)
 		get_device_pointer_flags);
 	throw_if_error(status,
 		"Failed obtaining the device-side pointer for host-memory pointer "
-		+ cuda::detail::ptr_as_hex(host_memory_ptr) + " supposedly mapped to device memory");
+		+ cuda::detail_::ptr_as_hex(host_memory_ptr) + " supposedly mapped to device memory");
 	return device_side_ptr;
 }
 
-namespace detail {
+namespace detail_ {
 
 /**
  * Allocates a mapped pair of memory regions - on the current device
@@ -1439,7 +1439,7 @@ inline region_pair allocate(
 	region_pair allocated;
 	allocated.size_in_bytes = size_in_bytes;
 	auto flags = cudaHostAllocMapped &
-		cuda::memory::detail::make_cuda_host_alloc_flags(options);
+		cuda::memory::detail_::make_cuda_host_alloc_flags(options);
 	// Note: the typed cudaHostAlloc also takes its size in bytes, apparently,
 	// not in number of elements
 	auto status = cudaHostAlloc(&allocated.host_side, size_in_bytes, flags);
@@ -1449,7 +1449,7 @@ inline region_pair allocate(
 	}
 	throw_if_error(status,
 		"Failed allocating a mapped pair of memory regions of size " + ::std::to_string(size_in_bytes)
-			+ " bytes of global memory on device " + ::std::to_string(cuda::device::current::detail::get_id()));
+			+ " bytes of global memory on device " + ::std::to_string(cuda::device::current::detail_::get_id()));
 	allocated.device_side = device_side_pointer_for(allocated.host_side);
 	return allocated;
 }
@@ -1469,11 +1469,11 @@ inline region_pair allocate(
 	size_t              size_in_bytes,
 	allocation_options  options)
 {
-	cuda::device::current::detail::scoped_override_t set_device_for_this_scope(device_id);
-	return detail::allocate(size_in_bytes, options);
+	cuda::device::current::detail_::scoped_override_t set_device_for_this_scope(device_id);
+	return detail_::allocate(size_in_bytes, options);
 }
 
-} // namespace detail
+} // namespace detail_
 
 /**
  * Allocate a pair of memory regions, on the host and on the device, mapped to each other so
@@ -1564,9 +1564,9 @@ inline region_t locate(symbol_t symbol)
 	void* start;
 	size_t symbol_size;
 	auto api_call_result = cudaGetSymbolAddress(&start, symbol.handle);
-	throw_if_error(api_call_result, "Could not locate the device memory address for symbol " + cuda::detail::ptr_as_hex(symbol.handle));
+	throw_if_error(api_call_result, "Could not locate the device memory address for symbol " + cuda::detail_::ptr_as_hex(symbol.handle));
 	api_call_result = cudaGetSymbolSize(&symbol_size, symbol.handle);
-	throw_if_error(api_call_result, "Could not locate the device memory address for symbol " + cuda::detail::ptr_as_hex(symbol.handle));
+	throw_if_error(api_call_result, "Could not locate the device memory address for symbol " + cuda::detail_::ptr_as_hex(symbol.handle));
 	return {start, symbol_size};
 }
 
diff --git a/src/cuda/api/multi_wrapper_impls.hpp b/src/cuda/api/multi_wrapper_impls.hpp
index 6e2a28fb..5a82265c 100644
--- a/src/cuda/api/multi_wrapper_impls.hpp
+++ b/src/cuda/api/multi_wrapper_impls.hpp
@@ -26,23 +26,23 @@ namespace cuda {
 
 namespace array {
 
-namespace detail {
+namespace detail_ {
 
 template<typename T>
 inline cudaArray* allocate(device_t& device, array::dimensions_t<3> dimensions)
 {
-	device::current::detail::scoped_override_t set_device_for_this_scope(device.id());
+	device::current::detail_::scoped_override_t set_device_for_this_scope(device.id());
 	return allocate_on_current_device<T>(dimensions);
 }
 
 template<typename T>
 inline cudaArray* allocate(device_t& device, array::dimensions_t<2> dimensions)
 {
-	device::current::detail::scoped_override_t set_device_for_this_scope(device.id());
+	device::current::detail_::scoped_override_t set_device_for_this_scope(device.id());
 	return allocate_on_current_device<T>(dimensions);
 }
 
-} // namespace detail
+} // namespace detail_
 
 } // namespace array
 
@@ -57,19 +57,19 @@ inline event_t create(
 	auto device_id = device.id();
 		// Yes, we need the ID explicitly even on the current device,
 		// because event_t's don't have an implicit device ID.
-	return event::detail::create(device_id , uses_blocking_sync, records_timing, interprocess);
+	return event::detail_::create(device_id , uses_blocking_sync, records_timing, interprocess);
 }
 
 namespace ipc {
 
 inline handle_t export_(event_t& event)
 {
-	return detail::export_(event.id());
+	return detail_::export_(event.id());
 }
 
 inline event_t import(device_t& device, const handle_t& handle)
 {
-	return event::detail::wrap(device.id(), detail::import(handle), do_not_take_ownership);
+	return event::detail_::wrap(device.id(), detail_::import(handle), do_not_take_ownership);
 }
 
 } // namespace ipc
@@ -81,7 +81,7 @@ inline event_t import(device_t& device, const handle_t& handle)
 
 inline stream_t device_t::default_stream() const noexcept
 {
-	return stream::detail::wrap(id(), stream::default_stream_id);
+	return stream::detail_::wrap(id(), stream::default_stream_id);
 }
 
 inline stream_t
@@ -89,8 +89,8 @@ device_t::create_stream(
 	bool                will_synchronize_with_default_stream,
 	stream::priority_t  priority)
 {
-	device::current::detail::scoped_override_t set_device_for_this_scope(id_);
-	return stream::detail::wrap(id(), stream::detail::create_on_current_device(
+	device::current::detail_::scoped_override_t set_device_for_this_scope(id_);
+	return stream::detail_::wrap(id(), stream::detail_::create_on_current_device(
 		will_synchronize_with_default_stream, priority), do_take_ownership);
 }
 
@@ -104,9 +104,9 @@ inline scoped_override_t::scoped_override_t(device_t&& device) : parent(device.i
 } // namespace device
 
 
-namespace detail {
+namespace detail_ {
 
-} // namespace detail
+} // namespace detail_
 
 template <typename KernelFunction, typename ... KernelParameters>
 void device_t::launch(
@@ -140,7 +140,7 @@ inline void event_t::record(const stream_t& stream)
 	// Note:
 	// TODO: Perhaps check the device ID here, rather than
 	// have the Runtime API call fail?
-	event::detail::enqueue(stream.id(), id_);
+	event::detail_::enqueue(stream.id(), id_);
 }
 
 inline void event_t::fire(const stream_t& stream)
@@ -160,15 +160,15 @@ inline device_t stream_t::device() const noexcept
 inline void stream_t::enqueue_t::wait(const event_t& event_)
 {
 	auto device_id = associated_stream.device_id_;
-	device::current::detail::scoped_override_t set_device_for_this_context(device_id);
+	device::current::detail_::scoped_override_t set_device_for_this_context(device_id);
 
 	// Required by the CUDA runtime API; the flags value is currently unused
 	constexpr const unsigned int flags = 0;
 
 	auto status = cudaStreamWaitEvent(associated_stream.id_, event_.id(), flags);
 	throw_if_error(status,
-		::std::string("Failed scheduling a wait for event ") + cuda::detail::ptr_as_hex(event_.id())
-		+ " on stream " + cuda::detail::ptr_as_hex(associated_stream.id_)
+		::std::string("Failed scheduling a wait for event ") + cuda::detail_::ptr_as_hex(event_.id())
+		+ " on stream " + cuda::detail_::ptr_as_hex(associated_stream.id_)
 		+ " on CUDA device " + ::std::to_string(device_id));
 
 }
@@ -181,8 +181,8 @@ inline event_t& stream_t::enqueue_t::event(event_t& existing_event)
 			+ ::std::to_string(existing_event.device_id()) + " to be triggered by a stream on CUDA device "
 			+ ::std::to_string(device_id ) );
 	}
-	device::current::detail::scoped_override_t set_device_for_this_context(device_id);
-	stream::detail::record_event_on_current_device(device_id, associated_stream.id_, existing_event.id());
+	device::current::detail_::scoped_override_t set_device_for_this_context(device_id);
+	stream::detail_::record_event_on_current_device(device_id, associated_stream.id_, existing_event.id());
 	return existing_event;
 }
 
@@ -192,11 +192,11 @@ inline event_t stream_t::enqueue_t::event(
     bool          interprocess)
 {
 	auto device_id = associated_stream.device_id_;
-	device::current::detail::scoped_override_t set_device_for_this_scope(device_id);
+	device::current::detail_::scoped_override_t set_device_for_this_scope(device_id);
 
-	event_t ev { event::detail::create_on_current_device(device_id, uses_blocking_sync, records_timing, interprocess) };
+	event_t ev { event::detail_::create_on_current_device(device_id, uses_blocking_sync, records_timing, interprocess) };
 	// Note that, at this point, the event is not associated with this enqueue object's stream.
-	stream::detail::record_event_on_current_device(device_id, associated_stream.id_, ev.id());
+	stream::detail_::record_event_on_current_device(device_id, associated_stream.id_, ev.id());
 	return ev;
 }
 
@@ -212,25 +212,25 @@ namespace async {
 
 inline void copy(void *destination, const void *source, size_t num_bytes, const stream_t& stream)
 {
-	detail::copy(destination, source, num_bytes, stream.id());
+	detail_::copy(destination, source, num_bytes, stream.id());
 }
 
 template <typename T, dimensionality_t NumDimensions>
 inline void copy(array_t<T, NumDimensions>& destination, const T* source, const stream_t& stream)
 {
-	detail::copy(destination, source, stream.id());
+	detail_::copy(destination, source, stream.id());
 }
 
 template <typename T, dimensionality_t NumDimensions>
 inline void copy(T* destination, const array_t<T, NumDimensions>& source, const stream_t& stream)
 {
-	detail::copy(destination, source, stream.id());
+	detail_::copy(destination, source, stream.id());
 }
 
 template <typename T>
 inline void copy_single(T& destination, const T& source, const stream_t& stream)
 {
-	detail::copy_single(&destination, &source, sizeof(T), stream.id());
+	detail_::copy_single(&destination, &source, sizeof(T), stream.id());
 }
 
 } // namespace async
@@ -239,24 +239,24 @@ namespace device {
 
 inline region_t allocate(cuda::device_t device, size_t size_in_bytes)
 {
-	return detail::allocate(device.id(), size_in_bytes);
+	return detail_::allocate(device.id(), size_in_bytes);
 }
 
 namespace async {
 
 inline region_t allocate(const stream_t& stream, size_t size_in_bytes)
 {
-	return detail::allocate(stream.device().id(), stream.id(), size_in_bytes);
+	return detail_::allocate(stream.device().id(), stream.id(), size_in_bytes);
 }
 
 inline void set(void* start, int byte_value, size_t num_bytes, const stream_t& stream)
 {
-	detail::set(start, byte_value, num_bytes, stream.id());
+	detail_::set(start, byte_value, num_bytes, stream.id());
 }
 
 inline void zero(void* start, size_t num_bytes, const stream_t& stream)
 {
-	detail::zero(start, num_bytes, stream.id());
+	detail_::zero(start, num_bytes, stream.id());
 }
 
 } // namespace async
@@ -275,8 +275,8 @@ template<typename T>
 inline unique_ptr<T> make_unique(device_t device, size_t num_elements)
 {
 	static_assert(::std::is_array<T>::value, "make_unique<T>(device, num_elements) can only be invoked for T being an array type, T = U[]");
-	cuda::device::current::detail::scoped_override_t set_device_for_this_scope(device.id());
-	return cuda::memory::detail::make_unique<T, detail::allocator, detail::deleter>(num_elements);
+	cuda::device::current::detail_::scoped_override_t set_device_for_this_scope(device.id());
+	return cuda::memory::detail_::make_unique<T, detail_::allocator, detail_::deleter>(num_elements);
 }
 
 /**
@@ -291,46 +291,46 @@ inline unique_ptr<T> make_unique(device_t device, size_t num_elements)
 template <typename T>
 inline unique_ptr<T> make_unique(device_t device)
 {
-	cuda::device::current::detail::scoped_override_t set_device_for_this_scope(device.id());
-	return cuda::memory::detail::make_unique<T, detail::allocator, detail::deleter>();
+	cuda::device::current::detail_::scoped_override_t set_device_for_this_scope(device.id());
+	return cuda::memory::detail_::make_unique<T, detail_::allocator, detail_::deleter>();
 }
 
 } // namespace device
 
 namespace managed {
 
-namespace detail {
+namespace detail_ {
 
 template <typename T>
 inline device_t base_region_t<T>::preferred_location() const
 {
-	auto device_id = detail::get_scalar_range_attribute<bool>(*this, cudaMemRangeAttributePreferredLocation);
+	auto device_id = detail_::get_scalar_range_attribute<bool>(*this, cudaMemRangeAttributePreferredLocation);
 	return cuda::device::get(device_id);
 }
 
 template <typename T>
 inline void base_region_t<T>::set_preferred_location(device_t& device) const
 {
-	detail::set_scalar_range_attribute(*this, (cudaMemoryAdvise) cudaMemAdviseSetPreferredLocation, device.id());
+	detail_::set_scalar_range_attribute(*this, (cudaMemoryAdvise) cudaMemAdviseSetPreferredLocation, device.id());
 }
 
 template <typename T>
 inline void base_region_t<T>::clear_preferred_location() const
 {
-	detail::set_scalar_range_attribute(*this, (cudaMemoryAdvise) cudaMemAdviseUnsetPreferredLocation);
+	detail_::set_scalar_range_attribute(*this, (cudaMemoryAdvise) cudaMemAdviseUnsetPreferredLocation);
 }
 
-} // namespace detail
+} // namespace detail_
 
 
 inline void advise_expected_access_by(const_region_t region, device_t& device)
 {
-	detail::set_scalar_range_attribute(region, cudaMemAdviseSetAccessedBy, device.id());
+	detail_::set_scalar_range_attribute(region, cudaMemAdviseSetAccessedBy, device.id());
 }
 
 inline void advise_no_access_expected_by(const_region_t region, device_t& device)
 {
-	detail::set_scalar_range_attribute(region, cudaMemAdviseUnsetAccessedBy, device.id());
+	detail_::set_scalar_range_attribute(region, cudaMemAdviseUnsetAccessedBy, device.id());
 }
 
 template <typename Allocator>
@@ -346,7 +346,7 @@ ::std::vector<device_t, Allocator> accessors(const_region_t region, const Alloca
 	auto status = cudaMemRangeGetAttribute(
 		device_ids, sizeof(device_t) * devices.size(),
 		cudaMemRangeAttributeAccessedBy, region.start(), region.size());
-	throw_if_error(status, "Obtaining the IDs of devices with access to the managed memory range at " + cuda::detail::ptr_as_hex(region.start()));
+	throw_if_error(status, "Obtaining the IDs of devices with access to the managed memory range at " + cuda::detail_::ptr_as_hex(region.start()));
 	auto first_invalid_element = ::std::lower_bound(device_ids, device_ids + num_devices, cudaInvalidDeviceId);
 	// We may have gotten less results that the set of all devices, so let's whittle that down
 
@@ -364,7 +364,7 @@ inline void prefetch(
 	cuda::device_t   destination,
 	const stream_t&  stream)
 {
-	detail::prefetch(region, destination.id(), stream.id());
+	detail_::prefetch(region, destination.id(), stream.id());
 }
 
 } // namespace async
@@ -375,7 +375,7 @@ inline region_t allocate(
 	size_t                num_bytes,
 	initial_visibility_t  initial_visibility)
 {
-	return detail::allocate(device.id(), num_bytes, initial_visibility);
+	return detail_::allocate(device.id(), num_bytes, initial_visibility);
 }
 
 } // namespace managed
@@ -387,7 +387,7 @@ inline region_pair allocate(
 	size_t              size_in_bytes,
 	allocation_options  options)
 {
-	return cuda::memory::mapped::detail::allocate(device.id(), size_in_bytes, options);
+	return cuda::memory::mapped::detail_::allocate(device.id(), size_in_bytes, options);
 }
 
 } // namespace mapped
@@ -398,14 +398,14 @@ inline region_pair allocate(
 
 inline void kernel_t::set_attribute(cudaFuncAttribute attribute, int value)
 {
-	device::current::detail::scoped_override_t set_device_for_this_context(device_id_);
+	device::current::detail_::scoped_override_t set_device_for_this_context(device_id_);
 	auto result = cudaFuncSetAttribute(ptr_, attribute, value);
 	throw_if_error(result, "Setting CUDA device function attribute " + ::std::to_string(attribute) + " to value " + ::std::to_string(value));
 }
 
 inline void kernel_t::opt_in_to_extra_dynamic_memory(cuda::memory::shared::size_t amount_required_by_kernel)
 {
-	device::current::detail::scoped_override_t set_device_for_this_context(device_id_);
+	device::current::detail_::scoped_override_t set_device_for_this_context(device_id_);
 #if CUDART_VERSION >= 9000
 	auto result = cudaFuncSetAttribute(ptr_, cudaFuncAttributeMaxDynamicSharedMemorySize, amount_required_by_kernel);
 	throw_if_error(result,
@@ -439,7 +439,7 @@ kernel_t::min_grid_params_for_max_occupancy(
 		disable_caching_override ? cudaOccupancyDisableCachingOverride : cudaOccupancyDefault
 		);
 	throw_if_error(result,
-		"Failed obtaining parameters for a minimum-size grid for kernel " + detail::ptr_as_hex(ptr_) +
+		"Failed obtaining parameters for a minimum-size grid for kernel " + detail_::ptr_as_hex(ptr_) +
 		" on device " + ::std::to_string(device_id_) + ".");
 	return { min_grid_size_in_blocks, block_size };
 #endif
@@ -464,7 +464,7 @@ kernel_t::min_grid_params_for_max_occupancy(
 		disable_caching_override ? cudaOccupancyDisableCachingOverride : cudaOccupancyDefault
 		);
 	throw_if_error(result,
-		"Failed obtaining parameters for a minimum-size grid for kernel " + detail::ptr_as_hex(ptr_) +
+		"Failed obtaining parameters for a minimum-size grid for kernel " + detail_::ptr_as_hex(ptr_) +
 		" on device " + ::std::to_string(device_id_) + ".");
 	return { min_grid_size_in_blocks, block_size };
 #endif
@@ -473,7 +473,7 @@ kernel_t::min_grid_params_for_max_occupancy(
 
 inline void kernel_t::set_preferred_shared_mem_fraction(unsigned shared_mem_percentage)
 {
-	device::current::detail::scoped_override_t set_device_for_this_context(device_id_);
+	device::current::detail_::scoped_override_t set_device_for_this_context(device_id_);
 	if (shared_mem_percentage > 100) {
 		throw ::std::invalid_argument("Percentage value can't exceed 100");
 	}
@@ -487,7 +487,7 @@ inline void kernel_t::set_preferred_shared_mem_fraction(unsigned shared_mem_perc
 
 inline kernel::attributes_t kernel_t::attributes() const
 {
-	device::current::detail::scoped_override_t set_device_for_this_context(device_id_);
+	device::current::detail_::scoped_override_t set_device_for_this_context(device_id_);
 	kernel::attributes_t function_attributes;
 	auto status = cudaFuncGetAttributes(&function_attributes, ptr_);
 	throw_if_error(status, "Failed obtaining attributes for a CUDA device function");
@@ -496,7 +496,7 @@ inline kernel::attributes_t kernel_t::attributes() const
 
 inline void kernel_t::set_cache_preference(multiprocessor_cache_preference_t  preference)
 {
-	device::current::detail::scoped_override_t set_device_for_this_context(device_id_);
+	device::current::detail_::scoped_override_t set_device_for_this_context(device_id_);
 	auto result = cudaFuncSetCacheConfig(ptr_, (cudaFuncCache) preference);
 	throw_if_error(result,
 		"Setting the multiprocessor L1/Shared Memory cache distribution preference for a "
@@ -507,7 +507,7 @@ inline void kernel_t::set_cache_preference(multiprocessor_cache_preference_t  pr
 inline void kernel_t::set_shared_memory_bank_size(
 	multiprocessor_shared_memory_bank_size_option_t  config)
 {
-	device::current::detail::scoped_override_t set_device_for_this_context(device_id_);
+	device::current::detail_::scoped_override_t set_device_for_this_context(device_id_);
 	auto result = cudaFuncSetSharedMemConfig(ptr_, (cudaSharedMemConfig) config);
 	throw_if_error(result);
 }
@@ -517,7 +517,7 @@ inline grid::dimension_t kernel_t::maximum_active_blocks_per_multiprocessor(
 	memory::shared::size_t    dynamic_shared_memory_per_block,
 	bool                      disable_caching_override)
 {
-	device::current::detail::scoped_override_t set_device_for_this_context(device_id_);
+	device::current::detail_::scoped_override_t set_device_for_this_context(device_id_);
 	int result;
 	unsigned int flags = disable_caching_override ?
 		cudaOccupancyDisableCachingOverride : cudaOccupancyDefault;
@@ -541,20 +541,20 @@ inline stream_t create(
 	bool        synchronizes_with_default_stream,
 	priority_t  priority)
 {
-	return detail::create(device.id(), synchronizes_with_default_stream, priority);
+	return detail_::create(device.id(), synchronizes_with_default_stream, priority);
 }
 
-namespace detail {
+namespace detail_ {
 
 inline void record_event_on_current_device(device::id_t device_id, stream::id_t stream_id, event::id_t event_id)
 {
 	auto status = cudaEventRecord(event_id, stream_id);
 	throw_if_error(status,
-		"Failed scheduling event " + cuda::detail::ptr_as_hex(event_id) + " to occur"
-		+ " on stream " + cuda::detail::ptr_as_hex(stream_id)
+		"Failed scheduling event " + cuda::detail_::ptr_as_hex(event_id) + " to occur"
+		+ " on stream " + cuda::detail_::ptr_as_hex(stream_id)
 		+ " on CUDA device " + ::std::to_string(device_id));
 }
-} // namespace detail
+} // namespace detail_
 
 } // namespace stream
 
@@ -569,7 +569,7 @@ inline void enqueue_launch(
 	auto unwrapped_kernel_function =
 		kernel::unwrap<
 			Kernel,
-			detail::kernel_parameter_decay_t<KernelParameters>...
+			detail_::kernel_parameter_decay_t<KernelParameters>...
 		>(kernel_function);
 		// Note: This helper function is necessary since we may have gotten a
 		// kernel_t as Kernel, which is type-erased - in
@@ -583,10 +583,10 @@ inline void enqueue_launch(
 		// we massage those a bit.
 
 #ifdef DEBUG
-	assert(thread_block_cooperation == detail::intrinsic_block_cooperation_value,
+	assert(thread_block_cooperation == detail_::intrinsic_block_cooperation_value,
 		"mismatched indications of whether thread block should be able to cooperate for a kernel");
 #endif
-	detail::enqueue_launch(
+	detail_::enqueue_launch(
 		thread_block_cooperation,
 		unwrapped_kernel_function,
 		stream.id(),
diff --git a/src/cuda/api/pci_id.hpp b/src/cuda/api/pci_id.hpp
index 79d1f72a..8a88a266 100644
--- a/src/cuda/api/pci_id.hpp
+++ b/src/cuda/api/pci_id.hpp
@@ -40,7 +40,7 @@ struct pci_location_t {
 		// In lieu of making this class a variant with 3 type combinations.
 };
 
-namespace detail {
+namespace detail_ {
 
 /**
  * Obtain a CUDA device id for a PCIe bus device
@@ -58,7 +58,7 @@ inline id_t resolve_id(pci_location_t pci_id)
 	return cuda_device_id;
 }
 
-} // namespace detail
+} // namespace detail_
 
 
 } // namespace device
diff --git a/src/cuda/api/pointer.hpp b/src/cuda/api/pointer.hpp
index 1bfa5367..1e8fb1f6 100644
--- a/src/cuda/api/pointer.hpp
+++ b/src/cuda/api/pointer.hpp
@@ -110,7 +110,7 @@ class pointer_t {
 	{
 		pointer::attributes_t the_attributes;
 		auto status = cudaPointerGetAttributes (&the_attributes, ptr_);
-		throw_if_error(status, "Failed obtaining attributes of pointer " + cuda::detail::ptr_as_hex(ptr_));
+		throw_if_error(status, "Failed obtaining attributes of pointer " + cuda::detail_::ptr_as_hex(ptr_));
 		return the_attributes;
 	}
 	device_t device() const noexcept;
diff --git a/src/cuda/api/stream.hpp b/src/cuda/api/stream.hpp
index 510ab554..2dc7fdfe 100644
--- a/src/cuda/api/stream.hpp
+++ b/src/cuda/api/stream.hpp
@@ -38,7 +38,7 @@ enum : bool {
 	async = no_implicit_synchronization_with_default_stream,
 };
 
-namespace detail {
+namespace detail_ {
 
 inline id_t create_on_current_device(
 	bool          synchronizes_with_default_stream,
@@ -51,7 +51,7 @@ inline id_t create_on_current_device(
 	auto status = cudaStreamCreateWithPriority(&new_stream_id, flags, priority);
 	cuda::throw_if_error(status,
 		::std::string("Failed creating a new stream on CUDA device ")
-		+ ::std::to_string(device::current::detail::get_id()));
+		+ ::std::to_string(device::current::detail_::get_id()));
 	return new_stream_id;
 }
 
@@ -71,7 +71,7 @@ inline id_t create_on_current_device(
  */
 inline bool is_associated_with(stream::id_t stream_id, device::id_t device_id)
 {
-	device::current::detail::scoped_override_t set_device_for_this_scope(device_id);
+	device::current::detail_::scoped_override_t set_device_for_this_scope(device_id);
 	auto status = cudaStreamQuery(stream_id);
 	switch(status) {
 	case cudaSuccess:
@@ -103,7 +103,7 @@ inline device::id_t associated_device(stream::id_t stream_id)
 		if (is_associated_with(stream_id, device_index)) { return device_index; }
 	}
 	throw ::std::runtime_error(
-		"Could not find any device associated with stream " + cuda::detail::ptr_as_hex(stream_id));
+		"Could not find any device associated with stream " + cuda::detail_::ptr_as_hex(stream_id));
 }
 
 inline void record_event_on_current_device(device::id_t current_device_id, stream::id_t stream_id, event::id_t event_id);
@@ -120,7 +120,7 @@ stream_t wrap(
 	id_t          stream_id,
 	bool          take_ownership = false) noexcept;
 
-} // namespace detail
+} // namespace detail_
 
 } // namespace stream
 
@@ -146,7 +146,7 @@ class stream_t {
 	};
 
 protected: // type definitions
-	using device_setter_type = device::current::detail::scoped_override_t;
+	using device_setter_type = device::current::detail_::scoped_override_t;
 
 
 public: // const getters
@@ -203,7 +203,7 @@ class stream_t {
 		default:
 			throw(cuda::runtime_error(status,
 				"unexpected status returned from cudaStreamQuery() for stream "
-				+ detail::ptr_as_hex(id_)));
+				+ detail_::ptr_as_hex(id_)));
 		}
 	}
 
@@ -350,7 +350,7 @@ class stream_t {
 		{
 			// It is not necessary to make the device current, according to:
 			// http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#stream-and-event-behavior
-			memory::async::detail::copy(destination, source, num_bytes, associated_stream.id_);
+			memory::async::detail_::copy(destination, source, num_bytes, associated_stream.id_);
 		}
 
 		void copy(void* destination, memory::const_region_t source, size_t num_bytes)
@@ -386,7 +386,7 @@ class stream_t {
 		{
 			// Is it necessary to set the device? I wonder.
 			device_setter_type set_device_for_this_scope(associated_stream.device_id_);
-			memory::device::async::detail::set(destination, byte_value, num_bytes, associated_stream.id_);
+			memory::device::async::detail_::set(destination, byte_value, num_bytes, associated_stream.id_);
 		}
 
 		/**
@@ -404,7 +404,7 @@ class stream_t {
 		{
 			// Is it necessary to set the device? I wonder.
 			device_setter_type set_device_for_this_scope(associated_stream.device_id_);
-			memory::device::async::detail::zero(destination, num_bytes, associated_stream.id_);
+			memory::device::async::detail_::zero(destination, num_bytes, associated_stream.id_);
 		}
 
 		/**
@@ -480,7 +480,7 @@ class stream_t {
 
 			throw_if_error(status,
 				::std::string("Failed scheduling a callback to be launched")
-				+ " on stream " + cuda::detail::ptr_as_hex(associated_stream.id_)
+				+ " on stream " + cuda::detail_::ptr_as_hex(associated_stream.id_)
 				+ " on CUDA device " + ::std::to_string(associated_stream.device_id_));
 		}
 
@@ -534,7 +534,7 @@ class stream_t {
 				associated_stream.id_, managed_region_start, length, flags);
 			throw_if_error(status,
 				"Failed scheduling an attachment of a managed memory region"
-				" on stream " + cuda::detail::ptr_as_hex(associated_stream.id_)
+				" on stream " + cuda::detail_::ptr_as_hex(associated_stream.id_)
 				+ " on CUDA device " + ::std::to_string(associated_stream.device_id_));
 		}
 
@@ -609,7 +609,7 @@ class stream_t {
 
 public: // friendship
 
-	friend stream_t stream::detail::wrap(device::id_t device_id, stream::id_t stream_id, bool take_ownership) noexcept;
+	friend stream_t stream::detail_::wrap(device::id_t device_id, stream::id_t stream_id, bool take_ownership) noexcept;
 
 	friend inline bool operator==(const stream_t& lhs, const stream_t& rhs) noexcept
 	{
@@ -635,7 +635,7 @@ inline bool operator!=(const stream_t& lhs, const stream_t& rhs) noexcept
 
 namespace stream {
 
-namespace detail {
+namespace detail_ {
 /**
  * @brief Wrap an existing stream in a @ref stream_t instance.
  *
@@ -663,13 +663,13 @@ inline stream_t create(
 	bool          synchronizes_with_default_stream,
 	priority_t    priority = stream::default_priority)
 {
-	device::current::detail::scoped_override_t set_device_for_this_scope(device_id);
-	auto new_stream_id = cuda::stream::detail::create_on_current_device(
+	device::current::detail_::scoped_override_t set_device_for_this_scope(device_id);
+	auto new_stream_id = cuda::stream::detail_::create_on_current_device(
 		synchronizes_with_default_stream, priority);
 	return wrap(device_id, new_stream_id, do_take_ownership);
 }
 
-} // namespace detail
+} // namespace detail_
 
 /**
  * @brief Create a new stream (= queue) on a CUDA device.
diff --git a/src/cuda/api/texture_view.hpp b/src/cuda/api/texture_view.hpp
index da042ffa..8acd7b54 100644
--- a/src/cuda/api/texture_view.hpp
+++ b/src/cuda/api/texture_view.hpp
@@ -42,11 +42,11 @@ struct descriptor_t : public cudaTextureDesc {
 	}
 };
 
-namespace detail {
+namespace detail_ {
 
 inline texture_view wrap(texture::raw_handle_t handle, bool take_ownership) noexcept;
 
-}  // namespace detail
+}  // namespace detail_
 
 }  // namespace texture
 
@@ -122,7 +122,7 @@ class texture_view {
 
 public: // friendship
 
-	friend texture_view texture::detail::wrap(raw_handle_type handle, bool take_ownersip) noexcept;
+	friend texture_view texture::detail_::wrap(raw_handle_type handle, bool take_ownersip) noexcept;
 
 protected:
 	raw_handle_type raw_view_handle { } ;
@@ -141,14 +141,14 @@ inline bool operator!=(const texture_view& lhs, const texture_view& rhs) noexcep
 }
 
 namespace texture {
-namespace detail {
+namespace detail_ {
 
 inline texture_view wrap(texture::raw_handle_t handle, bool take_ownership) noexcept
 {
 	return texture_view(handle, take_ownership);
 }
 
-} // namespace detail
+} // namespace detail_
 } // namespace texture
 
 }  // namespace cuda
diff --git a/src/cuda/api/unique_ptr.hpp b/src/cuda/api/unique_ptr.hpp
index a5579e9b..0c0ffffb 100644
--- a/src/cuda/api/unique_ptr.hpp
+++ b/src/cuda/api/unique_ptr.hpp
@@ -13,7 +13,7 @@
 
 namespace cuda {
 namespace memory {
-namespace detail {
+namespace detail_ {
 
 
 template<typename T, typename Deleter>
@@ -29,15 +29,15 @@ template<typename T, size_t N, typename Deleter> struct make_unique_selector<T[N
  * @note Only trivially-constructible types are supported
  */
 template<typename T, typename Allocator, typename Deleter>
-inline typename detail::make_unique_selector<T, Deleter>::non_array make_unique()
+inline typename detail_::make_unique_selector<T, Deleter>::non_array make_unique()
 {
 	static_assert(::std::is_trivially_constructible<T>::value,
 		"Allocating with non-trivial construction on the device is not supported.");
 	auto space_ptr = Allocator()(sizeof(T));
-	return typename detail::make_unique_selector<T, Deleter>::non_array(static_cast<T*>(space_ptr));
+	return typename detail_::make_unique_selector<T, Deleter>::non_array(static_cast<T*>(space_ptr));
 }
 template<typename T, typename Allocator, typename Deleter>
-inline typename detail::make_unique_selector<T, Deleter>::unbounded_array make_unique(size_t num_elements)
+inline typename detail_::make_unique_selector<T, Deleter>::unbounded_array make_unique(size_t num_elements)
 {
 	// If this function is instantiated, T is of the form "element_type[]"
 	using element_type = typename ::std::remove_extent<T>::type;
@@ -46,19 +46,19 @@ inline typename detail::make_unique_selector<T, Deleter>::unbounded_array make_u
 	static_assert(::std::is_trivially_constructible<element_type>::value,
 		"Allocating with non-trivial construction on the device is not supported.");
 	void* space_ptr = Allocator()(sizeof(element_type) * num_elements);
-	return typename detail::make_unique_selector<T, Deleter>::unbounded_array(static_cast<element_type*>(space_ptr));
+	return typename detail_::make_unique_selector<T, Deleter>::unbounded_array(static_cast<element_type*>(space_ptr));
 }
 template<typename T, typename Allocator, typename Deleter, typename... Args>
-inline typename detail::make_unique_selector<T, Deleter>::bounded_array make_unique(Args&&...) = delete;
+inline typename detail_::make_unique_selector<T, Deleter>::bounded_array make_unique(Args&&...) = delete;
 
-using deleter = device::detail::deleter;
+using deleter = device::detail_::deleter;
 
 template<typename T>
 inline ::std::unique_ptr<T, deleter>
 make_unique(cuda::device::id_t device_id, size_t n)
 {
-	cuda::device::current::detail::scoped_override_t set_device_for_this_scope(device_id);
-	return memory::detail::make_unique<T, device::detail::allocator, deleter>(n);
+	cuda::device::current::detail_::scoped_override_t set_device_for_this_scope(device_id);
+	return memory::detail_::make_unique<T, device::detail_::allocator, deleter>(n);
 }
 
 template<typename T>
@@ -66,17 +66,17 @@ inline ::std::unique_ptr<T, deleter>
 make_unique(cuda::device::id_t device_id)
 {
 
-	cuda::device::current::detail::scoped_override_t set_device_for_this_scope(device_id);
+	cuda::device::current::detail_::scoped_override_t set_device_for_this_scope(device_id);
 
-	return memory::detail::make_unique<T, device::detail::allocator, deleter>();
+	return memory::detail_::make_unique<T, device::detail_::allocator, deleter>();
 }
 
-} // namespace detail
+} // namespace detail_
 
 namespace device {
 
 template<typename T>
-using unique_ptr = ::std::unique_ptr<T, detail::deleter>;
+using unique_ptr = ::std::unique_ptr<T, detail_::deleter>;
 
 template<typename T>
 inline unique_ptr<T> make_unique(cuda::device_t device, size_t n);
@@ -96,18 +96,18 @@ inline unique_ptr<T> make_unique(T* raw_ptr)
 namespace host {
 
 template<typename T>
-using unique_ptr = ::std::unique_ptr<T, detail::deleter>;
+using unique_ptr = ::std::unique_ptr<T, detail_::deleter>;
 
 template<typename T>
 inline unique_ptr<T> make_unique(size_t n)
 {
-	return cuda::memory::detail::make_unique<T, detail::allocator, detail::deleter>(n);
+	return cuda::memory::detail_::make_unique<T, detail_::allocator, detail_::deleter>(n);
 }
 
 template<typename T>
 inline unique_ptr<T> make_unique()
 {
-	return cuda::memory::detail::make_unique<T, detail::allocator, detail::deleter>();
+	return cuda::memory::detail_::make_unique<T, detail_::allocator, detail_::deleter>();
 }
 
 } // namespace host
@@ -115,7 +115,7 @@ inline unique_ptr<T> make_unique()
 namespace managed {
 
 template<typename T>
-using unique_ptr = ::std::unique_ptr<T, detail::deleter>;
+using unique_ptr = ::std::unique_ptr<T, detail_::deleter>;
 
 template<typename T>
 inline unique_ptr<T> make_unique(
@@ -123,11 +123,11 @@ inline unique_ptr<T> make_unique(
 	initial_visibility_t  initial_visibility = initial_visibility_t::to_all_devices)
 {
 	return (initial_visibility == initial_visibility_t::to_all_devices) ?
-		cuda::memory::detail::make_unique<T, detail::allocator<
-			initial_visibility_t::to_all_devices>, detail::deleter
+		cuda::memory::detail_::make_unique<T, detail_::allocator<
+			initial_visibility_t::to_all_devices>, detail_::deleter
 		>(n) :
-		cuda::memory::detail::make_unique<T, detail::allocator<
-			initial_visibility_t::to_supporters_of_concurrent_managed_access>, detail::deleter
+		cuda::memory::detail_::make_unique<T, detail_::allocator<
+			initial_visibility_t::to_supporters_of_concurrent_managed_access>, detail_::deleter
 		>(n);
 }
 
@@ -136,11 +136,11 @@ inline unique_ptr<T> make_unique(
 	initial_visibility_t initial_visibility = initial_visibility_t::to_all_devices)
 {
 	return (initial_visibility == initial_visibility_t::to_all_devices) ?
-		cuda::memory::detail::make_unique<T, detail::allocator<
-			initial_visibility_t::to_all_devices>, detail::deleter
+		cuda::memory::detail_::make_unique<T, detail_::allocator<
+			initial_visibility_t::to_all_devices>, detail_::deleter
 		>() :
-		cuda::memory::detail::make_unique<T, detail::allocator<
-			initial_visibility_t::to_supporters_of_concurrent_managed_access>, detail::deleter
+		cuda::memory::detail_::make_unique<T, detail_::allocator<
+			initial_visibility_t::to_supporters_of_concurrent_managed_access>, detail_::deleter
 		>();
 
 }
diff --git a/src/cuda/common/types.hpp b/src/cuda/common/types.hpp
index 0453867e..b55d4092 100644
--- a/src/cuda/common/types.hpp
+++ b/src/cuda/common/types.hpp
@@ -485,7 +485,7 @@ using pair_attribute_t   = cudaDeviceP2PAttr;
 
 } // namespace device
 
-namespace detail {
+namespace detail_ {
 
 /**
  * @brief adapt a type to be usable as a kernel parameter.
@@ -514,7 +514,7 @@ struct kernel_parameter_decay {
 template<typename P>
 using kernel_parameter_decay_t = typename kernel_parameter_decay<P>::type;
 
-} // namespace detail
+} // namespace detail_
 
 /**
  * Scheduling policies the Runtime API may use when the host-side
diff --git a/src/cuda/nvtx/profiling.cpp b/src/cuda/nvtx/profiling.cpp
index 02eb6ddf..73fe35aa 100644
--- a/src/cuda/nvtx/profiling.cpp
+++ b/src/cuda/nvtx/profiling.cpp
@@ -25,13 +25,13 @@ namespace profiling {
 
 namespace mark {
 
-namespace detail {
+namespace detail_ {
 static ::std::mutex profiler_mutex; // To prevent multiple threads from accessing the profiler simultaneously
 }
 
 void point(const ::std::string& description, color_t color)
 {
-	::std::lock_guard<::std::mutex> { detail::profiler_mutex };
+	::std::lock_guard<::std::mutex> { detail_::profiler_mutex };
 	// logging?
 	nvtxEventAttributes_t eventAttrib = {0};
 	eventAttrib.version = NVTX_VERSION;
@@ -47,7 +47,7 @@ range::handle_t range_start(
 	const ::std::string& description, ::cuda::profiling::range::type_t type, color_t color)
 {
 	(void) type; // Currently not doing anything with the type; maybe in the future
-	::std::lock_guard<::std::mutex> { detail::profiler_mutex };
+	::std::lock_guard<::std::mutex> { detail_::profiler_mutex };
 	nvtxEventAttributes_t range_attributes;
 	range_attributes.version   = NVTX_VERSION;
 	range_attributes.size      = NVTX_EVENT_ATTRIB_STRUCT_SIZE;