From bbfd6ea155a26b3bdb07863dd0d19223bd7ffc30 Mon Sep 17 00:00:00 2001
From: Eyal Rozenberg <eyalroz@technion.ac.il>
Date: Mon, 6 Jul 2020 11:37:55 +0300
Subject: [PATCH] Fixes #9: A near-complete revamp of the APIs, now taking the
 driver API into account and exposing most of its functionality.

---
 .github/workflows/cmake-build-linux.yml       |   30 +-
 .github/workflows/cmake-build-windows.yml     |   24 +-
 CMakeLists.txt                                |   12 +-
 examples/CMakeLists.txt                       |    5 +-
 .../context_management.cpp                    |  191 +++
 .../device_management.cpp                     |   43 +-
 .../by_runtime_api_module/error_handling.cu   |   34 +-
 .../by_runtime_api_module/event_management.cu |    4 +-
 .../execution_control.cu                      |   13 +-
 examples/by_runtime_api_module/ipc.cpp        |    7 +-
 .../stream_management.cu                      |    2 +-
 .../unified_addressing.cpp                    |  135 +-
 .../version_management.cpp                    |    6 +-
 examples/common.hpp                           |  109 +-
 .../binaryPartitionCG/binaryPartitionCG.cu    |    2 +-
 .../modified_cuda_samples/helper_cuda.hpp     |    1 -
 .../inlinePTX/inlinePTX.cu                    |   20 +-
 .../p2pBandwidthLatencyTest.cu                |    4 +-
 .../simpleDrvRuntimePTX.cpp                   |  175 ++
 .../simpleIPC/simpleIPC.cu                    |    9 +-
 .../simpleStreams/simpleStreams.cu            |   18 +-
 .../vectorAdd/vectorAdd.cu                    |    4 +-
 .../vectorAddManaged/vectorAddManaged.cu      |    3 +-
 .../vectorAddMapped/vectorAddMapped.cu        |    2 +-
 examples/other/array_management.cu            |    2 +-
 .../main.cpp                                  |    2 +-
 .../second_tu.cpp                             |    2 +-
 .../other/io_compute_overlap_with_streams.cu  |  197 +--
 examples/other/manipulate_current_device.cu   |   59 +
 src/cuda/api.hpp                              |   50 +
 src/cuda/api/apriori_compiled_kernel.hpp      |  145 ++
 src/cuda/api/array.hpp                        |  120 +-
 src/cuda/api/constants.hpp                    |   12 +-
 src/cuda/api/context.hpp                      |  850 +++++++++
 src/cuda/api/current_context.hpp              |  241 +++
 src/cuda/api/current_device.hpp               |  149 +-
 src/cuda/api/detail/device_properties.hpp     |   74 +-
 src/cuda/api/device.hpp                       |  672 ++++----
 src/cuda/api/device_properties.hpp            |   10 +-
 src/cuda/api/devices.hpp                      |    7 +-
 src/cuda/api/error.hpp                        |  281 ++-
 src/cuda/api/event.hpp                        |  175 +-
 src/cuda/api/ipc.hpp                          |   61 +-
 src/cuda/api/kernel.hpp                       |  446 +++--
 src/cuda/api/kernel_launch.hpp                |  184 +-
 src/cuda/api/link.hpp                         |  215 +++
 src/cuda/api/link_options.hpp                 |  339 ++++
 src/cuda/api/memory.hpp                       | 1346 ++++++++++-----
 src/cuda/api/miscellany.hpp                   |   51 +-
 src/cuda/api/module.hpp                       |  355 ++++
 src/cuda/api/multi_wrapper_impls.hpp          | 1524 +++++++++++++----
 src/cuda/api/pci_id.hpp                       |    6 +-
 src/cuda/api/pci_id_impl.hpp                  |    1 +
 src/cuda/api/peer_to_peer.hpp                 |  187 +-
 src/cuda/api/pointer.hpp                      |  155 +-
 src/cuda/api/primary_context.hpp              |  328 ++++
 src/cuda/api/stream.hpp                       |  537 ++++--
 src/cuda/api/texture_view.hpp                 |   72 +-
 src/cuda/{common => api}/types.hpp            |  413 +++--
 src/cuda/api/unique_ptr.hpp                   |  106 +-
 src/cuda/api/versions.hpp                     |   14 +-
 src/cuda/api/virtual_memory.hpp               |  524 ++++++
 src/cuda/nvtx/profiling.hpp                   |    2 +-
 src/cuda/runtime_api.hpp                      |    7 +-
 64 files changed, 8434 insertions(+), 2340 deletions(-)
 create mode 100644 examples/by_driver_api_module/context_management.cpp
 create mode 100644 examples/modified_cuda_samples/simpleDrvRuntimePTX/simpleDrvRuntimePTX.cpp
 create mode 100644 examples/other/manipulate_current_device.cu
 create mode 100644 src/cuda/api.hpp
 create mode 100644 src/cuda/api/apriori_compiled_kernel.hpp
 create mode 100644 src/cuda/api/context.hpp
 create mode 100644 src/cuda/api/current_context.hpp
 create mode 100644 src/cuda/api/link.hpp
 create mode 100644 src/cuda/api/link_options.hpp
 create mode 100644 src/cuda/api/module.hpp
 create mode 100644 src/cuda/api/primary_context.hpp
 rename src/cuda/{common => api}/types.hpp (65%)
 create mode 100644 src/cuda/api/virtual_memory.hpp

diff --git a/.github/workflows/cmake-build-linux.yml b/.github/workflows/cmake-build-linux.yml
index 922f37c2..d4e8956e 100644
--- a/.github/workflows/cmake-build-linux.yml
+++ b/.github/workflows/cmake-build-linux.yml
@@ -63,21 +63,21 @@ jobs:
           gcc: 9 # may fail with gcc-10 due to an internal compiler error
           shell: "bash"
           cmake-generator: "Unix Makefiles"
-        - os: ubuntu-18.04
-          cuda: "10.2"
-          gcc: 8
-          shell: "bash"
-          cmake-generator: "Unix Makefiles"
-        - os: ubuntu-18.04
-          cuda: "10.1"
-          gcc: 8
-          shell: "bash"
-          cmake-generator: "Unix Makefiles"
-        - os: ubuntu-18.04
-          cuda: "10.0"
-          gcc: 7 # fails with GCC 8 - no supported in CUDA 10.0
-          shell: "bash"
-          cmake-generator: "Unix Makefiles"
+#        - os: ubuntu-18.04
+#          cuda: "10.2"
+#          gcc: 8
+#          shell: "bash"
+#          cmake-generator: "Unix Makefiles"
+#        - os: ubuntu-18.04
+#          cuda: "10.1"
+#          gcc: 8
+#          shell: "bash"
+#          cmake-generator: "Unix Makefiles"
+#        - os: ubuntu-18.04
+#          cuda: "10.0"
+#          gcc: 7 # fails with GCC 8 - no supported in CUDA 10.0
+#          shell: "bash"
+#          cmake-generator: "Unix Makefiles"
 # GitHub has remoted ubuntu-16.04 runnings,
 # so we're not testing builds with older CUDA versions
 #       - os: ubuntu-16.04
diff --git a/.github/workflows/cmake-build-windows.yml b/.github/workflows/cmake-build-windows.yml
index c24f684f..c9cc16c2 100644
--- a/.github/workflows/cmake-build-windows.yml
+++ b/.github/workflows/cmake-build-windows.yml
@@ -66,18 +66,18 @@ jobs:
             shell: "powershell"
             os-type: "windows"
             cmake-platform-flag: "-A x64"
-          - os: windows-2019
-            cuda: "10.2.89"
-            visual-studio: "Visual Studio 16 2019"
-            shell: "powershell"
-            os-type: "windows"
-            cmake-platform-flag: "-A x64"
-          - os: windows-2019
-            cuda: "10.1.243"
-            visual-studio: "Visual Studio 16 2019"
-            shell: "powershell"
-            os-type: "windows"
-            cmake-platform-flag: "-A x64"
+#          - os: windows-2019
+#            cuda: "10.2.89"
+#            visual-studio: "Visual Studio 16 2019"
+#            shell: "powershell"
+#            os-type: "windows"
+#            cmake-platform-flag: "-A x64"
+#          - os: windows-2019
+#            cuda: "10.1.243"
+#            visual-studio: "Visual Studio 16 2019"
+#            shell: "powershell"
+#            os-type: "windows"
+#            cmake-platform-flag: "-A x64"
 
           # Windows2016 & VS 2017 supports 10.0+
           # - os: windows-2016
diff --git a/CMakeLists.txt b/CMakeLists.txt
index f07388c7..d4fcbfb8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -17,14 +17,14 @@ if(WIN32 AND "${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
 endif()
 
 PROJECT(cuda-api-wrappers
-	VERSION 0.4.4
+	VERSION 0.5.0
 	DESCRIPTION "Thin C++-flavored wrappers for the CUDA Runtime API"
 	HOMEPAGE_URL https://github.com/eyalroz/cuda-api-wrappers
 	LANGUAGES CUDA CXX)
 
 include(GNUInstallDirs)
 
-find_package(CUDAToolkit REQUIRED)
+find_package(CUDAToolkit 11.0 REQUIRED)
 find_package(Threads REQUIRED)
 set(CMAKE_THREAD_PREFER_PTHREAD TRUE)
 
@@ -35,9 +35,9 @@ set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY "lib/")
 # Our library targets
 # -------------------
 
-add_library(runtime-api INTERFACE) # A header-only library!
+add_library(runtime-and-driver INTERFACE) # A header-only library!
 add_library(nvtx)
-set(wrapper-libraries runtime-api nvtx)
+set(wrapper-libraries runtime-and-driver nvtx)
 
 foreach(WRAPPER_LIB ${wrapper-libraries})
 	target_compile_features(${WRAPPER_LIB} INTERFACE cxx_std_11) # This means _at least_ C++11
@@ -47,11 +47,11 @@ foreach(WRAPPER_LIB ${wrapper-libraries})
 		"$<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/src>"
 		"$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>"
 	)
-	target_link_libraries(${WRAPPER_LIB} INTERFACE CUDA::cudart) # CUDA::cuda_driver)
+	target_link_libraries(${WRAPPER_LIB} INTERFACE CUDA::cudart CUDA::nvToolsExt CUDA::cuda_driver)
 endforeach()
 
 set_target_properties(nvtx PROPERTIES OUTPUT_NAME "cuda-nvtx-wrappers")
-target_link_libraries(nvtx PUBLIC runtime-api)
+target_link_libraries(nvtx PUBLIC runtime-and-driver)
 target_link_libraries(nvtx PRIVATE Threads::Threads CUDA::nvToolsExt)
 set_property(TARGET nvtx PROPERTY CXX_STANDARD 11)
 set_property(TARGET nvtx PROPERTY CXX_STANDARD_REQUIRED ON)
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index b98970e7..b928cdf0 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -28,12 +28,13 @@ set(CMAKE_CUDA_STANDARD 11)
 set(CMAKE_CUDA_STANDARD_REQUIRED ON)
 set(CMAKE_CUDA_EXTENSIONS OFF)
 
-link_libraries(runtime-api)
+link_libraries(runtime-and-driver)
 
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "bin")
 add_executable(vectorAdd modified_cuda_samples/vectorAdd/vectorAdd.cu)
 add_executable(vectorAddMapped modified_cuda_samples/vectorAddMapped/vectorAddMapped.cu)
 add_executable(vectorAddManaged modified_cuda_samples/vectorAddManaged/vectorAddManaged.cu)
+add_executable(simpleDrvRuntimePTX modified_cuda_samples/simpleDrvRuntimePTX/simpleDrvRuntimePTX.cpp)
 add_executable(inlinePTX modified_cuda_samples/inlinePTX/inlinePTX.cu)
 add_executable(simpleStreams modified_cuda_samples/simpleStreams/simpleStreams.cu)
 add_executable(simpleIPC modified_cuda_samples/simpleIPC/simpleIPC.cu)
@@ -50,12 +51,14 @@ add_dependencies(modified_cuda_samples vectorAdd inlinePTX simpleStreams simpleI
 add_executable(version_management by_runtime_api_module/version_management.cpp)
 add_executable(error_handling by_runtime_api_module/error_handling.cu)
 add_executable(device_management by_runtime_api_module/device_management.cpp)
+add_executable(context_management by_driver_api_module/context_management.cpp)
 add_executable(execution_control by_runtime_api_module/execution_control.cu)
 
 add_executable(stream_management by_runtime_api_module/stream_management.cu)
 add_executable(event_management by_runtime_api_module/event_management.cu)
 add_executable(unified_addressing by_runtime_api_module/unified_addressing.cpp)
 add_executable(io_compute_overlap_with_streams other/io_compute_overlap_with_streams.cu)
+add_executable(manipulate_current_device other/manipulate_current_device.cu)
 add_executable(inclusion_in_two_translation_units other/inclusion_in_two_translation_units/main.cpp other/inclusion_in_two_translation_units/second_tu.cpp )
 
 if(NOT "${CMAKE_CUDA_COMPILER_ID}" STREQUAL "Clang")
diff --git a/examples/by_driver_api_module/context_management.cpp b/examples/by_driver_api_module/context_management.cpp
new file mode 100644
index 00000000..53fe6f31
--- /dev/null
+++ b/examples/by_driver_api_module/context_management.cpp
@@ -0,0 +1,191 @@
+/**
+ * An example program utilizing most/all calls from the CUDA
+ * Driver API module:
+ *
+ *   Device Management
+ */
+#include "../common.hpp"
+
+void current_context_manipulation(const cuda::device_t &device, const cuda::device::primary_context_t &pc,
+	const cuda::context_t &created_context);
+
+void test_context(
+	const cuda::context_t& context,
+	bool is_primary,
+	cuda::device::id_t device_id)
+{
+	std::cout << "Testing " << (is_primary ? "" : "non-") << "primary context " << context << '\n';
+	if (context.device_id() != device_id) {
+		die_("The device's primary context's reported ID and the device wrapper's ID differ: "
+			+ std::to_string(context.device_id()) + " !=" +  std::to_string(device_id));
+	}
+
+	if (context.device().id() != device_id) {
+		die_("The context's associated device's ID is not the same as that of the device for which we obtained the context: "
+			+ std::to_string(context.device().id()) + " !=" +  std::to_string(device_id) );
+	}
+
+	if (context.is_primary() != is_primary) {
+		die_(std::string("The ") + (is_primary ? "" : "non-") + "primary context " + std::to_string(context)
+			+ " \"believes\" it is " + (is_primary ? "not " : "") + "primary.");
+	}
+
+	// Specific attributes and properties with their own API calls:
+	// L1/shared mem (CacheConfig), shared memory bank size (SharedMemConfig)
+	// and stream priority range
+	// ----------------------------------------------------------------
+
+	auto cache_preference = context.cache_preference();
+	std::cout << "The cache preference for context " << context << " is: " << cache_preference << ".\n";
+
+	auto new_cache_preference =
+		cache_preference == cuda::multiprocessor_cache_preference_t::prefer_l1_over_shared_memory ?
+		cuda::multiprocessor_cache_preference_t::prefer_shared_memory_over_l1 :
+		cuda::multiprocessor_cache_preference_t::prefer_l1_over_shared_memory;
+	context.set_cache_preference(new_cache_preference);
+	cache_preference = context.cache_preference();
+	assert_(cache_preference == new_cache_preference);
+	std::cout << "The cache preference for context " << context << " has now been set to: " << new_cache_preference << ".\n";
+
+	auto shared_mem_bank_size = context.shared_memory_bank_size();
+	shared_mem_bank_size =
+		(shared_mem_bank_size == CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE) ?
+			CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE : CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE;
+	context.set_shared_memory_bank_size(shared_mem_bank_size);
+	auto stream_priority_range = context.stream_priority_range();
+	if (stream_priority_range.is_trivial()) {
+		std::cout << "Context " << context <<  " does not support stream priorities. "
+			"All streams will have the same (default) priority.\n";
+	}
+	else {
+		std::cout << "Streams in context " << context << " have priorities between "
+			<< stream_priority_range.least << " (highest numeric value, least prioritized) and "
+			<< std::to_string(stream_priority_range.greatest) << "(lowest numeric values, most prioritized).\n";
+		assert(stream_priority_range.least > stream_priority_range.greatest);
+	}
+
+	// Resource limits
+	// --------------------
+
+	auto printf_fifo_size = context.get_limit(CU_LIMIT_PRINTF_FIFO_SIZE);
+	std::cout << "The printf FIFO size for context " << context << " is " << printf_fifo_size << ".\n";
+	decltype(printf_fifo_size) new_printf_fifo_size =
+		(printf_fifo_size <= 1024) ?  2 * printf_fifo_size : printf_fifo_size - 512;
+	context.set_limit(CU_LIMIT_PRINTF_FIFO_SIZE, new_printf_fifo_size);
+	printf_fifo_size = context.get_limit(CU_LIMIT_PRINTF_FIFO_SIZE);
+	assert_(printf_fifo_size == new_printf_fifo_size);
+
+	// Flags - yes, yet another kind of attribute/property
+	// ----------------------------------------------------
+
+	std::cout << "Context " << context << " uses a"
+		<< (context.synch_scheduling_policy() ? " synchronous" : "n asynchronous")
+		<< " scheduling policy.\n";
+	std::cout << "Context " << context << " is set to "
+		<< (context.keeping_larger_local_mem_after_resize() ? "keep" : "discard")
+		<< " shared memory allocation after launch.\n";
+	// TODO: Change the settings as well obtaining them
+
+}
+
+void current_context_manipulation(
+	cuda::device_t &device,
+	cuda::device::primary_context_t &pc,
+	cuda::context_t &created_context)
+{
+	cuda::context_t context_0 = pc;
+	cuda::context_t context_1 = created_context;
+	cuda::context::current::set(context_0);
+	assert_(cuda::context::current::get() == context_0);
+	assert_(cuda::context::current::detail_::get_handle() == context_0.handle());
+	cuda::context::current::set(context_1);
+	assert_(cuda::context::current::get() == context_1);
+	assert_(cuda::context::current::detail_::get_handle() == context_1.handle());
+
+
+	auto context_2 = cuda::context::create(device);
+	{
+		cuda::context::current::scoped_override_t context_for_this_block { context_2 };
+		assert_(context_2.handle() == cuda::context::current::get().handle());
+		assert_(context_2 == cuda::context::current::get());
+	}
+	auto gotten = cuda::context::current::get();
+	assert_(gotten == context_1);
+
+	auto context_3 = cuda::context::create_and_push(device);
+
+//	std::cout << "Contexts:\n";
+//	std::cout << "context_0: " << context_0 << '\n';
+//	std::cout << "context_1: " << context_1 << '\n';
+//	std::cout << "context_2: " << context_2 << '\n';
+//	std::cout << "context_3: " << context_3 << '\n';
+
+	{
+		cuda::context::current::scoped_override_t context_for_this_block { context_3 };
+		assert_(context_3.handle() == cuda::context::current::get().handle());
+		assert_(context_3 == cuda::context::current::get());
+	}
+
+	{
+		auto popped = cuda::context::current::pop();
+		assert_(popped == context_3);
+	}
+	gotten = cuda::context::current::get();
+	assert_(gotten == context_1);
+}
+
+
+int main(int argc, char **argv)
+{
+	if (cuda::device::count() == 0) {
+		die_("No CUDA devices on this system");
+	}
+
+	// Being very cavalier about our command-line arguments here...
+	cuda::device::id_t device_id =  (argc > 1) ?
+		std::stoi(argv[1]) : cuda::device::default_device_id;
+
+	if (cuda::device::count() <= device_id) {
+		die_("No CUDA device with ID " + std::to_string(device_id));
+	}
+
+	auto device = cuda::device::get(device_id);
+
+	std::cout << "Using CUDA device " << device.name() << " (having device ID " << device.id() << ")\n";
+
+//	report_context_stack("Before anything is done");
+	auto pc = device.primary_context();
+//	report_context_stack("After getting the primary context");
+
+
+	cuda::context::current::push(pc);
+	constexpr const bool is_primary = true;
+	constexpr const bool isnt_primary = false;
+	test_context(pc, is_primary, device_id);
+
+	{
+		auto popped = cuda::context::current::pop();
+		if (popped != pc) {
+			die_("After pushing context " + std::to_string(pc) + " and popping it - the pop result is a different context, " + std::to_string(popped));
+		}
+	}
+
+	auto created_context = cuda::context::create(device);
+	test_context(created_context, isnt_primary, device_id);
+	current_context_manipulation(device, pc, created_context);
+
+	std::cout << std::endl;
+//	report_context_stack("After current_context_manipulation");
+	cuda::context::current::push(created_context);
+	cuda::context::current::push(created_context);
+	// We should have 3 copies of created_context on the stack at this point, and nothing else
+	cudaSetDevice(device_id);
+//	report_context_stack("After cudaSetDevice " + std::to_string(device_id));
+	// We should have the primary context of the device
+
+
+	device.synchronize();
+	device.reset();
+
+	std::cout << "\nSUCCESS\n";
+}
diff --git a/examples/by_runtime_api_module/device_management.cpp b/examples/by_runtime_api_module/device_management.cpp
index d5fb6d4b..8aa7de52 100644
--- a/examples/by_runtime_api_module/device_management.cpp
+++ b/examples/by_runtime_api_module/device_management.cpp
@@ -41,7 +41,7 @@ void attributes_and_properties()
 {
 	auto device = cuda::device::current::get();
 
-	auto max_registers_per_block = device.get_attribute(cudaDevAttrMaxRegistersPerBlock);
+	auto max_registers_per_block = device.get_attribute(CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK);
 	std::cout
 	<< "Maximum number of registers per block on this device: "
 	<< max_registers_per_block << "\n";
@@ -55,7 +55,7 @@ void pci_bus_id()
 	auto pci_id = device.pci_id();
 	std::string pci_id_str(pci_id);
 
-	cuda::outstanding_error::ensure_none(cuda::do_clear_errors);
+	cuda::outstanding_error::ensure_none();
 
 	auto re_obtained_device = cuda::device::get(pci_id_str);
 	assert_(re_obtained_device == device);
@@ -67,21 +67,19 @@ void global_memory()
 	auto device = cuda::device::current::get();
 
 	auto device_global_mem = device.memory();
+	auto total_memory = device_global_mem.amount_total();
+	auto free_memory = device_global_mem.amount_total();
 
-	assert_(device_global_mem.associated_device() == device);
+	std::cout 
+		<< "Device " << std::to_string(device.id()) << " reports it has "
+		<< free_memory << " bytes free out of " << total_memory << " bytes total global memory "
+		<< "(" << (total_memory - free_memory) << " bytes used).\n";
 
-    if (device.id() != device.memory().associated_device().id()) {
-        die_("The device's reported ID and the device's memory object's reported device ID differ: "
+    if (device != device.memory().associated_device()) {
+        die_("The device's reported ID and the device's memory object's reported devices differ: "
         + std::to_string(device.id()) + " !=" +  std::to_string(device.memory().associated_device().id()));
     }
 
-	auto total_memory = device_global_mem.amount_total();
-	auto free_memory = device_global_mem.amount_total();
-
-	std::cout
-	    << "Device " << std::to_string(device.id()) << " reports it has:\n"
-	    << free_memory << " Bytes free out of " << total_memory << " Bytes total global memory.\n";
-
 	assert_(free_memory <= total_memory);
 }
 
@@ -91,16 +89,21 @@ void global_memory()
 void shared_memory()
 {
 	auto device = cuda::device::current::get();
+//	auto primary_context = device.primary_context();
+//	report_context_stack("After getting the current device (which is " + std::to_string(device.id()) + ')');
 
 	auto reported_cache_preference = device.cache_preference();
 	std::cout << "The cache preference for device " << device.id() << " is: \""	<<  reported_cache_preference << "\".\n";
 
+//	report_context_stack("After getting the cache preference for device " + std::to_string(device.id()));
+
 	auto applied_cache_preference =
 		reported_cache_preference == cuda::multiprocessor_cache_preference_t::prefer_l1_over_shared_memory ?
 		cuda::multiprocessor_cache_preference_t::prefer_shared_memory_over_l1 :
 		cuda::multiprocessor_cache_preference_t::prefer_l1_over_shared_memory;
 	device.set_cache_preference(applied_cache_preference);
 
+//	report_context_stack("After setting cache pref");
 	reported_cache_preference = device.cache_preference();
 	if (reported_cache_preference != applied_cache_preference) {
 		std::cerr << "After setting cache preference to \""
@@ -119,8 +122,8 @@ void shared_memory()
 	std::cout << "The reported shared memory bank size for device " << device.id() << " is: "
 			  << bank_size_names[reported_shared_mem_bank_size] << '.' << std::endl;
 	auto applied_shared_mem_bank_size =
-		(reported_shared_mem_bank_size == cudaSharedMemBankSizeFourByte) ?
-		    cudaSharedMemBankSizeEightByte : cudaSharedMemBankSizeFourByte;
+		(reported_shared_mem_bank_size == CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE) ?
+		CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE : CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE;
 	device.set_shared_memory_bank_size(applied_shared_mem_bank_size);
 
 	// We can't reliably check the bank size setting succeeded, since some devices, which
@@ -155,12 +158,12 @@ void limits()
 {
 	auto device = cuda::device::current::get();
 
-	auto printf_fifo_size = device.get_limit(cudaLimitPrintfFifoSize);
+	auto printf_fifo_size = device.get_limit(CU_LIMIT_PRINTF_FIFO_SIZE);
 	std::cout << "The printf FIFO size for device " << device.id() << " is " << printf_fifo_size << ".\n";
 	decltype(printf_fifo_size) new_printf_fifo_size =
 		(printf_fifo_size <= 1024) ?  2 * printf_fifo_size : printf_fifo_size - 512;
-	device.set_limit(cudaLimitPrintfFifoSize, new_printf_fifo_size);
-	printf_fifo_size = device.get_limit(cudaLimitPrintfFifoSize);
+	device.set_limit(CU_LIMIT_PRINTF_FIFO_SIZE, new_printf_fifo_size);
+	printf_fifo_size = device.get_limit(CU_LIMIT_PRINTF_FIFO_SIZE);
 	assert_(printf_fifo_size == new_printf_fifo_size);
 }
 
@@ -187,7 +190,7 @@ void peer_to_peer(std::pair<cuda::device::id_t,cuda::device::id_t> peer_ids)
 	auto peer = cuda::device::get(peer_ids.second);
 	if (device.can_access(peer)) {
 		auto atomics_supported_over_link = cuda::device::peer_to_peer::get_attribute(
-			cudaDevP2PAttrNativeAtomicSupported, device, peer);
+			cuda::device::peer_to_peer::native_atomics_support, device, peer);
 		std::cout
 		<< "Native atomics are " << (atomics_supported_over_link ? "" : "not ")
 		<< "supported over the link from device " << device.id()
@@ -225,7 +228,9 @@ void current_device_manipulation()
 		(void) e; // This avoids a spurious warning in MSVC 16.11
 		assert_(e.code() == cuda::status::invalid_device);
 		// We expected to get this exception, just clear it
-		cuda::outstanding_error::clear();
+		cuda::outstanding_error::ensure_none(
+			"The attempt to set the current device to an invalid value should not "
+			"create an outstanding error");
 	}
 
 	// Iterate over all devices
diff --git a/examples/by_runtime_api_module/error_handling.cu b/examples/by_runtime_api_module/error_handling.cu
index 117e2c23..ebfd045d 100644
--- a/examples/by_runtime_api_module/error_handling.cu
+++ b/examples/by_runtime_api_module/error_handling.cu
@@ -27,40 +27,10 @@ int main(int, char **)
 	}
 	try {
 		cuda::outstanding_error::ensure_none();
-		die_("An exception should have be thrown when ensuring there were no outstanding errors (as we had just triggered one)");
 	}
-	catch(cuda::runtime_error&) { }
-
-	cuda::outstanding_error::ensure_none();
-
-	// An exception was not thrown, since by default,
-	// ensure_no_outstanding_error() clears the error it finds
-
-	// ... Let's do the whole thing again, but this time _without_
-	// clearing the error
-
-	try {
-		cuda::device::current::detail_::set(device_count);
-		die_("An exception should have be thrown when setting the current device to one-past-the-last.");
+	catch(cuda::runtime_error&) {
+		die_("An error was outstanding, despite our not having committed any 'sticky' errors)");
 	}
-	catch(cuda::runtime_error&) { }
-
-	try {
-		cuda::outstanding_error::ensure_none(cuda::dont_clear_errors);
-		die_("An exception should have be thrown when setting the current device to one-past-the-last.");
-	}
-	catch(cuda::runtime_error&) { }
-
-	try {
-		cuda::outstanding_error::ensure_none(cuda::dont_clear_errors);
-		die_("An exception should have be thrown when setting the current device to one-past-the-last.");
-	}
-	catch(cuda::runtime_error&) { }
-
-	// This time around, repeated calls to ensure_no_outstanding_error do throw...
-
-	cuda::outstanding_error::clear();
-	cuda::outstanding_error::ensure_none(); // ... and that makes them stop
 
 	std::cout << "SUCCESS\n";
 	return EXIT_SUCCESS;
diff --git a/examples/by_runtime_api_module/event_management.cu b/examples/by_runtime_api_module/event_management.cu
index b13f8d5c..30acdd2b 100644
--- a/examples/by_runtime_api_module/event_management.cu
+++ b/examples/by_runtime_api_module/event_management.cu
@@ -97,8 +97,8 @@ int main(int argc, char **argv)
 	constexpr size_t buffer_size = 12345678;
 	auto buffer = cuda::memory::managed::make_unique<char[]>(
 		device, buffer_size, cuda::memory::managed::initial_visibility_t::to_all_devices);
-	auto wrapped_kernel = cuda::kernel::wrap(device, increment);
-	cuda::grid::block_dimension_t threads_per_block = wrapped_kernel.attributes().maxThreadsPerBlock;
+	auto wrapped_kernel = cuda::kernel::get(device, increment);
+	cuda::grid::block_dimension_t threads_per_block = wrapped_kernel.maximum_threads_per_block();
 	cuda::grid::dimension_t num_blocks = div_rounding_up(buffer_size, threads_per_block);
 	auto launch_config = cuda::make_launch_config(num_blocks, threads_per_block);
 
diff --git a/examples/by_runtime_api_module/execution_control.cu b/examples/by_runtime_api_module/execution_control.cu
index 78576131..7e039fe0 100644
--- a/examples/by_runtime_api_module/execution_control.cu
+++ b/examples/by_runtime_api_module/execution_control.cu
@@ -63,16 +63,15 @@ int main(int argc, char **argv)
 
 	auto device = cuda::device::get(device_id).make_current();
 	std::cout << "Using CUDA device " << device.name() << " (having device ID " << device.id() << ")\n";
-	auto kernel = cuda::kernel::wrap(device, kernel_function);
+	auto kernel = cuda::kernel::get(device, kernel_function);
 
 	// ------------------------------------------
 	//  Attributes without a specific API call
 	// ------------------------------------------
 
-	auto attributes = kernel.attributes();
 	std::cout
 		<< "The PTX version used in compiling device function " << kernel_name
-		<< " is " << attributes.ptx_version() << ".\n";
+		<< " is " << kernel.ptx_version() << ".\n";
 
 	std::string cache_preference_names[] = {
 		"No preference",
@@ -102,7 +101,7 @@ int main(int argc, char **argv)
 
 	const int bar = 123;
 	const unsigned num_blocks = 3;
-	auto max_threads_per_block = attributes.maxThreadsPerBlock;
+	auto max_threads_per_block = kernel.get_attribute(CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK);
 	auto launch_config = cuda::make_launch_config(num_blocks, max_threads_per_block);
 	std::cout
 		<< "Launching kernel " << kernel_name
@@ -175,10 +174,12 @@ int main(int argc, char **argv)
 		if (not (e.code() == cuda::status::not_supported)) {
 			throw e;
 		}
-		cuda::outstanding_error::clear();
+		// We should really not have a sticky error at this point, but lets' make
+		// extra sure.
+		cuda::outstanding_error::ensure_none();
 	}
 #endif
-	auto non_cooperative_kernel = cuda::kernel::wrap(device, kernel_function);
+	auto non_cooperative_kernel = cuda::kernel::get(device, kernel_function);
 	auto non_cooperative_config = launch_config;
 	non_cooperative_config.block_cooperation = true;
 	std::cout
diff --git a/examples/by_runtime_api_module/ipc.cpp b/examples/by_runtime_api_module/ipc.cpp
index 6c7f4058..7cc5614a 100644
--- a/examples/by_runtime_api_module/ipc.cpp
+++ b/examples/by_runtime_api_module/ipc.cpp
@@ -17,16 +17,15 @@
  * about from the other process.
  *
  */
-#include <cuda/runtime_api/device.hpp>
-#include <cuda/runtime_api/error.hpp>
-#include <cuda/runtime_api/pci_id.hpp>
-
 #include <cuda_runtime_api.h>
 
 #include <iostream>
 #include <string>
 #include <cstdlib>
 #include <cassert>
+#include <cuda/api/device.hpp>
+#include <cuda/api/error.hpp>
+#include <cuda/api/pci_id.hpp>
 
 
 [[noreturn]] void die_(const std::string& message)
diff --git a/examples/by_runtime_api_module/stream_management.cu b/examples/by_runtime_api_module/stream_management.cu
index af92a93d..0418fb3a 100644
--- a/examples/by_runtime_api_module/stream_management.cu
+++ b/examples/by_runtime_api_module/stream_management.cu
@@ -172,7 +172,7 @@ int main(int argc, char **argv)
 			print_first_char(buffer.get());
 		}
 	);
-	auto threads_per_block = cuda::kernel::wrap(device, increment).attributes().maxThreadsPerBlock;
+	auto threads_per_block = cuda::kernel::get(device, increment).get_attribute(CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK);
 	auto num_blocks = div_rounding_up(buffer_size, threads_per_block);
 	auto launch_config = cuda::make_launch_config(num_blocks, threads_per_block);
 	// TODO: The following doesn't have much of a meaningful effect; we should modify this example
diff --git a/examples/by_runtime_api_module/unified_addressing.cpp b/examples/by_runtime_api_module/unified_addressing.cpp
index 5a796f9b..520dc68d 100644
--- a/examples/by_runtime_api_module/unified_addressing.cpp
+++ b/examples/by_runtime_api_module/unified_addressing.cpp
@@ -8,33 +8,91 @@
  * one kernel, wait for the other process' kernel to
  * complete execution, and inspect each other's kernel's
  * output - in an output buffer that each of them learns
- * about from the other process.
+ * about from the other process
+ *
+ * TODO: Mostly unimplemented for now.
  *
  */
-#include <cuda/api/device.hpp>
-#include <cuda/api/multi_wrapper_impls.hpp>
-#include <cuda/api/pointer.hpp>
-#include <cuda_runtime_api.h>
+#include "../common.hpp"
 
-#include <iostream>
-#include <string>
 #include <sstream>
 #include <cstdlib>
 
-[[noreturn]] bool die_(const std::string& message)
-{
-	std::cerr << message << "\n";
-	exit(EXIT_FAILURE);
-}
+namespace tests {
 
-int main(int argc, char **argv)
+void pointer_properties(const cuda::device_t& device)
 {
-	cuda::device::id_t device_id =  (argc > 1) ?
-		std::stoi(argv[1]) : cuda::device::default_device_id;
-	auto device = cuda::device::get(device_id);
+	constexpr const cuda::size_t fixed_size { 123 };
+	cuda::context_t contexts[2] = {
+		cuda::context::create(device),
+		cuda::context::create(device)
+	};
+	cuda::memory::device::unique_ptr<char[]> regions[2] = {
+		cuda::memory::device::make_unique<char[]>(contexts[0], fixed_size),
+		cuda::memory::device::make_unique<char[]>(contexts[1], fixed_size)
+	};
+	void* raw_pointers[2] = {
+		regions[0].get(),
+		regions[1].get()
+	};
+	cuda::memory::pointer_t<void> pointers[2] = {
+		cuda::memory::pointer::wrap(raw_pointers[0]),
+		cuda::memory::pointer::wrap(raw_pointers[1]),
+	};
+	auto primary_context = device.primary_context();
+	cuda::context::current::push(primary_context); // so that we check from a different context
+	for(size_t i = 0; i < 2; i++) {
+		auto reported_device_id = cuda::memory::pointer::detail_::get_attribute<CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL>(raw_pointers[i]);
+		assert_(reported_device_id == device.id());
+		auto context_handle = cuda::memory::pointer::detail_::get_attribute<CU_POINTER_ATTRIBUTE_CONTEXT>(raw_pointers[i]);
+		assert_(context_handle == contexts[i].handle());
+		auto ptr_mem_type = cuda::memory::type_of(raw_pointers[i]);
+		assert_(ptr_mem_type == cuda::memory::type_t::device_ or ptr_mem_type == cuda::memory::type_t::unified_);
+		if (i == 0) {
+			std::cout << "The memory type reported for pointers to memory allocated on the device is: " << memory_type_name(ptr_mem_type) << "\n";
+		}
+		assert_(pointers[i].get_for_device() == raw_pointers[i]);
+		try {
+			[[maybe_unused]] auto host_ptr = pointers[i].get_for_host();
+			die_("Was expecting the host_ptr() method to fail for a device-side pointer");
+		} catch(cuda::runtime_error& e) {
+			if (e.code() != cuda::status::named_t::invalid_value) {
+				throw e;
+			}
+		}
+		auto ptr_reported_as_managed = cuda::memory::pointer::detail_::get_attribute<CU_POINTER_ATTRIBUTE_IS_MANAGED>(raw_pointers[i]);
+		assert_(ptr_reported_as_managed == 0);
+//		auto ptr_reported_as_mapped = cuda::memory::pointer::detail_::get_attribute<CU_POINTER_ATTRIBUTE_MAPPED>(raw_pointers[i]);
+//		assert_(ptr_reported_as_mapped == 0);
+#if CUDA_VERSION >= 11030
+		auto mempool_handle = cuda::memory::pointer::detail_::get_attribute<CU_POINTER_ATTRIBUTE_MEMPOOL_HANDLE>(raw_pointers[i]);
+		assert_(mempool_handle == nullptr);
+#endif
+		auto raw_offset_ptr = cuda::memory::as_pointer(cuda::memory::device::address(raw_pointers[i]) + 17);
 
-	std::cout << "Using CUDA device " << device.name() << " (having device ID " << device.id() << ")" << std::endl;
+		cuda::memory::region_t range  = pointers[i].containing_range();
 
+		cuda::memory::pointer_t<void> offset_ptr { raw_offset_ptr };
+		cuda::memory::region_t range_for_offset_ptr = offset_ptr.containing_range();
+		assert_(range == range_for_offset_ptr);
+		assert_(range_for_offset_ptr.start() == raw_pointers[i]);
+//		std::cout << "range_for_offset_ptr.start() == " << range_for_offset_ptr.start() << '\n';
+//		std::cout << "range_for_offset_ptr.size() == " << range_for_offset_ptr.size() << '\n';
+//		std::cout << "offset_ptr == " << offset_ptr.get() << '\n';
+
+		// Consider testing:
+		//	CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE
+		//	CU_POINTER_ATTRIBUTE_MAPPED
+		//	CU_POINTER_ATTRIBUTE_ALLOWED_HANDLE_TYPES
+		//	CU_POINTER_ATTRIBUTE_IS_GPU_DIRECT_RDMA_CAPABLE
+		//	CU_POINTER_ATTRIBUTE_ACCESS_FLAGS
+
+	}
+
+}
+
+void wrapped_pointers_and_regions(const cuda::device_t& device)
+{
 	static const size_t allocation_size { 1024 };
 	auto memory_region = device.memory().allocate(allocation_size);
 
@@ -44,19 +102,21 @@ int main(int argc, char **argv)
 		<< "Verifying a wrapper for raw pointer " << memory_region.start()
 		<< " allocated on the CUDA device." << std::endl;
 
-	switch (ptr.attributes().memory_type()) {
+	switch (cuda::memory::type_of(ptr)) {
 	using namespace cuda::memory;
-	case host_memory:         die_("Pointer incorrectly reported to point into host memory"); break;
-	case managed_memory:      die_("Pointer incorrectly reported not to point to managed memory"); break;
-	case unregistered_memory: die_("Pointer incorrectly reported to point to \"unregistered\" memory"); break;
-	case device_memory:       break;
+	case host_:         die_("Pointer incorrectly reported to point into host memory"); break;
+	case array:         die_("Pointer incorrectly reported to point to array memory"); break;
+//	case unregistered_memory: die_("Pointer incorrectly reported to point to \"unregistered\" memory"); break;
+	case unified_:       std::cout << "Allocated global-device-memory pointer reported to be of unified memory type.";
+		// die_("Pointer incorrectly reported not to point to managed memory"); break;
+	case device_:       break;
 	}
 	{
 		auto ptr_device = ptr.device();
 		auto ptr_device_id = ptr_device.id();
-		(ptr_device_id == device_id) or die_(
-			"Pointer incorrectly reported as associated with device ID " + std::to_string(ptr_device_id) +
-			" rather than " + std::to_string(device_id) + "\n");
+		(ptr_device_id == device.id()) or die_(
+			"Pointer incorrectly reported as associated with " + cuda::device::detail_::identify(device.id())
+			+ " rather than + " + cuda::device::detail_::identify(device.id()));
 	}
 	(ptr.get() == memory_region.start()) or die_("Invalid get() output");
 	if (ptr.get_for_device() != memory_region.start()) {
@@ -66,7 +126,30 @@ int main(int argc, char **argv)
 			<< ptr.get_for_device() << " != " << memory_region.start();
 		die_(ss.str());
 	}
-	(ptr.get_for_host() == nullptr) or die_("Unexpected non-nullptr host-side address reported");
+	try {
+		auto host_side_ptr = ptr.get_for_host();
+		std::stringstream ss;
+		ss << "Unexpected success getting a host-side pointer for a device-only allocation; allocated pointer: "
+				<< ptr.get() << ", " << " host-side pointer: " << host_side_ptr;
+	}
+	catch(cuda::runtime_error& e) {
+		if (e.code() != cuda::status::invalid_value) { throw e; }
+	}
+}
+
+} // namespace tests
+
+int main(int argc, char **argv)
+{
+	cuda::device::id_t device_id =  (argc > 1) ?
+		std::stoi(argv[1]) : cuda::device::default_device_id;
+	auto device = cuda::device::get(device_id);
+
+	std::cout << "Using CUDA device " << device.name() << " (having device ID " << device.id() << ")" << std::endl;
+
+	tests::wrapped_pointers_and_regions(device);
+
+	tests::pointer_properties(device);
 
 	std::cout << "\nSUCCESS\n";
 	return EXIT_SUCCESS;
diff --git a/examples/by_runtime_api_module/version_management.cpp b/examples/by_runtime_api_module/version_management.cpp
index 6a0aeb27..9da77095 100644
--- a/examples/by_runtime_api_module/version_management.cpp
+++ b/examples/by_runtime_api_module/version_management.cpp
@@ -5,11 +5,11 @@
  *   Version Management
  *
  */
-#include <cuda/api/miscellany.hpp>
-#include <cuda/api/versions.hpp>
 #include <iostream>
 #include <string>
 #include <cstdlib>
+#include <cuda/api/miscellany.hpp>
+#include <cuda/api/versions.hpp>
 
 [[noreturn]] void die_(const std::string& message)
 {
@@ -27,7 +27,7 @@ int main(int, char **)
 	auto runtime_version = cuda::version_numbers::runtime();
 	std::cout << "Using CUDA runtime version " << runtime_version << ".\n";
 
-	auto driver_supported_version = cuda::version_numbers::maximum_supported_by_driver();
+	auto driver_supported_version = cuda::version_numbers::driver();
 	if (driver_supported_version == cuda::version_numbers::none()) {
 		std::cout << "There is no CUDA driver installed, so no CUDA runtime version is supported\n";
 	}
diff --git a/examples/common.hpp b/examples/common.hpp
index 61a27cd4..c3e05155 100644
--- a/examples/common.hpp
+++ b/examples/common.hpp
@@ -6,10 +6,18 @@
 #ifndef EXAMPLES_COMMON_HPP_
 #define EXAMPLES_COMMON_HPP_
 
-#include <cuda/runtime_api.hpp>
+// These next few lines allow for reporting the context 
+// stack contents within API code during debugging, but otherwise 
+// are not used.
 
 #include <string>
 #include <iostream>
+
+void report_current_context(const std::string& prefix);
+void report_context_stack(const std::string& prefix);
+
+#include <cuda/api.hpp>
+
 #include <cstdio>
 #include <fstream>
 #include <cmath>
@@ -19,7 +27,6 @@
 #include <cstdlib>
 #include <vector>
 #include <algorithm>
-#include <string>
 #include <iomanip>
 #include <numeric>
 
@@ -34,7 +41,7 @@ const char* cache_preference_name(cuda::multiprocessor_cache_preference_t pref)
 	return cache_preference_names[(off_t) pref];
 }
 
-const char* host_thread_synch_scheduling_policy_name(cuda::host_thread_synch_scheduling_policy_t policy)
+const char* host_thread_synch_scheduling_policy_name(cuda::context::host_thread_synch_scheduling_policy_t policy)
 {
 	static const char *names[] = {
 		"heuristic",
@@ -47,6 +54,17 @@ const char* host_thread_synch_scheduling_policy_name(cuda::host_thread_synch_sch
 	return names[(off_t) policy];
 }
 
+const char* memory_type_name(cuda::memory::type_t mem_type)
+{
+	static const char* memory_type_names[] = {
+		"N/A",
+		"host",
+		"device",
+		"array",
+		"unified"
+	};
+	return memory_type_names[mem_type];
+}
 
 namespace std {
 
@@ -60,11 +78,21 @@ std::ostream& operator<<(std::ostream& os, cuda::multiprocessor_cache_preference
 	return (os << cache_preference_name(pref));
 }
 
-std::ostream& operator<<(std::ostream& os, cuda::host_thread_synch_scheduling_policy_t pref)
+std::ostream& operator<<(std::ostream& os, cuda::context::host_thread_synch_scheduling_policy_t pref)
 {
 	return (os << host_thread_synch_scheduling_policy_name(pref));
 }
 
+std::ostream& operator<<(std::ostream& os, cuda::context::handle_t handle)
+{
+	return (os << cuda::detail_::ptr_as_hex(handle));
+}
+
+std::ostream& operator<<(std::ostream& os, const cuda::context_t& context)
+{
+	return os << "[device " << context.device_id() << " handle " << context.handle() << ']';
+}
+
 std::ostream& operator<<(std::ostream& os, const cuda::device_t& device)
 {
 	return os << cuda::device::detail_::identify(device.id());
@@ -75,6 +103,14 @@ std::ostream& operator<<(std::ostream& os, const cuda::stream_t& stream)
 	return os << cuda::stream::detail_::identify(stream.handle(), stream.device().id());
 }
 
+std::string to_string(const cuda::context_t& context)
+{
+	std::stringstream ss;
+	ss.clear();
+	ss << context;
+	return ss.str();
+}
+
 } // namespace std
 
 [[noreturn]] bool die_(const std::string& message)
@@ -90,6 +126,70 @@ std::ostream& operator<<(std::ostream& os, const cuda::stream_t& stream)
 		die_("Assertion failed at line " + std::to_string(__LINE__) + ": " #cond); \
 }
 
+
+void report_current_context(const std::string& prefix = "")
+{
+	if (not prefix.empty()) { std::cout << prefix << ", the current context is: "; }
+	else std::cout << "The current context is: ";
+	if (not cuda::context::current::exists()) {
+		std::cout << "(None)" << std::endl;
+	}
+	else {
+		auto cc = cuda::context::current::get();
+		std::cout << cc << std::endl;
+	}
+}
+
+
+void print_context_stack()
+{
+	if (not cuda::context::current::exists()) {
+		std::cout << "(Context stack is empty)" << std::endl;
+		return;
+	}
+	std::vector<cuda::context::handle_t> contexts;
+	while(cuda::context::current::exists()) {
+		contexts.push_back(cuda::context::current::detail_::pop());
+	}
+//	std::cout << "" << contexts.size() << " contexts; top to bottom:\n";
+	for (auto handle : contexts) {
+		auto device_id = cuda::context::detail_::get_device_id(handle);
+		std::cout << handle << " for device " << device_id;
+		if (cuda::context::detail_::is_primary(handle)) {
+			std::cout << " (primary, "
+				<< (cuda::device::primary_context::detail_::is_active(device_id) ? "active" : "inactive")
+				<< ')';
+		}
+		std::cout << '\n';
+	}
+	for (auto it = contexts.rbegin(); it != contexts.rend(); it++) {
+		cuda::context::current::detail_::push(*it);
+	}
+}
+
+void report_primary_context_activity(const std::string& prefix = "")
+{
+	if (not prefix.empty()) { std::cout << prefix << ", "; }
+	std::cout << "Device primary contexts activity: ";
+	for(auto device : cuda::devices()) {
+		std::cout << device.id() << ": "
+				  << (cuda::device::primary_context::detail_::is_active(device.id()) ? "ACTIVE" : "inactive")
+				  << "  ";
+	}
+	std::cout << '\n';
+}
+
+void report_context_stack(const std::string& prefix = "")
+{
+	if (not prefix.empty()) { std::cout << prefix << ", the context stack is (top to bottom):\n"; }
+	std::cout << "-----------------------------------------------------\n";
+	print_context_stack();
+	std::cout << "---\n";
+	report_primary_context_activity();
+	std::cout << "-----------------------------------------------------\n" << std::flush;
+}
+
+
 // Note: This will only work correctly for positive values
 template <typename U1, typename U2>
 typename std::common_type<U1,U2>::type div_rounding_up(U1 dividend, U2 divisor)
@@ -97,4 +197,5 @@ typename std::common_type<U1,U2>::type div_rounding_up(U1 dividend, U2 divisor)
 	return dividend / divisor + !!(dividend % divisor);
 }
 
+
 #endif // EXAMPLES_COMMON_HPP_
diff --git a/examples/modified_cuda_samples/binaryPartitionCG/binaryPartitionCG.cu b/examples/modified_cuda_samples/binaryPartitionCG/binaryPartitionCG.cu
index e89aa22c..6a9afffd 100644
--- a/examples/modified_cuda_samples/binaryPartitionCG/binaryPartitionCG.cu
+++ b/examples/modified_cuda_samples/binaryPartitionCG/binaryPartitionCG.cu
@@ -117,7 +117,7 @@ int main(int argc, const char **argv)
 	stream.enqueue.memzero(d_numOfOdds.get(), sizeof(int));
 	stream.enqueue.memzero(d_sumOfOddEvenElems.get(), sizeof(int)*2);
 
-	auto kernel = cuda::kernel::wrap(device, oddEvenCountAndSumCG);
+	auto kernel = cuda::kernel::get(device, oddEvenCountAndSumCG);
 	auto dims = kernel.min_grid_params_for_max_occupancy();
 	auto launch_config = cuda::make_launch_config(dims);
 		// Note: While the kernel uses the "cooperative groups" CUDA-C++ headers,
diff --git a/examples/modified_cuda_samples/helper_cuda.hpp b/examples/modified_cuda_samples/helper_cuda.hpp
index 21ce6fea..abb227c5 100644
--- a/examples/modified_cuda_samples/helper_cuda.hpp
+++ b/examples/modified_cuda_samples/helper_cuda.hpp
@@ -88,7 +88,6 @@ inline int get_device_with_highest_gflops()
 	return *iterator;
 }
 
-
 // Initialization code to find the best CUDA Device
 // Unlike in NVIDIA's original helper_cuda.h, this does _not_
 // make the chosen device current.
diff --git a/examples/modified_cuda_samples/inlinePTX/inlinePTX.cu b/examples/modified_cuda_samples/inlinePTX/inlinePTX.cu
index 5e6fb867..f1760c7e 100644
--- a/examples/modified_cuda_samples/inlinePTX/inlinePTX.cu
+++ b/examples/modified_cuda_samples/inlinePTX/inlinePTX.cu
@@ -10,27 +10,10 @@
  * contact the author.
  */
 
-#include <cuda/runtime_api.hpp>
+#include "../../common.hpp"
 
 #include "ptx.cuh"
 
-#include <iostream>
-#include <memory>
-
-// Note: This will only work correctly for positive values
-template <typename U1, typename U2>
-typename std::common_type<U1,U2>::type div_rounding_up(U1 dividend, U2 divisor)
-{
-	return dividend / divisor + !!(dividend % divisor);
-}
-
-[[noreturn]] void die_(const std::string& message)
-{
-	std::cerr << message << "\n";
-	exit(EXIT_FAILURE);
-}
-
-
 __global__ void sequence_gpu(int *d_ptr, int length)
 {
 	int elemID = blockIdx.x * blockDim.x + threadIdx.x;
@@ -49,7 +32,6 @@ void sequence_cpu(int *h_ptr, int length)
 	}
 }
 
-
 int main(int, char **)
 {
 	if (cuda::device::count() == 0) {
diff --git a/examples/modified_cuda_samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest.cu b/examples/modified_cuda_samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest.cu
index 46572de6..03c3766b 100644
--- a/examples/modified_cuda_samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest.cu
+++ b/examples/modified_cuda_samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest.cu
@@ -137,8 +137,10 @@ void enqueue_p2p_copy(
     P2PEngine p2p_mechanism,
     cuda::stream_t& stream)
 {
-    auto copy_kernel = cuda::kernel::wrap(stream.device(), copyp2p);
+    auto copy_kernel = cuda::kernel::get(stream.device(), copyp2p);
     auto grid_and_block_dims = copy_kernel.min_grid_params_for_max_occupancy();
+		// Note: We could have alternatively used:
+		// auto grid_and_block_dims = cuda::kernel::occupancy::min_grid_params_for_max_occupancy(copy_kernel);
     auto launch_config = cuda::make_launch_config(grid_and_block_dims);
 
 
diff --git a/examples/modified_cuda_samples/simpleDrvRuntimePTX/simpleDrvRuntimePTX.cpp b/examples/modified_cuda_samples/simpleDrvRuntimePTX/simpleDrvRuntimePTX.cpp
new file mode 100644
index 00000000..da0c3cab
--- /dev/null
+++ b/examples/modified_cuda_samples/simpleDrvRuntimePTX/simpleDrvRuntimePTX.cpp
@@ -0,0 +1,175 @@
+/*
+ * Copyright 1993-2019 NVIDIA Corporation.  All rights reserved.
+ *
+ * Please refer to the NVIDIA end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ *
+ */
+
+/* Vector addition: C = A + B.
+ *
+ * This sample is a very basic sample that implements element by element
+ * vector addition. It loads a cuda fatbinary and runs vector addition kernel.
+ * Uses both Driver and Runtime CUDA APIs for different purposes.
+ */
+
+#include "../../common.hpp"
+
+std::string create_ptx_file()
+{
+	const char* ptx_file_contents = R"(
+	.version 6.5
+	.target sm_30
+	.address_size 64
+
+		// .globl   dummy
+	
+	.visible .entry dummy(
+	
+)
+	{
+		ret;
+	}
+	
+
+		// .globl	VecAdd_kernel
+
+	.visible .entry VecAdd_kernel(
+		.param .u64 VecAdd_kernel_param_0,
+		.param .u64 VecAdd_kernel_param_1,
+		.param .u64 VecAdd_kernel_param_2,
+		.param .u32 VecAdd_kernel_param_3
+	)
+	{
+		.reg .pred 	%p<2>;
+		.reg .f32 	%f<4>;
+		.reg .b32 	%r<6>;
+		.reg .b64 	%rd<11>;
+
+
+		ld.param.u64 	%rd1, [VecAdd_kernel_param_0];
+		ld.param.u64 	%rd2, [VecAdd_kernel_param_1];
+		ld.param.u64 	%rd3, [VecAdd_kernel_param_2];
+		ld.param.u32 	%r2, [VecAdd_kernel_param_3];
+		mov.u32 	%r3, %ntid.x;
+		mov.u32 	%r4, %ctaid.x;
+		mov.u32 	%r5, %tid.x;
+		mad.lo.s32 	%r1, %r4, %r3, %r5;
+		setp.ge.s32	%p1, %r1, %r2;
+		@%p1 bra 	BB0_2;
+
+		cvta.to.global.u64 	%rd4, %rd1;
+		mul.wide.s32 	%rd5, %r1, 4;
+		add.s64 	%rd6, %rd4, %rd5;
+		cvta.to.global.u64 	%rd7, %rd2;
+		add.s64 	%rd8, %rd7, %rd5;
+		ld.global.f32 	%f1, [%rd8];
+		ld.global.f32 	%f2, [%rd6];
+		add.f32 	%f3, %f2, %f1;
+		cvta.to.global.u64 	%rd9, %rd3;
+		add.s64 	%rd10, %rd9, %rd5;
+		st.global.f32 	[%rd10], %f3;
+
+	BB0_2:
+		ret;
+	}
+	)";
+
+	char temp_filename[] = "caw-simple-drv-runtime-ptx-XXXXXX";
+    int file_descriptor = mkstemp(temp_filename);
+	if (file_descriptor == -1) {
+        throw std::runtime_error(std::string("Failed creating a temporary file using mkstemp(): ") + std::strerror(errno) + '\n');
+    }
+	FILE* ptx_file = fdopen(file_descriptor, "w");
+	if (ptx_file == nullptr) {
+        throw std::runtime_error(std::string("Failed converting temporay file descriptor into a C library FILE structure: ") + std::strerror(errno) + '\n');
+    }
+	if (fputs(ptx_file_contents, ptx_file) == EOF) {
+        throw std::runtime_error("Failed writing PTX to temporary file " + std::string(temp_filename) + ": " + std::strerror(errno) + '\n');
+    }
+	if (fclose(ptx_file) == EOF) {
+        throw std::runtime_error("Failed closing temporary PTX file " + std::string(temp_filename) + ": " + std::strerror(errno) + '\n');
+    }
+	return temp_filename;
+}
+
+// Host code
+int main(int argc, char** argv)
+{
+    std::cout << "simpleDrvRuntime - PTX version..\n";
+    int N = 50000;
+    size_t  size = N * sizeof(float);
+
+    // Initialize
+    cuda::initialize_driver();
+
+	if (cuda::device::count() == 0) {
+		die_("No CUDA devices on this system");
+	}
+
+	// Being very cavalier about our command-line arguments here...
+	cuda::device::id_t device_id =  (argc > 1) ?
+		std::stoi(argv[1]) : cuda::device::default_device_id;
+
+    auto device = cuda::device::get(device_id);
+
+    // Create context
+    auto context = cuda::context::create(device);
+
+    cuda::context::current::scoped_override_t context_setter { context };
+
+// first search for the module path before we load the results
+    auto ptx_filename = create_ptx_file();
+
+    auto module = cuda::module::load_from_file(ptx_filename);
+    auto vecAdd_kernel = module.get_kernel("VecAdd_kernel");
+    auto dummy_kernel = module.get_kernel("dummy");
+
+    auto stream = cuda::stream::create(context, cuda::stream::async);
+
+    stream.enqueue.kernel_launch(dummy_kernel, cuda::launch_configuration_t{1,1});
+
+    cuda::outstanding_error::ensure_none();
+
+    stream.synchronize();
+
+	auto h_A = std::unique_ptr<float>(new float[N]);
+	auto h_B = std::unique_ptr<float>(new float[N]);
+	auto h_C = std::unique_ptr<float>(new float[N]);
+
+	auto generator = []() { return rand() / (float) RAND_MAX; };
+	std::generate_n(h_A.get(), N, generator);
+	std::generate_n(h_B.get(), N, generator);
+
+    // Allocate vectors in device memory
+	auto d_A = cuda::memory::device::make_unique<float[]>(device, N);
+	auto d_B = cuda::memory::device::make_unique<float[]>(device, N);
+	auto d_C = cuda::memory::device::make_unique<float[]>(device, N);
+
+
+	cuda::memory::async::copy(d_A.get(), h_A.get(), size, stream);
+	cuda::memory::async::copy(d_B.get(), h_B.get(), size, stream);
+
+    auto threadsPerBlock = 256;
+	auto blocksPerGrid   = (N + threadsPerBlock - 1) / threadsPerBlock;
+    auto launch_config = cuda::make_launch_config( blocksPerGrid, threadsPerBlock );
+
+    cuda::outstanding_error::ensure_none();
+
+    stream.enqueue.kernel_launch(vecAdd_kernel, launch_config, d_A.get(), d_B.get(), d_C.get(), N);
+
+	cuda::memory::async::copy(h_C.get(), d_C.get(), size, stream);
+	stream.synchronize();
+
+	for (int i = 0; i < N; ++i) {
+		if (std::fabs(h_A.get()[i] + h_B.get()[i] - h_C.get()[i]) > 1e-5)  {
+			std::cerr << "Result verification failed at element " << i << "\n";
+			exit(EXIT_FAILURE);
+		}
+	}
+    std::cout << "SUCCESS\n";
+    return EXIT_SUCCESS;
+}
diff --git a/examples/modified_cuda_samples/simpleIPC/simpleIPC.cu b/examples/modified_cuda_samples/simpleIPC/simpleIPC.cu
index e4f30c03..a8bfcc80 100644
--- a/examples/modified_cuda_samples/simpleIPC/simpleIPC.cu
+++ b/examples/modified_cuda_samples/simpleIPC/simpleIPC.cu
@@ -11,7 +11,7 @@
 
 #include "../helper_string.h"
 
-#include <cuda/runtime_api.hpp>
+#include <cuda/api.hpp>
 
 #include <stdio.h>
 #include <assert.h>
@@ -38,8 +38,8 @@ typedef struct ipcCUDA_st
 {
 	int device;
 	pid_t pid;
-	cudaIpcEventHandle_t eventHandle;
-	cudaIpcMemHandle_t memHandle;
+	cuda::event::ipc::handle_t eventHandle;
+	cuda::memory::ipc::handle_t memHandle;
 } ipcCUDA_t;
 
 typedef struct ipcDevices_st
@@ -187,7 +187,8 @@ void runTestMultiKernel(ipcCUDA_t *s_mem, int index)
 		h_refData[i] = rand();
 	}
 
-	auto device = cuda::device::get(s_mem[index].device).make_current();
+	auto device = cuda::device::get(s_mem[index].device);
+	cuda::device::current::set(device);
 
 	if (index == 0)
 	{
diff --git a/examples/modified_cuda_samples/simpleStreams/simpleStreams.cu b/examples/modified_cuda_samples/simpleStreams/simpleStreams.cu
index 441aa92a..495e9500 100644
--- a/examples/modified_cuda_samples/simpleStreams/simpleStreams.cu
+++ b/examples/modified_cuda_samples/simpleStreams/simpleStreams.cu
@@ -40,19 +40,18 @@ const char *sEventSyncMethod[] =
 	NULL
 };
 
-// System includes
-
 // helper functions and utilities to work with CUDA
 #include "../helper_cuda.hpp"
 
-#include <cuda/runtime_api.hpp>
-
 #include <cstdlib>
 
 #include <vector>
 #include <iostream>
 #include <algorithm>
 
+using synch_policy_type = cuda::context::host_thread_synch_scheduling_policy_t;
+
+
 
 // Macro to aligned up to the memory size in question
 #define MEMORY_ALIGNMENT  4096
@@ -83,11 +82,11 @@ void printHelp()
 {
 	std::cout
 		<< "Usage: " << sSDKsample << " [options below]\n"
-		<< "\t--sync_method (" << (int) cuda::host_thread_synch_scheduling_policy_t::default_ << ") for CPU thread synchronization with GPU work."
-		<< "\t             Possible values: " << (int) cuda::host_thread_synch_scheduling_policy_t::heuristic << ", "
-		<< (int) cuda::host_thread_synch_scheduling_policy_t::spin << ", "
-		<< (int) cuda::host_thread_synch_scheduling_policy_t::yield << ", "
-		<< (int) cuda::host_thread_synch_scheduling_policy_t::block << ".\n"
+		<< "\t--sync_method (" << (int) synch_policy_type::default_ << ") for CPU thread synchronization with GPU work."
+		<< "\t             Possible values: " << (int) synch_policy_type::heuristic << ", "
+		<< (int) synch_policy_type::spin << ", "
+		<< (int) synch_policy_type::yield << ", "
+		<< (int) synch_policy_type::block << ".\n"
 		<< "\t--use_generic_memory (default) use generic page-aligned host memory allocation\n"
 		<< "\t--use_cuda_malloc_host (optional) use pinned host memory allocation\n";
 }
@@ -103,7 +102,6 @@ int main(int argc, char **argv)
 
 	// allocate generic memory and pin it laster instead of using cudaHostAlloc()
 
-	using synch_policy_type = cuda::host_thread_synch_scheduling_policy_t;
 	auto synch_policy = synch_policy_type::block;
 
 	int niterations;    // number of iterations for the loop inside the kernel
diff --git a/examples/modified_cuda_samples/vectorAdd/vectorAdd.cu b/examples/modified_cuda_samples/vectorAdd/vectorAdd.cu
index 23ab5f66..9aea8a0c 100644
--- a/examples/modified_cuda_samples/vectorAdd/vectorAdd.cu
+++ b/examples/modified_cuda_samples/vectorAdd/vectorAdd.cu
@@ -10,7 +10,9 @@
  * contact the author.
  */
 
-#include <cuda/runtime_api.hpp>
+#include "../../common.hpp"
+
+#include <cuda/api.hpp>
 
 #include <iostream>
 #include <memory>
diff --git a/examples/modified_cuda_samples/vectorAddManaged/vectorAddManaged.cu b/examples/modified_cuda_samples/vectorAddManaged/vectorAddManaged.cu
index c16f677d..6eac1e82 100644
--- a/examples/modified_cuda_samples/vectorAddManaged/vectorAddManaged.cu
+++ b/examples/modified_cuda_samples/vectorAddManaged/vectorAddManaged.cu
@@ -13,10 +13,9 @@
  * used instead of regular host and device memory.
  */
 
-#include <cuda/runtime_api.hpp>
+#include <cuda/api.hpp>
 
 #include <iostream>
-#include <memory>
 #include <algorithm>
 
 __global__ void vectorAdd(const float *A, const float *B, float *C, int numElements)
diff --git a/examples/modified_cuda_samples/vectorAddMapped/vectorAddMapped.cu b/examples/modified_cuda_samples/vectorAddMapped/vectorAddMapped.cu
index 6aa11a00..c69fc84b 100644
--- a/examples/modified_cuda_samples/vectorAddMapped/vectorAddMapped.cu
+++ b/examples/modified_cuda_samples/vectorAddMapped/vectorAddMapped.cu
@@ -13,7 +13,7 @@
  * used instead of regular host and device memory.
  */
 
-#include <cuda/runtime_api.hpp>
+#include <cuda/api.hpp>
 
 #include <iostream>
 #include <memory>
diff --git a/examples/other/array_management.cu b/examples/other/array_management.cu
index 04c5d0b1..c886d700 100644
--- a/examples/other/array_management.cu
+++ b/examples/other/array_management.cu
@@ -68,7 +68,7 @@ void array_3d_example(cuda::device_t& device, size_t w, size_t h, size_t d) {
 	auto ptr_out = cuda::memory::managed::make_unique<float[]>(arr.size());
 	cuda::memory::copy(arr, ptr_in.get());
 	cuda::texture_view tv(arr);
-    assert_(tv.associated_device() == device);
+    assert_(tv.device() == device);
 	constexpr cuda::grid::block_dimension_t block_dim = 10;
 	constexpr auto block_dims = cuda::grid::block_dimensions_t::cube(block_dim);
 	assert(div_rounding_up(w, block_dim) <= std::numeric_limits<grid::dimension_t>::max());
diff --git a/examples/other/inclusion_in_two_translation_units/main.cpp b/examples/other/inclusion_in_two_translation_units/main.cpp
index 36c05c90..048cc31e 100644
--- a/examples/other/inclusion_in_two_translation_units/main.cpp
+++ b/examples/other/inclusion_in_two_translation_units/main.cpp
@@ -1,4 +1,4 @@
-#include <cuda/runtime_api.hpp>
+#include <cuda/api.hpp>
 
 #include <cstdlib>
 #include <iostream>
diff --git a/examples/other/inclusion_in_two_translation_units/second_tu.cpp b/examples/other/inclusion_in_two_translation_units/second_tu.cpp
index 4b3624e0..7bd38c3e 100644
--- a/examples/other/inclusion_in_two_translation_units/second_tu.cpp
+++ b/examples/other/inclusion_in_two_translation_units/second_tu.cpp
@@ -1,4 +1,4 @@
-#include <cuda/runtime_api.hpp>
+#include <cuda/api.hpp>
 
 cuda::device::id_t get_current_device_id()
 {
diff --git a/examples/other/io_compute_overlap_with_streams.cu b/examples/other/io_compute_overlap_with_streams.cu
index a827976b..d232c041 100644
--- a/examples/other/io_compute_overlap_with_streams.cu
+++ b/examples/other/io_compute_overlap_with_streams.cu
@@ -5,7 +5,7 @@
  *   Stream Management
  *
  */
-#include <cuda/runtime_api.hpp>
+#include <cuda/api.hpp>
 
 #include <iostream>
 #include <vector>
@@ -27,125 +27,126 @@ __device__ void gpu_sleep(clock_value_t sleep_cycles)
 
 template <typename T>
 __global__ void add(
-	const T* __restrict__  lhs,
-	const T* __restrict__  rhs,
-	T* __restrict__        result,
-	size_t                 length)
+    const T* __restrict__  lhs,
+    const T* __restrict__  rhs,
+    T* __restrict__        result,
+    size_t                 length)
 {
-	auto global_index = threadIdx.x + blockIdx.x * blockDim.x;
-	if (global_index < length) {
-		result[global_index] = lhs[global_index] + rhs[global_index];
-		gpu_sleep(200000);
-	}
+    auto global_index = threadIdx.x + blockIdx.x * blockDim.x;
+    if (global_index < length) {
+        result[global_index] = lhs[global_index] + rhs[global_index];
+        gpu_sleep(200000);
+    }
 }
 
 template <typename I, typename I2>
 constexpr I div_rounding_up(I dividend, const I2 divisor) noexcept
 {
-	return (dividend / divisor) + !!(dividend % divisor);
+    return (dividend / divisor) + !!(dividend % divisor);
 }
 
 /*
  * Produce a launch configuration with one thread covering each element
  */
 cuda::launch_configuration_t make_linear_launch_config(
-	const cuda::device_t  device,
-	size_t                length)
+    const cuda::device_t  device,
+    size_t                length)
 {
-	auto threads_per_block = device.properties().max_threads_per_block();
-	auto num_blocks =  div_rounding_up(length, threads_per_block);
-	if (num_blocks > std::numeric_limits<cuda::grid::dimension_t>::max()) {
-		throw std::invalid_argument("Specified length exceeds CUDA's support for a linear grid");
-	}
-	return cuda::make_launch_config((cuda::grid::dimensions_t) num_blocks, threads_per_block, cuda::no_dynamic_shared_memory);
+    auto threads_per_block = device.properties().max_threads_per_block();
+    auto num_blocks =  div_rounding_up(length, threads_per_block);
+    if (num_blocks > std::numeric_limits<cuda::grid::dimension_t>::max()) {
+        throw std::invalid_argument("Specified length exceeds CUDA's support for a linear grid");
+    }
+    return cuda::make_launch_config((cuda::grid::dimensions_t) num_blocks, threads_per_block, cuda::no_dynamic_shared_memory);
 }
 
 struct buffer_set_t {
-	cuda::memory::host::unique_ptr<element_t[]> host_lhs;
-	cuda::memory::host::unique_ptr<element_t[]> host_rhs;
-	cuda::memory::host::unique_ptr<element_t[]> host_result;
-	cuda::memory::device::unique_ptr<element_t[]> device_lhs;
-	cuda::memory::device::unique_ptr<element_t[]> device_rhs;
-	cuda::memory::device::unique_ptr<element_t[]> device_result;
+    cuda::memory::host::unique_ptr<element_t[]> host_lhs;
+    cuda::memory::host::unique_ptr<element_t[]> host_rhs;
+    cuda::memory::host::unique_ptr<element_t[]> host_result;
+    cuda::memory::device::unique_ptr<element_t[]> device_lhs;
+    cuda::memory::device::unique_ptr<element_t[]> device_rhs;
+    cuda::memory::device::unique_ptr<element_t[]> device_result;
 };
 
 std::vector<buffer_set_t> generate_buffers(
-	const cuda::device_t  device,
-	size_t                num_kernels,
-	size_t                num_elements)
+    const cuda::device_t&  device,
+    size_t                num_kernels,
+    size_t                num_elements)
 {
-	// TODO: This should be an std::array, but generating
-	// it is a bit tricky and I don't want to burden the example
-	// with template wizardry
-	std::vector<buffer_set_t> buffers;
-	std::generate_n(std::back_inserter(buffers), num_kernels,
-		[&]() {
-			return buffer_set_t {
-				// Sticking to C++11 here...
-				cuda::memory::host::make_unique<element_t[]>(num_elements),
-				cuda::memory::host::make_unique<element_t[]>(num_elements),
-				cuda::memory::host::make_unique<element_t[]>(num_elements),
-				cuda::memory::device::make_unique<element_t[]>(device, num_elements),
-				cuda::memory::device::make_unique<element_t[]>(device, num_elements),
-				cuda::memory::device::make_unique<element_t[]>(device, num_elements)
-			};
-		}
-	);
-
-	// TODO: Consider actually filling the buffers
-
-	return buffers;
+    // device.make_current();
+    // TODO: This should be an std::array, but generating
+    // it is a bit tricky and I don't want to burden the example
+    // with template wizardry
+    std::vector<buffer_set_t> buffers;
+    std::generate_n(std::back_inserter(buffers), num_kernels,
+        [&]() {
+            return buffer_set_t {
+                // Sticking to C++11 here...
+                cuda::memory::host::make_unique<element_t[]>(num_elements),
+                cuda::memory::host::make_unique<element_t[]>(num_elements),
+                cuda::memory::host::make_unique<element_t[]>(num_elements),
+                cuda::memory::device::make_unique<element_t[]>(device, num_elements),
+                cuda::memory::device::make_unique<element_t[]>(device, num_elements),
+                cuda::memory::device::make_unique<element_t[]>(device, num_elements)
+            };
+        }
+    );
+
+    // TODO: Consider actually filling the buffers
+
+    return buffers;
 }
 
 int main(int, char **)
 {
-	constexpr size_t num_kernels     = 5;
-	constexpr size_t num_elements    = 1e7;
-
-	auto device = cuda::device::current::get();
-	std::cout << "Using CUDA device " << device.name() << " (having ID " << device.id() << ")\n";
-
-	std::cout << "Generating host buffers... " << std::flush;
-	auto buffers = generate_buffers(device, num_kernels, num_elements);
-	std::cout << "done.\n" << std::flush;
-
-	std::vector<cuda::stream_t> streams;
-	streams.reserve(num_kernels);
-	std::generate_n(std::back_inserter(streams), num_kernels,
-		[&]() { return device.create_stream(cuda::stream::async); });
-
-	auto common_launch_config = make_linear_launch_config(device, num_elements);
-	auto buffer_size = num_elements * sizeof(element_t);
-
-	std::cout
-		<< "Running " << num_kernels << " sequences of HtoD-kernel-DtoH, in parallel" << std::endl;
-		// Unfortunately, we need to use indices here - unless we
-		// had access to a zip iterator (e.g. boost::zip_iterator)
-	for(size_t k = 0; k < num_kernels; k++) {
-		auto& stream = streams[k];
-		auto& buffer_set = buffers[k];
-		stream.enqueue.copy(buffer_set.device_lhs.get(), buffer_set.host_lhs.get(), buffer_size);
-		stream.enqueue.copy(buffer_set.device_rhs.get(), buffer_set.host_rhs.get(), buffer_size);
-		stream.enqueue.kernel_launch(
-			add<element_t>,
-			common_launch_config,
-			buffer_set.device_lhs.get(),
-			buffer_set.device_rhs.get(),
-			buffer_set.device_result.get(),
-			num_elements);
-		stream.enqueue.copy(buffer_set.host_result.get(), buffer_set.device_result.get(), buffer_size);
-		stream.enqueue.host_function_call(
-			[=](cuda::stream_t) {
-				std::cout
-					<< "Stream " << k+1 << " of " << num_kernels << " has concluded all work. " << std::endl;
-			}
-		);
-	}
-	std::this_thread::sleep_for(std::chrono::microseconds(50000));
-	for(auto& stream : streams) { stream.synchronize(); }
-	cuda::outstanding_error::ensure_none();
-
-	// TODO: Consider checking for correctness here
-
-	std::cout << "\nSUCCESS" << std::endl;
+    constexpr size_t num_kernels     = 5;
+    constexpr size_t num_elements    = 1e7;
+
+    auto device = cuda::device::current::get();
+    std::cout << "Using CUDA device " << device.name() << " (having ID " << device.id() << ")\n";
+
+    std::cout << "Generating host buffers... " << std::flush;
+    auto buffers = generate_buffers(device, num_kernels, num_elements);
+    std::cout << "done.\n" << std::flush;
+
+    std::vector<cuda::stream_t> streams;
+    streams.reserve(num_kernels);
+    std::generate_n(std::back_inserter(streams), num_kernels,
+        [&]() { return device.create_stream(cuda::stream::async); });
+
+    auto common_launch_config = make_linear_launch_config(device, num_elements);
+    auto buffer_size = num_elements * sizeof(element_t);
+
+    std::cout
+        << "Running " << num_kernels << " sequences of HtoD-kernel-DtoH, in parallel" << std::endl;
+        // Unfortunately, we need to use indices here - unless we
+        // had access to a zip iterator (e.g. boost::zip_iterator)
+    for(size_t k = 0; k < num_kernels; k++) {
+        auto& stream = streams[k];
+        auto& buffer_set = buffers[k];
+        stream.enqueue.copy(buffer_set.device_lhs.get(), buffer_set.host_lhs.get(), buffer_size);
+        stream.enqueue.copy(buffer_set.device_rhs.get(), buffer_set.host_rhs.get(), buffer_size);
+        stream.enqueue.kernel_launch(
+            add<element_t>,
+            common_launch_config,
+            buffer_set.device_lhs.get(),
+            buffer_set.device_rhs.get(),
+            buffer_set.device_result.get(),
+            num_elements);
+        stream.enqueue.copy(buffer_set.host_result.get(), buffer_set.device_result.get(), buffer_size);
+        stream.enqueue.host_function_call(
+            [=](cuda::stream_t) {
+                std::cout
+                    << "Stream " << k+1 << " of " << num_kernels << " has concluded all work. " << std::endl;
+            }
+        );
+    }
+    std::this_thread::sleep_for(std::chrono::microseconds(50000));
+    for(auto& stream : streams) { stream.synchronize(); }
+    cuda::outstanding_error::ensure_none();
+
+    // TODO: Consider checking for correctness here
+
+    std::cout << "\nSUCCESS" << std::endl;
 }
diff --git a/examples/other/manipulate_current_device.cu b/examples/other/manipulate_current_device.cu
new file mode 100644
index 00000000..9da259df
--- /dev/null
+++ b/examples/other/manipulate_current_device.cu
@@ -0,0 +1,59 @@
+/**
+ * An example program for the CUDA API wrappers library,
+ * which indirectly manipulates the current device using
+ * driver API calls.
+ *
+ */
+#include "../common.hpp"
+#include <iostream>
+
+void report_current_device()
+{
+	std::cout << "Runtime believes the current device index is: "
+		<< cuda::device::current::detail_::get_id() << std::endl;
+}
+
+int main()
+{
+	namespace context = cuda::context::detail_;
+	namespace cur_dev = cuda::device::current::detail_;
+	namespace pc = cuda::device::primary_context::detail_;
+	namespace cur_ctx = cuda::context::current::detail_;
+
+	cuda::device::id_t dev_idx[2];
+	cuda::context::handle_t pc_handle[2];
+	
+	cuda::initialize_driver();
+	dev_idx[0] = cur_dev::get_id();
+	report_current_device();
+	assert_(cur_dev::get_id() == 0);
+	dev_idx[1] = (dev_idx[0] == 0) ? 1 : 0;
+	pc_handle[0] = pc::obtain_and_increase_refcount(dev_idx[0]);
+	std::cout << "Obtained primary context handle for device " << dev_idx[0]<< '\n';
+	pc_handle[1] = pc::obtain_and_increase_refcount(dev_idx[1]);
+	std::cout << "Obtained primary context handle for device " << dev_idx[1]<< '\n';
+	report_current_device();
+	cur_ctx::push(pc_handle[1]);
+	std::cout << "Pushed primary context handle for device " << dev_idx[1] << " onto the stack\n";
+	report_current_device();
+	assert_(cur_dev::get_id() == dev_idx[1]);
+	auto ctx = context::create_and_push(dev_idx[0]);
+	std::cout << "Created a new context for device " << dev_idx[0] << " and pushed it onto the stack\n";
+	report_current_device();
+	assert_(cur_dev::get_id() == dev_idx[0]);
+	cur_ctx::push(ctx);
+	std::cout << "Pushed primary context handle for device " << dev_idx[0] << " onto the stack\n";
+	report_current_device();
+	assert_(cur_dev::get_id() == dev_idx[0]);
+	cur_ctx::push(pc_handle[1]);
+	std::cout << "Pushed primary context for device " << dev_idx[1] << " onto the stack\n";
+	report_current_device();
+	assert_(cur_dev::get_id() == dev_idx[1]);
+	pc::decrease_refcount(dev_idx[1]);
+	std::cout << "Deactivated/destroyed primary context for device " << dev_idx[1] << '\n';
+	report_current_device();
+	assert_(cur_dev::get_id() == dev_idx[1]);
+
+	std::cout << "\nSUCCESS" << std::endl;
+}
+
diff --git a/src/cuda/api.hpp b/src/cuda/api.hpp
new file mode 100644
index 00000000..370e6edf
--- /dev/null
+++ b/src/cuda/api.hpp
@@ -0,0 +1,50 @@
+/**
+ * @file runtime_api.hpp
+ *
+ * @brief A single file which includes, in turn, all (joint)
+ * wrappers for Runtime and Driver APIs, and related headers.
+ */
+#pragma once
+#ifndef CUDA_API_WRAPPERS_HPP_
+#define CUDA_API_WRAPPERS_HPP_
+
+static_assert(__cplusplus >= 201103L, "The CUDA API headers can only be compiled with C++11 or a later version of the C++ language standard");
+
+#include <cuda/api/types.hpp>
+
+#include <cuda/api/pci_id.hpp>
+#include <cuda/api/constants.hpp>
+#include <cuda/api/error.hpp>
+#include <cuda/api/versions.hpp>
+#include <cuda/api/miscellany.hpp>
+#include <cuda/api/pointer.hpp>
+#include <cuda/api/device_properties.hpp>
+#include <cuda/api/current_context.hpp>
+#include <cuda/api/ipc.hpp>
+#include <cuda/api/array.hpp>
+#include <cuda/api/texture_view.hpp>
+#include <cuda/api/memory.hpp>
+#include <cuda/api/unique_ptr.hpp>
+#include <cuda/api/link_options.hpp>
+
+#include <cuda/api/device.hpp>
+#include <cuda/api/context.hpp>
+#include <cuda/api/primary_context.hpp>
+#include <cuda/api/stream.hpp>
+#include <cuda/api/event.hpp>
+#include <cuda/api/kernel.hpp>
+#include <cuda/api/module.hpp>
+#include <cuda/api/link.hpp>
+
+#include <cuda/api/current_device.hpp>
+
+#include <cuda/api/peer_to_peer.hpp>
+#include <cuda/api/devices.hpp>
+
+#include <cuda/api/pci_id_impl.hpp>
+#include <cuda/api/multi_wrapper_impls.hpp>
+#include <cuda/api/apriori_compiled_kernel.hpp>
+#include <cuda/api/kernel_launch.hpp>
+#include <cuda/api/virtual_memory.hpp>
+
+#endif // CUDA_API_WRAPPERS_HPP_
diff --git a/src/cuda/api/apriori_compiled_kernel.hpp b/src/cuda/api/apriori_compiled_kernel.hpp
new file mode 100644
index 00000000..8276bf57
--- /dev/null
+++ b/src/cuda/api/apriori_compiled_kernel.hpp
@@ -0,0 +1,145 @@
+/**
+ * @file apriori_compiled_kernel.hpp
+ *
+ * @brief An implementation of a subclass of @ref `kernel_t` for kernels
+ * compiled together with the host-side program.
+ */
+#pragma once
+#ifndef CUDA_API_WRAPPERS_APRIORI_COMPILED_KERNEL_HPP_
+#define CUDA_API_WRAPPERS_APRIORI_COMPILED_KERNEL_HPP_
+
+#include <cuda/api/kernel.hpp>
+#include <cuda/api/current_context.hpp>
+#include <type_traits>
+
+namespace cuda {
+
+///@cond
+class device_t;
+class apriori_compiled_kernel_t;
+///@nocond
+
+namespace kernel {
+
+namespace detail_ {
+
+inline handle_t get_handle(const void *kernel_function_ptr, const char* name = nullptr)
+{
+	handle_t handle;
+	auto status = cudaGetFuncBySymbol(&handle, kernel_function_ptr);
+	throw_if_error(status, "Failed obtaining a CUDA function handle for "
+		+ ((name == nullptr) ? ::std::string("a kernel function") : ::std::string("kernel function ") + name)
+		+ " at " + cuda::detail_::ptr_as_hex(kernel_function_ptr));
+	return handle;
+}
+
+apriori_compiled_kernel_t wrap(
+	device::id_t device_id,
+	context::handle_t context_id,
+	kernel::handle_t f,
+	const void* ptr);
+
+
+} // namespace detail_
+} // namespace kernel
+
+/**
+ * @brief A subclass of the @ref `kernel_t` interface for kernels being
+ * functions marked as __global__ in source files and compiled apriori.
+ */
+class apriori_compiled_kernel_t final : public kernel_t {
+public: // getters
+	const void *ptr() const noexcept { return ptr_; }
+	const void *get() const noexcept { return ptr_; }
+
+public: // type_conversions
+	explicit operator const void *() noexcept { return ptr_; }
+
+public: // non-mutators
+
+	/**
+	 * @brief Calculates the number of grid blocks which may be "active" on a given GPU
+	 * multiprocessor simultaneously (i.e. with warps from any of these block
+	 * being schedulable concurrently)
+	 *
+	 * @param num_threads_per_block
+	 * @param dynamic_shared_memory_per_block
+	 * @param disable_caching_override On some GPUs, the choice of whether to
+	 * cache memory reads affects occupancy. But what if this caching results in 0
+	 * potential occupancy for a kernel? There are two options, controlled by this flag.
+	 * When it is set to false - the calculator will assume caching is off for the
+	 * purposes of its work; when set to true, it will return 0 for such device functions.
+	 * See also the "Unified L1/Texture Cache" section of the
+	 * <a href="http://docs.nvidia.com/cuda/maxwell-tuning-guide/index.html">Maxwell
+	 * tuning guide</a>.
+	 */
+	grid::dimension_t maximum_active_blocks_per_multiprocessor(
+		grid::block_dimension_t num_threads_per_block,
+		memory::shared::size_t dynamic_shared_memory_per_block,
+		bool disable_caching_override = false);
+
+protected: // ctors & dtor
+	apriori_compiled_kernel_t(device::id_t device_id, context::handle_t context_handle,
+		kernel::handle_t handle, const void *f)
+		: kernel_t(device_id, context_handle, handle), ptr_(f) {
+		// TODO: Consider checking whether this actually is a device function, at all and in this context
+#ifndef NDEBUG
+		assert(f != nullptr && "Attempt to construct a kernel object for a nullptr kernel function pointer");
+#endif
+	}
+	apriori_compiled_kernel_t(device::id_t device_id, context::handle_t context_handle, const void *f)
+		: apriori_compiled_kernel_t(device_id, context_handle, kernel::detail_::get_handle(f), f) { }
+
+public: // ctors & dtor
+	apriori_compiled_kernel_t(const apriori_compiled_kernel_t&) = default;
+	apriori_compiled_kernel_t(apriori_compiled_kernel_t&&) = default;
+
+public: // friends
+	friend apriori_compiled_kernel_t kernel::detail_::wrap(device::id_t, context::handle_t, kernel::handle_t, const void*);
+
+protected: // data members
+	const void *const ptr_;
+};
+
+inline grid::dimension_t apriori_compiled_kernel_t::maximum_active_blocks_per_multiprocessor(
+	grid::block_dimension_t num_threads_per_block,
+	memory::shared::size_t dynamic_shared_memory_per_block,
+	bool disable_caching_override)
+{
+	context::current::detail_::scoped_override_t set_context_for_this_context(context_handle_);
+	int result;
+	unsigned int flags = disable_caching_override ?
+						 cudaOccupancyDisableCachingOverride : cudaOccupancyDefault;
+	auto status = cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
+		&result, ptr_, (int) num_threads_per_block,
+		dynamic_shared_memory_per_block, flags);
+	throw_if_error(status, "Failed calculating the maximum occupancy "
+						   "of device function blocks per multiprocessor");
+	return result;
+}
+
+namespace kernel {
+namespace detail_ {
+
+inline apriori_compiled_kernel_t wrap(
+	device::id_t device_id,
+	context::handle_t context_id,
+	kernel::handle_t f,
+	const void *ptr)
+{
+	return {device_id, context_id, f, ptr};
+}
+
+} // namespace detail
+
+template<typename KernelFunctionPtr>
+apriori_compiled_kernel_t get(device_t device, KernelFunctionPtr function_ptr);
+
+template<typename KernelFunctionPtr>
+apriori_compiled_kernel_t get(context_t context, KernelFunctionPtr function_ptr);
+
+} // namespace kernel
+
+} // namespace cuda
+
+#endif // CUDA_API_WRAPPERS_APRIORI_COMPILED_KERNEL_HPP_
diff --git a/src/cuda/api/array.hpp b/src/cuda/api/array.hpp
index 8608640f..876a63f5 100644
--- a/src/cuda/api/array.hpp
+++ b/src/cuda/api/array.hpp
@@ -12,10 +12,12 @@
 #ifndef CUDA_API_WRAPPERS_ARRAY_HPP_
 #define CUDA_API_WRAPPERS_ARRAY_HPP_
 
-#include <cuda/api/current_device.hpp>
+#include <cuda/api/context.hpp>
 #include <cuda/api/error.hpp>
 
 #include <cuda_runtime.h>
+#include <cuda.h>
+#include <cuda_fp16.h>
 
 namespace cuda {
 
@@ -26,48 +28,88 @@ class array_t;
 
 namespace array {
 
-using handle_t = cudaArray*;
+using handle_t = CUarray;
+template <dimensionality_t NumDimensions>
+using descriptor_t = typename ::std::conditional<NumDimensions == 2, CUDA_ARRAY_DESCRIPTOR, CUDA_ARRAY3D_DESCRIPTOR>::type;
 
 /**
  * @brief Wrap an existing CUDA array in an @ref array_t instance.
  */
 template <typename T, dimensionality_t NumDimensions>
 array_t<T, NumDimensions> wrap(
-    device::id_t                 device_id,
+	device::id_t                 device_id,
+	context::handle_t            context_handle,
 	handle_t                     handle,
 	dimensions_t<NumDimensions>  dimensions) noexcept;
 
 namespace detail_ {
 
+template <typename T> struct format_specifier {};
+
+template <> struct format_specifier<uint8_t > { static constexpr const CUarray_format value = CU_AD_FORMAT_UNSIGNED_INT8;  };
+template <> struct format_specifier<uint16_t> { static constexpr const CUarray_format value = CU_AD_FORMAT_UNSIGNED_INT16; };
+template <> struct format_specifier<uint32_t> { static constexpr const CUarray_format value = CU_AD_FORMAT_UNSIGNED_INT32; };
+template <> struct format_specifier<int8_t  > { static constexpr const CUarray_format value = CU_AD_FORMAT_SIGNED_INT8;    };
+template <> struct format_specifier<int16_t > { static constexpr const CUarray_format value = CU_AD_FORMAT_SIGNED_INT16;   };
+template <> struct format_specifier<int32_t > { static constexpr const CUarray_format value = CU_AD_FORMAT_SIGNED_INT32;   };
+template <> struct format_specifier<half    > { static constexpr const CUarray_format value = CU_AD_FORMAT_HALF;           };
+template <> struct format_specifier<float   > { static constexpr const CUarray_format value = CU_AD_FORMAT_FLOAT;          };
+
 template<typename T>
-handle_t create_on_current_device(dimensions_t<3> dimensions)
+handle_t create_in_current_context(dimensions_t<3> dimensions)
 {
-	auto channel_descriptor = cudaCreateChannelDesc<T>();
-	cudaExtent extent = dimensions;
 	handle_t handle;
-	auto status = cudaMalloc3DArray(&handle, &channel_descriptor, extent);
-	throw_if_error(status, "Failed allocating 3D CUDA array");
+	CUDA_ARRAY3D_DESCRIPTOR descriptor;
+	descriptor.Width = dimensions.width;
+	descriptor.Height = dimensions.height;
+	descriptor.Depth = dimensions.depth;
+	descriptor.Format = format_specifier<T>::value;
+	descriptor.NumChannels = 1;
+		// We don't currently support an array of packed pairs or quadruplets; if you want this,
+		// file an issue.
+	descriptor.Flags = 0;
+
+	auto status = cuArray3DCreate(&handle, &descriptor);
+	throw_if_error(status, "failed allocating 3D CUDA array");
 	return handle;
 }
 
 template<typename T>
-handle_t create_on_current_device(dimensions_t<2> dimensions)
+handle_t create_in_current_context(dimensions_t<2> dimensions)
 {
-	auto channel_desc = cudaCreateChannelDesc<T>();
+	CUDA_ARRAY_DESCRIPTOR descriptor;
+	descriptor.Width = dimensions.width;
+	descriptor.Height = dimensions.height;
+	descriptor.Format = format_specifier<T>::value;
+	descriptor.NumChannels = 1;
 	handle_t handle;
-	auto status = cudaMallocArray(&handle, &channel_desc, dimensions.width, dimensions.height);
-	throw_if_error(status, "Failed allocating 2D CUDA array");
+	auto status = cuArrayCreate(&handle, &descriptor);
+	throw_if_error(status, "failed allocating 2D CUDA array");
 	return handle;
 }
 
 template <typename T, dimensionality_t NumDimensions>
-handle_t create(const device_t& device, dimensions_t<NumDimensions> dimensions);
+handle_t create(context::handle_t context_handle, dimensions_t<NumDimensions> dimensions)
+{
+	context::current::detail_::scoped_override_t set_context_for_this_scope(context_handle);
+	return create_in_current_context<T>(dimensions);
+}
 
 template <typename T, dimensionality_t NumDimensions>
-handle_t create(device::id_t device_id, dimensions_t<NumDimensions> dimensions)
+handle_t create(const context_t& context, dimensions_t<NumDimensions> dimensions);
+
+template <dimensionality_t NumDimensions>
+handle_t get_descriptor(context::handle_t context_handle, handle_t handle)
 {
-	device::current::detail_::scoped_override_t set_device_for_this_scope(device_id);
-	return create_on_current_device<T>(dimensions);
+	cuda::context::current::detail_::scoped_override_t set_context_for_this_scope(context_handle);
+	descriptor_t<NumDimensions> result;
+	auto status = (NumDimensions == 2) ?
+		cuArrayGetDescriptor(&result, handle) :
+		cuArray3DGetDescriptor(&result, handle);
+	throw_if_error(status,
+		::std::string("Failed obtaining the descriptor of the CUDA ") +
+		(NumDimensions == 2 ? "2":"3") + "D array at " + cuda::detail_::ptr_as_hex(handle));
+	return result;
 }
 
 } // namespace detail_
@@ -99,64 +141,78 @@ class array_t {
 
 public:
 	using handle_type = array::handle_t;
+	using descriptor_type = array::descriptor_t<NumDimensions>;
 	using dimensions_type = array::dimensions_t<NumDimensions>;
 
 	/**
 	 * Constructs a CUDA array wrapper from the raw type used by the CUDA
 	 * Runtime API - and takes ownership of the array
 	 */
-	array_t(device::id_t device_id, handle_type handle, dimensions_type dimensions) :
-	    device_id_(device_id), dimensions_(dimensions), handle_(handle)
+	array_t(device::id_t device_id, context::handle_t context_handle, handle_type handle, dimensions_type dimensions) :
+		device_id_(device_id), context_handle_(context_handle), dimensions_(dimensions), handle_(handle)
 	{
 		assert(handle != nullptr);
 	}
 
 	array_t(const array_t& other) = delete;
-	array_t(array_t&& other) noexcept : array_t(other.device_id_, other.handle_, other.dimensions_)
+	array_t(array_t&& other) noexcept : array_t(other.device_id_, other.context_handle_, other.handle_, other.dimensions_)
 	{
 		other.handle_ = nullptr;
 	}
 
 	~array_t() noexcept
 	{
+		cuda::context::current::detail_::scoped_override_t set_context_for_this_scope(context_handle_);
 		if (handle_) {
-			auto status = cudaFreeArray(handle_);
+			auto status = cuArrayDestroy(handle_);
 			// Note: Throwing in a noexcept destructor; if the free'ing fails, the program
 			// will likely terminate
-			throw_if_error(status, "Failed freeing CUDA array");
+			throw_if_error(status, "Failed destroying CUDA array " + cuda::detail_::ptr_as_hex(handle_));
 		}
 	}
 
-	friend array_t array::wrap<T, NumDimensions>(device::id_t, handle_type, dimensions_type) noexcept;
+	friend array_t array::wrap<T, NumDimensions>(device::id_t, context::handle_t, handle_type, dimensions_type) noexcept;
 
-    handle_type get() const noexcept { return handle_; }
-    device_t device() const noexcept;
+	handle_type get() const noexcept { return handle_; }
+	device::id_t device_id() const noexcept { return device_id_; }
+	context::handle_t context_handle() const noexcept { return context_handle_; }
 	dimensions_type dimensions() const noexcept { return dimensions_; }
+	device_t device() const noexcept;
+	context_t context() const;
 	::std::size_t size() const noexcept { return dimensions().size(); }
 	::std::size_t size_bytes() const noexcept { return size() * sizeof(T); }
+	descriptor_type descriptor() const	{ return array::detail_::get_descriptor<NumDimensions>(context_handle_, handle_); }
 
 protected:
-	dimensions_type  dimensions_;
-	handle_type      handle_;
-	device::id_t     device_id_;
+	dimensions_type    dimensions_;
+	device::id_t       device_id_;
+	context::handle_t  context_handle_;
+	handle_type        handle_;
 };
 
 namespace array {
 
 template <typename T, dimensionality_t NumDimensions>
-inline array_t<T, NumDimensions> wrap(
-    device::id_t                 device_id,
+array_t<T, NumDimensions> wrap(
+	device::id_t                 device_id,
+	context::handle_t            context_handle,
 	handle_t                     handle,
 	dimensions_t<NumDimensions>  dimensions) noexcept
 {
-	return array_t<T, NumDimensions> { device_id, handle, dimensions };
+	return { device_id, context_handle, handle, dimensions };
 }
 
 template <typename T, dimensionality_t NumDimensions>
-array_t<T, NumDimensions> create(
-	const device_t&              device,
+array_t<T,NumDimensions> create(
+	const context_t&             context,
 	dimensions_t<NumDimensions>  dimensions);
 
+template <typename T, dimensionality_t NumDimensions>
+array_t<T,NumDimensions> create(
+	device_t                     device,
+	dimensions_t<NumDimensions>  dimensions);
+
+
 } // namespace array
 
 } // namespace cuda
diff --git a/src/cuda/api/constants.hpp b/src/cuda/api/constants.hpp
index 87a8c62d..aae8825e 100644
--- a/src/cuda/api/constants.hpp
+++ b/src/cuda/api/constants.hpp
@@ -10,7 +10,7 @@
 #ifndef CUDA_API_WRAPPERS_CONSTANTS_HPP_
 #define CUDA_API_WRAPPERS_CONSTANTS_HPP_
 
-#include <cuda/common/types.hpp>
+#include <cuda/api/types.hpp>
 
 namespace cuda {
 
@@ -116,6 +116,16 @@ enum : bool {
 	do_not_take_ownership = false,
 };
 
+namespace context {
+
+namespace detail_ {
+
+constexpr const CUcontext none { 0 };
+
+} // namespace detail_
+
+} // namespace context
+
 } // namespace cuda
 
 #endif // CUDA_API_WRAPPERS_CONSTANTS_HPP_
diff --git a/src/cuda/api/context.hpp b/src/cuda/api/context.hpp
new file mode 100644
index 00000000..077a99ce
--- /dev/null
+++ b/src/cuda/api/context.hpp
@@ -0,0 +1,850 @@
+/**
+ * @file context.hpp
+ *
+ * @brief Contains a proxy class for CUDA execution contexts.
+ */
+#pragma once
+#ifndef CUDA_API_WRAPPERS_CONTEXT_HPP_
+#define CUDA_API_WRAPPERS_CONTEXT_HPP_
+
+#include <cuda/api/types.hpp>
+#include <cuda/api/constants.hpp>
+#include <cuda/api/error.hpp>
+#include <cuda/api/versions.hpp>
+#include <cuda/api/current_context.hpp>
+
+#include <cuda.h>
+#include <iostream>
+
+namespace cuda {
+
+///@cond
+class device_t;
+class event_t;
+class context_t;
+class stream_t;
+class module_t;
+///@endcond
+
+namespace link {
+class options_t;
+} // namespace link
+
+namespace context {
+
+using limit_t = CUlimit;
+using limit_value_t = size_t;
+using shared_memory_bank_size_t = CUsharedconfig;
+
+/**
+ * A range of priorities supported by a CUDA context; ranges from the
+ * higher numeric value to the lower.
+ */
+struct stream_priority_range_t {
+	stream::priority_t least; /// Higher numeric value, lower priority
+	stream::priority_t greatest; /// Lower numeric value, higher priority
+
+	/**
+	 * When true, stream prioritization is not supported, i.e. all streams have
+	 * "the same" priority - the default one.
+	 */
+	constexpr bool is_trivial() const {
+		return least == stream::default_priority and greatest == stream::default_priority;
+	}
+};
+
+namespace detail_ {
+
+::std::string identify(const context_t& context);
+
+inline limit_value_t get_limit(limit_t limit_id)
+{
+	limit_value_t limit_value;
+	auto status = cuCtxGetLimit(&limit_value, limit_id);
+	throw_if_error(status,
+			"Failed obtaining CUDA context limit value");
+	return limit_value;
+}
+
+inline void set_limit(limit_t limit_id, limit_value_t new_value)
+{
+	auto status = cuCtxSetLimit(limit_id, new_value);
+	throw_if_error(status, "Failed obtaining CUDA context limit value");
+}
+
+constexpr flags_t inline make_flags(
+	host_thread_synch_scheduling_policy_t  synch_scheduling_policy,
+	bool                                   keep_larger_local_mem_after_resize)
+{
+	return(
+		  synch_scheduling_policy // this enum value is also a valid bitmask
+		| (keep_larger_local_mem_after_resize    ? CU_CTX_LMEM_RESIZE_TO_MAX : 0) );
+}
+
+inline device::id_t get_device_id(handle_t context_handle)
+{
+	auto needed_push = current::detail_::push_if_not_on_top(context_handle);
+	auto device_id = current::detail_::get_device_id();
+	if (needed_push) {
+		current::detail_::pop();
+	}
+	return device_id;
+}
+
+
+
+/**
+ * @brief Wrap an existing CUDA context in a @ref context_t instance
+ *
+ * @param device_id ID of the device for which the context is defined
+ * @param context_id
+ * @param take_ownership When set to `false`, the CUDA context
+ * will not be destroyed along with its proxy. When set to `true`,
+ * the proxy class will destroy the context when itself being destructed.
+ * @return The constructed `cuda::context_t`.
+ */
+context_t wrap(
+	device::id_t       device_id,
+	context::handle_t  context_id,
+	bool               take_ownership = false) noexcept;
+
+context_t from_handle(
+	context::handle_t  context_handle,
+	bool               take_ownership = false);
+
+inline size_t total_memory(handle_t handle)
+{
+	size_t total_mem_in_bytes;
+	auto status = cuMemGetInfo(nullptr, &total_mem_in_bytes);
+	throw_if_error(status, "Failed determining amount of total memory for " + identify(handle));
+	return total_mem_in_bytes;
+
+}
+
+inline size_t free_memory(handle_t handle)
+{
+	size_t free_mem_in_bytes;
+	auto status = cuMemGetInfo(&free_mem_in_bytes, nullptr);
+	throw_if_error(status, "Failed determining amount of free memory for " + identify(handle));
+	return free_mem_in_bytes;
+}
+
+inline void set_cache_preference(handle_t handle, multiprocessor_cache_preference_t preference)
+{
+	auto status = cuCtxSetCacheConfig(static_cast<CUfunc_cache>(preference));
+	throw_if_error(status,
+		"Setting the multiprocessor L1/Shared Memory cache distribution preference to " +
+			::std::to_string((unsigned) preference) + " for " + identify(handle));
+}
+
+inline multiprocessor_cache_preference_t cache_preference(handle_t handle)
+{
+	CUfunc_cache preference;
+	auto status = cuCtxGetCacheConfig(&preference);
+	throw_if_error(status,
+		"Obtaining the multiprocessor L1/Shared Memory cache distribution preference for " + identify(handle));
+	return (multiprocessor_cache_preference_t) preference;
+}
+
+inline shared_memory_bank_size_t shared_memory_bank_size(handle_t handle)
+{
+	CUsharedconfig bank_size;
+	auto status = cuCtxGetSharedMemConfig(&bank_size);
+	throw_if_error(status, "Obtaining the multiprocessor shared memory bank size for " + identify(handle));
+	return static_cast<shared_memory_bank_size_t>(bank_size);
+}
+
+inline void set_shared_memory_bank_size(handle_t handle, shared_memory_bank_size_t bank_size)
+{
+	auto status = cuCtxSetSharedMemConfig(static_cast<CUsharedconfig>(bank_size));
+	throw_if_error(status, "Setting the multiprocessor shared memory bank size for " + identify(handle));
+}
+
+inline void synchronize(context::handle_t handle)
+{
+	context::current::detail_::scoped_override_t set_context_for_this_scope(handle);
+	context::current::detail_::synchronize(handle);
+}
+
+inline void synchronize(device::id_t device_id, context::handle_t handle)
+{
+	context::current::detail_::scoped_override_t set_context_for_this_scope(handle);
+	context::current::detail_::synchronize(device_id, handle);
+}
+
+inline void destroy(handle_t handle)
+{
+	auto status = cuCtxDestroy(handle);
+	throw_if_error(status, "Failed destroying " + identify(handle));
+}
+
+inline void destroy(handle_t handle, device::id_t device_index)
+{
+	auto status = cuCtxDestroy(handle);
+	throw_if_error(status, "Failed destroying " + identify(handle, device_index));
+}
+
+inline context::flags_t get_flags(handle_t handle)
+{
+	current::detail_::scoped_override_t set_context_for_this_scope{handle};
+	return context::current::detail_::get_flags();
+}
+
+} // namespace detail_
+
+} // namespace context
+
+inline void synchronize(const context_t& context);
+
+/**
+ * @brief Wrapper class for a CUDA context
+ *
+ * Use this class - built around a context id - to perform all
+ * context-related operations the CUDA Driver (or, in fact, Runtime) API is capable of.
+ *
+ * @note By default this class has RAII semantics, i.e. it creates a
+ * context on construction and destroys it on destruction, and isn't merely
+ * an ephemeral wrapper one could apply and discard; but this second kind of
+ * semantics is also supported, through the @ref context_t::holds_refcount_unit_ field.
+ *
+ * @note A context is a specific to a device; see, therefore, also @ref device_t .
+ * @note This class is a "reference type", not a "value type". Therefore, making changes
+ * to properties of the context is a const-respecting operation on this class.
+ */
+class context_t {
+public: // types
+	using scoped_setter_type = context::current::detail_::scoped_override_t;
+	using flags_type = context::flags_t;
+
+	static_assert(
+		::std::is_same< ::std::underlying_type<CUsharedconfig>::type, ::std::underlying_type<cudaSharedMemConfig>::type >::value,
+		"Unexpected difference between enumerators used for the same purpose by the CUDA runtime and the CUDA driver");
+
+public: // inner classes
+
+	/**
+	 * @brief A class to create a faux member in a @ref device_t, in lieu of an in-class
+	 * namespace (which C++ does not support); whenever you see a function
+	 * `my_dev.memory::foo()`, think of it as a `my_dev::memory::foo()`.
+	 *
+	 * TODO: Should this be made context-specific?
+	 */
+	class global_memory_type {
+	protected: // data members
+		const device::id_t device_id_;
+		const context::handle_t context_handle_;
+
+	public:
+		global_memory_type(device::id_t device_id, context::handle_t context_handle)
+			: device_id_(device_id), context_handle_(context_handle) { }
+		///@endcond
+
+        device_t associated_device() const;
+        context_t associated_context() const;
+
+		/**
+		 * Allocate a region of memory on the device
+		 *
+		 * @param size_in_bytes size in bytes of the region of memory to allocate
+		 * @return a non-null (device-side) pointer to the allocated memory
+		 */
+		memory::region_t allocate(size_t size_in_bytes);
+
+		/**
+		 * Allocates memory on the device whose pointer is also visible on the host,
+		 * and possibly on other devices as well - with the same address. This is
+		 * nVIDIA's "managed memory" mechanism.
+		 *
+		 * @note Managed memory isn't as "strongly associated" with a single device
+		 * as the result of allocate(), since it can be read or written from any
+		 * device or from the host. However, the actual space is allocated on
+		 * some device, so its creation is a device (device_t) object method.
+		 *
+		 * @note for a more complete description see the
+		 * <a href="http://docs.nvidia.com/cuda/cuda-runtime-api/">CUDA Runtime API
+		 * reference</a>)
+		 *
+		 * @param size_in_bytes Size of memory region to allocate
+		 * @param initial_visibility if this equals ,to_supporters_of_concurrent_managed_access\ only the host (and the
+		 * allocating device) will be able to utilize the pointer returned; if false,
+		 * it will be made usable on all CUDA devices on the systems.
+		 * @return the allocated pointer; never returns null (throws on failure)
+		 */
+		memory::region_t allocate_managed(
+			size_t size_in_bytes,
+			cuda::memory::managed::initial_visibility_t initial_visibility =
+			cuda::memory::managed::initial_visibility_t::to_supporters_of_concurrent_managed_access);
+
+		/**
+		 * Amount of total global memory on the CUDA device's primary context.
+		 */
+		size_t amount_total() const
+		{
+			scoped_setter_type set_context_for_this_scope(context_handle_);
+			return context::detail_::total_memory(context_handle_);
+		}
+
+		/**
+		 * Amount of free global memory on the CUDA device's primary context.
+		 */
+		size_t amount_free() const {
+			scoped_setter_type set_context_for_this_scope(context_handle_);
+			return context::detail_::free_memory(context_handle_);
+		}
+	}; // class global_memory_type
+
+
+public: // data member non-mutator getters
+
+	/**
+	 * The CUDA context ID this object is wrapping
+	 */
+	context::handle_t handle() const noexcept { return handle_; }
+
+	/**
+	 * The device with which this context is associated
+	 */
+	device::id_t device_id() const noexcept { return device_id_; }
+	device_t device() const;
+
+	/**
+	 * Is this wrapper responsible for having the wrapped CUDA context destroyed on destruction?
+	 */
+	bool is_owning() const noexcept { return owning_;  }
+
+	/**
+	 * The amount of total global device memory available to this context, including
+	 * memory already allocated.
+	 */
+	size_t total_memory() const {
+		scoped_setter_type set_context_for_this_scope(handle_);
+		return context::detail_::total_memory(handle_);
+	}
+
+	/**
+	 *  The amount of unallocated global device memory available to this context
+	 *  and not yet allocated.
+	 *
+	 *  @note It is not guaranteed that this entire amount can actually be succefully allocated.
+	 */
+	size_t free_memory() const {
+		scoped_setter_type set_context_for_this_scope(handle_);
+		return context::detail_::free_memory(handle_);
+	}
+
+public: // other non-mutator methods
+
+	/**
+	 * Determines the balance between L1 space and shared memory space set
+	 * for kernels executing within this context.
+	 */
+	multiprocessor_cache_preference_t cache_preference() const
+	{
+		scoped_setter_type set_context_for_this_scope(handle_);
+		return context::detail_::cache_preference(handle_);
+	}
+
+	/**
+	 * @return the stack size in bytes of each GPU thread
+	 *
+	 * @todo Is this really a feature of the context? Not of the device?
+	 */
+	size_t stack_size() const
+	{
+		scoped_setter_type set_context_for_this_scope(handle_);
+		return context::detail_::get_limit(CU_LIMIT_STACK_SIZE);
+	}
+
+	/**
+	 * @return the size of the FIFO (first-in, first-out) buffer used by the printf() function available to device kernels
+	 *
+	 * @todo Is this really a feature of the context? Not of the device?
+	 */
+	context::limit_value_t printf_buffer_size() const
+	{
+		scoped_setter_type set_context_for_this_scope(handle_);
+		return context::detail_::get_limit(CU_LIMIT_PRINTF_FIFO_SIZE);
+	}
+
+	/**
+	 * @return the size in bytes of the heap used by the malloc() and free() device system calls.
+	 *
+	 * @todo Is this really a feature of the context? Not of the device?
+	 */
+	context::limit_value_t memory_allocation_heap_size() const
+	{
+		scoped_setter_type set_context_for_this_scope(handle_);
+		return context::detail_::get_limit(CU_LIMIT_MALLOC_HEAP_SIZE);
+	}
+
+	/**
+	 * @return the maximum grid depth at which a thread can issue the device
+	 * runtime call `cudaDeviceSynchronize()` / `cuda::device::synchronize()`
+     * to wait on child grid launches to complete.
+	 *
+	 * @todo Is this really a feature of the context? Not of the device?
+	 */
+	context::limit_value_t maximum_depth_of_child_grid_synch_calls() const
+	{
+		scoped_setter_type set_context_for_this_scope(handle_);
+		return context::detail_::get_limit(CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH);
+	}
+
+	global_memory_type memory() const
+	{
+		return { device_id_, handle_ };
+	}
+
+	/**
+	 * @return maximum number of outstanding device runtime launches that can be made from this context.
+	 *
+	 * @todo Is this really a feature of the context? Not of the device?
+	 */
+	context::limit_value_t maximum_outstanding_kernel_launches() const
+	{
+		scoped_setter_type set_context_for_this_scope(handle_);
+		return context::detail_::get_limit(CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT);
+	}
+
+	/**
+	 * @return maximum granularity of fetching from the L2 cache
+	 *
+	 * @note A value between 0 and 128; it is apparently a "hint" somehow.
+	 *
+	 * @todo Is this really a feature of the context? Not of the device?
+	 */
+	context::limit_value_t l2_fetch_granularity() const
+	{
+		scoped_setter_type set_context_for_this_scope(handle_);
+		return context::detail_::get_limit(CU_LIMIT_MAX_L2_FETCH_GRANULARITY);
+	}
+
+	/**
+	 * @brief Returns the shared memory bank size, as described in
+	 * <a href="https://devblogs.nvidia.com/parallelforall/using-shared-memory-cuda-cc/">this Parallel-for-all blog entry</a>
+	 *
+	 * @return the shared memory bank size in bytes
+	 */
+	context::shared_memory_bank_size_t shared_memory_bank_size() const
+	{
+		scoped_setter_type set_context_for_this_scope(handle_);
+		return context::detail_::shared_memory_bank_size(handle_);
+	}
+
+	/**
+	 * Determine if this context is the system's current CUDA context.
+	 */
+	bool is_current() const
+	{
+		return context::current::detail_::is_(handle_);
+	}
+
+	/**
+	 * Determine if this context is the primary context for its associated device.
+	 */
+	bool is_primary() const;
+
+	/**
+	 *
+	 * @todo isn't this a feature of devices?
+	 */
+	context::stream_priority_range_t stream_priority_range() const
+	{
+		scoped_setter_type set_context_for_this_scope(handle_);
+		context::stream_priority_range_t result;
+		auto status = cuCtxGetStreamPriorityRange(&result.least, &result.greatest);
+		throw_if_error(status, "Obtaining the priority range for streams within " +
+			context::detail_::identify(*this));
+		return result;
+	}
+
+	context::limit_value_t get_limit(context::limit_t limit_id) const
+	{
+		scoped_setter_type set_context_for_this_scope(handle_);
+		return context::detail_::get_limit(limit_id);
+	}
+
+	version_t api_version() const
+	{
+		unsigned int raw_version;
+		auto status = cuCtxGetApiVersion(handle_, &raw_version);
+		throw_if_error(status, "Failed obtaining the API version for " + context::detail_::identify(*this));
+		return version_t::from_single_number((int) raw_version);
+	}
+
+protected:
+	context::flags_t flags() const
+	{
+		return context::detail_::get_flags(handle_);
+	}
+
+
+public: // methods which mutate the context, but not its wrapper
+	/**
+	 * Gets the synchronization policy to be used for threads synchronizing
+	 * with this CUDA context.
+	 *
+	 * @note see @ref host_thread_synch_scheduling_policy_t
+	 * for a description of the various policies.
+	 */
+	context::host_thread_synch_scheduling_policy_t synch_scheduling_policy() const
+	{
+		return context::host_thread_synch_scheduling_policy_t(flags() & CU_CTX_SCHED_MASK);
+	}
+
+	 bool keeping_larger_local_mem_after_resize() const
+	 {
+		return flags() & CU_CTX_LMEM_RESIZE_TO_MAX;
+	}
+
+	/**
+	 * See @ref cuda::stream::create()
+	 */
+	stream_t create_stream(
+		bool                will_synchronize_with_default_stream,
+		stream::priority_t  priority = cuda::stream::default_priority);
+
+	/**
+	 * See @ref cuda::event::create()
+	 */
+	event_t create_event(
+		bool uses_blocking_sync = event::sync_by_busy_waiting, // Yes, that's the runtime default
+		bool records_timing     = event::do_record_timings,
+		bool interprocess       = event::not_interprocess);
+
+	module_t create_module(const void* module_data, link::options_t link_options) const;
+	module_t create_module(const void* module_data) const;
+	template <typename ContiguousContainer>
+	module_t create_module(ContiguousContainer module_data) const;
+
+public: // Methods which don't mutate the context, but affect the device itself
+
+
+	void enable_access_to(const context_t& peer) const;
+
+	void disable_access_to(const context_t& peer) const;
+
+	void reset_persisting_l2_cache() const
+	{
+		scoped_setter_type set_context_for_this_scope(handle_);
+#if (CUDART_VERSION >= 11000)
+		auto status = cuCtxResetPersistingL2Cache();
+		throw_if_error(status, "Failed resetting/clearing the persisting L2 cache memory");
+#endif
+		throw cuda::runtime_error(
+			cuda::status::insufficient_driver,
+			"Resetting/clearing the persisting L2 cache memory is not supported when compiling CUDA versions lower than 11.0");
+	}
+
+public: // other methods which don't mutate this class as a reference, but do mutate the context
+
+	/**
+	 * @brief Sets the shared memory bank size, described in
+ 	 * <a href="https://devblogs.nvidia.com/parallelforall/using-shared-memory-cuda-cc/">this Parallel-for-all blog entry</a>
+ 	 *
+ 	 * @param bank_size the shared memory bank size to set
+ 	 */
+	void set_shared_memory_bank_size(context::shared_memory_bank_size_t bank_size) const
+	{
+		scoped_setter_type set_context_for_this_scope(handle_);
+		context::detail_::set_shared_memory_bank_size(handle_, bank_size);
+	}
+
+	/**
+	 * Controls the balance between L1 space and shared memory space for
+	 * kernels executing within this context.
+	 *
+	 * @param preference the preferred balance between L1 and shared memory
+	 */
+	void set_cache_preference(multiprocessor_cache_preference_t preference) const
+	{
+		scoped_setter_type set_context_for_this_scope(handle_);
+		context::detail_::set_cache_preference(handle_, preference);
+	}
+
+	void set_limit(context::limit_t limit_id, context::limit_value_t new_value) const
+	{
+		scoped_setter_type set_context_for_this_scope(handle_);
+		return context::detail_::set_limit(limit_id, new_value);
+	}
+
+	void stack_size(context::limit_value_t new_value) const
+	{
+		return set_limit(CU_LIMIT_STACK_SIZE, new_value);
+	}
+
+	void printf_buffer_size(context::limit_value_t new_value) const
+	{
+		return set_limit(CU_LIMIT_PRINTF_FIFO_SIZE, new_value);
+	}
+
+	void memory_allocation_heap_size(context::limit_value_t new_value) const
+	{
+		return set_limit(CU_LIMIT_MALLOC_HEAP_SIZE, new_value);
+	}
+
+	void set_maximum_depth_of_child_grid_synch_calls(context::limit_value_t new_value) const
+	{
+		return set_limit(CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH, new_value);
+	}
+
+	void set_maximum_outstanding_kernel_launches(context::limit_value_t new_value) const
+	{
+		return set_limit(CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT, new_value);
+	}
+
+	/**
+	 * Have the calling thread wait - either busy-waiting or blocking - and
+	 * return only after all pending actions within this context have concluded.
+	 */
+	void synchronize() const
+	{
+		cuda::synchronize(*this);
+	}
+
+protected: // constructors
+
+	context_t(
+		device::id_t       device_id,
+		context::handle_t  context_id,
+		bool               take_ownership) noexcept
+	: device_id_(device_id), handle_(context_id), owning_(take_ownership) { }
+
+public: // friendship
+
+	friend context_t context::detail_::wrap(
+			device::id_t device_id,
+			context::handle_t context_id,
+			bool take_ownership) noexcept;
+
+public: // constructors and destructor
+
+	context_t(const context_t& other) :
+		context_t(other.device_id_, other.handle_, false) { };
+
+	context_t(context_t&& other) noexcept :
+		context_t(other.device_id_, other.handle_, other.owning_)
+	{
+		other.owning_ = false;
+	};
+
+	~context_t() {
+	    if (owning_) {
+	        cuCtxDestroy(handle_);
+			// Note: "Swallowing" any potential error to avoid std::terminate(); also,
+			// because the context cannot possibly exist after this call.
+	    }
+	}
+
+public: // operators
+
+	context_t& operator=(const context_t& other)
+	{
+		if (owning_) {
+			context::detail_::destroy(handle_);
+		}
+		device_id_ = other.device_id_;
+		handle_ = other.handle_;
+		owning_ = false;
+		return *this;
+	}
+
+	// Deleted since the handle_t and handle_t are constant
+	context_t& operator=(context_t&& other) noexcept
+  	{
+		::std::swap(device_id_, other.device_id_);
+		::std::swap(handle_, other.handle_);
+		::std::swap(owning_, other.owning_);
+		return *this;
+	}
+
+protected: // data members
+	device::id_t       device_id_;
+	context::handle_t  handle_;
+	bool               owning_;
+		// this field is mutable only for enabling move construction; other
+		// than in that case it must not be altered
+
+	// TODO: Should we hold a field indicating whether this context is
+	// primary or not?
+};
+
+inline bool operator==(const context_t& lhs, const context_t& rhs)
+{
+	return lhs.handle() == rhs.handle();
+}
+
+inline bool operator!=(const context_t& lhs, const context_t& rhs)
+{
+	return lhs.handle() != rhs.handle();
+}
+
+namespace context {
+
+namespace detail_ {
+
+/**
+ * Obtain a wrapper for an already-existing CUDA context
+ *
+ * @note This is a named constructor idiom instead of direct access to the ctor of the same
+ * signature, to emphase what this construction means - a new context is _not_
+ * created.
+ *
+ * @param device_id Device with which the context is associated
+ * @param context_id id of the context to wrap with a proxy
+ * @param take_ownership when true, the wrapper will have the CUDA driver destroy
+ * the cuntext when the wrapper itself destruct; otherwise, it is assumed
+ * that the context is "owned" elsewhere in the code, and that location or entity
+ * is responsible for destroying it when relevant (possibly after this wrapper
+ * ceases to exist)
+ * @return a context wrapper associated with the specified context
+ */
+inline context_t wrap(
+	device::id_t       device_id,
+	handle_t           context_id,
+	bool               take_ownership) noexcept
+{
+	return { device_id, context_id, take_ownership };
+}
+
+inline context_t from_handle(
+	context::handle_t  context_handle,
+	bool               take_ownership)
+{
+	device::id_t device_id = get_device_id(context_handle);
+	return wrap(device_id, context_handle, take_ownership);
+}
+
+inline handle_t create_and_push(
+	device::id_t                           device_id,
+	host_thread_synch_scheduling_policy_t  synch_scheduling_policy = automatic,
+	bool                                   keep_larger_local_mem_after_resize = false)
+{
+	auto flags = context::detail_::make_flags(
+			synch_scheduling_policy,
+			keep_larger_local_mem_after_resize);
+	handle_t handle;
+	auto status = cuCtxCreate(&handle, flags, device_id);
+	cuda::throw_if_error(status, "failed creating a CUDA context associated with "
+		+ device::detail_::identify(device_id));
+	return handle;
+}
+
+} // namespace detail_
+
+/**
+ * @brief creates a new context on a given device
+ *
+ * @param device              The device on which to create the new stream
+ * @param synch_scheduling_policy
+ * @param keep_larger_local_mem_after_resize
+ * @return
+ * @note Until CUDA 11, there used to also be a flag for enabling/disabling
+ * the ability of mapping pinned host memory to device addresses. However, it was
+ * being ignored since CUDA 3.2 already, with the minimum CUDA version supported
+ * by these wrappers being later than that, so - no sense in keeping it.
+ */
+context_t create(
+	device_t                               device,
+	host_thread_synch_scheduling_policy_t  synch_scheduling_policy = heuristic,
+    bool                                   keep_larger_local_mem_after_resize = false);
+
+context_t create_and_push(
+	device_t                               device,
+	host_thread_synch_scheduling_policy_t  synch_scheduling_policy = heuristic,
+    bool                                   keep_larger_local_mem_after_resize = false);
+
+namespace current {
+
+/**
+ * Determine whether any CUDA context is current, or whether the context stack is empty
+ */
+inline bool exists()
+{
+	return (detail_::get_handle() != context::detail_::none);
+}
+
+/**
+ * Obtain the current CUDA context, if one exists.
+ *
+ * @throws ::std::runtime_error in case there is no current context
+ */
+inline context_t get()
+{
+	auto handle = detail_::get_handle();
+	if (handle == context::detail_::none) {
+		throw ::std::runtime_error("Attempt to obtain the current CUDA context when no context is current.");
+	}
+	return context::detail_::from_handle(handle);
+}
+
+inline void set(const context_t& context)
+{
+	return detail_::set(context.handle());
+}
+
+inline bool push_if_not_on_top(const context_t& context)
+{
+	return context::current::detail_::push_if_not_on_top(context.handle());
+}
+
+inline void push(const context_t& context)
+{
+	return context::current::detail_::push(context.handle());
+}
+
+inline context_t pop()
+{
+	constexpr const bool do_not_take_ownership { false };
+	// Unfortunately, since we don't store the device IDs of contexts
+	// on the stack, this incurs an extra API call beyond just the popping...
+	auto handle = context::current::detail_::pop();
+	auto device_id = context::detail_::get_device_id(handle);
+	return context::detail_::wrap(device_id, handle, do_not_take_ownership);
+}
+
+namespace detail_ {
+
+/**
+ * If now current context exists, push the current device's primary context onto the stack
+ */
+handle_t push_default_if_missing();
+
+/**
+ * Ensures that a current context exists by pushing the current device's primary context
+ * if necessary, and returns the current context
+ *
+ * @throws ::std::runtime_error in case there is no current context
+ */
+inline context_t get_with_fallback_push()
+{
+	auto handle =  push_default_if_missing();
+	return context::detail_::from_handle(handle);
+}
+
+
+} // namespace detail_
+
+} // namespace current
+
+bool is_primary(const context_t& context);
+
+namespace detail_ {
+
+inline ::std::string identify(const context_t& context)
+{
+	return identify(context.handle(), context.device_id());
+}
+
+} // namespace detail_
+
+} // namespace context
+
+inline void synchronize(const context_t& context)
+{
+	context::detail_::synchronize(context.device_id(), context.handle());
+}
+
+} // namespace cuda
+
+#endif // CUDA_API_WRAPPERS_CONTEXT_HPP_
diff --git a/src/cuda/api/current_context.hpp b/src/cuda/api/current_context.hpp
new file mode 100644
index 00000000..33d47f9a
--- /dev/null
+++ b/src/cuda/api/current_context.hpp
@@ -0,0 +1,241 @@
+/**
+ * @file current_context.hpp
+ */
+#pragma once
+#ifndef CUDA_API_WRAPPERS_CURRENT_CONTEXT_HPP_
+#define CUDA_API_WRAPPERS_CURRENT_CONTEXT_HPP_
+
+#include <cuda/api/types.hpp>
+#include <cuda/api/error.hpp>
+#include <cuda/api/versions.hpp>
+
+#include <cuda.h>
+
+namespace cuda {
+
+///@cond
+class device_t;
+class context_t;
+///@endcond
+
+namespace context {
+
+namespace current {
+
+namespace detail_ {
+
+/**
+ * Returns a raw handle for the current CUDA context
+ *
+ * @return the raw handle from the CUDA driver - if one exists; none
+ * if no context is current/active.
+ */
+inline bool is_(handle_t handle)
+{
+	handle_t current_context_handle;
+	auto status = cuCtxGetCurrent(&current_context_handle);
+	switch(status) {
+	case CUDA_ERROR_NOT_INITIALIZED:
+	case CUDA_ERROR_INVALID_CONTEXT:
+		return false;
+	case CUDA_SUCCESS:
+		return (handle == current_context_handle);
+	default:
+		throw cuda::runtime_error((status_t) status, "Failed determining whether there's a current context, or what it is");
+	}
+}
+
+/**
+ * Returns a raw handle for the current CUDA context
+ *
+ * @return the raw handle from the CUDA driver - if one exists; none
+ * if no context is current/active.
+ */
+inline handle_t get_handle()
+{
+	handle_t handle;
+	auto status = cuCtxGetCurrent(&handle);
+	throw_if_error(status, "Failed obtaining the current context's handle");
+	return handle;
+}
+
+// Note: not calling this get_ since flags are read-only anyway
+inline context::flags_t get_flags()
+{
+	context::flags_t result;
+	auto status = cuCtxGetFlags(&result);
+	throw_if_error(status, "Failed obtaining the current context's flags");
+	return result;
+}
+
+inline device::id_t get_device_id()
+{
+	device::id_t device_id;
+	auto result = cuCtxGetDevice(&device_id);
+	throw_if_error(result, "Failed obtaining the current context's device");
+	return device_id;
+}
+
+} // namespace detail_
+
+inline bool exists();
+inline context_t get();
+inline void set(const context_t& context);
+
+namespace detail_ {
+
+/**
+ * Push a context handle onto the top of the context stack - if it is not already on the
+ * top of the stack
+ *
+ * @param context_handle A context handle to push
+ *
+ * @note behavior undefined if you try to push @ref none
+ */
+inline void push(handle_t context_handle)
+{
+	auto status = cuCtxPushCurrent(context_handle);
+	throw_if_error(status,
+		"Failed pushing to the top of the context stack: " + context::detail_::identify(context_handle));
+}
+
+/**
+ * Push a context handle onto the top of the context stack - if it is not already on the
+ * top of the stack
+ *
+ * @param context_handle A context handle to push
+ *
+ * @return true if a push actually occurred
+ *
+ * @note behavior undefined if you try to push @ref none
+ * @note The CUDA context stack is not a proper stack, in that it doesn't allow multiple
+ * consecutive copes of the same context on the stack; hence there is no `push()` method.
+ */
+inline bool push_if_not_on_top(handle_t context_handle)
+{
+	if (detail_::get_handle() == context_handle) { return false; }
+	push(context_handle); return true;
+}
+
+inline context::handle_t pop()
+{
+	handle_t popped_context_handle;
+	auto status = cuCtxPopCurrent(&popped_context_handle);
+	throw_if_error(status, "Failed popping the current CUDA context");
+	return popped_context_handle;
+}
+
+inline void set(handle_t context_handle)
+{
+	// TODO: Would this help?
+	// if (detail_::get_handle() == context_handle_) { return; }
+	auto status = static_cast<status_t>(cuCtxSetCurrent(context_handle));
+	throw_if_error(status,
+	    "Failed setting the current context to " + context::detail_::identify(context_handle));
+}
+
+/**
+ * @note See the out-of-`detail_::` version of this class.
+ *
+ */
+class scoped_override_t {
+protected:
+public:
+	explicit scoped_override_t(handle_t context_handle) { push(context_handle); }
+	~scoped_override_t() { pop(); }
+
+//	explicit scoped_context_override_t(handle_t context_handle_) :
+//		did_push(push_if_not_on_top(context_handle_)) { }
+//	scoped_context_override_terride_t() { if (did_push) { pop(); } }
+//
+//protected:
+//	bool did_push;
+};
+
+class scoped_ensurer_t {
+public:
+	bool push_needed;
+
+	explicit scoped_ensurer_t(handle_t fallback_context_handle) : push_needed(not exists())
+	{
+		if (push_needed) { push(fallback_context_handle); }
+	}
+	~scoped_ensurer_t() { if (push_needed) { pop(); } }
+};
+
+class scoped_current_device_fallback_t;
+
+} // namespace detail_
+
+/**
+ * A RAII-based mechanism for pushing a context onto the context stack
+ * for what remains of the current (C++ language) scope - making it the
+ * current context - then popping it back when exiting the scope -
+ * restoring the stack and the current context to what they had been
+ * previously.
+ *
+ * @note if some other code pushes/pops from the context stack during
+ * the lifetime of this class, the pop-on-destruction may fail, or
+ * succeed but pop some other context handle than the one originally.
+ * pushed.
+ *
+ */
+class scoped_override_t : private detail_::scoped_override_t {
+protected:
+	using parent = detail_::scoped_override_t;
+public:
+	explicit scoped_override_t(const context_t& device);
+    explicit scoped_override_t(context_t&& device);
+	~scoped_override_t() = default;
+};
+
+/**
+ * This macro will set the current device for the remainder of the scope in which it is
+ * invoked, and will change it back to the previous value when exiting the scope. Use
+ * it as an opaque command, which does not explicitly expose the variable defined under
+ * the hood to effect this behavior.
+ */
+#define CUDA_CONTEXT_FOR_THIS_SCOPE(_cuda_context_ctor_argument) \
+	::cuda::context::current::scoped_override_t scoped_device_override( ::cuda::context_t(_cuda_context_ctor_argument) )
+
+
+inline bool push_if_not_on_top(const context_t& context);
+inline void push(const context_t& context);
+
+inline void synchronize()
+{
+	auto status = cuCtxSynchronize();
+	if (not is_success(status)) {
+		throw cuda::runtime_error(status, "Failed synchronizing current context");
+	}
+}
+
+namespace detail_ {
+
+inline void synchronize(context::handle_t handle)
+{
+	auto status = cuCtxSynchronize();
+	if (not is_success(status)) {
+		throw cuda::runtime_error(status,"Failed synchronizing "
+			+ context::detail_::identify(handle));
+	}
+}
+
+inline void synchronize(device::id_t device_id, context::handle_t handle)
+{
+	auto status = cuCtxSynchronize();
+	if (not is_success(status)) {
+		throw cuda::runtime_error(status, "Failed synchronizing "
+			+ context::detail_::identify(handle, device_id));
+	}
+}
+
+} // namespace detail
+
+} // namespace current
+
+} // namespace context
+
+} // namespace cuda
+
+#endif // CUDA_API_WRAPPERS_CURRENT_CONTEXT_HPP_
diff --git a/src/cuda/api/current_device.hpp b/src/cuda/api/current_device.hpp
index 78e1b384..597cd101 100644
--- a/src/cuda/api/current_device.hpp
+++ b/src/cuda/api/current_device.hpp
@@ -21,9 +21,9 @@
 #define CUDA_API_WRAPPERS_CURRENT_DEVICE_HPP_
 
 #include <cuda/api/constants.hpp>
-#include <cuda/api/error.hpp>
 #include <cuda/api/miscellany.hpp>
-#include <cuda/common/types.hpp>
+#include <cuda/api/current_context.hpp>
+#include <cuda/api/primary_context.hpp>
 
 #include <cuda_runtime_api.h>
 
@@ -44,22 +44,71 @@ namespace detail_ {
  */
 inline id_t get_id()
 {
-	id_t  device;
-	status_t result = cudaGetDevice(&device);
-	throw_if_error(result, "Failure obtaining current device index");
-	return device;
+	constexpr const id_t default_device_id { 0 };
+	context::handle_t current_context_handle;
+	auto status = cuCtxGetCurrent(&current_context_handle);
+	if (status == CUDA_ERROR_NOT_INITIALIZED) {
+		initialize_driver();
+		// Should we activate and push the default device's context? probably not.
+		return default_device_id;
+	}
+	throw_if_error(status, "Failed obtaining the current context for determining which "
+		"device is active");
+
+	if (current_context_handle == context::detail_::none) {
+		// Should we activate and push the default device's context? probably not.
+		return default_device_id;
+	}
+	return cuda::context::current::detail_::get_device_id();
+	// ... which is the equivalent of doing:
+	//
+//	handle_t  device_id;
+//	auto status = cudaGetDevice(&device_id);
+//	throw_if_error(status, "Failure obtaining current device id");
+//	return device_id;
 }
 
 /**
  * Set a device as the current one for the CUDA Runtime API (so that API calls
  * not specifying a device apply to it.)
  *
+ * @note This replaces the current CUDA context (rather than pushing a context
+ * onto the stack), so use with care.
+ *
+ * @note This causes a primary context for the device to be created, if it
+ * doesn't already exist. I'm not entirely sure regarding the conditions under
+ * which it will be destroyed, however.
+ *
  * @param[in] device Numeric ID of the device to make current
  */
-inline void set(id_t device)
+inline void set(id_t device_id)
 {
-	status_t result = cudaSetDevice(device);
-	throw_if_error(result, "Failure setting current device to " + ::std::to_string(device));
+	context::handle_t current_context_handle;
+	bool have_current_context;
+	auto status = cuCtxGetCurrent(&current_context_handle);
+	if (status == CUDA_ERROR_NOT_INITIALIZED) {
+		initialize_driver();
+		// Should we activate and PUSH the default device's context? probably not.
+		have_current_context = false;
+	}
+	else {
+		have_current_context = (current_context_handle != context::detail_::none);
+	}
+	if (have_current_context) {
+		auto current_context_device_id = context::detail_::get_device_id(current_context_handle);
+		if (current_context_device_id == device_id) {
+			return;
+		}
+	}
+	auto device_pc_is_active = device::primary_context::detail_::is_active(device_id);
+	bool need_refcount_increase = not device_pc_is_active;
+	auto dev_pc_handle = device::primary_context::detail_::get_handle(device_id, need_refcount_increase);
+	context::current::detail_::set(dev_pc_handle);
+
+
+	// ... which is the equivalent of doing:
+	// auto status = cudaSetDevice(device_id);
+	// throw_if_error(status, "Failure setting current device to " + ::std::to_string(device_id));
 }
 
 /**
@@ -67,6 +116,9 @@ inline void set(id_t device)
  *
  * @param[in] device_ids Numeric IDs of the devices to try and make current, in order
  * @param[in] num_devices The number of device IDs pointed to by @device_ids
+ *
+ * @note this replaces the current CUDA context (rather than pushing a context
+ * onto the stack), so use with care.
  */
 inline void set(const id_t* device_ids, size_t num_devices)
 {
@@ -80,25 +132,55 @@ inline void set(const id_t* device_ids, size_t num_devices)
 
 /**
  * @note See the out-of-`detail_::` version of this class.
+ *
+ * @note Perhaps it would be better to keep a copy of the current context ID in a
+ * member of this class, instead of on the stack?
+ *
+ * @note we have no guarantee that the context stack is not altered during
+ * the lifetime of this object; but - we assume it wasn't, and it's up to the users
+ * of this class to assure that's the case or face the consequences.
+ *
+ * @note We don't want to use the cuda::context::detail_scoped_override_t
+ * as the implementation, since we're not simply pushing and popping
  */
-class scoped_override_t {
-protected:
-	static id_t  replace(id_t new_device_id)
+
+class scoped_context_override_t {
+public:
+	explicit scoped_context_override_t(id_t device_id) :
+		device_id_(device_id),
+		refcount_was_nonzero(device::primary_context::detail_::is_active(device_id))
 	{
-		id_t previous_device_id = device::current::detail_::get_id();
-		device::current::detail_::set(new_device_id);
-		return previous_device_id;
+		auto top_of_context_stack = context::current::detail_::get_handle();
+		if (top_of_context_stack != context::detail_::none) {
+			context::current::detail_::push(top_of_context_stack); // Yes, we're pushing a copy of the same context
+		}
+		device::current::detail_::set(device_id); // ... which now gets overwritten at the top of the stack
+		primary_context_handle = device::primary_context::detail_::obtain_and_increase_refcount(device_id);
+
+//		auto top_of_context_stack = context::current::detail_::get_handle();
+//		device::current::detail_::set(device_id); // ... which now gets overwritten at the top of the stack
+//		primary_context = device::primary_context::detail_::get_handle(device_id);
+//		context::current::detail_::push(primary_context);
 	}
+	~scoped_context_override_t() {
+		context::current::detail_::pop();
+//#else
+//		auto popped_context_handle = context::current::detail_::pop();
+//		if (popped_context_handle != primary_context_handle) {
+//			throw ::std::logic_error("Expected the top of the context stack to hold the primary context of "
+//				+ device::detail_::identify(device_id_));
+//		}
+//#endif
+		if (refcount_was_nonzero) {
+			device::primary_context::detail_::decrease_refcount(device_id_);
+			// We intentionally "leak" a refcount, as otherwise, the primary context
+			// gets destroyed after we have created it - and we don't want that happening.
+		}
 
-public:
-	scoped_override_t(id_t new_device_id) : previous_device_id(replace(new_device_id)) { }
-	~scoped_override_t() {
-		// Note that we have no guarantee that the current device was not
-		// already replaced while this object was in scope; but - that's life.
-		replace(previous_device_id);
 	}
-private:
-	id_t  previous_device_id;
+	device::id_t device_id_;
+	primary_context::handle_t primary_context_handle;
+	bool refcount_was_nonzero;
 };
 
 
@@ -109,26 +191,21 @@ class scoped_override_t {
  */
 inline void set_to_default() { return detail_::set(device::default_device_id); }
 
-/**
- * Obtains (a proxy for) the device which the CUDA runtime API considers to be current.
- */
-inline device_t get();
-
-/**
- * Tells the CUDA runtime API to consider the specified device as the current one.
- */
-inline void set(device_t device);
+void set(const device_t& device);
 
 /**
- * A RAII-based mechanism for setting the CUDA Runtime API's current device for
+ * A RAII-like mechanism for setting the CUDA Runtime API's current device for
  * what remains of the current scope, and changing it back to its previous value
  * when exiting the scope.
+ *
+ * @note The description says "RAII-like" because the reality is more complex. The
+ * runtime API sets a device by overwriting the current
  */
-class scoped_override_t : private detail_::scoped_override_t {
+class scoped_override_t : private detail_::scoped_context_override_t {
 protected:
-	using parent = detail_::scoped_override_t;
+	using parent = detail_::scoped_context_override_t;
 public:
-	scoped_override_t(device_t& device);
+	scoped_override_t(const device_t& device);
 	scoped_override_t(device_t&& device);
 	~scoped_override_t() = default;
 };
diff --git a/src/cuda/api/detail/device_properties.hpp b/src/cuda/api/detail/device_properties.hpp
index 4c847fb9..6ffbeaba 100644
--- a/src/cuda/api/detail/device_properties.hpp
+++ b/src/cuda/api/detail/device_properties.hpp
@@ -1,5 +1,5 @@
 /**
- * @file detail_/device_properties.hpp
+ * @file detail/device_properties.hpp
  *
  * @brief Implementation of methods and helper functions for device-property-related classes.
  *
@@ -56,7 +56,7 @@ template <typename T>
 inline T ensure_arch_property_validity(T v, const compute_architecture_t& arch)
 {
 	if (v == detail_::invalid_architecture_return) {
-		throw ::std::invalid_argument("No architecture numbered " + ::std::to_string(arch.major));
+		throw ::std::invalid_argument("No known architecture numbered " + ::std::to_string(arch.major));
 	}
 	return v;
 }
@@ -65,7 +65,7 @@ template <>
 inline const char* ensure_arch_property_validity<const char*>(const char* v, const compute_architecture_t& arch)
 {
 	if (v == nullptr) {
-		throw ::std::invalid_argument("No architecture numbered " + ::std::to_string(arch.major));
+		throw ::std::invalid_argument("No known architecture numbered " + ::std::to_string(arch.major));
 	}
 	return v;
 }
@@ -86,25 +86,30 @@ inline constexpr const char* architecture_name(const compute_architecture_t& arc
 		nullptr;
 }
 
+/**
+ * @note Remember that regardless of the value you get from this function,
+ * to use more than 48 KiB per block you may need a call such as:
+ *
+ *	 cudaFuncSetAttribute(
+ *	     my_kernel,
+ *	     cudaFuncAttributePreferredSharedMemoryCarveout,
+ *	     cudaSharedmemCarveoutMaxShared
+ *	 );
+ *
+ * for details, see the CUDA Programming Guide, section K.7.3
+ */
 inline constexpr memory::shared::size_t max_shared_memory_per_block(const compute_architecture_t& arch)
 {
 	return
 		(arch.major == 1) ?  16 * KiB :
 		(arch.major == 2) ?  48 * KiB :
 		(arch.major == 3) ?  48 * KiB :
-		// Note: No architecture number 4!
+			// Note: No architecture number 4!
 		(arch.major == 5) ?  48 * KiB :
 		(arch.major == 6) ?  48 * KiB :
-		(arch.major == 7) ?  96 * KiB :
-			// this is the Volta figure, Turing is different. Also, values above 48 require a call such as:
-			//
-			// cudaFuncSetAttribute(
-			//     my_kernel,
-			//     cudaFuncAttributePreferredSharedMemoryCarveout,
-			//     cudaSharedmemCarveoutMaxShared
-			// );
-			//
-			// for details, see the CUDA C Programming Guide.
+		(arch.major == 7) ?  96 * KiB : // of 128
+		// this is the Volta figure, Turing is different.
+		(arch.major == 8) ? 163 * KiB : // of 192
 		invalid_architecture_return;
 }
 
@@ -118,6 +123,7 @@ inline unsigned max_resident_warps_per_processor(const compute_architecture_t& a
 		(arch.major == 5) ?  64 :
 		(arch.major == 6) ?  64 :
 		(arch.major == 7) ?  64 : // this is the Volta figure, Turing is different
+		(arch.major == 8) ?  64 :
 		invalid_architecture_return;
 }
 
@@ -131,6 +137,7 @@ inline unsigned max_warp_schedulings_per_processor_cycle(const compute_architect
 		(arch.major == 5) ?  4 :
 		(arch.major == 6) ?  4 :
 		(arch.major == 7) ?  4 :
+		(arch.major == 8) ?  4 :
 		invalid_architecture_return;
 }
 
@@ -144,6 +151,7 @@ inline constexpr unsigned max_in_flight_threads_per_processor(const compute_arch
 		(arch.major == 5) ? 128 :
 		(arch.major == 6) ? 128 :
 		(arch.major == 7) ? 128 : // this is the Volta figure, Turing is different
+		(arch.major == 8) ? 2048 :
 		invalid_architecture_return;
 }
 
@@ -232,6 +240,7 @@ inline constexpr unsigned max_in_flight_threads_per_processor(const compute_capa
 		cc.as_combined_number() == 21 ? 48 :
 		cc.as_combined_number() == 60 ? 64 :
 		cc.as_combined_number() == 75 ? 64 :
+		cc.as_combined_number() == 86 ? 1536 :
 		max_in_flight_threads_per_processor(cc.architecture);
 }
 
@@ -243,11 +252,24 @@ inline constexpr unsigned max_warp_schedulings_per_processor_cycle(const compute
 		max_warp_schedulings_per_processor_cycle(cc.architecture);
 }
 
+/**
+ * @note Remember that regardless of the value you get from this function,
+ * to use more than 48 KiB per block you may need a call such as:
+ *
+ *	 cudaFuncSetAttribute(
+ *	     my_kernel,
+ *	     cudaFuncAttributePreferredSharedMemoryCarveout,
+ *	     cudaSharedmemCarveoutMaxShared
+ *	 );
+ *
+ * for details, see the CUDA Programming Guide, section K.7.3
+ */
 inline constexpr unsigned max_shared_memory_per_block(const compute_capability_t& cc)
 {
 	return
-		cc.as_combined_number() == 37 ? 112 * KiB :
-		cc.as_combined_number() == 75 ?  64 * KiB :
+		cc.as_combined_number() == 37 ? 112 * KiB : // of 112
+		cc.as_combined_number() == 75 ?  64 * KiB : // of 96
+		cc.as_combined_number() == 86 ?  99 * KiB : // of 128
 		max_shared_memory_per_block(cc.architecture);
 }
 
@@ -259,6 +281,7 @@ inline constexpr unsigned max_resident_warps_per_processor(const compute_capabil
 		cc.as_combined_number() == 12 ? 32 * KiB :
 		cc.as_combined_number() == 13 ? 32 * KiB :
 		cc.as_combined_number() == 75 ? 32 * KiB :
+		cc.as_combined_number() == 86 ? 48 * KiB :
 		max_resident_warps_per_processor(cc.architecture);
 }
 
@@ -295,6 +318,25 @@ inline bool properties_t::usable_for_compute() const noexcept
 } // namespace device
 } // namespace cuda
 
+namespace std {
+
+  template <>
+  struct hash<cuda::device::compute_capability_t>
+  {
+    ::std::size_t operator()(const cuda::device::compute_capability_t& cc) const
+    {
+      using ::std::hash;
+
+      // Compute individual hash values for first,
+      // second and third and combine them using XOR
+      // and bit shifting:
+
+      return hash<unsigned>()(cc.major()) ^ (hash<unsigned>()(cc.minor()) << 1);
+    }
+  };
+
+} // namespace std
+
 ///@endcond
 
 #endif // CUDA_API_WRAPPERS_DETAIL_DEVICE_PROPERTIES_HPP_
diff --git a/src/cuda/api/device.hpp b/src/cuda/api/device.hpp
index fecc6d67..3a80dce7 100644
--- a/src/cuda/api/device.hpp
+++ b/src/cuda/api/device.hpp
@@ -9,15 +9,18 @@
 #ifndef CUDA_API_WRAPPERS_DEVICE_HPP_
 #define CUDA_API_WRAPPERS_DEVICE_HPP_
 
+#include <cuda/api/types.hpp>
 #include <cuda/api/current_device.hpp>
 #include <cuda/api/device_properties.hpp>
 #include <cuda/api/memory.hpp>
 #include <cuda/api/pci_id.hpp>
-#include <cuda/common/types.hpp>
+#include <cuda/api/primary_context.hpp>
+#include <cuda/api/error.hpp>
 
 #include <cuda_runtime_api.h>
 
 #include <string>
+#include <cstring>
 #include <type_traits>
 
 namespace cuda {
@@ -28,8 +31,27 @@ class stream_t;
 class device_t;
 ///@endcond
 
+/**
+ * @brief Waits for all previously-scheduled tasks on all streams (= queues)
+ * on a specified device to conclude.
+ *
+ * Depending on the host_thread_synch_scheduling_policy_t set for this
+ * device, the thread calling this method will either yield, spin or block
+ * until all tasks scheduled previously scheduled on this device have been
+ * concluded.
+ */
+void synchronize(const device_t& device);
+
 namespace device {
 
+///@cond
+class primary_context_t;
+///@cendond
+
+using limit_t = context::limit_t;
+using limit_value_t = context::limit_value_t;
+using shared_memory_bank_size_t = context::shared_memory_bank_size_t;
+
 namespace detail_ {
 
 /**
@@ -41,99 +63,48 @@ namespace detail_ {
  * (see below) - but chose not too for consistency with other wrappers
  * and to avoid requiring multiple friend functions.
  */
-device_t wrap(id_t id) noexcept;
+device_t wrap(id_t id, primary_context::handle_t primary_context_handle = context::detail_::none) noexcept;
 
 }
 
-namespace peer_to_peer {
-
-/**
- * @brief The value of type for all CUDA device "attributes"; see also @ref cuda::device::attribute_t.
- */
-using attribute_value_t = int;
-
-/**
- * @brief An identifier of a integral-numeric-value attribute of a CUDA device.
- *
- * @note Somewhat annoyingly, CUDA devices have attributes, properties and flags.
- * Attributes have integral number values; properties have all sorts of values,
- * including arrays and limited-length strings (see
- * @ref cuda::device::properties_t), and flags are either binary or
- * small-finite-domain type fitting into an overall flags value (see
- * @ref cuda::device_t::flags_t). Flags and properties are obtained all at once,
- * attributes are more one-at-a-time.
- */
-using attribute_t = cudaDeviceP2PAttr;
+using stream_priority_range_t = context::stream_priority_range_t;
 
-/**
- * Aliases for all CUDA device attributes
- */
-enum : ::std::underlying_type<attribute_t>::type {
-		link_performance_rank = cudaDevP2PAttrPerformanceRank, /**< A relative value indicating the performance of the link between two devices */                       //!< link_performance_rank
-		access_support = cudaDevP2PAttrAccessSupported, /**< 1 if access is supported, 0 otherwise */                                                                    //!< access_support
-		native_atomics_support = cudaDevP2PAttrNativeAtomicSupported /**< 1 if the first device can perform native atomic operations on the second device, 0 otherwise *///!< native_atomics_support
-};
+namespace detail_ {
 
-/**
- * @brief Get one of the numeric attributes for a(n ordered) pair of devices,
- * relating to their interaction
- *
- * @note This is the device-pair equivalent of @ref device_t::get_attribute()
- *
- * @param attribute identifier of the attribute of interest
- * @param source source device
- * @param destination destination device
- * @return the numeric attribute value
- */
-inline attribute_value_t get_attribute(attribute_t attribute, id_t source, id_t destination)
+inline ::std::string get_name(id_t id)
 {
-	attribute_value_t value;
-	auto status = cudaDeviceGetP2PAttribute(&value, attribute, source, destination);
-	throw_if_error(status,
-		"Failed obtaining peer-to-peer device attribute for device pair (" + ::std::to_string(source) + ", "
-			+ ::std::to_string(destination) + ')');
-	return value;
+	initialize_driver();
+	using size_type = int; // Yes, an int, that's what cuDeviceName takes
+	constexpr const size_type initial_size_reservation { 100 };
+	constexpr const size_type larger_size { 1000 }; // Just in case
+	char stack_buffer[initial_size_reservation];
+	char* buffer = stack_buffer;
+	auto buffer_size = (size_type) (sizeof(stack_buffer) / sizeof(char));
+	auto try_getting_name = [&](char* buffer, size_type buffer_size) -> size_type {
+		auto status =  cuDeviceGetName(buffer, buffer_size-1, id);
+		throw_if_error(status, "Failed obtaining the CUDA device name");
+		buffer[buffer_size-1] = '\0';
+		return (size_type) ::std::strlen(buffer);
+	};
+	auto prospective_name_length = try_getting_name(buffer, initial_size_reservation);
+	if (prospective_name_length >= buffer_size - 1) {
+		// This should really not happen, but just for the off chance...
+		if (buffer != stack_buffer) { delete buffer; }
+		buffer = new char[larger_size];
+		prospective_name_length = try_getting_name(buffer, buffer_size);
+	}
+	if (prospective_name_length >= buffer_size - 1) {
+		throw ::std::runtime_error("CUDA device name longer than expected maximum size " + ::std::to_string(larger_size));
+	}
+	return { buffer, (::std::size_t) prospective_name_length };
 }
 
-attribute_value_t get_attribute(
-	attribute_t  attribute,
-	device_t     source,
-	device_t     destination);
-
-} // namespace peer_to_peer
-
-/**
- * A range of priorities supported by a CUDA device; ranges from the
- * higher numeric value to the lower.
- */
-struct stream_priority_range_t {
-	stream::priority_t least; /// Higher numeric value, lower priority
-	stream::priority_t greatest; /// Lower numeric value, higher priority
-
-	/**
-	 * When true, stream prioritization is not supported, i.e. all streams have
-	 * "the same" priority - the default one.
-	 */
-	constexpr bool is_trivial() const {
-		return least == stream::default_priority and greatest == stream::default_priority;
-	}
-};
+} // namespace detail
 
 } // namespace device
 
-
 /**
- * @brief Suspends execution until all previously-scheduled tasks on
- * the specified device (all contexts, all streams) have concluded.
- *
- * Depending on the host_thread_synch_scheduling_policy_t set for this
- * device, the thread calling this method will either yield, spin or block
- * until this completion.
- */
-inline void synchronize(device_t device);
-
-/**
- * @brief Proxy class for a CUDA device
+ * @brief Wrapper class for a CUDA device
  *
  * Use this class - built around a device ID, or for the current device - to
  * perform almost, if not all, device-related operations, as opposed to passing
@@ -141,111 +112,29 @@ inline void synchronize(device_t device);
  *
  * @note this is one of the three main classes in the Runtime API wrapper library,
  * together with @ref cuda::stream_t and @ref cuda::event_t
+ *
+ * @note obtaining device LUID's is not supported (those are too graphics-specific)
+ * @note This class is a "reference type", not a "value type". Therefore, making changes
+ * to properties of the device is a const-respecting operation on this class.
  */
 class device_t {
 public: // types
 	using properties_t = device::properties_t;
 	using attribute_value_t = device::attribute_value_t;
-	using limit_t = size_t;
+	using flags_type = device::flags_t;
 
-	using resource_id_t = cudaLimit;
+	// TODO: Consider a scoped/unscoped dichotomy
+	context_t::global_memory_type memory() const { return primary_context(unscoped_).memory(); }
 
 protected: // types
-	using scoped_setter_t = device::current::detail_::scoped_override_t;
-	using flags_t = unsigned;
-
-public:	// types
-
-	/**
-	 * @brief A class to create a faux member in a @ref device_t, in lieu of an in-class
-	 * namespace (which C++ does not support); whenever you see a function
-	 * `my_dev.memory::foo()`, think of it as a `my_dev::memory::foo()`.
-	 */
-	class global_memory_t {
-	protected:
-		const device::id_t device_id_;
-
-		using deleter = memory::device::detail_::deleter;
-		using allocator = memory::device::detail_::allocator;
-
-	public:
-		///@cond
-		explicit global_memory_t(device::id_t id) : device_id_(id) { }
-		///@endcond
-
-		cuda::device_t associated_device() const { return device::detail_::wrap(device_id_); }
-
-		/**
-		 * Allocate a region of memory on the device
-		 *
-		 * @param size_in_bytes size in bytes of the region of memory to allocate
-		 * @return a non-null (device-side) pointer to the allocated memory
-		 */
-		memory::region_t allocate(size_t size_in_bytes) const
-		{
-			scoped_setter_t set_device_for_this_scope(device_id_);
-			return memory::device::detail_::allocate(size_in_bytes);
-		}
-
-		// Perhaps drop this? it should really go into a managed namespace
-		using initial_visibility_t  = cuda::memory::managed::initial_visibility_t;
-
-		/**
-		 * Allocates memory on the device whose pointer is also visible on the host,
-		 * and possibly on other devices as well - with the same address. This is
-		 * nVIDIA's "managed memory" mechanism.
-		 *
-		 * @note Managed memory isn't as "strongly associated" with a single device
-		 * as the result of allocate(), since it can be read or written from any
-		 * device or from the host. However, the actual space is allocated on
-		 * some device, so its creation is a device (device_t) object method.
-		 *
-		 * @note for a more complete description see the
-		 * <a href="http://docs.nvidia.com/cuda/cuda-runtime-api/">CUDA Runtime API
-		 * reference</a>)
-		 *
-		 * @param size_in_bytes Size of memory region to allocate
-		 * @param initial_visibility if this equals ,to_supporters_of_concurrent_managed_access\ only the host (and the
-		 * allocating device) will be able to utilize the pointer returned; if false,
-		 * it will be made usable on all CUDA devices on the systems.
-		 * @return the allocated pointer; never returns null (throws on failure)
-		 */
-		memory::region_t allocate_managed(
-			size_t size_in_bytes,
-			initial_visibility_t initial_visibility =
-				initial_visibility_t::to_supporters_of_concurrent_managed_access) const
-		{
-			scoped_setter_t set_device_for_this_scope(device_id_);
-			return cuda::memory::managed::detail_::allocate(size_in_bytes, initial_visibility);
-		}
+	using context_setter_type = context::current::detail_::scoped_override_t;
+		// Note the context setter only affects the _currency_ of a context, not the
+		// activity of a primary context
 
-		/**
-		 * Amount of total global memory on the CUDA device.
-		 */
-		size_t amount_total() const
-		{
-			scoped_setter_t set_device_for_this_scope(device_id_);
-			size_t total_mem_in_bytes;
-			auto status = cudaMemGetInfo(nullptr, &total_mem_in_bytes);
-			throw_if_error(status, "Failed determining amount of total memory for " + device::detail_::identify(device_id_));
-			return total_mem_in_bytes;
-		}
+protected: // constants
+    enum : bool { scoped_ = true, unscoped_ = false };
 
-		/**
-		 * Amount of memory on the CUDA device which is free and may be
-		 * allocated for other uses.
-		 *
-		 * @note No guarantee of this free memory being contigous.
-		 */
-		size_t amount_free() const
-		{
-			scoped_setter_t set_device_for_this_scope(device_id_);
-			size_t free_mem_in_bytes;
-			auto status = cudaMemGetInfo(&free_mem_in_bytes, nullptr);
-			throw_if_error(status, "Failed determining amount of free memory for CUDA " + device::detail_::identify(device_id_));
-			return free_mem_in_bytes;
-		}
-	}; // class global_memory_t
+public:
 
 	/**
 	 * @brief Determine whether this device can access the global memory
@@ -256,10 +145,12 @@ class device_t {
 	 */
 	bool can_access(device_t peer) const
 	{
+		context_setter_type set_for_this_scope(primary_context_handle());
 		int result;
-		auto status = cudaDeviceCanAccessPeer(&result, id(), peer.id());
-		throw_if_error(status, "Failed determining whether CUDA device " + device::detail_::identify(id()) + " can access CUDA device "
-		+ device::detail_::identify(peer.id()));
+		auto status = cuDeviceCanAccessPeer(&result, id(), peer.id());
+		throw_if_error(status, "Failed determining whether "
+			+ device::detail_::identify(id_) + " can access "
+			+ device::detail_::identify(peer.id_));
 		return (result == 1);
 	}
 
@@ -268,14 +159,9 @@ class device_t {
 	 *
 	 * @param peer the device to which to enable access
 	 */
-	void enable_access_to(device_t peer) const
+	void enable_access_to(const device_t& peer) const
 	{
-		enum : unsigned {fixed_flags = 0 };
-		// No flags are supported as of CUDA 8.0
-		scoped_setter_t set_device_for_this_scope(id());
-		auto status = cudaDeviceEnablePeerAccess(peer.id(), fixed_flags);
-		throw_if_error(status,
-					   "Failed enabling access of " + device::detail_::identify(id()) + " to " + ::std::to_string(peer.id()));
+		primary_context(scoped_).enable_access_to(peer.primary_context(scoped_));
 	}
 
 	/**
@@ -283,48 +169,69 @@ class device_t {
 	 *
 	 * @param peer the device to which to disable access
 	 */
-	void disable_access_to(device_t peer) const
+	void disable_access_to(const device_t& peer) const
 	{
-		scoped_setter_t set_device_for_this_scope(id());
-		auto status = cudaDeviceDisablePeerAccess(peer.id());
-		throw_if_error(status,
-			"Failed disabling access of device " + ::std::to_string(id()) + " to device " + ::std::to_string(peer.id()));
+		primary_context(scoped_).disable_access_to(peer.primary_context(scoped_));
 	}
 
-protected:
-	void set_flags(flags_t new_flags) const
-	{
-		scoped_setter_t set_device_for_this_scope(id_);
-		auto status = cudaSetDeviceFlags(new_flags);
-		throw_if_error(status, "Failed setting the flags for " + device::detail_::identify(id_));
-	}
 
-	void set_flags(
-		host_thread_synch_scheduling_policy_t  synch_scheduling_policy,
-		bool                                   keep_larger_local_mem_after_resize) const
-	{
-		set_flags( (flags_t)
-			  synch_scheduling_policy // this enum value is also a valid bitmask
-			| (keep_larger_local_mem_after_resize    ? cudaDeviceLmemResizeToMax : 0));
+	uuid_t uuid () const {
+		uuid_t result;
+		auto status = cuDeviceGetUuid(&result, id_);
+		throw_if_error(status, "Failed obtaining UUID for " + device::detail_::identify(id_));
+		return result;
 	}
 
-	flags_t flags() const
-	{
-		scoped_setter_t set_device_for_this_scope(id_);
-		flags_t flags;
-		auto status = cudaGetDeviceFlags(&flags);
-		throw_if_error(status, "Failed obtaining the flags for  " + device::detail_::identify(id_));
-		return flags;
+protected:
+	void cache_and_ensure_primary_context_activation() const {
+        if (primary_context_handle_ == context::detail_::none) {
+            primary_context_handle_ = device::primary_context::detail_::obtain_and_increase_refcount(id_);
+            // The refcount should now be non-zero until we destruct this device_t!
+        }
 	}
 
+    context::handle_t primary_context_handle() const
+    {
+		cache_and_ensure_primary_context_activation();
+        return primary_context_handle_;
+    }
+
+
+public:
+	/**
+	 * Produce a proxy for the device's primary context - the one used by runtime API calls.
+	 *
+	 * @param scoped When true, the primary proxy object returned will not perform its
+	 * own reference accounting, and will assume the primary context is active while
+	 * this device object exists. When false, the returned primary context proxy object
+	 * _will_ take care of its own reference count unit, and can outlive this object.
+	 */
+	device::primary_context_t primary_context(bool scoped) const;
+
+
 public:
 	/**
-	 * @brief Obtains a proxy for the device's global memory
+	 * Produce a proxy for the device's primary context - the one used by runtime API calls.
+	 *
+	 * @note The CUDA driver reference counting for the primary scope is "taken core of"
+	 * with this call, i.e. the caller does not need to add/decrease the refcount, and the
+	 * object can safely outlive the device_t proxy object which created it.
 	 */
-	global_memory_t memory() const { return global_memory_t{ id_ }; };
+	device::primary_context_t primary_context() const { return primary_context(unscoped_); }
+
+	void set_flags(flags_type new_flags) const
+	{
+		cache_and_ensure_primary_context_activation();
+		auto status = cuDevicePrimaryCtxSetFlags(id(), new_flags);
+		throw_if_error(status, "Failed setting the flags for " + device::detail_::identify(id_));
+	}
+
+public:
 
 	/**
 	 * Obtains the (mostly) non-numeric properties for this device.
+	 *
+	 * @todo get rid of this in favor of individual properties only.
 	 */
 	properties_t properties() const
 	{
@@ -334,8 +241,7 @@ class device_t {
 		return properties;
 	}
 
-	static device_t choose_best_match(const properties_t& properties)
-	{
+	static device_t choose_best_match(const properties_t& properties) {
 		device::id_t id;
 		auto status = cudaChooseDevice(&id, &properties);
 		throw_if_error(status, "Failed choosing a best matching device by a a property set.");
@@ -347,10 +253,24 @@ class device_t {
 	 */
 	::std::string name() const
 	{
-		// I could get the name directly, but that would require
-		// direct use of the driver, and I'm not ready for that
-		// just yet
-		return properties().name;
+		// If I were lazy, I would just write:
+		// return properties().name;
+		// and let you wait for all of that to get populated. But not me!
+		return cuda::device::detail_::get_name(id_);
+	}
+
+	/**
+	 * Obtain a numeric-value attribute of the device
+	 *
+	 * @note See @ref device::attribute_t for explanation about attributes,
+	 * properties and flags.
+	 */
+	attribute_value_t get_attribute(device::attribute_t attribute) const
+	{
+		attribute_value_t attribute_value;
+		auto status = cuDeviceGetAttribute(&attribute_value, attribute, id_);
+		throw_if_error(status, "Failed obtaining device properties for " + device::detail_::identify(id_));
+		return attribute_value;
 	}
 
 	/**
@@ -359,44 +279,52 @@ class device_t {
 	 */
 	device::pci_location_t pci_id() const
 	{
-		auto pci_domain_id = get_attribute(cudaDevAttrPciDomainId);
-		auto pci_bus_id = get_attribute(cudaDevAttrPciBusId);
-		auto pci_device_id = get_attribute(cudaDevAttrPciDeviceId);
+		auto pci_domain_id = get_attribute(CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID);
+		auto pci_bus_id    = get_attribute(CU_DEVICE_ATTRIBUTE_PCI_BUS_ID);
+		auto pci_device_id = get_attribute(CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID);
 		return {pci_domain_id, pci_bus_id, pci_device_id};
 	}
 
+	device::multiprocessor_count_t multiprocessor_count() const
+	{
+		return get_attribute(CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT);
+	}
+
 	/**
-	 * Obtains the device's hardware architecture generation numeric
-	 * designator see @ref cuda::device::compute_architecture_t
+	 * True if the device supports the facilities under namespace @ref memory::virtual_
+	 * including the separation of memory allocation from address range mapping, and
+	 * the possibility of changing mapping after allocation.
 	 */
-	device::compute_architecture_t architecture() const
+	bool supports_virtual_memory_management() const
 	{
-		unsigned major = get_attribute(cudaDevAttrComputeCapabilityMajor);
-		return {major};
+		return get_attribute(
+#if CUDA_VERSION >= 11030
+			CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED
+#else
+			CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED
+#endif
+		);
+
 	}
 
 	/**
-	 * Obtains the device's compute capability; see @ref cuda::device::compute_capability_t
+	 * Obtains the device's hardware architecture generation numeric
+	 * designator see @ref cuda::device::compute_architecture_t
 	 */
-	device::compute_capability_t compute_capability() const
+	device::compute_architecture_t architecture() const
 	{
-		auto arch = architecture();
-		unsigned minor = get_attribute(cudaDevAttrComputeCapabilityMinor);
-		return {arch, minor};
+		unsigned major = get_attribute(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR);
+		return { major };
 	}
 
 	/**
-	 * Obtain a numeric-value attribute of the device
-	 *
-	 * @note See @ref device::attribute_t for explanation about attributes,
-	 * properties and flags.
+	 * Obtains the device's compute capability; see @ref cuda::device::compute_capability_t
 	 */
-	attribute_value_t get_attribute(device::attribute_t attribute) const
+	device::compute_capability_t compute_capability() const
 	{
-		attribute_value_t attribute_value;
-		auto ret = cudaDeviceGetAttribute(&attribute_value, attribute, id());
-		throw_if_error(ret, "Failed obtaining device properties for " + device::detail_::identify(id_));
-		return attribute_value;
+		auto major = architecture();
+		unsigned minor = get_attribute(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR);
+		return {major, minor};
 	}
 
 	/**
@@ -405,7 +333,7 @@ class device_t {
 	 */
 	bool supports_concurrent_managed_access() const
 	{
-		return get_attribute(cudaDevAttrConcurrentManagedAccess);
+		return (get_attribute(CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS) != 0);
 	}
 
 	/**
@@ -414,7 +342,7 @@ class device_t {
 	 */
 	bool supports_block_cooperation() const
 	{
-		return get_attribute(cudaDevAttrCooperativeLaunch);
+		return get_attribute(CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH);
 	}
 
 	/**
@@ -423,22 +351,18 @@ class device_t {
 	 *
 	 * @param resource which resource's limit to obtain
 	 */
-	limit_t get_limit(resource_id_t resource) const
+	device::limit_value_t get_limit(device::limit_t limit) const
 	{
-		limit_t limit;
-		auto status = cudaDeviceGetLimit(&limit, resource);
-		throw_if_error(status, "Failed obtaining a resource limit for " + device::detail_::identify(id_));
-		return limit;
+		return primary_context(scoped_).get_limit(limit);
 	}
 
 	/**
 	 * Set the upper limit of one of the named numeric resources
 	 * on this device
 	 */
-	void set_limit(resource_id_t resource, limit_t new_limit) const
+	void set_limit(device::limit_t limit, device::limit_value_t new_value) const
 	{
-		auto status = cudaDeviceSetLimit(resource, new_limit);
-		throw_if_error(status, "Failed setting a resource limit for  " + device::detail_::identify(id_));
+		primary_context(scoped_).set_limit(limit, new_value);
 	}
 
 	/**
@@ -450,9 +374,28 @@ class device_t {
 	 * until all tasks scheduled previously scheduled on this device have been
 	 * concluded.
 	 */
-	void synchronize() const
+	const device_t& synchronize() const
+	{
+		cuda::synchronize(*this);
+		return *this;
+	}
+
+	device_t& synchronize()
 	{
 		cuda::synchronize(*this);
+		return *this;
+	}
+
+	const device_t& make_current() const
+	{
+		device::current::set(*this);
+		return *this;
+	}
+
+	device_t& make_current()
+	{
+		device::current::set(*this);
+		return *this;
 	}
 
 	/**
@@ -463,9 +406,20 @@ class device_t {
 	 */
 	void reset() const
 	{
-		scoped_setter_t set_device_for_this_scope(id_);
-		status_t status = cudaDeviceReset();
-		throw_if_error(status, "Resetting  " + device::detail_::identify(id_));
+		// Notes:
+		//
+		// 1. We _cannot_ use cuDevicePrimaryCtxReset() - because that one only affects
+		// the device's primary context, while cudaDeviceReset() destroys _all_ contexts for
+		// the device.
+		// 2. We don't need the primary context to be active here, so not using the usual
+		//    primary_context_handle() getter mechanism.
+
+	    auto pc_handle = (primary_context_handle_ == context::detail_::none) ?
+	        device::primary_context::detail_::obtain_and_increase_refcount(id_) :
+	        primary_context_handle_;
+		context_setter_type set_context_for_this_scope{pc_handle};
+		auto status = cudaDeviceReset();
+		throw_if_error(status, "Resetting " + device::detail_::identify(id_));
 	}
 
 	/**
@@ -476,10 +430,7 @@ class device_t {
 	 */
 	void set_cache_preference(multiprocessor_cache_preference_t preference) const
 	{
-		scoped_setter_t set_device_for_this_scope(id_);
-		auto status = cudaDeviceSetCacheConfig((cudaFuncCache) preference);
-		throw_if_error(status,
-			"Setting the multiprocessor L1/Shared Memory cache distribution preference for " + device::detail_::identify(id_));
+		primary_context(scoped_).set_cache_preference(preference);
 	}
 
 	/**
@@ -488,12 +439,7 @@ class device_t {
 	 */
 	multiprocessor_cache_preference_t cache_preference() const
 	{
-		scoped_setter_t set_device_for_this_scope(id_);
-		cudaFuncCache raw_preference;
-		auto status = cudaDeviceGetCacheConfig(&raw_preference);
-		throw_if_error(status,
-			"Obtaining the multiprocessor L1/Shared Memory cache distribution preference for " + device::detail_::identify(id_));
-		return (multiprocessor_cache_preference_t) raw_preference;
+		return primary_context(scoped_).cache_preference();
 	}
 
 	/**
@@ -502,11 +448,9 @@ class device_t {
 	 *
 	 * @param new_bank_size the shared memory bank size to set, in bytes
 	 */
-	void set_shared_memory_bank_size(memory::shared::bank_size_configuration_t new_bank_size) const
+	void set_shared_memory_bank_size(device::shared_memory_bank_size_t new_bank_size) const
 	{
-		scoped_setter_t set_device_for_this_scope(id_);
-		auto status = cudaDeviceSetSharedMemConfig(new_bank_size);
-		throw_if_error(status, "Setting the multiprocessor shared memory bank size for " + device::detail_::identify(id_));
+		primary_context(scoped_).set_shared_memory_bank_size(new_bank_size);
 	}
 
 	/**
@@ -515,13 +459,9 @@ class device_t {
 	 *
 	 * @return the shared memory bank size in bytes
 	 */
-	memory::shared::bank_size_configuration_t shared_memory_bank_size() const
+	device::shared_memory_bank_size_t shared_memory_bank_size() const
 	{
-		scoped_setter_t set_device_for_this_scope(id_);
-		memory::shared::bank_size_configuration_t bank_size;
-		auto status = cudaDeviceGetSharedMemConfig(&bank_size);
-		throw_if_error(status, "Obtaining the multiprocessor shared memory bank size for  " + device::detail_::identify(id_));
-		return bank_size;
+		return primary_context(scoped_).shared_memory_bank_size();
 	}
 
 	// For some reason, there is no cudaFuncGetCacheConfig. Weird.
@@ -534,21 +474,16 @@ class device_t {
 	 * Return the proxied device's ID
 	 *
 	 */
-	device::id_t id() const
+	device::id_t id() const noexcept
 	{
 		return id_;
 	}
 
-	stream_t default_stream() const noexcept;
+	stream_t default_stream() const;
 
-	// I'm a worried about the creation of streams with the assumption
-	// that theirs is the current device, so I'm just forbidding it
-	// outright here - even though it's very natural to want to write
-	//
-	//   cuda::device::curent::get().create_stream()
-	//
-	// (sigh)... safety over convenience I guess
-	//
+	/**
+	 * See @ref cuda::stream::create()
+	 */
 	stream_t create_stream(
 		bool                will_synchronize_with_default_stream,
 		stream::priority_t  priority = cuda::stream::default_priority) const;
@@ -559,7 +494,11 @@ class device_t {
 	event_t create_event(
 		bool uses_blocking_sync = event::sync_by_busy_waiting, // Yes, that's the runtime default
 		bool records_timing     = event::do_record_timings,
-		bool interprocess       = event::not_interprocess) const;
+		bool interprocess       = event::not_interprocess);
+
+	context_t create_context(
+		context::host_thread_synch_scheduling_policy_t  synch_scheduling_policy = context::heuristic,
+		bool                                            keep_larger_local_mem_after_resize = false) const;
 
 	template<typename KernelFunction, typename ... KernelParameters>
 	void launch(
@@ -576,97 +515,101 @@ class device_t {
 	 */
 	device::stream_priority_range_t stream_priority_range() const
 	{
-		scoped_setter_t set_device_for_this_scope(id_);
-		stream::priority_t least, greatest;
-		auto status = cudaDeviceGetStreamPriorityRange(&least, &greatest);
-		throw_if_error(status, "Failed obtaining stream priority range for " + device::detail_::identify(id_));
-		return {least, greatest};
+		return primary_context(scoped_).stream_priority_range();
 	}
 
 public:
 
-	host_thread_synch_scheduling_policy_t synch_scheduling_policy() const
+	// TODO: Make the primary context do this (when that's even possible)
+
+	context::host_thread_synch_scheduling_policy_t synch_scheduling_policy() const
 	{
-		return (host_thread_synch_scheduling_policy_t) (flags() & cudaDeviceScheduleMask);
+		return primary_context(scoped_).synch_scheduling_policy();
 	}
 
-	void set_synch_scheduling_policy(host_thread_synch_scheduling_policy_t new_policy) const
+	void set_synch_scheduling_policy(context::host_thread_synch_scheduling_policy_t new_policy)
 	{
-		auto other_flags = flags() & ~cudaDeviceScheduleMask;
-		set_flags(other_flags | (flags_t) new_policy);
+	    primary_context().set_synch_scheduling_policy(new_policy);
 	}
 
 	bool keeping_larger_local_mem_after_resize() const
 	{
-		return flags() & cudaDeviceLmemResizeToMax;
+	    return primary_context().keeping_larger_local_mem_after_resize();
 	}
 
-	void keep_larger_local_mem_after_resize(bool keep = true) const
+	void keep_larger_local_mem_after_resize(bool keep = true)
 	{
-		auto flags_ = flags();
-		if (keep) {
-			flags_ |= cudaDeviceLmemResizeToMax;
-		} else {
-			flags_ &= ~cudaDeviceLmemResizeToMax;
-		}
-		set_flags(flags_);
+        primary_context().keep_larger_local_mem_after_resize(keep);
 	}
 
-	void dont_keep_larger_local_mem_after_resize() const
+	void dont_keep_larger_local_mem_after_resize()
 	{
-		keep_larger_local_mem_after_resize(false);
+		primary_context().keep_larger_local_mem_after_resize(false);
 	}
 
-public:
 
-	/**
-	 * @brief Makes this device the CUDA Runtime API's current device
-	 *
-	 * @note a non-current device becoming current will not stop its methods from
-	 * always expressly setting the current device before doing anything(!)
-	 */
-	device_t& make_current()
+protected:
+	void maybe_decrease_primary_context_refcount() const
 	{
-		device::current::detail_::set(id());
-		return *this;
+		if (primary_context_handle_ != context::detail_::none) {
+			device::primary_context::detail_::decrease_refcount(id_);
+		}
 	}
 
-    const device_t& make_current() const
-    {
-        device::current::detail_::set(id());
-        return *this;
-    }
-
 public: 	// constructors and destructor
 
-	~device_t() noexcept = default;
-	device_t(device_t&& other) noexcept = default;
-	device_t(const device_t& other) noexcept = default;
+	friend void swap(device_t& lhs, device_t& rhs) noexcept
+	{
+		::std::swap(lhs.id_, rhs.id_);
+		::std::swap(lhs.primary_context_handle_, rhs.primary_context_handle_);
+	}
+	~device_t() { maybe_decrease_primary_context_refcount(); }
+	device_t(device_t&& other) noexcept : id_(other.id_)
+	{
+		swap(*this, other);
+	}
+
+	device_t(const device_t& other) noexcept : id_(other.id_) { }
 		// Device proxies are not owning - as devices aren't allocated nor de-allocated.
-		// Also, the proxies don't hold any state - it's the devices _themselves_ which
-		// have state. ; so there's no problem copying the proxies around. This is
-		// unlike events and streams, which get created and destroyed.
-	device_t& operator=(const device_t& other) noexcept = default;
-	device_t& operator=(device_t&& other) noexcept = default;
+		// Also, the proxies don't hold any state (except for one bit regarding whether
+		// or not the device proxy has increased the primary context refcount); it's
+		// the devices _themselves_ which have state; so there's no problem copying
+		// the proxies around. This is unlike events and streams, which get created
+		// and destroyed.
+
+	device_t& operator=(const device_t& other) noexcept
+	{
+		maybe_decrease_primary_context_refcount();
+		id_ = other.id_;
+		primary_context_handle_ = other.primary_context_handle_;
+		return *this;
+	}
+
+	device_t& operator=(device_t&& other) noexcept
+	{
+		swap(*this, other);
+		return *this;
+	}
 
 protected: // constructors
 
 	/**
-	 * @note Only @ref device::current::get() and @ref device::get() should be
+	 * @note Only @ref device::detail_::wrap() and @ref device::get() should be
 	 * calling this one.
 	 */
-	explicit device_t(device::id_t device_id) noexcept  : id_( device_id ) { }
+	explicit device_t(
+		device::id_t device_id,
+		device::primary_context::handle_t primary_context_handle = context::detail_::none) noexcept
+	: id_(device_id), primary_context_handle_(primary_context_handle) { }
 
 public: // friends
-	friend device_t device::detail_::wrap(device::id_t) noexcept;
-
-protected:
-	// data members
+	friend device_t device::detail_::wrap(device::id_t, device::primary_context::handle_t handle) noexcept;
 
-	/**
-	 * The numeric ID of the proxied device.
-	 */
-	device::id_t id_;
+protected: // data members
+	device::id_t id_; /// Numeric ID of the proxied device.
+	mutable device::primary_context::handle_t primary_context_handle_ { context::detail_::none };
+		/// Most work involving a device actually occurs using its primary context; we cache the handle
+		/// to this context here - albeit not necessary on construction
 };
 
 ///@cond
@@ -685,12 +628,13 @@ namespace device {
 
 namespace detail_ {
 
-inline device_t wrap(id_t id) noexcept
+inline device_t wrap(id_t id, primary_context::handle_t primary_context_handle) noexcept
 {
-	return device_t{ id };
+	return device_t{ id, primary_context_handle };
 }
 
 } // namespace detail_
+
 /**
  * Returns a proxy for the CUDA device with a given id
  *
@@ -698,21 +642,36 @@ inline device_t wrap(id_t id) noexcept
  * @note direct constructor access is blocked so that you don't get the
  * idea you're actually creating devices
  */
-inline device_t get(id_t device_id) noexcept
+inline device_t get(id_t id) noexcept
 {
-	return detail_::wrap(device_id);
+	ensure_driver_is_initialized(); // The device_t class mostly assumes the driver has been initialized
+	return detail_::wrap(id);
 }
 
+/**
+ * A named constructor idiom for a "dummy" CUDA device representing the CPU.
+ *
+ * @note Only use this idiom when comparing the results of functions returning
+ * locations, which can be either a GPU device or the CPU; any other use will likely
+ * result in a runtime error being thrown.
+ */
+inline device_t cpu() { return get(CU_DEVICE_CPU); }
+
 namespace current {
 
 /**
- * Returns the current device in a wrapper which assumes it is indeed
- * current, i.e. which will not set the current device before performing any
- * other actions.
+ * Obtains (a proxy for) the device which the CUDA runtime API considers to be current.
  */
-inline device_t get() { return device::detail_::wrap(detail_::get_id()); }
+inline device_t get()
+{
+	ensure_driver_is_initialized();
+	return device::get(detail_::get_id());
+}
 
-inline void set(device_t device) { detail_::set(device.id()); }
+/**
+ * Tells the CUDA runtime API to consider the specified device as the current one.
+ */
+inline void set(const device_t& device) { detail_::set(device.id()); }
 
 } // namespace current
 
@@ -747,15 +706,6 @@ inline device_t get(const ::std::string& pci_id_str)
 
 } // namespace device
 
-inline void synchronize(device_t device)
-{
-	auto device_id = device.id();
-	device::current::detail_::scoped_override_t set_device_for_this_scope(device_id);
-	auto status = cudaDeviceSynchronize();
-	throw_if_error(status, "Failed synchronizing " + ::std::to_string(device_id));
-}
-
-
 } // namespace cuda
 
 #endif // CUDA_API_WRAPPERS_DEVICE_HPP_
diff --git a/src/cuda/api/device_properties.hpp b/src/cuda/api/device_properties.hpp
index 90e6909f..92d708b3 100644
--- a/src/cuda/api/device_properties.hpp
+++ b/src/cuda/api/device_properties.hpp
@@ -11,7 +11,8 @@
 
 #include <cuda/api/constants.hpp>
 #include <cuda/api/pci_id.hpp>
-#include <cuda/common/types.hpp>
+
+#include <cuda/api/types.hpp>
 
 #include <cuda_runtime_api.h>
 
@@ -33,6 +34,11 @@ namespace cuda {
 
 namespace device {
 
+/**
+ * Type of the number of mutiprocessors within a single GPU.
+ */
+using multiprocessor_count_t = int;
+
 /**
  * A numeric designator of an architectural generation of CUDA devices
  *
@@ -96,8 +102,6 @@ struct compute_capability_t {
 	 * setting
 	 */
 	memory::shared::size_t max_shared_memory_per_block() const;
-
-
 };
 
 /**
diff --git a/src/cuda/api/devices.hpp b/src/cuda/api/devices.hpp
index 2b1da1b8..30a1c3d0 100644
--- a/src/cuda/api/devices.hpp
+++ b/src/cuda/api/devices.hpp
@@ -24,7 +24,7 @@ class all_devices {
 	using reference = value_type; // device_t is already a reference type; and there is no instance-of-device_t here to reference
 	using const_reference = const value_type; // ditto
 	using size_type = decltype(device::count());
-	using difference_type = typename std::make_signed<size_type>::type;
+	using difference_type = typename ::std::make_signed<size_type>::type;
 
 	class index_based_iterator {
 	public:
@@ -121,6 +121,11 @@ class all_devices {
 			return index_based_iterator(num_devices_, index_ - n);
 		}
 
+		difference_type operator-(const index_based_iterator& other) const
+		{
+			return this->index_ - other.index_;
+		}
+
 		size_type index() const { return index_; }
 		size_type num_devices() const { return num_devices_; }
 
diff --git a/src/cuda/api/error.hpp b/src/cuda/api/error.hpp
index 2f128085..02e32ddf 100644
--- a/src/cuda/api/error.hpp
+++ b/src/cuda/api/error.hpp
@@ -1,17 +1,26 @@
 /**
  * @file error.hpp
  *
- * @brief Facilities for exception-based handling of Runtime API
- * errors, including a basic exception class wrapping
- * `::std::runtime_error`.
+ * @brief Facilities for exception-based handling of Runtime
+ * and Driver API errors, including a basic exception class
+ * wrapping `::std::runtime_error`.
+ *
+ * @note Does not - for now - support wrapping errors generated
+ * by other CUDA-related libraries like NVRTC.
+ *
+ * @note Unlike the Runtime API, the driver API has no memory
+ * of "non-sticky" errors, which do not corrupt the current
+ * context.
+ *
  */
 #pragma once
 #ifndef CUDA_API_WRAPPERS_ERROR_HPP_
 #define CUDA_API_WRAPPERS_ERROR_HPP_
 
-#include <cuda/common/types.hpp>
-
+#include <cuda/api/types.hpp>
 #include <cuda_runtime_api.h>
+#include <cuda.h>
+
 #include <type_traits>
 #include <string>
 #include <stdexcept>
@@ -28,8 +37,13 @@ namespace status {
 enum named_t : ::std::underlying_type<status_t>::type {
 	success                         = cudaSuccess,
 	missing_configuration           = cudaErrorMissingConfiguration,
-	memory_allocation               = cudaErrorMemoryAllocation,
-	initialization_error            = cudaErrorInitializationError,
+	memory_allocation_failure       = cudaErrorMemoryAllocation, // == CUDA_ERROR_OUT_OF_MEMORY
+	initialization_error            = cudaErrorInitializationError, // == CUDA_ERROR_NOT_INITIALIZED
+	already_deinitialized           = cudaErrorCudartUnloading, // == CUDA_ERROR_DEINITIALIZED
+	profiler_disabled               = cudaErrorProfilerDisabled,
+	profiler_not_initialized        = cudaErrorProfilerNotInitialized,
+	profiler_already_started        = cudaErrorProfilerAlreadyStarted,
+	profiler_already_stopped        = cudaErrorProfilerAlreadyStopped,
 	launch_failure                  = cudaErrorLaunchFailure,
 	prior_launch_failure            = cudaErrorPriorLaunchFailure,
 	launch_timeout                  = cudaErrorLaunchTimeout,
@@ -55,7 +69,6 @@ enum named_t : ::std::underlying_type<status_t>::type {
 	invalid_filter_setting          = cudaErrorInvalidFilterSetting,
 	invalid_norm_setting            = cudaErrorInvalidNormSetting,
 	mixed_device_execution          = cudaErrorMixedDeviceExecution,
-	cuda_runtime_unloading          = cudaErrorCudartUnloading,
 	unknown                         = cudaErrorUnknown,
 	not_yet_implemented             = cudaErrorNotYetImplemented,
 	memory_value_too_large          = cudaErrorMemoryValueTooLarge,
@@ -64,7 +77,7 @@ enum named_t : ::std::underlying_type<status_t>::type {
 	insufficient_driver             = cudaErrorInsufficientDriver,
 	set_on_active_process           = cudaErrorSetOnActiveProcess,
 	invalid_surface                 = cudaErrorInvalidSurface,
-	no_device                       = cudaErrorNoDevice,
+	no_device                       = cudaErrorNoDevice, // == 100
 	ecc_uncorrectable               = cudaErrorECCUncorrectable,
 	shared_object_symbol_not_found  = cudaErrorSharedObjectSymbolNotFound,
 	shared_object_init_failed       = cudaErrorSharedObjectInitFailed,
@@ -76,13 +89,11 @@ enum named_t : ::std::underlying_type<status_t>::type {
 	invalid_kernel_image            = cudaErrorInvalidKernelImage,
 	no_kernel_image_for_device      = cudaErrorNoKernelImageForDevice,
 	incompatible_driver_context     = cudaErrorIncompatibleDriverContext,
+	invalid_context                 = CUDA_ERROR_INVALID_CONTEXT,
+	context_already_current         = CUDA_ERROR_CONTEXT_ALREADY_CURRENT,
 	peer_access_already_enabled     = cudaErrorPeerAccessAlreadyEnabled,
 	peer_access_not_enabled         = cudaErrorPeerAccessNotEnabled,
 	device_already_in_use           = cudaErrorDeviceAlreadyInUse,
-	profiler_disabled               = cudaErrorProfilerDisabled,
-	profiler_not_initialized        = cudaErrorProfilerNotInitialized,
-	profiler_already_started        = cudaErrorProfilerAlreadyStarted,
-	profiler_already_stopped        = cudaErrorProfilerAlreadyStopped,
 	assert                          = cudaErrorAssert,
 	too_many_peers                  = cudaErrorTooManyPeers,
 	host_memory_already_registered  = cudaErrorHostMemoryAlreadyRegistered,
@@ -126,18 +137,27 @@ constexpr inline bool is_success(status_t status)  { return status == (status_t)
 /**
  * @brief Determine whether the API call returning the specified status had failed
  */
-constexpr inline bool is_failure(status_t status)  { return status != (status_t) status::success; }
+constexpr inline bool is_failure(status_t status)  { return not is_success(status); }
 
 /**
  * Obtain a brief textual explanation for a specified kind of CUDA Runtime API status
  * or error code.
  */
-inline ::std::string describe(status_t status) { return cudaGetErrorString(status); }
+///@{
+inline ::std::string describe(status_t status)
+{
+	const char* description;
+	auto description_lookup_status = cuGetErrorString(status, &description);
+	return (description_lookup_status != CUDA_SUCCESS) ? nullptr : description;
+}
+inline ::std::string describe(cudaError_t status) { return cudaGetErrorString(status); }
+///@}
+
 
 namespace detail_ {
 
 template <typename I, bool UpperCase = false>
-::std::string as_hex(I x)
+std::string as_hex(I x)
 {
 	static_assert(::std::is_unsigned<I>::value, "only signed representations are supported");
 	unsigned num_hex_digits = 2*sizeof(I);
@@ -162,7 +182,7 @@ ::std::string as_hex(I x)
 template <typename I, bool UpperCase = false>
 inline ::std::string ptr_as_hex(const I* ptr)
 {
-	return as_hex((size_t) ptr);
+	return as_hex(reinterpret_cast<uintptr_t>(ptr));
 }
 
 } // namespace detail_
@@ -173,6 +193,8 @@ inline ::std::string ptr_as_hex(const I* ptr)
  *
  * A CUDA runtime error can be constructed with either just a CUDA error code
  * (=status code), or a code plus an additional message.
+ *
+ * @todo Consider renaming this to avoid confusion with the CUDA Runtime.
  */
 class runtime_error : public ::std::runtime_error {
 public:
@@ -191,11 +213,11 @@ class runtime_error : public ::std::runtime_error {
 		runtime_error(error_code, what_arg)
 	{ }
 	///@endcond
-	runtime_error(cuda::status::named_t error_code) :
+	explicit runtime_error(status::named_t error_code) :
 		runtime_error(static_cast<status_t>(error_code)) { }
-	runtime_error(cuda::status::named_t error_code, const ::std::string& what_arg) :
+	runtime_error(status::named_t error_code, const ::std::string& what_arg) :
 		runtime_error(static_cast<status_t>(error_code), what_arg) { }
-	runtime_error(cuda::status::named_t error_code, ::std::string&& what_arg) :
+	runtime_error(status::named_t error_code, ::std::string&& what_arg) :
 		runtime_error(static_cast<status_t>(error_code), what_arg) { }
 
 	/**
@@ -207,7 +229,7 @@ class runtime_error : public ::std::runtime_error {
 	status_t code_;
 };
 
-// TODO: The following could use ::std::optiomal arguments - which would
+// TODO: The following could use ::std::optional arguments - which would
 // prevent the need for dual versions of the functions - but we're
 // not writing C++17 here
 
@@ -223,11 +245,21 @@ inline void throw_if_error(status_t status, const ::std::string& message) noexce
 	if (is_failure(status)) { throw runtime_error(status, message); }
 }
 
+inline void throw_if_error(cudaError_t status, const ::std::string& message) noexcept(false)
+{
+	throw_if_error(static_cast<status_t>(status), message);
+}
+
 inline void throw_if_error(status_t status, ::std::string&& message) noexcept(false)
 {
 	if (is_failure(status)) { throw runtime_error(status, message); }
 }
 
+inline void throw_if_error(cudaError_t status, ::std::string&& message) noexcept(false)
+{
+	return throw_if_error(static_cast<status_t>(status), message);
+}
+
 /**
  * Does nothing - unless the status indicates an error, in which case
  * a @ref cuda::runtime_error exception is thrown
@@ -239,44 +271,73 @@ inline void throw_if_error(status_t status) noexcept(false)
 	if (is_failure(status)) { throw runtime_error(status); }
 }
 
+inline void throw_if_error(cudaError_t status) noexcept(false)
+{
+	throw_if_error(static_cast<status_t>(status));
+}
+
 enum : bool {
 	dont_clear_errors = false,
 	do_clear_errors    = true
 };
 
-namespace outstanding_error {
+namespace detail_ {
+
+namespace outstanding_runtime_error {
 
 /**
- * Reset the CUDA status to @ref cuda::status::success.
+ * Clears the current CUDA context's status and return any outstanding error.
+ *
+ * @todo Reconsider what this does w.r.t. driver calls
  */
-inline status_t clear() noexcept { return cudaGetLastError();    }
+inline status_t clear() noexcept
+{
+	return static_cast<status_t>(cudaGetLastError());
+}
 
 /**
  * Get the code of the last error in a CUDA-related action.
+ *
+ * @todo Reconsider what this does w.r.t. driver calls
+ */
+inline status_t get() noexcept
+{
+	return static_cast<status_t>(cudaPeekAtLastError());
+}
+
+} // namespace outstanding_runtime_error
+} // namespace detail_
+
+/**
+ * Unlike the Runtime API, where every error is outstanding
+ * until cleared, the Driver API, which we use mostly, only
+ * remembers "sticky" errors - severe errors which corrupt
+ * contexts. Such errors cannot be recovered from / cleared,
+ * and require either context destruction or process termination.
+ */
+namespace outstanding_error {
+
+/**
+ * @return the code of a sticky (= context-corrupting) error,
+ * if the CUDA driver has recently encountered any.
  */
-inline status_t get()   noexcept { return cudaPeekAtLastError(); }
+inline status_t get()
+{
+	constexpr const unsigned dummy_flags{0};
+	auto status = cuInit(dummy_flags);
+	return static_cast<status_t>(status);
+}
 
 /**
  * @brief Does nothing (unless throwing an exception)
  *
- * @note similar to @ref cuda::throw_if_error, but uses the CUDA Runtime API's internal
- * state
- *
- * @throws cuda::runtime_error if the CUDA runtime API has
- * encountered previously encountered an (uncleared) error
- *
- * @param message Additional message to incldue in the exception thrown
- * @param clear_any_error When true, clears the CUDA Runtime API's state from
- * recalling errors arising from before this moment
- *
- *
+ * @note similar to @ref cuda::throw_if_error, but uses the CUDA driver's
+ * own state regarding whether or not a sticky error has occurred
  */
-inline void ensure_none(
-	::std::string  message,
-	bool         clear_any_error = do_clear_errors) noexcept(false)
+inline void ensure_none(const ::std::string &message) noexcept(false)
 {
-	auto last_status = clear_any_error ? clear() : get();
-	throw_if_error(last_status, message);
+	auto status = get();
+	throw_if_error(status, message);
 }
 
 /**
@@ -286,11 +347,9 @@ inline void ensure_none(
  * @note exists so as to avoid incorrect overload resolution of
  * `ensure_none(my_c_string)` calls.
  */
-inline void ensure_none(
-	const char*  message,
-	bool         clear_any_error = do_clear_errors) noexcept(false)
+inline void ensure_none(const char *message) noexcept(false)
 {
-	return ensure_none(::std::string(message), clear_any_error);
+	return ensure_none(::std::string{message});
 }
 
 /**
@@ -305,56 +364,156 @@ inline void ensure_none(
  * @param clear_any_error When true, clears the CUDA Runtime API's state from
  * recalling errors arising from before this oment
  */
-inline void ensure_none(bool clear_any_error = do_clear_errors) noexcept(false)
+inline void ensure_none() noexcept(false)
 {
-	auto last_status = clear_any_error ? clear() : get();
-	throw_if_error(last_status);
+	auto status = get();
+	throw_if_error(status);
 }
 
 } // namespace outstanding_error
 
+// The following few functions are used in the error messages
+// generated for exceptions thrown by various API wrappers.
+
 namespace device {
 namespace detail_ {
+inline ::std::string identify(device::id_t device_id)
+{
+	return ::std::string("device ") + ::std::to_string(device_id);
+}
+} // namespace detail_
+} // namespace device
 
-inline ::std::string identify(id_t id)
+namespace context {
+namespace detail_ {
+
+inline ::std::string identify(handle_t handle)
 {
-	return ::std::string("device ") + std::to_string(id);
+	return "context " + cuda::detail_::ptr_as_hex(handle);
 }
 
-} // namespace detail
-} // namespace device
+inline ::std::string identify(handle_t handle, device::id_t device_id)
+{
+	return identify(handle) + " on " + device::detail_::identify(device_id);
+}
 
-namespace event {
+} // namespace detail_
+
+namespace current{
 namespace detail_ {
+inline ::std::string identify(context::handle_t handle)
+{
+	return "current context: " + context::detail_::identify(handle);
+}
+inline ::std::string identify(context::handle_t handle, device::id_t device_id)
+{
+	return "current context: " + context::detail_::identify(handle, device_id);
+}
+} // namespace detail_
+} // namespace current
+
+} // namespace context
 
-inline ::std::string identify(event::handle_t handle)
+namespace device {
+namespace primary_context {
+namespace detail_ {
+
+inline ::std::string identify(handle_t handle, device::id_t device_id)
+{
+	return "context " + context::detail_::identify(handle, device_id);
+}
+inline ::std::string identify(handle_t handle)
 {
-	return ::std::string("event ") + cuda::detail_::ptr_as_hex(handle);
+	return "context " + context::detail_::identify(handle);
 }
+} // namespace detail_
+} // namespace primary_context
+} // namespace device
 
+namespace stream {
+namespace detail_ {
+inline ::std::string identify(handle_t handle)
+{
+	return "event " + cuda::detail_::ptr_as_hex(handle);
+}
 inline ::std::string identify(handle_t handle, device::id_t device_id)
 {
 	return identify(handle) + " on " + device::detail_::identify(device_id);
 }
+inline ::std::string identify(handle_t handle, context::handle_t context_handle)
+{
+	return identify(handle) + " in " + context::detail_::identify(context_handle);
+}
+inline ::std::string identify(handle_t handle, context::handle_t context_handle, device::id_t device_id)
+{
+	return identify(handle) + " in " + context::detail_::identify(context_handle, device_id);
+}
+} // namespace detail_
+} // namespace stream
 
+namespace event {
+namespace detail_ {
+inline ::std::string identify(handle_t handle)
+{
+	return "event " + cuda::detail_::ptr_as_hex(handle);
+}
+inline ::std::string identify(handle_t handle, device::id_t device_id)
+{
+	return identify(handle) + " on " + device::detail_::identify(device_id);
+}
+inline ::std::string identify(handle_t handle, context::handle_t context_handle)
+{
+	return identify(handle) + " on " + context::detail_::identify(context_handle);
+}
+inline ::std::string identify(handle_t handle, context::handle_t context_handle, device::id_t device_id)
+{
+	return identify(handle) + " on " + context::detail_::identify(context_handle, device_id);
+}
 } // namespace detail_
 } // namespace event
 
-namespace stream {
+namespace kernel {
 namespace detail_ {
 
-inline ::std::string identify(stream::handle_t handle)
+inline ::std::string identify(const void* ptr)
+{
+	return "kernel " + cuda::detail_::ptr_as_hex(ptr);
+}
+inline ::std::string identify(const void* ptr, context::handle_t context_handle)
+{
+	return identify(ptr) + " in " + context::detail_::identify(context_handle);
+}
+inline ::std::string identify(const void* ptr, context::handle_t context_handle, device::id_t device_id)
+{
+	return identify(ptr) + " in " + context::detail_::identify(context_handle, device_id);
+}
+inline ::std::string identify(handle_t handle)
+{
+	return "kernel " + cuda::detail_::ptr_as_hex(handle);
+}
+inline ::std::string identify(handle_t handle, context::handle_t context_handle)
 {
-	return ::std::string("stream ") + cuda::detail_::ptr_as_hex(handle);
+	return identify(handle) + " in " + context::detail_::identify(context_handle);
 }
+inline ::std::string identify(handle_t handle, context::handle_t context_handle, device::id_t device_id)
+{
+	return identify(handle) + " in " + context::detail_::identify(context_handle, device_id);
+}
+
+} // namespace detail
+} // namespace kernel
 
-inline ::std::string identify(stream::handle_t handle, device::id_t device_id)
+namespace memory {
+namespace detail_ {
+
+inline ::std::string identify(region_t region)
 {
-	return identify(handle) + " on " + device::detail_::identify(device_id);
+	return ::std::string("memory region at ") + cuda::detail_::ptr_as_hex(region.data())
+		+ " of size " + ::std::to_string(region.size());
 }
 
 } // namespace detail_
-} // namespace stream
+} // namespace memory
 
 } // namespace cuda
 
diff --git a/src/cuda/api/event.hpp b/src/cuda/api/event.hpp
index 1cd04247..8fa51048 100644
--- a/src/cuda/api/event.hpp
+++ b/src/cuda/api/event.hpp
@@ -9,15 +9,15 @@
 #ifndef CUDA_API_WRAPPERS_EVENT_HPP_
 #define CUDA_API_WRAPPERS_EVENT_HPP_
 
-#include <cuda/api/constants.hpp>
-#include <cuda/api/current_device.hpp>
-#include <cuda/api/error.hpp>
-#include <cuda/api/ipc.hpp>
-#include <cuda/common/types.hpp>
+#include <cuda/api/types.hpp>
 
 #include <cuda_runtime_api.h>
 
 #include <chrono> // for duration types
+#include <cuda/api/constants.hpp>
+#include <cuda/api/current_device.hpp>
+#include <cuda/api/error.hpp>
+#include <cuda/api/ipc.hpp>
 
 namespace cuda {
 
@@ -38,18 +38,18 @@ namespace detail_ {
  * @param event_handle Event to be made to occur on stream @ref stream_handle
  */
 inline void enqueue(stream::handle_t stream_handle, handle_t event_handle) {
-	auto status = cudaEventRecord(event_handle, stream_handle);
+	auto status = cuEventRecord(event_handle, stream_handle);
 	cuda::throw_if_error(status,
-		"Failed recording event " + event::detail_::identify(event_handle)
+		"Failed recording " + event::detail_::identify(event_handle)
 		+ " on " + stream::detail_::identify(stream_handle));
 }
 
 constexpr unsigned inline make_flags(bool uses_blocking_sync, bool records_timing, bool interprocess)
 {
 	return
-		  ( uses_blocking_sync  ? cudaEventBlockingSync : 0  )
-		| ( records_timing      ? 0 : cudaEventDisableTiming )
-		| ( interprocess        ? cudaEventInterprocess : 0  );
+		  ( uses_blocking_sync  ? CU_EVENT_BLOCKING_SYNC : 0  )
+		| ( records_timing      ? 0 : CU_EVENT_DISABLE_TIMING )
+		| ( interprocess        ? CU_EVENT_INTERPROCESS : 0  );
 }
 
 } // namespace detail_
@@ -63,11 +63,12 @@ class event_t;
 namespace event {
 
 namespace detail_ {
+
 /**
  * @brief Wrap an existing CUDA event in a @ref event_t instance.
  *
- * @param device_id ID of the device for which the stream is defined
- * @param event_handle handle for the pre-existing event
+ * @param context_handle Handle of the context in which this event was created
+ * @param event_handle handle of the pre-existing event
  * @param take_ownership When set to `false`, the CUDA event
  * will not be destroyed along with proxy; use this setting
  * when temporarily working with a stream existing irrespective of
@@ -77,9 +78,13 @@ namespace detail_ {
  * @return The constructed `cuda::event_t`.
  */
 event_t wrap(
-	device::id_t  device_id,
-	handle_t          event_handle,
-	bool          take_ownership = false) noexcept;
+	device::id_t       device_id,
+	context::handle_t  context_handle,
+	handle_t           event_handle,
+	bool               take_ownership = false) noexcept;
+
+::std::string identify(const event_t& event);
+
 
 } // namespace detail_
 
@@ -88,7 +93,7 @@ event_t wrap(
 inline void synchronize(const event_t& event);
 
 /**
- * @brief Proxy class for a CUDA event
+ * @brief Wrapper class for a CUDA event
  *
  * Use this class - built around an event handle - to perform almost, if not all,
  * event-related operations the CUDA Runtime API is capable of.
@@ -100,31 +105,31 @@ inline void synchronize(const event_t& event);
  *
  * @note this is one of the three main classes in the Runtime API wrapper library,
  * together with @ref cuda::device_t and @ref cuda::stream_t
+ * @note This class is a "reference type", not a "value type". Therefore, making changes
+ * to the event is a const-respecting operation on this class.
  */
 class event_t {
+
 public: // data member non-mutator getters
-	/**
-	 * The CUDA handle this object is wrapping
-	 */
-	event::handle_t  handle() const noexcept{ return handle_; }
+	/// The raw CUDA ID for the device w.r.t. which the event is defined
+	device::id_t      device_id()       const noexcept { return device_id_; };
 
-	/**
-	 * ID of the device with which this event is associated (and on whose
-	 * streams this event can be enqueued)
-	 */
-	device::id_t device_id() const noexcept { return device_id_; }
+	/// The raw CUDA handle for the context in which the represented stream is defined.
+	context::handle_t context_handle()  const noexcept { return context_handle_; }
+
+	/// The raw CUDA handle for this event
+	event::handle_t   handle()          const noexcept { return handle_; }
+
+	/// True if this wrapper is responsible for telling CUDA to destroy the event upon the wrapper's own destruction
+	bool              is_owning()       const noexcept { return owning; }
+
+	/// The device w.r.t. which the event is defined
+	device_t          device()          const;
+
+	/// The context in which this stream was defined.
+	context_t         context()         const;
 
-	/**
-	 * The device with which this event is associated (i.e. on whose stream
-	 * this event can be enqueued)
-	 */
-	device_t device() const noexcept;
 
-	/**
-	 * Is this wrapper responsible for having the CUDA Runtime API destroy
-	 * the event when it destructs?
-	 */
-	bool is_owning() const noexcept { return owning; }
 
 public: // other non-mutator methods
 
@@ -141,11 +146,11 @@ class event_t {
 	 */
 	bool has_occurred() const
 	{
-		auto status = cudaEventQuery(handle_);
+		auto status = cuEventQuery(handle_);
 		if (status == cuda::status::success) return true;
 		if (status == cuda::status::not_ready) return false;
 		throw cuda::runtime_error(status,
-			"Could not determine whether " + event::detail_::identify(handle_, device_id_)
+			"Could not determine whether " + event::detail_::identify(handle_)
 			+ "has already occurred or not.");
 	}
 
@@ -158,14 +163,13 @@ class event_t {
 
 public: // other mutator methods
 
-
 	/**
 	 * Schedule a specified event to occur (= to fire) when all activities
 	 * already scheduled on the event's device's default stream have concluded.
 	 *
 	 * @note No protection against repeated calls.
 	 */
-	void record()
+	void record() const
 	{
 		event::detail_::enqueue(stream::default_stream_handle, handle_);
 	}
@@ -176,7 +180,7 @@ class event_t {
 	 *
 	 * @note No protection against repeated calls.
 	 */
-	void record(const stream_t& stream);
+	void record(const stream_t& stream) const;
 
 	/**
 	 * Records the event and ensures it has occurred before returning
@@ -184,39 +188,43 @@ class event_t {
 	 *
 	 * @note No protection against repeated calls.
 	 */
-	void fire(const stream_t& stream);
+	void fire(const stream_t& stream) const;
 
 	/**
 	 * Have the calling thread wait - either busy-waiting or blocking - and
 	 * return only after this event has occurred (see @ref has_occurred() ).
 	 */
-	void synchronize()
+	void synchronize() const
 	{
 		return cuda::synchronize(*this);
 	}
 
 protected: // constructors
 
-	event_t(device::id_t device_id, event::handle_t event_handle, bool take_ownership) noexcept
-	: device_id_(device_id), handle_(event_handle), owning(take_ownership) { }
+	event_t(device::id_t device_id, context::handle_t context_handle, event::handle_t event_handle, bool take_ownership) noexcept
+	: device_id_(device_id), context_handle_(context_handle), handle_(event_handle), owning(take_ownership) { }
 
 public: // friendship
 
-	friend event_t event::detail_::wrap(device::id_t device_id, event::handle_t event_handle, bool take_ownership) noexcept;
+	friend event_t event::detail_::wrap(device::id_t, context::handle_t context_handle, event::handle_t event_handle, bool take_ownership) noexcept;
 
 public: // constructors and destructor
 
-	event_t(const event_t& other) noexcept : event_t(other.device_id_, other.handle_, false) { }
+	event_t(const event_t& other) noexcept : event_t(other.device_id_, other.context_handle_, other.handle_, false) { }
 
 	event_t(event_t&& other) noexcept :
-		event_t(other.device_id_, other.handle_, other.owning)
+		event_t(other.device_id_, other.context_handle_, other.handle_, other.owning)
 	{
 		other.owning = false;
 	};
 
 	~event_t()
 	{
-		if (owning) { cudaEventDestroy(handle_); }
+		if (owning) {
+			cuEventDestroy(handle_);
+				// Note: "Swallowing" any potential error to avoid std::terminate(); also,
+				// because the context cannot possibly exist after this call.
+		}
 	}
 
 public: // operators
@@ -225,9 +233,10 @@ class event_t {
 	event_t& operator=(event_t&& other) = delete;
 
 protected: // data members
-	const device::id_t  device_id_;
-	const event::handle_t   handle_;
-	bool                owning;
+	const device::id_t       device_id_;
+	const context::handle_t  context_handle_;
+	const event::handle_t    handle_;
+	bool                     owning;
 		// this field is mutable only for enabling move construction; other
 		// than in that case it must not be altered
 };
@@ -253,7 +262,7 @@ using duration_t = ::std::chrono::duration<float, ::std::milli>;
 inline duration_t time_elapsed_between(const event_t& start, const event_t& end)
 {
 	float elapsed_milliseconds;
-	auto status = cudaEventElapsedTime(&elapsed_milliseconds, start.handle(), end.handle());
+	auto status = cuEventElapsedTime(&elapsed_milliseconds, start.handle(), end.handle());
 	cuda::throw_if_error(status, "determining the time elapsed between events");
 	return duration_t { elapsed_milliseconds };
 }
@@ -276,29 +285,37 @@ namespace detail_ {
  * @return an event wrapper associated with the specified event
  */
 inline event_t wrap(
-	device::id_t  device_id,
-	handle_t          event_handle,
-	bool          take_ownership) noexcept
+	device::id_t       device_id,
+	context::handle_t  context_handle,
+	handle_t           event_handle,
+	bool               take_ownership) noexcept
+{
+	return { device_id, context_handle, event_handle, take_ownership };
+}
+
+inline ::std::string identify(const event_t& event)
 {
-	return event_t(device_id, event_handle, take_ownership);
+	return identify(event.handle(), event.context_handle(), event.device_id());
 }
 
 // Note: For now, event_t's need their device's ID - even if it's the current device;
 // that explains the requirement in this function's interface
-inline event_t create_on_current_device(
-	device::id_t  current_device_id,
-	bool          uses_blocking_sync,
-	bool          records_timing,
-	bool          interprocess)
+inline event_t create_in_current_context(
+	device::id_t       current_device_id,
+	context::handle_t  current_context_handle,
+	bool               uses_blocking_sync,
+	bool               records_timing,
+	bool               interprocess)
 {
 	auto flags = make_flags(uses_blocking_sync, records_timing, interprocess);
 	cuda::event::handle_t new_event_handle;
-	auto status = cudaEventCreateWithFlags(&new_event_handle, flags);
+	auto status = cuEventCreate(&new_event_handle, flags);
 	cuda::throw_if_error(status, "failed creating a CUDA event associated with the current device");
 	// Note: We're trusting CUDA to actually have succeeded if it reports success,
 	// so we're not checking the newly-created event handle - which is really just
 	// a pointer - for nullness
-	return wrap(current_device_id, new_event_handle, do_take_ownership);
+	bool take_ownership = true;
+	return wrap(current_device_id, current_context_handle, new_event_handle, take_ownership);
 }
 
 /**
@@ -306,14 +323,14 @@ inline event_t create_on_current_device(
  */
 
 inline event_t create(
-	device::id_t  device_id,
-	bool          uses_blocking_sync,
-	bool          records_timing,
-	bool          interprocess)
+	device::id_t       device_id,
+	context::handle_t  context_handle,
+	bool               uses_blocking_sync,
+	bool               records_timing,
+	bool               interprocess)
 {
-	device::current::detail_::scoped_override_t
-		set_device_for_this_scope(device_id);
-	return detail_::create_on_current_device(device_id, uses_blocking_sync, records_timing, interprocess);
+	context::current::detail_::scoped_override_t set_context_for_this_scope(context_handle);
+	return detail_::create_in_current_context(device_id, context_handle, uses_blocking_sync, records_timing, interprocess);
 }
 
 } // namespace detail_
@@ -330,10 +347,10 @@ inline event_t create(
  * @note Creating an event
  */
 inline event_t create(
-	device_t  device,
-	bool      uses_blocking_sync = sync_by_busy_waiting, // Yes, that's the runtime default
-	bool      records_timing     = do_record_timings,
-	bool      interprocess       = not_interprocess);
+	device_t&  device,
+	bool       uses_blocking_sync = sync_by_busy_waiting, // Yes, that's the runtime default
+	bool       records_timing     = do_record_timings,
+	bool       interprocess       = not_interprocess);
 
 } // namespace event
 
@@ -342,18 +359,18 @@ inline event_t create(
  * to the calling code.
  *
  * @todo Determine how this waiting takes place (as opposed to stream
- * synchrnoization).
+ * synchronization).
  *
  * @param event the event for whose occurrence to wait; must be scheduled
  * to occur on some stream (possibly the different stream)
  */
 inline void synchronize(const event_t& event)
 {
-	auto device_id = event.device_id();
+	auto context_handle = event.context_handle();
 	auto event_handle = event.handle();
-	device::current::detail_::scoped_override_t device_for_this_scope(device_id);
-	auto status = cudaEventSynchronize(event_handle);
-	throw_if_error(status, "Failed synchronizing " + event::detail_::identify(event_handle, device_id));
+	context::current::detail_::scoped_override_t context_for_this_scope(context_handle);
+	auto status = cuEventSynchronize(event_handle);
+	throw_if_error(status, "Failed synchronizing " + event::detail_::identify(event));
 }
 
 } // namespace cuda
diff --git a/src/cuda/api/ipc.hpp b/src/cuda/api/ipc.hpp
index c5d4a60b..6cb9b50a 100644
--- a/src/cuda/api/ipc.hpp
+++ b/src/cuda/api/ipc.hpp
@@ -2,7 +2,8 @@
  * @file ipc.hpp
  *
  * @brief wrappers for CUDA's facilities for sharing on-device
- * memory addresses and CUDA events between host processes
+ * memory addresses and CUDA events between host processes (Inter-
+ * Process Communication)
  *
  * CUDA addresses into device memory are not valid across different
  * host processes - somewhat, but not entirely, similarly to the
@@ -23,9 +24,8 @@
 #ifndef CUDA_API_WRAPPERS_IPC_HPP_
 #define CUDA_API_WRAPPERS_IPC_HPP_
 
-#include <cuda/api/device.hpp>
+#include <cuda/api/types.hpp>
 #include <cuda/api/error.hpp>
-#include <cuda/common/types.hpp>
 
 #include <cuda_runtime_api.h>
 
@@ -43,7 +43,7 @@ namespace ipc {
  * The concrete value passed between processes, used to tell
  * the CUDA Runtime API which memory area is desired.
  */
-using handle_t = cudaIpcMemHandle_t;
+using handle_t = CUipcMemHandle;
 
 /**
  * Obtain a handle for a region of on-device memory which can
@@ -59,9 +59,9 @@ using handle_t = cudaIpcMemHandle_t;
  */
 inline handle_t export_(void* device_ptr) {
 	handle_t handle;
-	auto status = cudaIpcGetMemHandle(&handle, device_ptr);
-		cuda::throw_if_error(status,
-			"Failed producing an IPC memory handle for device pointer " + cuda::detail_::ptr_as_hex(device_ptr));
+	auto status = cuIpcGetMemHandle(&handle, device::address(device_ptr));
+	cuda::throw_if_error(status, "Failed producing an IPC memory handle for device pointer "
+		+ cuda::detail_::ptr_as_hex(device_ptr));
 	return handle;
 }
 
@@ -69,7 +69,7 @@ inline handle_t export_(void* device_ptr) {
  * @brief Obtain a CUDA pointer from a handle passed
  * by inter-process communication
  *
- * @note the couterpart of @ref memory::ipc::unmap.
+ * @note the counterpart of @ref memory::ipc::unmap.
  *
  * @param handle the handle which allows us access to the on-device address
  * @return a pointer to the relevant address (which may not have the same value
@@ -78,10 +78,9 @@ inline handle_t export_(void* device_ptr) {
 template <typename T = void>
 inline T* import(const handle_t& handle)
 {
-	void* device_ptr;
-	auto status = cudaIpcOpenMemHandle(&device_ptr, handle, cudaIpcMemLazyEnablePeerAccess);
-	cuda::throw_if_error(status,
-		"Failed obtaining a device pointer from an IPC memory handle");
+	CUdeviceptr device_ptr;
+	auto status = cuIpcOpenMemHandle(&device_ptr, handle, CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS);
+	cuda::throw_if_error(status, "Failed obtaining a device pointer from an IPC memory handle");
 	return reinterpret_cast<T*>(device_ptr);
 }
 
@@ -92,10 +91,8 @@ inline T* import(const handle_t& handle)
  */
 inline void unmap(void* ipc_mapped_ptr)
 {
-	auto status = cudaIpcCloseMemHandle(ipc_mapped_ptr);
-	cuda::throw_if_error(status,
-		"Failed unmapping IPC memory mapped to " +
-		cuda::detail_::ptr_as_hex(ipc_mapped_ptr));
+	auto status = cuIpcCloseMemHandle(device::address(ipc_mapped_ptr));
+	cuda::throw_if_error(status, "Failed unmapping IPC memory mapped to " + cuda::detail_::ptr_as_hex(ipc_mapped_ptr));
 }
 
 /**
@@ -109,7 +106,7 @@ inline void unmap(void* ipc_mapped_ptr)
 template <typename T = void>
 class imported_t {
 public: // constructors & destructor
-	imported_t(const handle_t& handle) : ptr_(import<T>(handle))
+	explicit imported_t(const handle_t& handle) : ptr_(import<T>(handle))
 	{
 		if (ptr_ == nullptr) {
 			throw ::std::logic_error("IPC memory handle yielded a null pointer");
@@ -156,25 +153,24 @@ namespace ipc {
  * The concrete value passed between processes, used to tell
  * the CUDA Runtime API which event is desired.
  */
-using handle_t = cudaIpcEventHandle_t;
+using handle_t = CUipcEventHandle;
 
 namespace detail_ {
 
 inline handle_t export_(event::handle_t event_handle)
 {
 	handle_t ipc_handle;
-	auto status = cudaIpcGetEventHandle(&ipc_handle, event_handle);
-	cuda::throw_if_error(status,
-		"Failed obtaining an IPC event handle for " + event::detail_::identify(event_handle));
+	auto status = cuIpcGetEventHandle(&ipc_handle, event_handle);
+	cuda::throw_if_error(status, "Failed obtaining an IPC event handle for " +
+		event::detail_::identify(event_handle));
 	return ipc_handle;
 }
 
 inline event::handle_t import(const handle_t& handle)
 {
 	event::handle_t event_handle;
-	auto status = cudaIpcOpenEventHandle(&event_handle, handle);
-	cuda::throw_if_error(status,
-		"Failed obtaining an event handle from an IPC event handle");
+	auto status = cuIpcOpenEventHandle(&event_handle, handle);
+	cuda::throw_if_error(status, "Failed obtaining an event handle from an IPC event handle");
 	return event_handle;
 }
 
@@ -188,7 +184,7 @@ inline event::handle_t import(const handle_t& handle)
  * may obtain a proper CUDA event
  *
  */
-inline handle_t export_(event_t& event);
+inline handle_t export_(const event_t& event);
 
 /**
  * Obtain a proper CUDA event, corresponding to an event created by another
@@ -198,10 +194,19 @@ inline handle_t export_(event_t& event);
  * from an event handle (or otherwise - have a handle provide both an event handle and
  * a device ID), but that is not currently the case.
  *
- * @param device the device to which the imported event corresponds
- * @param handle the handle obtained via inter-process communications
+ * @param event_ipc_handle the handle obtained via inter-process communications
  */
-inline event_t import(device_t& device, const handle_t& handle);
+///@{
+ /**
+  * @param device the device with which the imported event is associated
+  */
+inline event_t import(const device_t& device, const handle_t& event_ipc_handle);
+
+/**
+ * @param context the device-context with which the imported event is associated
+ */
+inline event_t import(const context_t& device, const handle_t& event_ipc_handle);
+///@}
 
 } // namespace ipc
 } // namespace event
diff --git a/src/cuda/api/kernel.hpp b/src/cuda/api/kernel.hpp
index c24fea32..9efb79bb 100644
--- a/src/cuda/api/kernel.hpp
+++ b/src/cuda/api/kernel.hpp
@@ -1,110 +1,148 @@
 /**
  * @file kernel.hpp
  *
- * @brief Functions for querying information and making settings
- * regarding CUDA kernels (`__global__` functions).
+ * @brief Contains a base wrapper class for CUDA kernels - both statically and
+ * dynamically compiled; and some related functionality.
  *
- * @note This file does _not_ define any kernels  itself.
+ * @note This file does _not_ define any kernels itself.
  */
 #pragma once
 #ifndef CUDA_API_WRAPPERS_KERNEL_HPP_
 #define CUDA_API_WRAPPERS_KERNEL_HPP_
 
-#include <cuda/api/current_device.hpp>
-#include <cuda/api/device_properties.hpp>
+#include <cuda/api/types.hpp>
 #include <cuda/api/error.hpp>
-#include <cuda/common/types.hpp>
+#include <cuda/api/current_context.hpp>
+// #include <cuda/api/module.hpp>
 
 #include <cuda_runtime_api.h>
+#include <cuda.h>
 
 namespace cuda {
 
 ///@cond
 class device_t;
-class stream_t;
 class kernel_t;
-///@endcond
+///@nocond
 
 namespace kernel {
 
 namespace detail_ {
 
-inline kernel_t wrap(device::id_t device_id, const void* ptr);
+kernel_t wrap(device::id_t device_id, context::handle_t context_id,	kernel::handle_t f);
 
-} // namespace detail
-
-/**
- * @brief a wrapper around `cudaFuncAttributes`, offering
- * a few convenience member functions.
- */
-struct attributes_t : cudaFuncAttributes {
+#ifndef NDEBUG
+static const char* attribute_name(int attribute_index)
+{
+	// Note: These correspond to the values of enum CUfunction_attribute_enum
+	static const char* names[] = {
+		"Maximum number of threads per block",
+		"Statically-allocated shared memory size in bytes",
+		"Required constant memory size in bytes",
+		"Required local memory size in bytes",
+		"Number of registers used by each thread",
+		"PTX virtual architecture version into which the kernel code was compiled",
+		"Binary architecture version for which the function was compiled",
+		"Indication whether the function was compiled with cache mode CA",
+		"Maximum allowed size of dynamically-allocated shared memory use size bytes",
+		"Preferred shared memory carve-out to actual shared memory"
+	};
+	return names[attribute_index];
+}
+#endif
 
-	cuda::device::compute_capability_t ptx_version() const noexcept {
-		return device::compute_capability_t::from_combined_number(ptxVersion);
-	}
+inline attribute_value_t get_attribute_in_current_context(handle_t handle, attribute_t attribute)
+{
+	kernel::attribute_value_t attribute_value;
+	auto result = cuFuncGetAttribute(&attribute_value,  attribute, handle);
+	throw_if_error(result,
+		::std::string("Failed obtaining attribute ") +
+#ifdef NDEBUG
+			::std::to_string(static_cast<::std::underlying_type<kernel::attribute_t>::type>(attribute))
+#else
+			attribute_name(attribute)
+#endif
+	);
+	return attribute_value;
+}
 
-	cuda::device::compute_capability_t binary_compilation_target_architecture() const noexcept {
-		return device::compute_capability_t::from_combined_number(binaryVersion);
-	}
-};
+} // namespace detail_
 
 } // namespace kernel
 
 /**
- * A non-owning wrapper class for CUDA `__global__` functions
+ * A non-owning wrapper for CUDA kernels - whether they be `__global__` functions compiled
+ * apriori, or the result of dynamic NVRTC compilation, or obtained in some other future
+ * way.
+ *
+ * @note The association of a `kernel_t` with an individual device or context is somewhat
+ * tenuous. That is, the same function could be used with any other compatible device;
+ * However, many/most of the features, attributes and settings are context-specific
+ * or device-specific.
+ *
+ * @note NVRTC-compiled kernels can only use this class, with apriori-compiled
+ * kernels can use their own subclass.
  *
- * @note The association of a `kernel_t` with an individual device is somewhat tenuous.
- * That is, the same function pointer could be used with any other device (provided the kernel
- * was compiled appropriately). However, many/most of the features, attributes and settings
- * are device-specific.
+ * @todo Consider holding a module handle (possibly null/0/invalid), and a boolean
+ * saying whether this kernel wrapper holds it. This would allow passing kernel_t's
+ * without accompanying module_t's.
  */
 class kernel_t {
+
 public: // getters
-	const void* ptr() const noexcept { return ptr_; }
+	context_t context() const noexcept;
 	device_t device() const noexcept;
 
-protected:
-	device::id_t device_id() const noexcept { return device_id_; }
-
-public: // type_conversions
-	operator const void*() noexcept { return ptr_; }
+	device::id_t      device_id()  const noexcept { return device_id_; }
+	context::handle_t context_handle() const noexcept { return context_handle_; }
+	kernel::handle_t  handle()     const noexcept { return handle_; }
 
 public: // non-mutators
 
-	inline kernel::attributes_t attributes() const;
+	kernel::attribute_value_t get_attribute(kernel::attribute_t attribute) const
+	{
+		context::current::detail_::scoped_override_t set_context_for_this_context(context_handle_);
+		return kernel::detail_::get_attribute_in_current_context(handle(), attribute);
+	}
 
-/*
-	// The following are commented out because there are no CUDA API calls for them!
-	// You may uncomment them if you'd rather get an exception...
+	cuda::device::compute_capability_t ptx_version() const noexcept {
+		auto raw_attribute = get_attribute(CU_FUNC_ATTRIBUTE_PTX_VERSION);
+		return device::compute_capability_t::from_combined_number(raw_attribute);
+	}
 
-	multiprocessor_cache_preference_t                cache_preference() const;
-	multiprocessor_shared_memory_bank_size_option_t  shared_memory_bank_size() const;
-*/
+	cuda::device::compute_capability_t binary_compilation_target_architecture() const noexcept {
+		auto raw_attribute = get_attribute(CU_FUNC_ATTRIBUTE_BINARY_VERSION);
+		return device::compute_capability_t::from_combined_number(raw_attribute);
+	}
 
 	/**
-	 * @brief Calculates the number of grid blocks which may be "active" on a given GPU
-	 * multiprocessor simultaneously (i.e. with warps from any of these block
-	 * being schedulable concurrently)
+	 * @return the maximum number of threads per block for which the GPU device can satisfy
+	 * this kernel's hardware requirement - typically, the number of registers in use.
 	 *
-	 * @param num_threads_per_block
-	 * @param dynamic_shared_memory_per_block
-	 * @param disable_caching_override On some GPUs, the choice of whether to
-	 * cache memory reads affects occupancy. But what if this caching results in 0
-	 * potential occupancy for a kernel? There are two options, controlled by this flag.
-	 * When it is set to false - the calculator will assume caching is off for the
-	 * purposes of its work; when set to true, it will return 0 for such device functions.
-	 * See also the "Unified L1/Texture Cache" section of the
-	 * <a href="http://docs.nvidia.com/cuda/maxwell-tuning-guide/index.html">Maxwell
-	 * tuning guide</a>.
+	 * @note the kernel may have other constraints, requiring a different number of threads
+	 * per block; these cannot be determined using this method.
 	 */
-	grid::dimension_t maximum_active_blocks_per_multiprocessor(
-		grid::block_dimension_t   num_threads_per_block,
-		memory::shared::size_t    dynamic_shared_memory_per_block,
-		bool                      disable_caching_override = false);
+	grid::block_dimension_t maximum_threads_per_block() const
+	{
+		return get_attribute(CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK);
+	}
 
-public: // mutators
+	grid::complete_dimensions_t min_grid_params_for_max_occupancy(
+		memory::shared::size_t dynamic_shared_memory_size = no_dynamic_shared_memory,
+		grid::block_dimension_t block_size_limit = 0,
+		bool disable_caching_override = false) const;
 
-	void set_attribute(kernel::attribute_t attribute, kernel::attribute_value_t value);
+	using shared_memory_size_determiner_t = size_t (*)(int block_size);
+
+	grid::complete_dimensions_t min_grid_params_for_max_occupancy(
+		shared_memory_size_determiner_t shared_memory_size_determiner,
+		grid::block_dimension_t block_size_limit = 0,
+		bool disable_caching_override = false) const;
+
+
+public: // methods mutating the kernel-in-context, but not this reference object
+
+	void set_attribute(kernel::attribute_t attribute, kernel::attribute_value_t value) const;
 
 	/**
 	 * @brief Change the hardware resource carve-out between L1 cache and shared memory
@@ -117,54 +155,16 @@ class kernel_t {
 	 * also be set on the individual device-function level, by specifying the amount of shared
 	 * memory the kernel may require.
 	 */
-	void opt_in_to_extra_dynamic_memory(cuda::memory::shared::size_t amount_required_by_kernel);
-
-	/**
-	 *
-	 * @param dynamic_shared_memory_size The amount of dynamic shared memory each grid block will
-	 * need.
-	 * @param block_size_limit do not return a block size above this value; the default, 0,
-	 * means no limit on the returned block size.
-	 * @param disable_caching_override On platforms where global caching affects occupancy,
-	 * and when enabling caching would result in zero occupancy, the occupancy calculator will
-	 * calculate the occupancy as if caching is disabled. Setting this to true makes the
-	 * occupancy calculator return 0 in such cases. More information can be found about this
-	 * feature in the "Unified L1/Texture Cache" section of the
-	 * <a href="https://docs.nvidia.com/cuda/maxwell-tuning-guide/index.html">Maxwell tuning guide</a>.
-	 *
-	 * @return A pair, with the second element being the maximum achievable block size
-	 * (1-dimensional), and the first element being the minimum number of such blocks necessary
-	 * for keeping the GPU "busy" (again, in a 1-dimensional grid).
-	 */
-	grid::complete_dimensions_t min_grid_params_for_max_occupancy(
-		memory::shared::size_t   dynamic_shared_memory_size = no_dynamic_shared_memory,
-		grid::block_dimension_t  block_size_limit = 0,
-		bool                     disable_caching_override = false) const;
-
-	template <typename UnaryFunction>
-	grid::complete_dimensions_t min_grid_params_for_max_occupancy(
-		UnaryFunction            block_size_to_dynamic_shared_mem_size,
-		grid::block_dimension_t  block_size_limit = 0,
-		bool                     disable_caching_override = false) const;
-
-	/**
-	 * @brief Indicate the desired carve-out between shared memory and L1 cache when launching
-	 * this kernel - with fine granularity.
-	 *
-	 * On several nVIDIA GPU micro-architectures, the L1 cache and the shared memory in each
-	 * symmetric multiprocessor (=physical core) use the same hardware resources. The
-	 * carve-out between the two uses has a device-wide value (which can be changed), but the
-	 * driver can set another value for a specific function. This function doesn't make a demand
-	 * from the CUDA runtime (as in @p opt_in_to_extra_dynamic_memory), but rather indicates
-	 * what is the fraction of L1 to shared memory it would like the kernel scheduler to carve
-	 * out.
-	 *
-	 * @param shared_mem_percentage The percentage - from 0 to 100 - of the combined L1/shared
-	 * memory space the user wishes to assign to shared memory.
-	 *
-	 * @note similar to @ref set_cache_preference() - but with finer granularity.
-	 */
-	void set_preferred_shared_mem_fraction(unsigned shared_mem_percentage);
+	void set_maximum_dynamic_shared_memory_per_block(cuda::memory::shared::size_t amount_required_by_kernel) const
+	{
+		auto amount_required_by_kernel_ = (kernel::attribute_value_t) amount_required_by_kernel;
+		if (amount_required_by_kernel != (cuda::memory::shared::size_t) amount_required_by_kernel_) {
+			throw ::std::invalid_argument("Requested amount of maximum shared memory exceeds the "
+				"representation range for kernel attribute values");
+		}
+		// TODO: Consider a check in debug mode for the value being within range
+		set_attribute(CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,amount_required_by_kernel_);
+	}
 
 	/**
 	 * @brief Indicate the desired carve-out between shared memory and L1 cache when launching
@@ -183,51 +183,241 @@ class kernel_t {
 	 *
 	 * @note similar to @ref set_preferred_shared_mem_fraction() - but with coarser granularity.
 	 */
-	void set_cache_preference(multiprocessor_cache_preference_t preference);
+	void set_cache_preference(multiprocessor_cache_preference_t preference)
+	{
+		context::current::detail_::scoped_override_t set_context_for_this_context(context_handle_);
+		auto result = cuFuncSetCacheConfig(handle(), (CUfunc_cache) preference);
+		throw_if_error(result,
+			"Setting the multiprocessor L1/Shared Memory cache distribution preference for a "
+			"CUDA device function");
+	}
 
 	/**
-	 * @brief Sets a device function's preference of shared memory bank size preference
-	 * (for the current device probably)
+	 * @brief Sets a device function's preference of shared memory bank size
 	 *
 	 * @param config bank size setting to make
 	 */
-	void set_shared_memory_bank_size(multiprocessor_shared_memory_bank_size_option_t config);
-
-
-protected: // ctors & dtor
-	kernel_t(device::id_t device_id, const void* f)
-	: device_id_(device_id), ptr_(f)
+	void set_shared_memory_bank_size(multiprocessor_shared_memory_bank_size_option_t config)
 	{
-		// TODO: Consider checking whether this actually is a device function
-		// TODO: Consider performing a check for nullptr
+		// TODO: Need to set a context, not a device
+		context::current::detail_::scoped_override_t set_context_for_this_context(context_handle_);
+		auto result = cuFuncSetSharedMemConfig(handle(), static_cast<CUsharedconfig>(config) );
+		throw_if_error(result, "Failed setting the shared memory bank size");
 	}
 
+protected: // ctors & dtor
+	kernel_t(device::id_t device_id, context::handle_t context_handle, kernel::handle_t handle)
+		: device_id_(device_id), context_handle_(context_handle), handle_(handle) { }
+
 public: // ctors & dtor
-	~kernel_t() = default;
+	friend kernel_t kernel::detail_::wrap(device::id_t, context::handle_t, kernel::handle_t);
 
-	friend kernel_t kernel::detail_::wrap(device::id_t, const void* ptr);
+	kernel_t(const kernel_t& other) = default; // Note: be careful with subclasses
+	kernel_t(kernel_t&& other) = default; // Note: be careful with subclasses
+
+public: // ctors & dtor
+	virtual ~kernel_t() = default;
 
 protected: // data members
-	const device::id_t device_id_;
-	const void* const ptr_;
-};
+	device::id_t device_id_; // We don't _absolutely_ need the device ID, but - why not have it if we can?
+	context::handle_t context_handle_;
+	mutable kernel::handle_t handle_;
+}; // kernel_t
 
 namespace kernel {
 
 namespace detail_ {
 
-inline kernel_t wrap(device::id_t device_id, const void* function_ptr)
+inline kernel_t wrap(
+	device::id_t       device_id,
+	context::handle_t  context_id,
+	kernel::handle_t   f)
 {
-	return { device_id, reinterpret_cast<const void*>(function_ptr) };
+	return kernel_t{ device_id, context_id, f };
 }
 
 } // namespace detail_
 
-template<typename KernelFunctionPtr>
-kernel_t wrap(const device_t &device, KernelFunctionPtr function_ptr);
+namespace occupancy {
+
+namespace detail_ {
+
+/*
+// TODO: Can't we make full use of the closure here? If only we had
+// some void* parameter to the b2d function...
+template <typename UnaryFunction>
+cuda::size_t CUDA_CB block_size_to_dynamic_shared_mem_size_helper(int blockSize)
+{
+	return UnaryFunction{}(blockSize);
+}
+
+
+#if CUDART_VERSION <= 10000
+	throw cuda::runtime_error {cuda::status::not_yet_implemented};
+#else
+	int min_grid_size_in_blocks { 0 };
+	int block_size { 0 };
+	// Note: only initializing the values her because of a
+	// spurious (?) compiler warning about potential uninitialized use.
+
+	size_t ignored_fixed_dynamic_shared_mem_size { 0 };
+	auto result =  cuOccupancyMaxPotentialBlockSizeWithFlags(
+		&min_grid_size_in_blocks, &block_size,
+		kernel_handle,
+		block_size_to_dynamic_shared_mem_size_helper<UnaryFunction>,
+		ignored_fixed_dynamic_shared_mem_size,
+		static_cast<int>(block_size_limit),
+		disable_caching_override ? CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE : CU_OCCUPANCY_DEFAULT
+	);
+
+//	CUresult CUDAAPI cuOccupancyMaxPotentialBlockSizeWithFlags(
+//	int *minGridSize, int *blockSize, CUfunction func,
+//	CUoccupancyB2DSize blockSizeToDynamicSMemSize,
+//	size_t dynamicSMemSize, int blockSizeLimit, unsigned int flags);
+
+	throw_if_error(result,
+		"Failed obtaining parameters for a minimum-size grid for " + kernel::detail_::identify(kernel_handle)
+			+ " on " + device::detail_::identify(device_id));
+	return { min_grid_size_in_blocks, block_size };
+#endif // CUDART_VERSION <= 10000
+}
+*/
+
+// Note: If determine_shared_mem_by_block_size is not null, fixed_shared_mem_size is ignored;
+// if block_size_limit is 0, it is ignored.
+inline grid::complete_dimensions_t min_grid_params_for_max_occupancy(
+	CUfunction                     kernel_handle,
+	cuda::device::id_t             device_id,
+	CUoccupancyB2DSize             determine_shared_mem_by_block_size,
+	cuda::memory::shared::size_t   fixed_shared_mem_size,
+	cuda::grid::block_dimension_t  block_size_limit,
+	bool                           disable_caching_override)
+{
+#if CUDART_VERSION <= 10000
+	throw cuda::runtime_error {cuda::status::not_yet_implemented};
+#else
+	int min_grid_size_in_blocks { 0 };
+	int block_size { 0 };
+	// Note: only initializing the values her because of a
+	// spurious (?) compiler warning about potential uninitialized use.
+
+	auto result =  cuOccupancyMaxPotentialBlockSizeWithFlags(
+		&min_grid_size_in_blocks, &block_size,
+		kernel_handle,
+		determine_shared_mem_by_block_size,
+		fixed_shared_mem_size,
+		static_cast<int>(block_size_limit),
+		disable_caching_override ? CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE : CU_OCCUPANCY_DEFAULT
+	);
+
+	throw_if_error(result,
+		"Failed obtaining parameters for a minimum-size grid for " + kernel::detail_::identify(kernel_handle)
+			+ " on " + device::detail_::identify(device_id) + " with maximum occupancy given dynamic shared "
+			"memory and block size data");
+	return { min_grid_size_in_blocks, block_size };
+#endif // CUDART_VERSION <= 10000
+}
+
+} // namespace detail_
+
+
+/**
+* @brief See the Driver API documentation for @ref cuOccupancyAvailableDynamicSMemPerBlock
+*/
+inline memory::shared::size_t max_dynamic_shared_memory_per_block(
+	const kernel_t &kernel,
+	grid::dimension_t blocks_on_multiprocessor,
+	grid::block_dimension_t block_size_in_threads)
+{
+	size_t result;
+	auto status = cuOccupancyAvailableDynamicSMemPerBlock(
+		&result, kernel.handle(), (int) blocks_on_multiprocessor, (int) block_size_in_threads);
+	throw_if_error(status,
+		"Determining the available dynamic memory per block, given the number of blocks on a multiprocessor and their size");
+	return (memory::shared::size_t) result;
+}
+
+/**
+* @brief See the Driver API documentation for @ref cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+*/
+inline grid::dimension_t max_blocks_per_multiprocessor(
+	const kernel_t &kernel,
+	grid::block_dimension_t block_size_in_threads,
+	memory::shared::size_t dynamic_shared_memory_per_block,
+	bool disable_caching_override = false)
+{
+	int result;
+	auto flags = (unsigned) disable_caching_override ? CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE : CU_OCCUPANCY_DEFAULT;
+	auto status = cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
+		&result, kernel.handle(), (int) block_size_in_threads, (int) dynamic_shared_memory_per_block, flags);
+	throw_if_error(status,
+		"Determining the maximum occupancy in blocks per multiprocessor, given the block size and the amount of dyanmic memory per block");
+	return result;
+}
+
+/**
+*
+* @param dynamic_shared_memory_size The amount of dynamic shared memory each grid block will
+* need.
+* @param block_size_limit do not return a block size above this value; the default, 0,
+* means no limit on the returned block size.
+* @param disable_caching_override On platforms where global caching affects occupancy,
+* and when enabling caching would result in zero occupancy, the occupancy calculator will
+* calculate the occupancy as if caching is disabled. Setting this to true makes the
+* occupancy calculator return 0 in such cases. More information can be found about this
+* feature in the "Unified L1/Texture Cache" section of the
+* <a href="https://docs.nvidia.com/cuda/maxwell-tuning-guide/index.html">Maxwell tuning guide</a>.
+*
+* @return A pair, with the second element being the maximum achievable block size
+* (1-dimensional), and the first element being the minimum number of such blocks necessary
+* for keeping the GPU "busy" (again, in a 1-dimensional grid).
+*/
+inline grid::complete_dimensions_t min_grid_params_for_max_occupancy(
+	kernel_t kernel,
+	memory::shared::size_t dynamic_shared_memory_size = no_dynamic_shared_memory,
+	grid::block_dimension_t block_size_limit = 0,
+	bool disable_caching_override = false)
+{
+	return detail_::min_grid_params_for_max_occupancy(
+		kernel.handle(), kernel.device_id(), nullptr,
+		dynamic_shared_memory_size, block_size_limit, disable_caching_override);
+}
+
+inline grid::complete_dimensions_t min_grid_params_for_max_occupancy(
+	kernel_t                       kernel,
+	kernel_t::shared_memory_size_determiner_t
+	                               shared_memory_size_determiner,
+	cuda::grid::block_dimension_t  block_size_limit = 0,
+	bool                           disable_caching_override = false)
+{
+	size_t ignored_fixed_dynamic_shared_mem_size { 0 };
+	return detail_::min_grid_params_for_max_occupancy(
+		kernel.handle(), kernel.device_id(), shared_memory_size_determiner,
+		ignored_fixed_dynamic_shared_mem_size, block_size_limit, disable_caching_override);
+}
+
+} // namespace occupancy
 
 } // namespace kernel
 
+inline grid::complete_dimensions_t kernel_t::min_grid_params_for_max_occupancy(
+	memory::shared::size_t   dynamic_shared_memory_size,
+	grid::block_dimension_t  block_size_limit,
+	bool                     disable_caching_override) const
+{
+	return kernel::occupancy::min_grid_params_for_max_occupancy(
+		*this, dynamic_shared_memory_size, block_size_limit, disable_caching_override);
+}
+
+inline grid::complete_dimensions_t kernel_t::min_grid_params_for_max_occupancy(
+	shared_memory_size_determiner_t shared_memory_size_determiner,
+	cuda::grid::block_dimension_t  block_size_limit,
+	bool                           disable_caching_override) const
+{
+	return kernel::occupancy::min_grid_params_for_max_occupancy(
+		*this, shared_memory_size_determiner, block_size_limit, disable_caching_override);
+}
+
 } // namespace cuda
 
 #endif // CUDA_API_WRAPPERS_KERNEL_HPP_
diff --git a/src/cuda/api/kernel_launch.hpp b/src/cuda/api/kernel_launch.hpp
index 68eda773..162659e3 100644
--- a/src/cuda/api/kernel_launch.hpp
+++ b/src/cuda/api/kernel_launch.hpp
@@ -17,7 +17,7 @@
  * and sticking to proper C++; in other words, the wrappers are "ugly"
  * instead of client code having to be.
  * <li>Avoiding some of the "parameter soup" of launching a kernel: It's
- * rather easy to mix up shared memory sizes with stream IDs; grid and
+ * not so difficult to mix up shared memory sizes with stream handles; grid and
  * block dimensions with each other; and even grid/block dimensions with
  * the scalar parameters - since a `dim3` is constructible from
  * integral values. Instead, we enforce a launch configuration structure:
@@ -42,9 +42,8 @@
 #ifndef CUDA_API_WRAPPERS_KERNEL_LAUNCH_CUH_
 #define CUDA_API_WRAPPERS_KERNEL_LAUNCH_CUH_
 
-#include <cuda/api/constants.hpp>
 #include <cuda/api/kernel.hpp>
-#include <cuda/common/types.hpp>
+#include <cuda/api/apriori_compiled_kernel.hpp>
 
 #if (__CUDACC_VER_MAJOR__ >= 9)
 #include <cooperative_groups.h>
@@ -73,6 +72,38 @@ constexpr grid::block_dimensions_t single_thread_per_block() { return 1; }
 
 namespace detail_ {
 
+template<bool...> struct bool_pack;
+
+template<bool... bs>
+using all_true = ::std::is_same<bool_pack<bs..., true>, bool_pack<true, bs...>>;
+
+/**
+ * @brief adapt a type to be usable as a kernel parameter.
+ *
+ * CUDA kernels don't accept just any parameter type a C++ function may accept.
+ * Specifically: No references, arrays decay (IIANM) and functions pass by address.
+ * However - not all "decaying" of `::std::decay` is necessary. Such transformation
+ * can be effected by this type-trait struct.
+ */
+template<typename P>
+struct kernel_parameter_decay {
+private:
+    typedef typename ::std::remove_reference<P>::type U;
+public:
+    typedef typename ::std::conditional<
+        ::std::is_array<U>::value,
+        typename ::std::remove_extent<U>::type*,
+        typename ::std::conditional<
+            ::std::is_function<U>::value,
+            typename ::std::add_pointer<U>::type,
+            U
+        >::type
+    >::type type;
+};
+
+template<typename P>
+using kernel_parameter_decay_t = typename kernel_parameter_decay<P>::type;
+
 template<typename Fun>
 struct is_function_ptr: ::std::integral_constant<bool,
     ::std::is_pointer<Fun>::value and ::std::is_function<typename ::std::remove_pointer<Fun>::type>::value> { };
@@ -86,25 +117,46 @@ inline void collect_argument_addresses(void** collected_addresses, Arg&& arg, Ar
 	collect_argument_addresses(collected_addresses + 1, ::std::forward<Args>(args)...);
 }
 
-// Note: Unlike the non-detail_ functions - this one
-// cannot handle type-erased kernel_t's.
-template<typename RawKernel, typename... KernelParameters>
-inline void enqueue_launch(
-	RawKernel                   kernel_function,
-	stream::handle_t            stream_handle,
-	launch_configuration_t      launch_configuration,
-	KernelParameters&&...       parameters)
+// For partial template specialization on WrappedKernel...
+template<typename Kernel, typename... KernelParameters>
+struct enqueue_launch_helper {
+	void operator()(
+		Kernel                  kernel_function,
+		const stream_t &        stream,
+		launch_configuration_t  launch_configuration,
+		KernelParameters &&...  parameters);
+};
+
+template<typename Kernel, typename... KernelParameters>
+void enqueue_launch(
+	::std::integral_constant<bool, false>,
+	Kernel                  kernel_function,
+	const stream_t&         stream,
+	launch_configuration_t  launch_configuration,
+	KernelParameters&&...   parameters);
+
+template<typename Kernel, typename... KernelParameters>
+void enqueue_launch(
+	::std::integral_constant<bool, true>,
+	Kernel                  kernel,
+	const stream_t&         stream,
+	launch_configuration_t  launch_configuration,
+	KernelParameters&&...   parameters);
+
+template<typename KernelFunction, typename... KernelParameters>
+void enqueue_raw_kernel_launch(
+	KernelFunction          kernel_function,
+	stream::handle_t        stream_handle,
+	launch_configuration_t  launch_configuration,
+	KernelParameters&&...   parameters)
 #ifndef __CUDACC__
 // If we're not in CUDA's NVCC, this can't run properly anyway, so either we throw some
 // compilation error, or we just do nothing. For now it's option 2.
-	;
+;
 #else
 {
-	static_assert(::std::is_function<RawKernel>::value or
-	    (is_function_ptr<RawKernel>::value),
-	    "Only a bona fide function can be a CUDA kernel and be launched; "
-	    "you were attempting to enqueue a launch of something other than a function");
-
+	static_assert(::std::is_function<KernelFunction>::value or (is_function_ptr<KernelFunction>::value),
+		"Only a bona fide function can be launched as a CUDA kernel");
 	if (launch_configuration.block_cooperation == thread_blocks_may_not_cooperate) {
 		// regular plain vanilla launch
 		kernel_function <<<
@@ -112,11 +164,11 @@ inline void enqueue_launch(
 			launch_configuration.dimensions.block,
 			launch_configuration.dynamic_shared_memory_size,
 			stream_handle
-			>>>(::std::forward<KernelParameters>(parameters)...);
+		>>>(::std::forward<KernelParameters>(parameters)...);
 		cuda::outstanding_error::ensure_none("Kernel launch failed");
 	}
 	else {
-#if __CUDACC_VER_MAJOR__ >= 9
+#if  __CUDACC_VER_MAJOR__ >= 9
 		// Cooperative launches cannot be made using the triple-chevron syntax,
 		// nor is there a variadic-template of the launch API call, so we need to
 		// a bit of useless work here. We could have done exactly the same thing
@@ -133,13 +185,18 @@ inline void enqueue_launch(
 		// of the two terms is confusing here and depends on how you
 		// look at things.
 		detail_::collect_argument_addresses(argument_ptrs, ::std::forward<KernelParameters>(parameters)...);
-		auto status = cudaLaunchCooperativeKernel(
-			(const void*) kernel_function,
-			launch_configuration.dimensions.grid,
-			launch_configuration.dimensions.block,
-			argument_ptrs,
+		kernel::handle_t kernel_function_handle = kernel::detail_::get_handle( (const void*) kernel_function);
+		auto status = cuLaunchCooperativeKernel(
+			kernel_function_handle,
+			launch_configuration.dimensions.grid.x,
+			launch_configuration.dimensions.grid.y,
+			launch_configuration.dimensions.grid.z,
+			launch_configuration.dimensions.block.x,
+			launch_configuration.dimensions.block.y,
+			launch_configuration.dimensions.block.z,
 			launch_configuration.dynamic_shared_memory_size,
-			stream_handle);
+			stream_handle,
+			argument_ptrs);
 		throw_if_error(status, "Cooperative kernel launch failed");
 
 #else
@@ -153,6 +210,59 @@ inline void enqueue_launch(
 
 } // namespace detail_
 
+
+namespace kernel {
+
+namespace detail_ {
+
+// The helper code here is intended for re-imbuing kernel-related classes with the types
+// of the kernel parameters. This is necessary since kernel wrappers may be type-erased
+// (which makes it much easier to work with them and avoids a bunch of code duplication).
+//
+// Note: The type-unerased kernel must be a non-const function pointer. Why? Not sure.
+// even though function pointers can't get written through, for some reason they are
+// expected not to be const.
+
+
+template<typename... KernelParameters>
+struct raw_kernel_typegen {
+	// You should be careful to only instantiate this class with nice simple types we can pass to CUDA kernels.
+//	static_assert(
+//		all_true<
+//		    ::std::is_same<
+//		    	KernelParameters,
+//		    	::cuda::detail_::kernel_parameter_decay_t<KernelParameters>>::value...
+//		    >::value,
+//		"All kernel parameter types must be decay-invariant" );
+	using type = void(*)(cuda::detail_::kernel_parameter_decay_t<KernelParameters>...);
+};
+
+} // namespace detail_
+
+template<typename... KernelParameters>
+typename detail_::raw_kernel_typegen<KernelParameters...>::type
+unwrap(apriori_compiled_kernel_t kernel)
+{
+	using raw_kernel_t = typename detail_::raw_kernel_typegen<KernelParameters ...>::type;
+	return reinterpret_cast<raw_kernel_t>(const_cast<void *>(kernel.ptr()));
+}
+
+} // namespace kernel
+
+namespace detail_ {
+
+template<typename... KernelParameters>
+struct enqueue_launch_helper<apriori_compiled_kernel_t, KernelParameters...> {
+	void operator()(
+		apriori_compiled_kernel_t  wrapped_kernel,
+		const stream_t &           stream,
+		launch_configuration_t     launch_configuration,
+		KernelParameters &&...     parameters);
+};
+
+} // namespace detail_
+
+
 /**
  * @brief Enqueues a kernel on a stream (=queue) on the current CUDA device.
  *
@@ -171,9 +281,8 @@ inline void enqueue_launch(
  * <p>As kernels do not return values, neither does this function. It also contains no hooks, logging
  * commands etc. - if you want those, write an additional wrapper (perhaps calling this one in turn).
  *
- * @param kernel_function the kernel to apply. Pass it just as-it-is, as though it were any other function. Note:
+ * @param kernel the kernel to apply. Pass it just as-it-is, as though it were any other function. Note:
  * If the kernel is templated, you must pass it fully-instantiated. Alternatively, you can pass a
- * @ref kernel_t wrapping the raw pointer to the function.
  * @param stream the CUDA hardware command queue on which to place the command to launch the kernel (affects
  * the scheduling of the launch and the execution)
  * @param launch_configuration not all launches of the same kernel are identical: The launch may be configured
@@ -184,13 +293,28 @@ inline void enqueue_launch(
  */
 template<typename Kernel, typename... KernelParameters>
 void enqueue_launch(
-	Kernel                  kernel_function,
+	Kernel                  kernel,
 	const stream_t&         stream,
 	launch_configuration_t  launch_configuration,
-	KernelParameters&&...   parameters);
+	KernelParameters&&...   parameters)
+{
+	static_assert(
+		detail_::all_true<
+		::std::is_trivially_copyable<detail_::kernel_parameter_decay_t<KernelParameters>>::value...
+		>::value,
+		"All kernel parameter types must be of a trivially copyable (decayed) type." );
+	constexpr const bool wrapped_kernel = ::std::is_base_of<kernel_t, typename ::std::decay<Kernel>::type>::value;
+	// We would have liked an "if constexpr" here, but that is unsupported by C++11, so we have to
+	// use tagged dispatch for the separate behavior for raw and wrapped kernels - although the enqueue_launch
+	// function for each of them will basically be just a one-liner :-(
+	detail_::enqueue_launch<Kernel, KernelParameters...>(
+		::std::integral_constant<bool, wrapped_kernel>{},
+		::std::forward<Kernel>(kernel), stream, launch_configuration,
+		::std::forward<KernelParameters>(parameters)...);
+}
 
 /**
- * Variant of @ref enqueue_launch for use with the default stream on the current device.
+ * Variant of @ref enqueue_launch for use with the default stream in the current context.
  *
  * @note This isn't called `enqueue` since the default stream is synchronous.
  */
diff --git a/src/cuda/api/link.hpp b/src/cuda/api/link.hpp
new file mode 100644
index 00000000..eeca6743
--- /dev/null
+++ b/src/cuda/api/link.hpp
@@ -0,0 +1,215 @@
+/**
+ * @file link.hpp
+ *
+ * @brief Wrappers for linking modules of compiled CUDA code.
+ */
+#pragma once
+#ifndef CUDA_API_WRAPPERS_LINK_HPP_
+#define CUDA_API_WRAPPERS_LINK_HPP_
+
+#include <cuda/api/current_context.hpp>
+#include <cuda/api/link_options.hpp>
+#include <cuda/api/memory.hpp>
+#include <cuda/api/module.hpp>
+#include <cuda.h>
+
+#if __cplusplus >= 201703L
+#include <filesystem>
+#endif
+
+namespace cuda {
+
+///@cond
+class device_t;
+class module_t;
+class link_t;
+///@endcond
+
+namespace link {
+
+using handle_t = CUlinkState;
+
+namespace detail_ {
+
+// TODO: Check if the linking has been completed!
+inline link_t wrap(
+	context::handle_t  context,
+	link::handle_t     handle,
+	link::options_t    options,
+	bool               take_ownership = false) noexcept;
+
+} // namespace detail_
+
+inline link_t create(const void* image, link::options_t options);
+
+// TODO: Use a clase-class with C++17 of later, made up of the two classes here
+namespace input {
+
+/**
+ * A typed, named, image in memory which can be used as an input to a runtime
+ * CUDA linking process.
+ */
+struct image_t : memory::region_t {
+	const char* name;
+	link::input_type_t type;
+};
+
+struct file_t {
+	const char* path; // TODO: Use a proper path in C++14 and later
+	link::input_type_t type;
+};
+
+} // namespace input
+
+} // namespace link
+
+/**
+ * @brief Wrapper class for a CUDA link (a process of linking compiled code together into an
+ * executable binary, using CUDA, at run-time)
+ *
+ * @note This class is a "reference type", not a "value type". Therefore, making changes
+ * to the link is a const-respecting operation on this class.
+ */
+class link_t {
+
+public:
+	/**
+	 * Complete a linking process, producing a completely-linked cubin image (for loading into
+	 * modules).
+	 *
+	 * @return The completely-linked cubin image, in a sized memory range. This memory is owned
+	 * by the link object, and must not be freed/deleted.
+	 */
+	memory::region_t complete() const {
+		void* cubin_output_start;
+		size_t cubin_output_size;
+		auto status = cuLinkComplete(handle_, &cubin_output_start, &cubin_output_size);
+		throw_if_error(status,
+			"Failed completing the link with state at address " + cuda::detail_::ptr_as_hex(handle_));
+		return memory::region_t{cubin_output_start, cubin_output_size};
+	}
+
+	// TODO: Replace this with methods which take wrapper classes.
+	void add(link::input::image_t image, const link::options_t ptx_compilation_options = {}) const
+	{
+		auto marshalled_options = ptx_compilation_options.marshal();
+		auto status = cuLinkAddData(
+			handle_,
+			static_cast<CUjitInputType>(image.type),
+			image.data(), // TODO: Is this really safe?
+			image.size(),
+			image.name,
+			marshalled_options.count(),
+			const_cast<link::option_t*>(marshalled_options.options()),
+			const_cast<void**>(marshalled_options.values())
+		);
+		throw_if_error(status,
+			"Failed adding input " + ::std::string(image.name) + " of type " + ::std::to_string(image.type) + " to a link.");
+	}
+
+	void add_file(link::input::file_t file_input, const link::options_t& options) const
+	{
+		auto marshalled_options = options.marshal();
+		auto status = cuLinkAddFile(
+			handle_,
+			static_cast<CUjitInputType_enum>(file_input.type),
+			file_input.path,
+			marshalled_options.count(),
+			const_cast<link::option_t*>(marshalled_options.options()),
+			const_cast<void**>(marshalled_options.values())
+			);
+		throw_if_error(status,
+			"Failed loading an object of type " + ::std::to_string(file_input.type) + " from file " + file_input.path);
+	}
+
+#if __cplusplus >= 201703L
+	void add_file(const ::std::filesystem::path& path, link::input_type_t file_contents_type) const
+	{
+		return add_file(path.c_str(), file_contents_type);
+	}
+#endif
+
+protected: // constructors
+
+	link_t(context::handle_t context, link::handle_t handle, link::options_t options, bool take_ownership) noexcept
+	: context_handle_(context), handle_(handle), options_(options), owning(take_ownership) { }
+
+public: // friendship
+
+	friend link_t link::detail_::wrap(context::handle_t context, link::handle_t handle, link::options_t, bool take_ownership) noexcept;
+
+public: // constructors and destructor
+
+	link_t(const link_t&) = delete;
+
+	link_t(link_t&& other) noexcept :
+		link_t(other.context_handle_, other.handle_, other.options_, other.owning)
+	{
+		other.owning = false;
+	};
+
+	~link_t()
+	{
+		if (owning) {
+			context::current::detail_::scoped_override_t set_context_for_this_scope(context_handle_);
+			auto status = cuLinkDestroy(handle_);
+			throw_if_error(status,
+				::std::string("Failed destroying the link ") + detail_::ptr_as_hex(handle_) +
+				" in " + context::detail_::identify(context_handle_));
+		}
+	}
+
+public: // operators
+
+	link_t& operator=(const link_t& other) = delete;
+	link_t& operator=(link_t&& other) = delete;
+
+protected: // data members
+	const context::handle_t  context_handle_;
+	const link::handle_t     handle_;
+	link::options_t          options_;
+	bool                     owning;
+		// this field is mutable only for enabling move construction; other
+		// than in that case it must not be altered
+};
+
+namespace link {
+
+inline link_t create(link::options_t options = link::options_t{})
+{
+	handle_t new_link_handle;
+	auto marshalled_options = options.marshal();
+	auto status = cuLinkCreate(
+		marshalled_options.count(),
+		const_cast<link::option_t*>(marshalled_options.options()),
+		const_cast<void**>(marshalled_options.values()),
+		&new_link_handle
+	);
+	throw_if_error(status, "Failed creating a new link ");
+	auto do_take_ownership = true;
+	return detail_::wrap(
+		context::current::detail_::get_handle(),
+		new_link_handle,
+		options,
+		do_take_ownership);
+}
+
+namespace detail_ {
+
+// TODO: Check if the linking has been completed!
+inline link_t wrap(
+	context::handle_t  context,
+	link::handle_t     handle,
+	link::options_t    options,
+	bool               take_ownership) noexcept
+{
+	return link_t{context, handle, options, take_ownership};
+}
+
+} // namespace detail_
+
+} // namespace link
+
+} // namespace cuda
+
+#endif // CUDA_API_WRAPPERS_LINK_HPP_
diff --git a/src/cuda/api/link_options.hpp b/src/cuda/api/link_options.hpp
new file mode 100644
index 00000000..24b6b8d1
--- /dev/null
+++ b/src/cuda/api/link_options.hpp
@@ -0,0 +1,339 @@
+/**
+ * @file jit.hpp
+ *
+ * @brief Definitions and utility functions relating to just-in-time compilation and linking of CUDA code.
+ */
+#pragma once
+#ifndef CUDA_API_WRAPPERS_ASSEMBLY_AND_LINK_OPTIONS_HPP_
+#define CUDA_API_WRAPPERS_ASSEMBLY_AND_LINK_OPTIONS_HPP_
+
+
+#include <cuda/api/types.hpp>
+
+#if __cplusplus >= 202002L
+#include <span>
+#endif
+#include <array>
+
+namespace cuda {
+
+#if __cplusplus >= 202002L
+using ::std::span;
+#else
+	template <typename T>
+	// Poor man's span. TODO: Replace it with a proper span.
+	//
+	// Note: A span is a reference type. That means that changes
+	// to the pointed-to data are _not_considered changes to the
+	// span, hence you can get to that data with const methods.
+	struct span {
+		T* data_;
+		size_t size_;
+
+//		T*& data() noexcept { return const_cast<T*&>(data_); }
+		T* data() const noexcept { return data_; }
+		constexpr size_t size() const noexcept { return size_; }
+		const T* cbegin() const { return data(); }
+		const T* cend() const { return data() + size_; }
+		T* begin() const { return data(); }
+		T* end() const { return data() + size_; }
+	};
+#endif
+
+
+///@cond
+class module_t;
+///@endcond
+
+namespace link {
+
+enum input_type_t {
+	cubin,    /// Compiled device-class-specific device code
+	ptx,      /// PTX (microarchitecture-inspecific intermediate representation)
+	fatbin,   /// A bundle of multiple cubin and/or PTX inputs; typically
+	object,   /// A host-side binary object with embedded device code; a `.o` file
+	library,  /// An archive of objects files with embedded device code; a `.a` file
+} ;
+
+enum fallback_strategy_t {
+	prefer_ptx     = 0,
+	prefer_binary  = 1,
+};
+
+enum class caching_mode_t  {
+
+	/**
+	 * ca - Cache at all levels, likely to be accessed again.
+	 *
+	 * The default load instruction cache operation is ld.ca,
+	 * which allocates cache lines in all levels (L1 and L2) with
+	 * normal eviction policy. Global data is coherent at the L2
+	 *  level, but multiple L1 caches are not coherent for global
+	 *  data.
+	 */
+	cash_at_all_levels,
+	cash_in_l1_and_l2 = cash_at_all_levels,
+	ca = cash_at_all_levels,
+
+	/**
+	 * Cache at global level (cache in L2 and below, not L1).
+	 *
+	 * Use ld.cg to cache loads only globally, bypassing the L1
+	 * cache, and cache only in the L2 cache.
+	 */
+	cache_at_global_level,
+	cache_in_l2_only = cache_at_global_level,
+	cg = cache_at_global_level,
+
+	/**
+	 * Cache streaming, likely to be accessed once.
+	 *
+	 * The ld.cs load cached streaming operation allocates global
+	 * lines with evict-first policy in L1 and L2 to limit cache
+	 * pollution by temporary streaming data that may be accessed
+	 * once or twice. When ld.cs is applied to a Local window
+	 * address, it performs the ld.lu operation.
+	 */
+	cache_as_evict_first,
+	cache_streaming = cache_as_evict_first,
+	cs = cache_streaming,
+
+	/**
+	 * Last use.
+	 *
+	 * The compiler/programmer may use ld.lu when restoring spilled
+	 * registers and popping function stack frames to avoid needless
+	 * write-backs of lines that will not be used again. The ld.lu
+	 * instruction performs a load cached streaming operation
+	 * (ld.cs) on global addresses.
+	 */
+	last_use,
+	lu = last_use,
+
+	/**
+	 * Don't cache and fetch again (consider cached system memory
+	 * lines stale, fetch again).
+	 *
+	 * The ld.cv load operation applied to a global System Memory
+	 * address invalidates (discards) a matching L2 line and
+	 * re-fetches the line on each new load.
+	 */
+	 fetch_again_and_dont_cache,
+	 cv = fetch_again_and_dont_cache,
+};
+
+using register_index_t = unsigned;
+using optimization_level_t = unsigned;
+using option_t = CUjit_option;
+constexpr const optimization_level_t maximum_optimization_level { 4 };
+
+struct marshalled_options_t {
+	using size_type = unsigned;
+	constexpr static const size_type max_num_options { CU_JIT_NUM_OPTIONS };
+
+protected:
+	::std::array<option_t, max_num_options> option_buffer;
+	::std::array<void*, max_num_options> value_buffer;
+	size_type count_ { 0 };
+public:
+	size_type count() { return count_; }
+
+	void push_back(option_t option)
+	{
+		if (count_ >= max_num_options) {
+			throw ::std::invalid_argument("Attempt to push back the same option a second time");
+			// If each option is pushed back at most once, the count cannot exist the number
+			// of possible options. In fact, it can't even reach it because some options contradict.
+			//
+			// Note: This check will not catch all repeat push-backs, nor the case of conflicting
+			// options - the cuLink methods will catch those. We just want to avoid overflow.
+		}
+		option_buffer[count_] = option;
+		count_++;
+	}
+protected:
+	template <typename I>
+	void* process_value(typename ::std::enable_if<::std::is_integral<I>::value, I>::type value)
+	{
+		return reinterpret_cast<void*>(static_cast<uintptr_t>(value));
+	}
+
+	template <typename T>
+	void* process_value(T* value)
+	{
+		return static_cast<void*>(value);
+	}
+
+	void* process_value(bool value) { return process_value<int>(value ? 1 : 0); }
+
+	void* process_value(caching_mode_t value)
+	{
+		return process_value(static_cast<typename ::std::underlying_type<caching_mode_t>::type>(value));
+	}
+
+public:
+
+	template <typename T>
+	void push_back(option_t option, T value)
+	{
+		push_back(option);
+		process_value(value);
+		// Now set value_buffer[count-1]...
+		value_buffer[count_-1] = process_value(value);
+	}
+	const option_t* options() const { return option_buffer.data(); }
+	const void * const * values() const { return value_buffer.data(); }
+};
+
+struct options_t {
+
+	static constexpr const register_index_t no_max_registers_limit { 0 };
+
+	/**
+	 * Limit the number of registers which a kernel thread may use.
+	 *
+	 * @todo Use an optional.
+	 */
+	register_index_t max_num_registers_per_thread { no_max_registers_limit };
+
+	static constexpr const register_index_t no_min_num_threads_per_block { 0 };
+
+	/**
+	 * The minimum number of threads per block which the compiler should target
+	 * @note can't be combined with a value for the @ref target property.
+	 *
+	 * @todo Use an optional.
+	 */
+	grid::block_dimension_t min_num_threads_per_block { no_min_num_threads_per_block };
+
+	// Note: The sizes are used as parameters too.
+	span<char> info_log, error_log;
+
+	static constexpr const optimization_level_t dont_set_optimization_level { maximum_optimization_level + 1 };
+	/**
+	 * Compilation optimization level (as in -O1, -O2 etc.)
+	 *
+	 * @todo Use an optional.
+	 */
+	optimization_level_t optimization_level { dont_set_optimization_level };
+
+	/**
+	 *
+	 * @todo Use a variant or optional+variant.
+	 */
+	struct {
+		bool obtain_from_cuda_context { true };
+		bool use_specific { true };
+		device::compute_capability_t specific;
+	} target; // Can't be combined with CU_JIT_THREADS_PER_BLOCK
+
+	bool specify_fallback_strategy { false };
+	/**
+	 * @todo Use an optional.
+	 */
+	fallback_strategy_t fallback_strategy { prefer_ptx }; // fallback behavior if a cubin matching (WHAT?) is not found
+
+	/**
+	 *  Whether or not to generate indications of which PTX/SASS instructions correspond to which
+	 *  lines of the source code, within the compiled output (-lineinfo)
+	 */
+	bool generate_debug_information { false }; /// Whether or not to generate debug information within the compiled output (-g)
+	bool generate_source_line_number_information { false };
+
+	// It _seems_ that the verbosity is a boolean setting - but this is not clear
+	bool verbose_log;
+
+	bool specify_default_load_caching_mode { false };
+	/**
+	 *  Specifies which of the PTX load caching modes use by default,
+	 *  when no caching mode is specified in a PTX instruction  (-dlcm)
+	 */
+	caching_mode_t default_load_caching_mode;
+
+	// Ignoring the "internal purposes only" options;
+	//
+	//   CU_JIT_NEW_SM3X_OPT
+	//   CU_JIT_FAST_COMPILE
+	//   CU_JIT_GLOBAL_SYMBOL_NAMES
+	//   CU_JIT_GLOBAL_SYMBOL_ADDRESSES
+	//   CU_JIT_GLOBAL_SYMBOL_COUNT
+	//
+
+public:
+	marshalled_options_t marshal() const;
+};
+
+inline marshalled_options_t options_t::marshal() const
+{
+	marshalled_options_t marshalled;
+
+	if (max_num_registers_per_thread != no_max_registers_limit) {
+		marshalled.push_back(CU_JIT_MAX_REGISTERS, max_num_registers_per_thread);
+	}
+
+	if (min_num_threads_per_block != no_min_num_threads_per_block) {
+		marshalled.push_back(CU_JIT_THREADS_PER_BLOCK, min_num_threads_per_block);
+	}
+
+	auto cil = const_cast<span<char>*>(&info_log);
+	if (cil->data() != nullptr and cil->size() != 0) {
+		marshalled.push_back(CU_JIT_INFO_LOG_BUFFER, cil->data());
+		marshalled.push_back(CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, cil->size());
+	}
+
+	auto cel = const_cast<span<char>*>(&error_log);
+	if (cel->data() != nullptr and cel->size() != 0) {
+		marshalled.push_back(CU_JIT_ERROR_LOG_BUFFER, cel->data());
+		marshalled.push_back(CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, cel->size());
+	}
+
+	if (optimization_level != dont_set_optimization_level) {
+		marshalled.push_back(CU_JIT_OPTIMIZATION_LEVEL, optimization_level);
+	}
+
+	if (target.obtain_from_cuda_context) {
+		marshalled.push_back(CU_JIT_TARGET_FROM_CUCONTEXT);
+	}
+	else if (target.use_specific) {
+		marshalled.push_back(CU_JIT_TARGET, target.specific.as_combined_number());
+	}
+
+	if (specify_fallback_strategy) {
+		marshalled.push_back(CU_JIT_FALLBACK_STRATEGY, fallback_strategy);
+	}
+
+	if (generate_debug_information) {
+		marshalled.push_back(CU_JIT_GENERATE_DEBUG_INFO);
+	}
+
+	if (generate_source_line_number_information) {
+		marshalled.push_back(CU_JIT_GENERATE_LINE_INFO);
+	}
+
+	if (generate_source_line_number_information) {
+		marshalled.push_back(CU_JIT_GENERATE_LINE_INFO);
+	}
+
+	if (verbose_log) {
+		marshalled.push_back(CU_JIT_LOG_VERBOSE);
+	}
+
+	if (specify_default_load_caching_mode) {
+		marshalled.push_back(CU_JIT_CACHE_MODE, default_load_caching_mode);
+	}
+
+	return marshalled;
+}
+
+
+// TODO: Compiler "output options":
+//
+// threads per block targeted
+// compilation wall time
+// amount written to info log
+
+} // namespace assembly_and_link
+
+} // namespace cuda
+
+#endif // CUDA_API_WRAPPERS_ASSEMBLY_AND_LINK_OPTIONS_HPP_
diff --git a/src/cuda/api/memory.hpp b/src/cuda/api/memory.hpp
index 422d4744..71c14337 100644
--- a/src/cuda/api/memory.hpp
+++ b/src/cuda/api/memory.hpp
@@ -31,7 +31,10 @@
 #include <cuda/api/current_device.hpp>
 #include <cuda/api/error.hpp>
 #include <cuda/api/pointer.hpp>
+#include <cuda/api/current_context.hpp>
+
 #include <cuda_runtime.h> // needed, rather than cuda_runtime_api.h, e.g. for cudaMalloc
+#include <cuda.h>
 
 #include <memory>
 #include <cstring> // for ::std::memset
@@ -41,48 +44,13 @@ namespace cuda {
 
 ///@cond
 class device_t;
+class context_t;
 class stream_t;
+class module_t;
 ///@endcond
 
-/**
- * @namespace memory
- * Representation, allocation and manipulation of CUDA-related memory, with
- * its various namespaces and kinds of memory regions.
- */
 namespace memory {
 
-namespace detail_ {
-
-template <class T>
-	class base_region_t {
-	private:
-		T* start_ = nullptr;
-		size_t size_in_bytes_ = 0;
-	public:
-		base_region_t() = default;
-		base_region_t(T* start, size_t size_in_bytes)
-		: start_(start), size_in_bytes_(size_in_bytes) {}
-
-		T*& start() { return start_; }
-		size_t& size() { return size_in_bytes_; }
-
-		size_t size() const { return size_in_bytes_; }
-		T* start() const { return start_; }
-		T* data() const { return start(); }
-		T* get() const { return start(); }
-	};
-
-}  // namespace detail_
-
-struct region_t : public detail_::base_region_t<void> {
-	using base_region_t<void>::base_region_t;
-};
-
-struct const_region_t : public detail_::base_region_t<void const> {
-	using base_region_t<void const>::base_region_t;
-	const_region_t(const region_t& r) : base_region_t(r.start(), r.size()) {}
-};
-
 /**
  * A memory allocation setting: Can the allocated memory be used in other
  * CUDA driver contexts (in addition to the implicit default context we
@@ -127,8 +95,8 @@ namespace detail_ {
 inline unsigned make_cuda_host_alloc_flags(allocation_options options)
 {
 	return
-		(options.portability     == portability_across_contexts::is_portable ? cudaHostAllocPortable      : 0) &
-		(options.write_combining == cpu_write_combining::with_wc             ? cudaHostAllocWriteCombined : 0);
+		(options.portability     == portability_across_contexts::is_portable ? CU_MEMHOSTALLOC_PORTABLE      : 0) &
+		(options.write_combining == cpu_write_combining::with_wc             ? CU_MEMHOSTALLOC_WRITECOMBINED : 0);
 }
 
 } // namespace detail_
@@ -161,11 +129,6 @@ struct region_pair {
 
 } // namespace mapped
 
-} // namespace memory
-
-
-namespace memory {
-
 /**
  * @brief CUDA-Device-global memory on a single device (not accessible from the host)
  */
@@ -176,34 +139,31 @@ namespace detail_ {
 /**
  * Allocate memory on current device
  *
- * @param size_in_bytes amount of memory to allocate in bytes
+ * @param num_bytes amount of memory to allocate in bytes
  */
-inline region_t allocate(size_t size_in_bytes)
+inline cuda::memory::region_t allocate_in_current_context(size_t num_bytes)
 {
-	void* allocated = nullptr;
+	device::address_t allocated = 0;
 	// Note: the typed cudaMalloc also takes its size in bytes, apparently,
 	// not in number of elements
-	auto status = cudaMalloc(&allocated, size_in_bytes);
-	if (is_success(status) && allocated == nullptr) {
+	auto status = cuMemAlloc(&allocated, num_bytes);
+	if (is_success(status) && allocated == 0) {
 		// Can this even happen? hopefully not
-		status = cudaErrorUnknown;
+		status = (status_t) status::unknown;
 	}
-	throw_if_error(status,
-		"Failed allocating " + ::std::to_string(size_in_bytes) +
-		" bytes of global memory on CUDA device " +
-		::std::to_string(cuda::device::current::detail_::get_id()));
-	return {allocated, size_in_bytes};
+	throw_if_error(status, "Failed allocating " + ::std::to_string(num_bytes) +
+		" bytes of global memory on the current CUDA device");
+	return {as_pointer(allocated), num_bytes};
 }
 
-inline region_t allocate(cuda::device::id_t device_id, size_t size_in_bytes)
+inline region_t allocate(context::handle_t context_handle, size_t size_in_bytes)
 {
-	cuda::device::current::detail_::scoped_override_t set_device_for_this_scope(device_id);
-	return memory::device::detail_::allocate(size_in_bytes);
+	context::current::detail_::scoped_override_t set_context_for_this_scope(context_handle);
+	return allocate_in_current_context(size_in_bytes);
 }
 
 } // namespace detail_
 
-
 namespace async {
 
 namespace detail_ {
@@ -212,27 +172,27 @@ namespace detail_ {
  * Allocate memory asynchronously on a specified stream.
  */
 inline region_t allocate(
-	cuda::device::id_t  device_id,
-	cuda::stream::handle_t  stream_handle,
-	size_t              size_in_bytes)
+	context::handle_t  context_handle,
+	stream::handle_t       stream_handle,
+	size_t             num_bytes)
 {
 #if CUDART_VERSION >= 11020
-	void* allocated = nullptr;
+	device::address_t allocated = 0;
 	// Note: the typed cudaMalloc also takes its size in bytes, apparently,
 	// not in number of elements
-	auto status = cudaMallocAsync(&allocated, size_in_bytes, stream_handle);
-	if (is_success(status) && allocated == nullptr) {
+	auto status = cuMemAllocAsync(&allocated, num_bytes, stream_handle);
+	if (is_success(status) && allocated == 0) {
 		// Can this even happen? hopefully not
-		status = static_cast<decltype(status)>(cuda::status::unknown);
+		status = static_cast<decltype(status)>(status::unknown);
 	}
 	throw_if_error(status,
-		"Failed scheduling an asynchronous allocation of " + ::std::to_string(size_in_bytes) +
-		" bytes of global memory on " + stream::detail_::identify(stream_handle, device_id));
-	return {allocated, size_in_bytes};
+		"Failed scheduling an asynchronous allocation of " + ::std::to_string(num_bytes) +
+		" bytes of global memory on " + stream::detail_::identify(stream_handle, context_handle) );
+	return {as_pointer(allocated), num_bytes};
 #else
-	(void) device_id;
+	(void) context_handle;
 	(void) stream_handle;
-	(void) size_in_bytes;
+	(void) num_bytes;
 	throw cuda::runtime_error(cuda::status::not_yet_implemented, "Asynchronous memory allocation is not supported with CUDA versions below 11.2");
 #endif
 }
@@ -243,8 +203,7 @@ inline region_t allocate(
  * Schedule an allocation of device-side memory on a CUDA stream.
  *
  * @note The CUDA memory allocator guarantees alignment "suitabl[e] for any kind of variable"
- * (CUDA 9.0 Runtime API documentation), and the CUDA programming guide guarantees
- * since at least version 5.0 that the minimum allocation is 256 bytes.
+ * (CUDA 9.0 Runtime API documentation), so probably at least 128 bytes.
  *
  * @throws cuda::runtime_error if scheduling fails for any reason
  *
@@ -253,8 +212,7 @@ inline region_t allocate(
  * @return a pointer to the region of memory which will become allocated once the stream
  * completes all previous tasks and proceeds to also complete the allocation.
  */
-inline region_t allocate(const cuda::stream_t& stream, size_t size_in_bytes);
-
+inline region_t allocate(const stream_t& stream, size_t size_in_bytes);
 
 } // namespace async
 
@@ -265,36 +223,69 @@ inline region_t allocate(const cuda::stream_t& stream, size_t size_in_bytes);
 ///@{
 inline void free(void* ptr)
 {
-	auto result = cudaFree(ptr);
+	auto result = cuMemFree(address(ptr));
 	throw_if_error(result, "Freeing device memory at 0x" + cuda::detail_::ptr_as_hex(ptr));
 }
 inline void free(region_t region) { free(region.start()); }
 ///@}
 
+/**
+ * Allocate device-side memory on a CUDA device context.
+ *
+ * @note The CUDA memory allocator guarantees alignment "suitabl[e] for any kind of variable"
+ * (CUDA 9.0 Runtime API documentation), and the CUDA programming guide guarantees
+ * since at least version 5.0 that the minimum allocation is 256 bytes.
+ *
+ * @throws cuda::runtime_error if allocation fails for any reason
+ *
+ * @param device the context in which to allocate memory
+ * @param size_in_bytes the amount of global device memory to allocate
+ * @return a pointer to the allocated stretch of memory (only usable within @p context)
+ */
+inline region_t allocate(const context_t& context, size_t size_in_bytes);
+
 /**
  * Allocate device-side memory on a CUDA device.
  *
  * @note The CUDA memory allocator guarantees alignment "suitabl[e] for any kind of variable"
- * (CUDA 9.0 Runtime API documentation), so probably at least 128 bytes.
+ * (CUDA 9.0 Runtime API documentation), and the CUDA programming guide guarantees
+ * since at least version 5.0 that the minimum allocation is 256 bytes.
  *
  * @throws cuda::runtime_error if allocation fails for any reason
  *
  * @param device the device on which to allocate memory
- * @param size_in_bytes the amount of memory to allocate
- * @return a pointer to the allocated stretch of memory (only usable on the CUDA device)
+ * @param size_in_bytes the amount of global device memory to allocate
+ * @return a pointer to the allocated stretch of memory (only usable on @p device)
  */
-inline region_t allocate(cuda::device_t device, size_t size_in_bytes);
+inline region_t allocate(const device_t& device, size_t size_in_bytes);
 
 namespace detail_ {
+
+// Note: Allocates _in the current context_! No current context => failure!
 struct allocator {
-	// Allocates on the current device!
-	void* operator()(size_t size_in_bytes) const { return detail_::allocate(size_in_bytes).start(); }
+	void* operator()(size_t num_bytes) const { return detail_::allocate_in_current_context(num_bytes).start(); }
 };
 struct deleter {
 	void operator()(void* ptr) const { cuda::memory::device::free(ptr); }
 };
+
 } // namespace detail_
 
+
+/**
+ * @brief Sets consecutive elements of a region of memory to a fixed
+ * value of some width
+ *
+ * @note A generalization of `set()`, for different-size units.
+ *
+ * @tparam T An unsigned integer type of size 1, 2, 4 or 8
+ * @param start The first location to set to @p value ; must be properly aligned.
+ * @param value A (properly aligned) value to set T-elements to.
+ * @param num_elements The number of type-T elements (i.e. _not_ necessarily the number of bytes).
+ */
+template <typename T>
+inline void typed_set(T* start, const T& value, size_t num_elements);
+
 /**
  * @brief Sets all bytes in a region of memory to a fixed value
  *
@@ -304,20 +295,17 @@ struct deleter {
  */
 ///@{
 /**
- * @param start address at which to start setting memory bytes
- *     in global CUDA-device-side memory or CUDA-managed memory.
- * @param byte_value the value to which to set memory bytes
- * @param num_bytes the number of bytes to set to @p byte_value
+ * @param start starting address of the memory region to set, in a CUDA
+ * device's global memory
+ * @param num_bytes size of the memory region in bytes
  */
 inline void set(void* start, int byte_value, size_t num_bytes)
 {
-	auto result = cudaMemset(start, byte_value, num_bytes);
-	throw_if_error(result, "memsetting an on-device buffer");
+	return typed_set<unsigned char>(static_cast<unsigned char*>(start), byte_value, num_bytes);
 }
 
 /**
- * @param region a stretch of memory whose contents is to be set
- * @param byte_value the value to which to set all bytes of @p region
+ * @param region a region to zero-out, in a CUDA device's global memory
  */
 inline void set(region_t region, int byte_value)
 {
@@ -325,7 +313,6 @@ inline void set(region_t region, int byte_value)
 }
 ///@}
 
-
 /**
  * @brief Sets all bytes in a region of memory to 0 (zero)
  */
@@ -339,7 +326,9 @@ inline void zero(void* start, size_t num_bytes)
 }
 
 /**
- * @param region the memory region to zero-out
+ * @param start starting address of the memory region to zero-out,
+ * in a CUDA device's global memory
+ * @param num_bytes size of the memory region in bytes
  */
 inline void zero(region_t region)
 {
@@ -367,41 +356,23 @@ inline void zero(T* ptr)
  * @note Since we assume Compute Capability >= 2.0, all devices support the
  * Unified Virtual Address Space, so the CUDA driver can determine, for each pointer,
  * where the data is located, and one does not have to specify this.
- */
-///@{
-/**
- *  @param destination A pointer to a memory region of size @p num_bytes, either in
- * host memory or on any CUDA device's global memory
- * @param source A pointer to a a memory region of size @p num_bytes, either in
- * host memory or on any CUDA device's global memory
+ *
+ * @note asynchronous version of @ref memory::copy
+ *
+ * @param destination A (pointer to) a memory region of size @p num_bytes.
+ * Must be defined in the same context as @p stream.
+ * @param source A (pointer to) a memory region of size @p num_bytes.
+ * Must be defined in the same context as @p stream.
  * @param num_bytes The number of bytes to copy from @p source to @p destination
  */
-inline void copy(void *destination, const void *source, size_t num_bytes)
-{
-	auto result = cudaMemcpy(destination, source, num_bytes, cudaMemcpyDefault);
-	// TODO: Determine whether it was from host to device, device to host etc and
-	// add this information to the error string
-	throw_if_error(result, "Synchronously copying data");
-}
+///@{
+void copy(void *destination, const void *source, size_t num_bytes);
 
-/**
- * @param destination A memory region of the same size as @p source, in 
- *     host memory or on any CUDA device's global memory
- * @param source A region whose contents is to be copied, either in host memory
- *     or on any CUDA device's global memory
- */
 inline void copy(void* destination, const_region_t source)
 {
 	return copy(destination, source.start(), source.size());
 }
 
-/**
- * @param destination A region of memory to which to copy the data in@source, of
- *     size at least that of @p source , either in host memory or on any CUDA
- *     device's global memory.
- * @param source A region whose contents is to be copied, either in host memory
- *     or on any CUDA device's global memory
- */
 inline void copy(region_t destination, const_region_t source)
 {
 #ifndef NDEBUG
@@ -411,6 +382,30 @@ inline void copy(region_t destination, const_region_t source)
 #endif
 	return copy(destination.start(), source);
 }
+
+/**
+ * @param source A plain array whose contents is to be copied.
+ */
+template <typename T, size_t N>
+inline void copy(region_t destination, const T(&source)[N])
+{
+#ifndef NDEBUG
+	if (destination.size() < N) {
+		throw ::std::logic_error("Source size exceeds destination size");
+	}
+#endif
+	return copy(destination.start(), source, N);
+}
+
+inline void copy(region_t destination, void* source, size_t num_bytes)
+{
+#ifndef NDEBUG
+	if (destination.size() < num_bytes) {
+		throw ::std::logic_error("Number of bytes to copy exceeds destination size");
+	}
+#endif
+	return copy(destination.start(), source, num_bytes);
+}
 ///@}
 
 /**
@@ -426,13 +421,13 @@ inline void copy(region_t destination, const_region_t source)
  */
 inline void set(void* ptr, int byte_value, size_t num_bytes)
 {
-	pointer_t<void> pointer { ptr };
-	switch ( pointer.attributes(). memory_type() ) {
-		case device_memory:
-		case managed_memory:
+	switch ( type_of(ptr) ) {
+		case device_:
+//		case managed_:
+		case unified_:
 			memory::device::set(ptr, byte_value, num_bytes); break;
-		case unregistered_memory:
-		case host_memory:
+//		case unregistered_:
+		case host_:
 			::std::memset(ptr, byte_value, num_bytes); break;
   		default:
 			throw runtime_error(
@@ -471,9 +466,8 @@ inline void zero(region_t region)
 /**
  * @brief Sets a number of bytes starting in at a given address of memory to 0 (zero)
  *
- * @param start address at which to start setting memory bytes to 0, in
+ * @param region the memory region to zero-out; may be in host-side memory,
  * global CUDA-device-side memory or CUDA-managed memory.
- * @param num_bytes the number of bytes to set to zero
  */
 inline void zero(void* ptr, size_t num_bytes)
 {
@@ -495,104 +489,210 @@ inline void zero(T* ptr)
 
 namespace detail_ {
 
-/**
- * @note When constructing this class - destination first, source second
- * (otherwise you're implying the opposite direction of transfer).
- */
-struct copy_params_t : cudaMemcpy3DParms {
-	struct tag { };
-protected:
-	template <typename T>
-	copy_params_t(tag, const void *ptr, const array_t<T, 3>& array) :
-		cudaMemcpy3DParms { 0 },
-		pitch(sizeof(T) * array.dimensions().width),
-		pitched_ptr(make_cudaPitchedPtr(
-			const_cast<void*>(ptr),
-			pitch,
-			array.dimensions().width,
-			array.dimensions().height))
-	{
-		kind = cudaMemcpyDefault;
-		extent = array.dimensions();
+template<dimensionality_t NumDimensions>
+struct base_copy_params;
+
+template<>
+struct base_copy_params<2> {
+	using intra_context_type = CUDA_MEMCPY2D;
+	using type = intra_context_type; // Why is there no inter-context type, CUDA_MEMCPY2D_PEER ?
+};
+
+template<>
+struct base_copy_params<3> {
+	using type = CUDA_MEMCPY3D_PEER;
+	using intra_context_type = CUDA_MEMCPY3D;
+};
+
+// Note these, by default, support inter-context
+template<dimensionality_t NumDimensions>
+using base_copy_params_t = typename base_copy_params<NumDimensions>::type;
+
+
+enum class endpoint_t {
+	source, destination
+};
+
+template<dimensionality_t NumDimensions>
+struct copy_parameters_t : base_copy_params_t<NumDimensions> {
+	// TODO: Perhaps use proxies?
+
+	using intra_context_type = typename base_copy_params<NumDimensions>::intra_context_type;
+
+	using dimensions_type = array::dimensions_t<NumDimensions>;
+
+	template<typename T>
+	void set_endpoint(endpoint_t endpoint, const cuda::array_t<T, NumDimensions> &array);
+
+	template<typename T>
+	void set_endpoint(endpoint_t endpoint, T *ptr, array::dimensions_t<NumDimensions> dimensions);
+
+	template<typename T>
+	void set_endpoint(endpoint_t endpoint, context::handle_t context_handle, T *ptr,
+		array::dimensions_t<NumDimensions> dimensions);
+
+	// TODO: Perhaps we should have an dimensioned offset type?
+	template<typename T>
+	void set_offset(endpoint_t endpoint, dimensions_type offset);
+
+	template<typename T>
+	void clear_offset(endpoint_t endpoint)
+	{ set_offset<T>(endpoint, dimensions_type::zero()); }
+
+	template<typename T>
+	void set_extent(dimensions_type extent);
+	// Sets how much is being copies, as opposed to the sizes of the endpoints which may be larger
+
+	void clear_rest();
+	// Clear any dummy fields which are required to be set to 0. Note that important fields,
+	// which you have not set explicitly, will _not_ be cleared by this method.
+
+};
+
+template<>
+template<typename T>
+void copy_parameters_t<2>::set_endpoint(endpoint_t endpoint, const cuda::array_t<T, 2> &array)
+{
+	(endpoint == endpoint_t::source ? srcMemoryType : dstMemoryType) = CU_MEMORYTYPE_ARRAY;
+	(endpoint == endpoint_t::source ? srcArray : dstArray) = array.get();
+	// Can't set the endpoint context - the basic data structure doesn't support that!
+}
+
+template<>
+template<typename T>
+void copy_parameters_t<3>::set_endpoint(endpoint_t endpoint, const cuda::array_t<T, 3> &array)
+{
+	(endpoint == endpoint_t::source ? srcMemoryType : dstMemoryType) = CU_MEMORYTYPE_ARRAY;
+	(endpoint == endpoint_t::source ? srcArray : dstArray) = array.get();
+	(endpoint == endpoint_t::source ? srcContext : dstContext) = array.context_handle();
+}
+
+template<>
+template<typename T>
+inline void copy_parameters_t<2>::set_endpoint(endpoint_t endpoint, context::handle_t context_handle, T *ptr,
+	array::dimensions_t<2> dimensions)
+{
+	if (context_handle != context::detail_::none) {
+		throw cuda::runtime_error(
+			cuda::status::named_t::not_supported,
+			"Inter-context copying of 2D arrays is not supported by the CUDA driver");
 	}
+	set_endpoint<2>(endpoint, ptr, dimensions);
+}
 
-public:
-	template <typename T>
-	copy_params_t(const array_t<T, 3>& destination, const void *source) :
-		copy_params_t(tag{}, source, destination)
-	{
-		srcPtr = pitched_ptr;
-		dstArray = destination.get();
+template<>
+template<typename T>
+inline void copy_parameters_t<2>::set_endpoint(endpoint_t endpoint, T *ptr, array::dimensions_t<2> dimensions)
+{
+	auto memory_type = memory::type_of(ptr);
+	if (memory_type == memory::type_t::unified_ or memory_type == type_t::device_) {
+		(endpoint == endpoint_t::source ? srcDevice : dstDevice) = device::address(ptr);
+	} else {
+		if (endpoint == endpoint_t::source) { srcHost = ptr; }
+		else { dstHost = ptr; }
 	}
+	(endpoint == endpoint_t::source ? srcPitch : dstPitch) = dimensions.width * sizeof(T);
+	(endpoint == endpoint_t::source ? srcMemoryType : dstMemoryType) = (CUmemorytype) memory_type;
+	// Can't set the endpoint context - the basic data structure doesn't support that!
+}
 
-	template <typename T>
-	copy_params_t(const T* destination, const array_t<T, 3>& source) :
-		copy_params_t(tag{}, destination, source)
-	{
-		srcArray = source.get();
-		dstPtr = pitched_ptr;
+template<>
+template<typename T>
+inline void copy_parameters_t<3>::set_endpoint(endpoint_t endpoint, context::handle_t context_handle, T *ptr,
+	array::dimensions_t<3> dimensions)
+{
+	cuda::memory::pointer_t<void> wrapped{ptr};
+	auto memory_type = memory::type_of(ptr);
+	if (memory_type == memory::type_t::unified_ or memory_type == type_t::device_) {
+		(endpoint == endpoint_t::source ? srcDevice : dstDevice) = device::address(ptr);
+	} else {
+		if (endpoint == endpoint_t::source) { srcHost = ptr; }
+		else { dstHost = ptr; }
 	}
+	(endpoint == endpoint_t::source ? srcPitch : dstPitch) = dimensions.width * sizeof(T);
+	(endpoint == endpoint_t::source ? srcHeight : dstHeight) = dimensions.height;
+	(endpoint == endpoint_t::source ? srcMemoryType : dstMemoryType) = (CUmemorytype) memory_type;
+	(endpoint == endpoint_t::source ? srcContext : dstContext) = context_handle;
+}
 
-	size_t pitch;
-	cudaPitchedPtr pitched_ptr;
-};
+template<>
+template<typename T>
+inline void copy_parameters_t<3>::set_endpoint(endpoint_t endpoint, T *ptr, array::dimensions_t<3> dimensions)
+{
+	set_endpoint<T>(endpoint, context::detail_::none, ptr, dimensions);
+}
+
+template<>
+inline void copy_parameters_t<2>::clear_rest()
+{}
 
+template<>
+inline void copy_parameters_t<3>::clear_rest()
+{
+	srcLOD = 0;
+	dstLOD = 0;
+}
+
+template<>
 template<typename T>
-inline void copy(array_t<T, 2>& destination, const T *source)
-{
-	const auto dimensions = destination.dimensions();
-	const auto width_in_bytes = sizeof(T) * dimensions.width;
-	const auto source_pitch = width_in_bytes; // i.e. no padding
-	const array::dimensions_t<2> offsets { 0, 0 };
-	auto result = cudaMemcpy2DToArray(
-		destination.get(),
-		offsets.width,
-		offsets.height,
-		source,
-		source_pitch,
-		width_in_bytes,
-		dimensions.height,
-		cudaMemcpyDefault);
-	throw_if_error(result, "Synchronously copying into a 2D CUDA array");
+inline void copy_parameters_t<2>::set_extent(dimensions_type extent)
+{
+	WidthInBytes = extent.width * sizeof(T);
+	Height = extent.height;
 }
 
-template <typename T>
-inline void copy(array_t<T, 3>& destination, const T *source)
+template<>
+template<typename T>
+void copy_parameters_t<3>::set_extent(dimensions_type extent)
 {
-	const auto copy_params = detail_::copy_params_t(destination, source);
-	auto result = cudaMemcpy3D(&copy_params);
-	throw_if_error(result, "Synchronously copying into a 3-dimensional CUDA array");
+	WidthInBytes = extent.width * sizeof(T);
+	Height = extent.height;
+	Depth = extent.depth;
 }
 
-template <typename T>
-inline void copy(T *destination, const array_t<T, 2>& source)
-{
-	const auto dimensions = source.dimensions();
-	const auto width_in_bytes = sizeof(T) * dimensions.width;
-	const auto destination_pitch = width_in_bytes; // i.e. no padding
-	const array::dimensions_t<2> offsets { 0, 0 };
-	auto result = cudaMemcpy2DFromArray(
-		destination,
-		destination_pitch,
-		source.get(),
-		offsets.width,
-		offsets.height,
-		width_in_bytes,
-		dimensions.height,
-		cudaMemcpyDefault);
-	throw_if_error(result, "Synchronously copying out of a 2D CUDA array");
+template<>
+template<typename T>
+void copy_parameters_t<3>::set_offset(endpoint_t endpoint, dimensions_type offset)
+{
+	(endpoint == endpoint_t::source ? srcXInBytes : dstXInBytes) = offset.width * sizeof(T);
+	(endpoint == endpoint_t::source ? srcY : dstY) = offset.height;
+	(endpoint == endpoint_t::source ? srcZ : dstZ) = offset.depth;
 }
 
-template <typename T>
-inline void copy(T* destination, const array_t<T, 3>& source)
+template<>
+template<typename T>
+void copy_parameters_t<2>::set_offset(endpoint_t endpoint, dimensions_type offset)
 {
-	const auto copy_params = detail_::copy_params_t(destination, source);
-	auto result = cudaMemcpy3D(&copy_params);
-	throw_if_error(result, "Synchronously copying from a 3-dimensional CUDA array");
+	(endpoint == endpoint_t::source ? srcXInBytes : dstXInBytes) = offset.width * sizeof(T);
+	(endpoint == endpoint_t::source ? srcY : dstY) = offset.height;
 }
 
-} // namespace detail_
+void set_endpoint(endpoint_t endpoint, void *src);
+
+inline status_t multidim_copy(::std::integral_constant<dimensionality_t, 2>, copy_parameters_t<2> params)
+{
+	// Note this _must_ be an intra-context copy, as inter-context is not supported
+	// and there's no indication of context in the relevant data structures
+	return cuMemcpy2D(&params);
+}
+
+inline status_t multidim_copy(::std::integral_constant<dimensionality_t, 3>, copy_parameters_t<3> params)
+{
+	if (params.srcContext == params.dstContext) {
+		auto *intra_context_params = reinterpret_cast<base_copy_params<3>::intra_context_type *>(&params);
+		return cuMemcpy3D(intra_context_params);
+	}
+	return cuMemcpy3DPeer(&params);
+}
+
+template<dimensionality_t NumDimensions>
+status_t multidim_copy(context::handle_t context_handle, copy_parameters_t<NumDimensions> params)
+{
+	context::current::detail_::scoped_ensurer_t ensure_context_for_this_scope{context_handle};
+	return multidim_copy(::std::integral_constant<dimensionality_t, NumDimensions>{}, params);
+}
 
+} // namespace detail
 
 /**
  * Synchronously copies data from a CUDA array into non-array memory.
@@ -604,12 +704,20 @@ inline void copy(T* destination, const array_t<T, 3>& source)
  * @param source A pointer to a region of contiguous memory holding `destination.size()` values
  * of type @tparam T. The memory may be located either on a CUDA device or in host memory.
  */
-template <typename T, dimensionality_t NumDimensions>
-inline void copy(array_t<T, NumDimensions>& destination, const T* source)
+template<typename T, dimensionality_t NumDimensions>
+void copy(const array_t<T, NumDimensions>& destination, const T *source)
 {
-	detail_::copy(destination, source);
+	detail_::copy_parameters_t<NumDimensions> params{};
+	auto dims = destination.dimensions();
+	params.template clear_offset<T>(detail_::endpoint_t::source);
+    params.template clear_offset<T>(detail_::endpoint_t::destination);
+	params.template set_extent<T>(dims);
+	params.clear_rest();
+	params.set_endpoint(detail_::endpoint_t::source, const_cast<T*>(source), dims);
+	params.set_endpoint(detail_::endpoint_t::destination, destination);
+	auto status = detail_::multidim_copy<NumDimensions>(destination.context_handle(), params);
+    throw_if_error(status, "Copying from a regular memory region into a CUDA array");
 }
-
 /**
  * Synchronously copies data into a CUDA array from non-array memory.
  *
@@ -621,13 +729,41 @@ inline void copy(array_t<T, NumDimensions>& destination, const T* source)
  * @param source A {@tparam NumDimensions}-dimensional CUDA array
  */
 template <typename T, dimensionality_t NumDimensions>
-inline void copy(T* destination, const array_t<T, NumDimensions>& source)
+void copy(T *destination, const array_t<T, NumDimensions>& source)
 {
-	detail_::copy(destination, source);
+	detail_::copy_parameters_t<NumDimensions> params{};
+	auto dims = source.dimensions();
+	params.template clear_offset<T>(detail_::endpoint_t::source);
+	params.template clear_offset<T>(detail_::endpoint_t::destination);
+	params.template set_extent<T>(source.dimensions());
+	params.clear_rest();
+	params.set_endpoint(detail_::endpoint_t::source, source);
+	params.template set_endpoint<T>(detail_::endpoint_t::destination, destination, dims);
+    params.dstPitch = params.srcPitch = dims.width * sizeof(T);
+    auto status = detail_::multidim_copy<NumDimensions>(source.context_handle(), params);
+    throw_if_error(status, "Copying from a CUDA array into a regular memory region");
 }
 
 template <typename T, dimensionality_t NumDimensions>
-inline void copy(region_t destination, const array_t<T, NumDimensions>& source)
+void copy(array_t<T, NumDimensions> destination, array_t<T, NumDimensions> source)
+{
+	detail_::copy_parameters_t<NumDimensions> params{};
+	auto dims = source.dimensions();
+	params.template clear_offset<T>(detail_::endpoint_t::source);
+	params.template clear_offset<T>(detail_::endpoint_t::destination);
+	params.template set_extent<T>(source.dimensions());
+	params.clear_rest();
+	params.set_endpoint(detail_::endpoint_t::source, source);
+	params.set_endpoint(detail_::endpoint_t::destination, destination);
+	params.dstPitch = params.srcPitch = dims.width * sizeof(T);
+	auto status = //(source.context() == destination.context()) ?
+		detail_::multidim_copy<NumDimensions>(source.context_handle(), params);
+	throw_if_error(status, "Copying from a CUDA array into a regular memory region");
+}
+
+
+template <typename T, dimensionality_t NumDimensions>
+void copy(region_t destination, const array_t<T, NumDimensions>& source)
 {
 	if (source.size_bytes() < destination.size()) {
 		throw ::std::logic_error("Attempt to copy an array into a memory region too small to hold the copy");
@@ -644,7 +780,7 @@ inline void copy(region_t destination, const array_t<T, NumDimensions>& source)
  * device's global memory
  */
 template <typename T>
-inline void copy_single(T* destination, const T* source)
+void copy_single(T* destination, const T* source)
 {
 	copy(destination, source, sizeof(T));
 }
@@ -654,7 +790,8 @@ namespace async {
 namespace detail_ {
 
 /**
- * Asynchronously copies data between memory spaces or within a memory space.
+ * Asynchronously copies data between memory spaces or within a memory space, but
+ * within a single CUDA context.
  *
  * @note Since we assume Compute Capability >= 2.0, all devices support the
  * Unified Virtual Address Space, so the CUDA driver can determine, for each pointer,
@@ -667,16 +804,16 @@ namespace detail_ {
 
 ///@{
 /**
-* @param destination A pointer to a memory region of size @p num_bytes, either in
-* host memory or on any CUDA device's global memory
-* @param source A pointer to a memory region of size @p num_bytes, either in
-* host memory or on any CUDA device's global memory
-* @param num_bytes number of bytes to copy from @p source
+ * @param destination A pointer to a memory region of size @p num_bytes, either in
+ * host memory or on any CUDA device's global memory
+ * @param source A pointer to a memory region of size at least @p num_bytes, either in
+ * host memory or on any CUDA device's global memory
+ * @param num_bytes number of bytes to copy from @p source
  * @param stream_handle The handle of a stream on which to schedule the copy operation
 */
 inline void copy(void* destination, const void* source, size_t num_bytes, stream::handle_t stream_handle)
 {
-	auto result = cudaMemcpyAsync(destination, source, num_bytes, cudaMemcpyDefault, stream_handle);
+	auto result = cuMemcpyAsync(device::address(destination), device::address(source), num_bytes, stream_handle);
 
 	// TODO: Determine whether it was from host to device, device to host etc and
 	// add this information to the error string
@@ -684,77 +821,103 @@ inline void copy(void* destination, const void* source, size_t num_bytes, stream
 }
 
 /**
- * @param destination a memory region of size at least that of @p source, either
- *     in host memory or on any CUDA device's global memory
- * @param source a memory region, either in   host memory or on any CUDA device's
- *     global memory.
+ * @param destination a memory region of size @p num_bytes, either in
+ * host memory or on any CUDA device's global memory
+ * @param source a memory region of size @p num_bytes, either in
+ * host memory or on any CUDA device's global memory
  * @param stream_handle The handle of a stream on which to schedule the copy operation
  */
 inline void copy(region_t destination, const_region_t source, stream::handle_t stream_handle)
 {
 #ifndef NDEBUG
 	if (destination.size() < source.size()) {
-		throw std::logic_error("Can't copy a large region into a smaller one");
+		throw ::std::logic_error("Source size exceeds destination size");
 	}
 #endif
 	copy(destination.start(), source.start(), source.size(), stream_handle);
 }
 ///@}
 
-template<typename T>
-void copy(array_t<T, 3>& destination, const T* source, stream::handle_t stream_handle)
+using memory::detail_::copy_parameters_t;
+
+inline status_t multidim_copy_in_current_context(
+	::std::integral_constant<dimensionality_t, 2>,
+	copy_parameters_t<2> params,
+	stream::handle_t stream_handle)
 {
-	const auto copy_params = memory::detail_::copy_params_t(destination, source);
-	auto result = cudaMemcpy3DAsync(&copy_params, stream_handle);
-	throw_if_error(result, "Scheduling a memory copy into a 3D CUDA array on " + stream::detail_::identify(stream_handle));
+	// Must be an intra-context copy, because CUDA does not support 2D inter-context copies and the copy parameters
+	// structure holds no information about contexts.
+	return cuMemcpy2DAsync(&params, stream_handle);
 }
 
-template<typename T>
-void copy(T* destination, const array_t<T, 3>& source, stream::handle_t stream_handle)
+inline status_t multidim_copy_in_current_context(
+	::std::integral_constant<dimensionality_t, 3>,
+	copy_parameters_t<3> params,
+	stream::handle_t stream_handle)
 {
-	const auto copy_params = memory::detail_::copy_params_t(destination, source);
-	auto result = cudaMemcpy3DAsync(&copy_params, stream_handle);
-	throw_if_error(result, "Scheduling a memory copy out of a 3D CUDA array on " + stream::detail_::identify(stream_handle));
+	if (params.srcContext == params.dstContext) {
+		using intra_context_type = memory::detail_::base_copy_params<3>::intra_context_type;
+		auto* intra_context_params = reinterpret_cast<intra_context_type *>(&params);
+		return cuMemcpy3DAsync(intra_context_params, stream_handle);
+	}
+	return cuMemcpy3DPeerAsync(&params, stream_handle);
+
 }
 
-template<typename T>
-void copy(array_t<T, 2>& destination, const T* source, stream::handle_t stream_handle)
-{
-	const auto dimensions = destination.dimensions();
-	const auto width_in_bytes = sizeof(T) * dimensions.width;
-	const auto source_pitch = width_in_bytes; // i.e. no padding
-	const array::dimensions_t<2> offsets { 0, 0 };
-	auto result = cudaMemcpy2DToArrayAsync(
-		destination.get(),
-		offsets.width,
-		offsets.height,
-		source,
-		source_pitch,
-		width_in_bytes,
-		dimensions.height,
-		cudaMemcpyDefault,
-		stream_handle);
-	throw_if_error(result, "Scheduling a memory copy into a 2D CUDA array on " + stream::detail_::identify(stream_handle));
+template<dimensionality_t NumDimensions>
+status_t multidim_copy_in_current_context(copy_parameters_t<NumDimensions> params, stream::handle_t stream_handle) {
+	return multidim_copy_in_current_context(::std::integral_constant<dimensionality_t, NumDimensions>{}, params, stream_handle);
 }
 
-template<typename T>
-void copy(T* destination, const array_t<T, 2>& source, cuda::stream::handle_t stream_handle)
-{
-	const auto dimensions = source.dimensions();
-	const auto width_in_bytes = sizeof(T) * dimensions.width;
-	const auto destination_pitch = width_in_bytes; // i.e. no padding
-	const array::dimensions_t<2> offsets { 0, 0 };
-	auto result = cudaMemcpy2DFromArrayAsync(
-		destination,
-		destination_pitch,
-		source.get(),
-		offsets.width,
-		offsets.height,
-		width_in_bytes,
-		dimensions.height,
-		cudaMemcpyDefault,
-		stream_handle);
-	throw_if_error(result, "Scheduling a memory copy out of a 3D CUDA array on " + stream::detail_::identify(stream_handle));
+// Note: Assumes the stream handle is for a stream in the current context
+template<dimensionality_t NumDimensions>
+status_t multidim_copy(
+	context::handle_t                 context_handle,
+	copy_parameters_t<NumDimensions>  params,
+	stream::handle_t                  stream_handle)
+{
+	context::current::detail_::scoped_override_t set_context_for_this_scope(context_handle);
+	return multidim_copy_in_current_context(::std::integral_constant<dimensionality_t, NumDimensions>{}, params, stream_handle);
+}
+
+
+// Assumes the array and the stream share the same context, and that the destination is
+// accessible from that context (e.g. allocated within it, or being managed memory, etc.)
+template <typename T, dimensionality_t NumDimensions>
+void copy(T *destination, const array_t<T, NumDimensions>& source, stream::handle_t stream_handle)
+{
+	using  memory::detail_::endpoint_t;
+	auto dims = source.dimensions();
+	//auto params = make_multidim_copy_params(destination, const_cast<T*>(source), destination.dimensions());
+	detail_::copy_parameters_t<NumDimensions> params{};
+	params.template clear_offset<T>(endpoint_t::source);
+	params.template clear_offset<T>(endpoint_t::destination);
+	params.template set_extent<T>(dims);
+	params.clear_rest();
+	params.set_endpoint(endpoint_t::source, source);
+	params.set_endpoint(endpoint_t::destination, const_cast<T*>(destination), dims);
+    params.dstPitch = dims.width * sizeof(T);
+    auto status = multidim_copy_in_current_context<NumDimensions>(params, stream_handle);
+    throw_if_error(status, "Scheduling an asynchronous copy from an array into a regular memory region");
+}
+
+
+template <typename T, dimensionality_t NumDimensions>
+void copy(const array_t<T, NumDimensions>&  destination, const T* source, stream::handle_t stream_handle)
+{
+	using  memory::detail_::endpoint_t;
+	auto dims = destination.dimensions();
+	//auto params = make_multidim_copy_params(destination, const_cast<T*>(source), destination.dimensions());
+	detail_::copy_parameters_t<NumDimensions> params{};
+	params.template clear_offset<T>(endpoint_t::source);
+	params.template clear_offset<T>(endpoint_t::destination);
+	params.template set_extent<T>(destination.dimensions());
+    params.srcPitch = dims.width * sizeof(T);
+	params.clear_rest();
+	params.set_endpoint(endpoint_t::source, const_cast<T*>(source), dims);
+	params.set_endpoint(endpoint_t::destination, destination);
+    auto status = multidim_copy_in_current_context<NumDimensions>(params, stream_handle);
+    throw_if_error(status, "Scheduling an asynchronous copy from regular memory into an array");
 }
 
 /**
@@ -762,6 +925,9 @@ void copy(T* destination, const array_t<T, 2>& source, cuda::stream::handle_t st
  *
  * @note asynchronous version of @ref memory::copy_single
  *
+ * @note assumes the source and destination are all valid in the same context as that of the
+ * context handle
+ *
  * @param destination a value residing either in host memory or on any CUDA
  * device's global memory
  * @param source a value residing either in host memory or on any CUDA
@@ -769,7 +935,7 @@ void copy(T* destination, const array_t<T, 2>& source, cuda::stream::handle_t st
  * @param stream_handle A stream on which to enqueue the copy operation
  */
 template <typename T>
-inline void copy_single(T& destination, const T& source, stream::handle_t stream_handle)
+void copy_single(T& destination, const T& source, stream::handle_t stream_handle)
 {
 	copy(&destination, &source, sizeof(T), stream_handle);
 }
@@ -785,13 +951,16 @@ inline void copy_single(T& destination, const T& source, stream::handle_t stream
  *
  * @note asynchronous version of @ref memory::copy
  *
- * @param destination A pointer to a memory region of size @p num_bytes, 
- *     either in host memory or on any CUDA device's global memory.
- * @param source A pointer to a a memory region of size at least @p num_bytes, 
- *     either in host memory or on any CUDA device's global memory
+ * @param destination A (pointer to) a memory region of size @p num_bytes, either in
+ * host memory or on any CUDA device's global memory. Must be defined in the same context
+ * as the stream.
+ * @param source A (pointer to) a memory region of size @p num_bytes, either in
+ * host memory or on any CUDA device's global memory. Must be defined in the same context
+ * as the stream
  * @param num_bytes The number of bytes to copy from @p source to @p destination
  * @param stream A stream on which to enqueue the copy operation
  */
+///@{
 void copy(void* destination, void const* source, size_t num_bytes, const stream_t& stream);
 
 inline void copy(void* destination, const_region_t source, size_t num_bytes, const stream_t& stream)
@@ -824,15 +993,42 @@ inline void copy(region_t destination, const_region_t source, const stream_t& st
 	copy(destination, source, source.size(), stream);
 }
 
+/**
+ * @param source A plain array whose contents is to be copied.
+ */
+template <typename T, size_t N>
+inline void copy(region_t destination, const T(&source)[N], const stream_t& stream)
+{
+#ifndef NDEBUG
+	if (destination.size() < N) {
+		throw ::std::logic_error("Source size exceeds destination size");
+	}
+#endif
+	return copy(destination.start(), source, N, stream);
+}
+
+inline void copy(region_t destination, void* source, size_t num_bytes, const stream_t& stream)
+{
+#ifndef NDEBUG
+	if (destination.size() < num_bytes) {
+		throw ::std::logic_error("Number of bytes to copy exceeds destination size");
+	}
+#endif
+	return copy(destination.start(), source, num_bytes, stream);
+}
+///@}
+
 /**
  * Asynchronously copies data from memory spaces into CUDA arrays.
  *
- * @param destination A CUDA array (see @ref cuda::array_t )
+ * @note asynchronous version of @ref memory::copy
+ *
+ * @param destination A CUDA array @ref cuda::array_t
  * @param source A pointer to a a memory region of size `destination.size() * sizeof(T)`
  * @param stream schedule the copy operation into this CUDA stream
  */
 template <typename T, dimensionality_t NumDimensions>
-inline void copy(array_t<T, NumDimensions>& destination, const T* source, const stream_t& stream);
+void copy(array_t<T, NumDimensions>& destination, const T* source, const stream_t& stream);
 
 template <typename T, dimensionality_t NumDimensions>
 void copy(array_t<T, NumDimensions>& destination, const_region_t source, const stream_t& stream)
@@ -864,7 +1060,7 @@ template <typename T, dimensionality_t NumDimensions>
 void copy(region_t destination, const array_t<T, NumDimensions>& source, const stream_t& stream)
 {
 #ifndef NDEBUG
-	size_t required_size = destination.size() * sizeof(T);
+	size_t required_size = source.size() * sizeof(T);
 	if (destination.size() < required_size) {
 		throw ::std::invalid_argument(
 			"Attempt to copy " + ::std::to_string(required_size) + " bytes from an array into a "
@@ -899,7 +1095,7 @@ namespace detail_ {
 inline void set(void* start, int byte_value, size_t num_bytes, stream::handle_t stream_handle)
 {
 	// TODO: Double-check that this call doesn't require setting the current device
-	auto result = cudaMemsetAsync(start, byte_value, num_bytes, stream_handle);
+	auto result = cuMemsetD8Async(address(start), byte_value, num_bytes, stream_handle);
 	throw_if_error(result, "asynchronously memsetting an on-device buffer");
 }
 
@@ -908,7 +1104,6 @@ inline void set(region_t region, int byte_value, stream::handle_t stream_handle)
 	set(region.start(), byte_value, region.size(), stream_handle);
 }
 
-
 inline void zero(void* start, size_t num_bytes, stream::handle_t stream_handle)
 {
 	set(start, 0, num_bytes, stream_handle);
@@ -919,8 +1114,43 @@ inline void zero(region_t region, stream::handle_t stream_handle)
 	zero(region.start(), region.size(), stream_handle);
 }
 
+// TODO: Drop this in favor of <algorithm>-like functions under `cuda::~.
+template <typename T>
+inline void typed_set(T* start, const T& value, size_t num_elements, stream::handle_t stream_handle)
+{
+	static_assert(::std::is_trivially_copyable<T>::value, "Non-trivially-copyable types cannot be used for setting memory");
+	static_assert(
+		sizeof(T) == 1 or sizeof(T) == 2 or
+		sizeof(T) == 4 or sizeof(T) == 8,
+		"Unsupported type size - only sizes 1, 2 and 4 are supported");
+	// TODO: Consider checking for alignment when compiling without NDEBUG
+	auto result {CUDA_SUCCESS};
+	switch(sizeof(T)) {
+		case(1): result = cuMemsetD8Async (address(start), reinterpret_cast<const ::std::uint8_t& >(value), num_elements, stream_handle); break;
+		case(2): result = cuMemsetD16Async(address(start), reinterpret_cast<const ::std::uint16_t&>(value), num_elements, stream_handle); break;
+		case(4): result = cuMemsetD32Async(address(start), reinterpret_cast<const ::std::uint32_t&>(value), num_elements, stream_handle); break;
+	}
+	throw_if_error(result, "Setting global device memory bytes");
+}
+
 } // namespace detail_
 
+
+/**
+ * @brief Sets consecutive elements of a region of memory to a fixed
+ * value of some width
+ *
+ * @note A generalization of `async::set()`, for different-size units.
+ *
+ * @tparam T An unsigned integer type of size 1, 2, 4 or 8
+ * @param start The first location to set to @p value ; must be properly aligned.
+ * @param value A (properly aligned) value to set T-elements to.
+ * @param num_elements The number of type-T elements (i.e. _not_ necessarily the number of bytes).
+ * @param stream The stream on which to enqueue the operation.
+ */
+template <typename T>
+void typed_set(T* start, const T& value, size_t num_elements, const stream_t& stream);
+
 /**
  * Asynchronously sets all bytes in a stretch of memory to a single value
  *
@@ -932,7 +1162,10 @@ inline void zero(region_t region, stream::handle_t stream_handle)
  * @param num_bytes size of the memory region in bytes
  * @param stream stream on which to schedule this action
  */
-inline void set(void* start, int byte_value, size_t num_bytes, const stream_t& stream);
+inline void set(void* start, int byte_value, size_t num_bytes, const stream_t& stream)
+{
+	return typed_set<unsigned char>(static_cast<unsigned char*>(start), byte_value, num_bytes, stream);
+}
 
 /**
  * Similar to @ref set(), but sets the memory to zero rather than an arbitrary value
@@ -945,7 +1178,8 @@ inline void zero(void* start, size_t num_bytes, const stream_t& stream);
  *
  * @note asynchronous version of @ref memory::zero
  *
- * @param ptr a pointer to the value to be to zero
+ * @param ptr a pointer to the value to be to zero; must be valid in the
+ * CUDA context of @p stream
  * @param stream stream on which to schedule this action
  */
 template <typename T>
@@ -956,8 +1190,166 @@ inline void zero(T* ptr, const stream_t& stream)
 
 } // namespace async
 
+
 } // namespace device
 
+namespace inter_context {
+
+namespace detail_ {
+
+inline void copy(
+	void *             destination_address,
+	context::handle_t  destination_context,
+	const void *       source_address,
+	context::handle_t  source_context,
+	size_t             num_bytes)
+{
+	auto status = cuMemcpyPeer(
+		reinterpret_cast<device::address_t>(destination_address),
+		destination_context,
+		reinterpret_cast<device::address_t>(source_address),
+		source_context, num_bytes);
+	throw_if_error(status,
+		::std::string("Failed copying data between devices: From address ")
+			+ cuda::detail_::ptr_as_hex(source_address) + " in "
+			+ context::detail_::identify(source_context) + " to address "
+			+ cuda::detail_::ptr_as_hex(destination_address) + " in "
+			+ context::detail_::identify(destination_context) );
+}
+
+} // namespace detail_
+
+void copy(
+	void *             destination,
+	const context_t&   destination_context,
+	const void *       source_address,
+	const context_t&   source_context,
+	size_t             num_bytes);
+
+inline void copy(
+	void *             destination,
+	const context_t&   destination_context,
+	const_region_t     source,
+	const context_t&   source_context)
+{
+	copy(destination, destination_context, source.start(), source_context, source.size());
+}
+
+inline void copy(
+	region_t           destination,
+	const context_t&   destination_context,
+	const_region_t     source,
+	const context_t&   source_context)
+{
+#ifndef NDEBUG
+	if (destination.size() < destination.size()) {
+		throw ::std::invalid_argument(
+			"Attempt to copy a region of " + ::std::to_string(source.size()) +
+				" bytes into a region of size " + ::std::to_string(destination.size()) + " bytes");
+	}
+#endif
+	copy(destination.start(), destination_context, source, source_context);
+}
+
+template <typename T, dimensionality_t NumDimensions>
+inline void copy(
+	array_t<T, NumDimensions>  destination,
+	array_t<T, NumDimensions>  source)
+{
+	// for arrays, a single mechanism handles both intra- and inter-context copying
+	return memory::copy(destination, source);
+}
+
+namespace async {
+
+namespace detail_ {
+
+inline void copy(
+	void *destination,
+	context::handle_t destination_context_handle,
+	const void *source,
+	context::handle_t source_context_handle,
+	size_t num_bytes,
+	stream::handle_t stream_handle)
+{
+	auto result = cuMemcpyPeerAsync(
+		device::address(destination),
+		destination_context_handle,
+		device::address(source),
+		source_context_handle,
+		num_bytes, stream_handle);
+
+	// TODO: Determine whether it was from host to device, device to host etc and
+	// add this information to the error string
+	throw_if_error(result, "Scheduling an inter-context memory copy from "
+		+ context::detail_::identify(source_context_handle) + " to "
+		+ context::detail_::identify(destination_context_handle) + " on "
+		+ stream::detail_::identify(stream_handle));
+}
+
+/**
+ * @param destination a memory region of size @p num_bytes, either in
+ * host memory or on any CUDA device's global memory
+ * @param source a memory region of size @p num_bytes, either in
+ * host memory or on any CUDA device's global memory
+ * @param stream_handle The handle of a stream on which to schedule the copy operation
+ */
+inline void copy(
+	region_t destination,
+	context::handle_t destination_context_handle,
+	const_region_t source,
+	context::handle_t source_context_handle,
+	stream::handle_t stream_handle)
+{
+#ifndef NDEBUG
+	if (destination.size() < source.size()) {
+		throw ::std::logic_error("Can't copy a large region into a smaller one");
+	}
+#endif
+	copy(destination.start(), destination_context_handle, source.start(), source_context_handle, source.size(),
+		stream_handle);
+}
+///@}
+
+} // namespace detail_
+
+void copy(
+	void *        destination_address,
+	context_t     destination_context,
+	const void *  source_address,
+	context_t     source_context,
+	size_t        num_bytes,
+	stream_t      stream);
+
+void copy(
+	void *          destination,
+	context_t       destination_context,
+	const_region_t  source,
+	context_t       source_context,
+	stream_t        stream);
+
+inline void copy(
+	region_t        destination,
+	context_t       destination_context,
+	const_region_t  source,
+	context_t       source_context,
+	stream_t        stream);
+
+template <typename T, dimensionality_t NumDimensions>
+inline void copy(
+	array_t<T, NumDimensions>  destination,
+	array_t<T, NumDimensions>  source,
+	const stream_t&            stream)
+{
+	// for arrays, a single mechanism handles both intra- and inter-context copying
+	return memory::async::copy(destination, source, stream);
+}
+
+
+} // namespace async
+
+} // namespace inter_context
+
 /**
  * @namespace host
  * Host-side (= system) memory which is "pinned", i.e. resides in
@@ -968,12 +1360,13 @@ namespace host {
 /**
  * Allocate pinned host memory
  *
- * @note "Pinned" memory is excepted from virtual memory swapping-out,
- * and is allocated in contiguous physical RAM addresses, making it
- * possible to copy to and from it to the the GPU using DMA without
- * assistance from the GPU. Typically for PCIe 3.0, the effective
- * bandwidth is twice as fast as copying from or to naively-allocated
- * host memory.
+ * @note This function will fail if
+ *
+ * @note "Pinned" memory is allocated in contiguous physical RAM
+ * addresses, making it possible to copy to and from it to the the
+ * GPU using DMA without assistance from the GPU. This improves
+ * the copying bandwidth significantly over naively-allocated
+ * host memory, and reduces overhead for the CPU.
  *
  * @throws cuda::runtime_error if allocation fails for any reason
  *
@@ -985,20 +1378,10 @@ namespace host {
  *
  * @return a pointer to the allocated stretch of memory
  */
-inline void* allocate(
+void* allocate(
 	size_t              size_in_bytes,
-	allocation_options  options)
-{
-	void* allocated = nullptr;
-	auto flags = cuda::memory::detail_::make_cuda_host_alloc_flags(options);
-	auto result = cudaHostAlloc(&allocated, size_in_bytes, flags);
-	if (is_success(result) && allocated == nullptr) {
-		// Can this even happen? hopefully not
-		result = cudaErrorUnknown;
-	}
-	throw_if_error(result, "Failed allocating " + ::std::to_string(size_in_bytes) + " bytes of host memory");
-	return allocated;
-}
+	allocation_options  options);
+
 
 inline void* allocate(
 	size_t                       size_in_bytes,
@@ -1018,14 +1401,14 @@ inline void* allocate(size_t size_in_bytes, cpu_write_combining cpu_wc)
  */
 inline void free(void* host_ptr)
 {
-	auto result = cudaFreeHost(host_ptr);
-	throw_if_error(result, "Freeing pinned host memory at 0x" + cuda::detail_::ptr_as_hex(host_ptr));
+	auto result = cuMemFreeHost(host_ptr);
+	throw_if_error(result, "Freeing pinned host memory at " + cuda::detail_::ptr_as_hex(host_ptr));
 }
 
 namespace detail_ {
 
 struct allocator {
-	void* operator()(size_t size_in_bytes) const { return cuda::memory::host::allocate(size_in_bytes); }
+	void* operator()(size_t num_bytes) const { return cuda::memory::host::allocate(num_bytes); }
 };
 struct deleter {
 	void operator()(void* ptr) const { cuda::memory::host::free(ptr); }
@@ -1036,8 +1419,8 @@ struct deleter {
  * @brief Makes a preallocated memory region behave as though it were allocated with @ref host::allocate.
  *
  * Page-locks the memory range specified by ptr and size and maps it for the device(s) as specified by
- * flags. This memory range also is added to the same tracking mechanism as cudaHostAlloc() to
- * automatically accelerate calls to functions such as cudaMemcpy().
+ * flags. This memory range also is added to the same tracking mechanism as cuMemAllocHost() to
+ * automatically accelerate calls to functions such as cuMemcpy().
  *
  * @param ptr A pre-allocated stretch of host memory
  * @param size the size in bytes the memory region to register/pin
@@ -1045,7 +1428,7 @@ struct deleter {
  */
 inline void register_(const void *ptr, size_t size, unsigned flags)
 {
-	auto result = cudaHostRegister(const_cast<void *>(ptr), size, flags);
+	auto result = cuMemHostRegister(const_cast<void *>(ptr), size, flags);
 	throw_if_error(result,
 		"Could not register and page-lock the region of " + ::std::to_string(size) +
 		" bytes of host memory at " + cuda::detail_::ptr_as_hex(ptr));
@@ -1097,9 +1480,9 @@ inline void register_(const void *ptr, size_t size,
 {
 	detail_::register_(
 		ptr, size,
-		  (register_mapped_io_space ? cudaHostRegisterIoMemory : 0)
-		| (map_into_device_space ? cudaHostRegisterMapped : 0)
-		| (make_device_side_accesible_to_all ? cudaHostRegisterPortable : 0)
+		(register_mapped_io_space ? CU_MEMHOSTREGISTER_IOMEMORY : 0)
+		| (map_into_device_space ? CU_MEMHOSTREGISTER_DEVICEMAP : 0)
+		| (make_device_side_accesible_to_all ? CU_MEMHOSTREGISTER_PORTABLE : 0)
 	);
 }
 
@@ -1120,7 +1503,8 @@ inline void register_(
 
 inline void register_(void const *ptr, size_t size)
 {
-	detail_::register_(ptr, size, cudaHostRegisterDefault);
+	unsigned no_flags_set { 0 };
+	detail_::register_(ptr, size, no_flags_set);
 }
 
 inline void register_(const_region_t region)
@@ -1133,7 +1517,7 @@ inline void register_(const_region_t region)
 // just ended
 inline void deregister(const void *ptr)
 {
-	auto result = cudaHostUnregister(const_cast<void *>(ptr));
+	auto result = cuMemHostUnregister(const_cast<void *>(ptr));
 	throw_if_error(result,
 		"Could not unregister the memory segment starting at address *a");
 }
@@ -1149,9 +1533,9 @@ inline void deregister(const_region_t region)
  * @note a wrapper for @ref ::std::memset
  *
  * @param start starting address of the memory region to set,
- *     in host memory; can be either CUDA-allocated or otherwise.
+ * in host memory; can be either CUDA-allocated or otherwise.
  * @param byte_value value to set the memory region to
- * @param num_bytes number of bytes at @p address to be set
+ * @param num_bytes size of the memory region in bytes
  */
 inline void set(void* start, int byte_value, size_t num_bytes)
 {
@@ -1195,11 +1579,13 @@ struct const_region_t;
 
 namespace detail_ {
 
+using advice_t = CUmem_advise;
+
 template <typename T>
-inline T get_scalar_range_attribute(managed::const_region_t region, cudaMemRangeAttribute attribute);
+inline T get_scalar_range_attribute(managed::const_region_t region, range_attribute_t attribute);
 
-inline void set_scalar_range_attribute(managed::const_region_t region, cudaMemoryAdvise advice, cuda::device::id_t device_id);
-inline void set_scalar_range_attribute(managed::const_region_t region, cudaMemoryAdvise attribute);
+inline void advise(managed::const_region_t region, advice_t advice, cuda::device::id_t device_id);
+// inline void advise(managed::const_region_t region, advice_t attribute);
 
 template <typename T>
 struct base_region_t : public memory::detail_::base_region_t<T> {
@@ -1208,17 +1594,17 @@ struct base_region_t : public memory::detail_::base_region_t<T> {
 
 	bool is_read_mostly() const
 	{
-		return get_scalar_range_attribute<bool>(*this, cudaMemRangeAttributeReadMostly);
+		return get_scalar_range_attribute<bool>(*this, CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY);
 	}
 
 	void designate_read_mostly() const
 	{
-		set_scalar_range_attribute(*this, cudaMemAdviseSetReadMostly);
+		set_range_attribute(*this, CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY);
 	}
 
 	void undesignate_read_mostly() const
 	{
-		detail_::set_scalar_range_attribute(*this, cudaMemAdviseUnsetReadMostly);
+		unset_range_attribute(*this, CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY);
 	}
 
 	device_t preferred_location() const;
@@ -1244,69 +1630,89 @@ void advise_expected_access_by(managed::const_region_t region, device_t& device)
 void advise_no_access_expected_by(managed::const_region_t region, device_t& device);
 
 template <typename Allocator = ::std::allocator<cuda::device_t> >
-typename ::std::vector<device_t, Allocator> accessors(managed::const_region_t region, const Allocator& allocator = Allocator() );
+	typename ::std::vector<device_t, Allocator> accessors(managed::const_region_t region, const Allocator& allocator = Allocator() );
 
 namespace detail_ {
 
 template <typename T>
-inline T get_scalar_range_attribute(managed::const_region_t region, cudaMemRangeAttribute attribute)
+inline T get_scalar_range_attribute(managed::const_region_t region, range_attribute_t attribute)
 {
 	uint32_t attribute_value { 0 };
-	auto result = cudaMemRangeGetAttribute(
-		&attribute_value, sizeof(attribute_value), attribute, region.start(), region.size());
+	auto result = cuMemRangeGetAttribute(
+		&attribute_value, sizeof(attribute_value), attribute, device::address(region.start()), region.size());
 	throw_if_error(result,
 		"Obtaining an attribute for a managed memory range at " + cuda::detail_::ptr_as_hex(region.start()));
 	return static_cast<T>(attribute_value);
 }
 
-inline void set_scalar_range_attribute(managed::const_region_t region, cudaMemoryAdvise advice, cuda::device::id_t device_id)
+// CUDA's range "advice" is simply a way to set the attributes of a range; unfortunately that's
+// not called cuMemRangeSetAttribute, and uses a different enum.
+inline void advise(managed::const_region_t region, advice_t advice, cuda::device::id_t device_id)
 {
-	auto result = cudaMemAdvise(region.start(), region.size(), advice, device_id);
-	throw_if_error(result,
-		"Setting an attribute for a managed memory range at " + cuda::detail_::ptr_as_hex(region.start()));
+	auto result = cuMemAdvise(device::address(region.start()), region.size(), advice, device_id);
+	throw_if_error(result, "Setting an attribute for a managed memory range at "
+	+ cuda::detail_::ptr_as_hex(region.start()));
 }
 
-inline void set_scalar_range_attribute(managed::const_region_t region, cudaMemoryAdvise attribute)
+// inline void set_range_attribute(managed::const_region_t region, range_attribute_t attribute, cuda::device::handle_t device_id)
+
+inline advice_t as_advice(range_attribute_t attribute, bool set)
 {
-	cuda::device::id_t ignored_device_index{};
-	set_scalar_range_attribute(region, attribute, ignored_device_index);
+	switch (attribute) {
+	case CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY:
+		return set ? CU_MEM_ADVISE_SET_READ_MOSTLY : CU_MEM_ADVISE_UNSET_READ_MOSTLY;
+	case CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION:
+		return set ? CU_MEM_ADVISE_SET_PREFERRED_LOCATION : CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION;
+	case CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY:
+		return set ? CU_MEM_ADVISE_SET_ACCESSED_BY : CU_MEM_ADVISE_UNSET_ACCESSED_BY;
+	default:
+		throw std::invalid_argument(
+			"CUDA memory range attribute does not correspond to any range advice value");
+	}
 }
 
-} // namespace detail_
+inline void set_range_attribute(managed::const_region_t region, range_attribute_t settable_attribute, cuda::device::id_t device_id)
+{
+	constexpr const bool set { true };
+	advise(region, as_advice(settable_attribute, set), device_id);
+}
 
+inline void unset_range_attribute(managed::const_region_t region, range_attribute_t settable_attribute)
+{
+	constexpr const bool unset { false };
+	constexpr const cuda::device::id_t dummy_device_id { 0 };
+	advise(region, as_advice(settable_attribute, unset), dummy_device_id);
+}
 
-enum class initial_visibility_t {
-	to_all_devices,
-	to_supporters_of_concurrent_managed_access,
-};
+} // namespace detail_
 
 
-enum class attachment_t {
-	global        = cudaMemAttachGlobal,
-	host          = cudaMemAttachHost,
-	single_stream = cudaMemAttachSingle,
-};
+enum class attachment_t : unsigned {
+	global        = CU_MEM_ATTACH_GLOBAL,
+	host          = CU_MEM_ATTACH_HOST,
+	single_stream = CU_MEM_ATTACH_SINGLE,
+	};
 
 
 namespace detail_ {
 
-inline region_t allocate(
-	size_t                size_in_bytes,
+inline region_t allocate_in_current_context(
+	size_t                num_bytes,
 	initial_visibility_t  initial_visibility = initial_visibility_t::to_all_devices)
 {
-	void* allocated = nullptr;
+	device::address_t allocated = 0;
 	auto flags = (initial_visibility == initial_visibility_t::to_all_devices) ?
-		cudaMemAttachGlobal : cudaMemAttachHost;
+		attachment_t::global : attachment_t::host;
 	// Note: Despite the templating by T, the size is still in bytes,
 	// not in number of T's
-	auto status = cudaMallocManaged(&allocated, size_in_bytes, flags);
-	if (is_success(status) && allocated == nullptr) {
+	auto status = cuMemAllocManaged(&allocated, num_bytes, (unsigned) flags);
+	if (is_success(status) && allocated == 0) {
 		// Can this even happen? hopefully not
 		status = (status_t) status::unknown;
 	}
-	throw_if_error(status,
-		"Failed allocating " + ::std::to_string(size_in_bytes) + " bytes of managed CUDA memory");
-	return {allocated, size_in_bytes};
+	throw_if_error(status, "Failed allocating "
+		+ ::std::to_string(num_bytes) + " bytes of managed CUDA memory");
+	return {as_pointer(allocated), num_bytes};
 }
 
 /**
@@ -1315,7 +1721,7 @@ inline region_t allocate(
 ///@{
 inline void free(void* ptr)
 {
-	auto result = cudaFree(ptr);
+	auto result = cuMemFree(device::address(ptr));
 	throw_if_error(result, "Freeing managed memory at 0x" + cuda::detail_::ptr_as_hex(ptr));
 }
 inline void free(region_t region)
@@ -1326,44 +1732,74 @@ inline void free(region_t region)
 
 template <initial_visibility_t InitialVisibility = initial_visibility_t::to_all_devices>
 struct allocator {
-	// Allocates on the current device!
-	void* operator()(size_t size_in_bytes) const
+	// Allocates in the current context!
+	void* operator()(size_t num_bytes) const
 	{
-		return detail_::allocate(size_in_bytes, InitialVisibility).start();
+		return detail_::allocate_in_current_context(num_bytes, InitialVisibility).start();
 	}
 };
+
 struct deleter {
-	void operator()(void* ptr) const { cuda::memory::device::free(ptr); }
+	void operator()(void* ptr) const { memory::device::free(ptr); }
 };
 
 inline region_t allocate(
-	cuda::device::id_t    device_id,
-	size_t                size_in_bytes,
+	context::handle_t     context_handle,
+	size_t                num_bytes,
 	initial_visibility_t  initial_visibility = initial_visibility_t::to_all_devices)
 {
-	cuda::device::current::detail_::scoped_override_t set_device_for_this_scope(device_id);
-	return detail_::allocate(size_in_bytes, initial_visibility);
+	context::current::detail_::scoped_override_t set_context_for_this_scope(context_handle);
+	return allocate_in_current_context(num_bytes, initial_visibility);
 }
 
 } // namespace detail_
 
+/**
+ * @brief Allocate a a region of managed memory, accessible with the same
+ * address on the host and on CUDA devices.
+ *
+ * @param context the initial context which is likely to access the managed
+ * memory region (and which will certainly have the region actually allocated
+ * for it)
+ * @param num_bytes size of each of the regions of memory to allocate
+ * @param initial_visibility will the allocated region be visible, using the
+ * common address, to all CUDA device (= more overhead, more work for the CUDA
+ * runtime) or just to those devices with some hardware features to assist in
+ * this task (= less overhead)?
+ */
+inline region_t allocate(
+	const context_t&      context,
+	size_t                num_bytes,
+	initial_visibility_t  initial_visibility = initial_visibility_t::to_all_devices);
+
 /**
  * @brief Allocate a a region of managed memory, accessible with the same
  * address on the host and on CUDA devices
  *
  * @param device the initial device which is likely to access the managed
- * memory region (and which will certainly have actually allocated for it)
- * @param size_in_bytes size of each of the regions of memory to allocate
+ * memory region (and which will certainly have the region actually allocated
+ * for it)
+ * @param num_bytes size of each of the regions of memory to allocate
  * @param initial_visibility will the allocated region be visible, using the
  * common address, to all CUDA device (= more overhead, more work for the CUDA
  * runtime) or just to those devices with some hardware features to assist in
  * this task (= less overhead)?
  */
-region_t allocate(
-	cuda::device_t        device,
-	size_t                size_in_bytes,
-	initial_visibility_t  initial_visibility = initial_visibility_t::to_all_devices
-);
+inline region_t allocate(
+	device_t              device,
+	size_t                num_bytes,
+	initial_visibility_t  initial_visibility = initial_visibility_t::to_all_devices);
+
+/**
+ * @brief Allocate a a region of managed memory, accessible with the same
+ * address on the host and on all CUDA devices.
+ *
+ * @note While the allocated memory should be available universally, the
+ * allocation itself does require some GPU context. This will be the current
+ * context, if one exists, or the primary context on the runtime-defined current
+ * device.
+ */
+region_t allocate(size_t num_bytes);
 
 /**
  * Free a managed memory region (host-side and device-side regions on all devices
@@ -1372,7 +1808,7 @@ region_t allocate(
  */
 inline void free(void* managed_ptr)
 {
-	auto result = cudaFree(managed_ptr);
+	auto result = cuMemFree(device::address(managed_ptr));
 	throw_if_error(result,
 		"Freeing managed memory (host and device regions) at address 0x"
 		+ cuda::detail_::ptr_as_hex(managed_ptr));
@@ -1385,23 +1821,26 @@ inline void free(region_t region)
 
 namespace advice {
 
-enum device_inspecific_kind_t {
-	read_mostly = cudaMemAdviseSetReadMostly,
+enum kind_t {
+	read_mostly = CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY,
+	preferred_location = CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION,
+	accessor = CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY,
+	// Note: CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION is never set
 };
 
-enum device_specific_kind_t {
-	preferred_location,
-	accessor,
-};
+namespace detail_ {
 
-inline void set(const_region_t region, device_inspecific_kind_t advice)
+inline void set(const_region_t region, kind_t advice, cuda::device::id_t device_id)
 {
-	cuda::device::id_t ignored_device_index{};
-	auto result = cudaMemAdvise(region.start(), region.size(), (cudaMemoryAdvise) advice, ignored_device_index);
-	throw_if_error(result,
-		"Setting advice on a (managed) memory region at" + cuda::detail_::ptr_as_hex(region.start()));
+	auto result = cuMemAdvise(device::address(region.start()), region.size(), (managed::detail_::advice_t) advice, device_id);
+	throw_if_error(result, "Setting advice on a (managed) memory region at"
+		+ cuda::detail_::ptr_as_hex(region.start()) + " w.r.t. " + cuda::device::detail_::identify(device_id));
 }
 
+} // namespace detail_
+
+void set(const_region_t region, kind_t advice, const device_t& device);
+
 } // namespace advice
 
 namespace async {
@@ -1411,12 +1850,13 @@ namespace detail_ {
 inline void prefetch(
 	const_region_t      region,
 	cuda::device::id_t  destination,
-	stream::handle_t        stream_handle)
+	stream::handle_t    source_stream_handle)
 {
-	auto result = cudaMemPrefetchAsync(region.start(), region.size(), destination, stream_handle);
+	auto result = cuMemPrefetchAsync(device::address(region.start()), region.size(), destination, source_stream_handle);
 	throw_if_error(result,
 		"Prefetching " + ::std::to_string(region.size()) + " bytes of managed memory at address "
-		 + cuda::detail_::ptr_as_hex(region.start()) + " to device " + ::std::to_string(destination));
+		 + cuda::detail_::ptr_as_hex(region.start()) + " to " + (
+		 	(destination == CU_DEVICE_CPU) ? "the host" : cuda::device::detail_::identify(destination))  );
 }
 
 } // namespace detail_
@@ -1427,27 +1867,17 @@ inline void prefetch(
  * devices.
  */
 void prefetch(
-	const_region_t   region,
-	cuda::device_t   destination,
-	const stream_t&  stream);
+	const_region_t         region,
+	const cuda::device_t&  destination,
+	const stream_t&        stream);
 
 /**
  * @brief Prefetches a region of managed memory into host memory. It can
  * later be used there without waiting for I/O from any of the CUDA devices.
  */
-inline void prefetch_to_host(const_region_t managed_region)
-{
-	auto result = cudaMemPrefetchAsync(
-		managed_region.start(),
-		managed_region.size(),
-		cudaCpuDeviceId,
-		stream::default_stream_handle);
-		// The stream handle will be ignored by the CUDA runtime API when this pseudo
-		// device indicator is used.
-	throw_if_error(result,
-		"Prefetching " + ::std::to_string(managed_region.size()) + " bytes of managed memory at address "
-		 + cuda::detail_::ptr_as_hex(managed_region.start()) + " into host memory");
-}
+void prefetch_to_host(
+	const_region_t   region,
+	const stream_t&  stream);
 
 } // namespace async
 
@@ -1462,107 +1892,95 @@ namespace mapped {
 template <typename T>
 inline T* device_side_pointer_for(T* host_memory_ptr)
 {
-	T* device_side_ptr;
+	device::address_t device_side_ptr;
 	auto get_device_pointer_flags = 0u; // see the CUDA runtime documentation
-	auto status = cudaHostGetDevicePointer(
+	auto status = cuMemHostGetDevicePointer(
 		&device_side_ptr,
 		host_memory_ptr,
 		get_device_pointer_flags);
 	throw_if_error(status,
 		"Failed obtaining the device-side pointer for host-memory pointer "
 		+ cuda::detail_::ptr_as_hex(host_memory_ptr) + " supposedly mapped to device memory");
-	return device_side_ptr;
+	return as_pointer(device_side_ptr);
 }
 
 namespace detail_ {
 
 /**
- * Allocates a mapped pair of memory regions - on the current device
- * and in host memory.
+ * Allocates a mapped pair of memory regions - in the current
+ * context and in host and device memory.
  *
  * @param size_in_bytes size of each of the two regions, in bytes.
  * @param options indication of how the CUDA driver will manage
  * the region pair
  * @return the allocated pair (with both regions being non-null)
  */
-inline region_pair allocate(
+inline region_pair allocate_in_current_context(
+	context::handle_t   current_context_handle,
 	size_t              size_in_bytes,
 	allocation_options  options)
 {
-	region_pair allocated;
+	region_pair allocated {};
+	// The default initialization is unnecessary, but let's play it safe
 	allocated.size_in_bytes = size_in_bytes;
-	auto flags = cudaHostAllocMapped &
+	auto flags = CU_MEMHOSTALLOC_DEVICEMAP &
 		cuda::memory::detail_::make_cuda_host_alloc_flags(options);
-	// Note: the typed cudaHostAlloc also takes its size in bytes, apparently,
-	// not in number of elements
-	auto status = cudaHostAlloc(&allocated.host_side, size_in_bytes, flags);
+	auto status = cuMemHostAlloc(&allocated.host_side, size_in_bytes, flags);
 	if (is_success(status) && (allocated.host_side == nullptr)) {
 		// Can this even happen? hopefully not
-		status = cudaErrorUnknown;
+		status = (status_t) status::named_t::unknown;
 	}
 	throw_if_error(status,
 		"Failed allocating a mapped pair of memory regions of size " + ::std::to_string(size_in_bytes)
-			+ " bytes of global memory on device " + ::std::to_string(cuda::device::current::detail_::get_id()));
+		+ " bytes of global memory in " + context::detail_::identify(current_context_handle));
 	allocated.device_side = device_side_pointer_for(allocated.host_side);
 	return allocated;
 }
 
-/**
- * Allocates a mapped pair of memory regions - on a CUDA device
- * and in host memory.
- *
- * @param device_id The device on which to allocate the device-side region
- * @param size_in_bytes size of each of the two regions, in bytes.
- * @param options indication of how the CUDA driver will manage
- * the region pair
- * @return the allocated pair (with both regions being non-null)
- */
 inline region_pair allocate(
-	cuda::device::id_t  device_id,
+	context::handle_t   context_handle,
 	size_t              size_in_bytes,
 	allocation_options  options)
 {
-	cuda::device::current::detail_::scoped_override_t set_device_for_this_scope(device_id);
-	return detail_::allocate(size_in_bytes, options);
+	context::current::detail_::scoped_override_t set_context_for_this_scope(context_handle);
+	return detail_::allocate_in_current_context(context_handle, size_in_bytes, options);
+}
+
+inline void free(void* host_side_pair)
+{
+	auto result = cuMemFreeHost(host_side_pair);
+	throw_if_error(result, "Freeing a mapped memory region pair with host-side address "
+		+ cuda::detail_::ptr_as_hex(host_side_pair));
 }
 
 } // namespace detail_
 
 /**
- * Allocate a pair of memory regions, on the host and on the device, mapped to each other so
- * that changes to one will be reflected in the other.
+ * Allocate a memory region on the host, which is also mapped to a memory region in
+ * a context of some CUDA device - so that changes to one will be reflected in the other.
  *
- * @param device The device on which the device-side region in the pair will be allocated
+ * @param context The device context in which the device-side region in the pair will be
+ *     allocated.
  * @param size_in_bytes amount of memory to allocate (in each of the regions)
  * @param options see @ref allocation_options
  */
 region_pair allocate(
-	cuda::device_t&     device,
+	cuda::context_t&    context,
 	size_t              size_in_bytes,
 	allocation_options  options);
 
 /**
- * @brief A variant of @ref allocate facilitating only specifying some of the allocation options
- */
-inline region_pair allocate(
-	cuda::device_t&              device,
-	size_t                       size_in_bytes,
-	portability_across_contexts  portability = portability_across_contexts(false),
-	cpu_write_combining          cpu_wc = cpu_write_combining(false))
-{
-	return allocate(device, size_in_bytes, allocation_options{ portability, cpu_wc } );
-}
-
-/**
- * @brief A variant of @ref allocate facilitating only specifying some of the allocation options
+ * Allocate a memory region on the host, which is also mapped to a memory region in
+ * the global memory of a CUDA device - so that changes to one will be reflected in the other.
+ *
+ * @param device The device on which the device-side region in the pair will be allocated
+ * @param size_in_bytes amount of memory to allocate (in each of the regions)
+ * @param options see @ref allocation_options
  */
-inline region_pair allocate(
+region_pair allocate(
 	cuda::device_t&     device,
 	size_t              size_in_bytes,
-	cpu_write_combining cpu_wc)
-{
-	return allocate(device, size_in_bytes, allocation_options{ portability_across_contexts(false), cpu_write_combining(cpu_wc)} );
-}
+	allocation_options  options = allocation_options{});
 
 
 /**
@@ -1573,8 +1991,7 @@ inline region_pair allocate(
  */
 inline void free(region_pair pair)
 {
-	auto result = cudaFreeHost(pair.host_side);
-	throw_if_error(result, "Could not free mapped memory region pair.");
+	detail_::free(pair.host_side);
 }
 
 /**
@@ -1585,15 +2002,21 @@ inline void free(region_pair pair)
  */
 inline void free_region_pair_of(void* ptr)
 {
-	auto wrapped_ptr = pointer_t<void> { ptr };
-	auto result = cudaFreeHost(wrapped_ptr.get_for_host());
-	throw_if_error(result, "Could not free mapped memory region pair.");
+	// TODO: What if the pointer is not part of a mapped region pair?
+	// We could check this...
+	void* host_side_ptr;
+	auto status = cuPointerGetAttribute (&host_side_ptr, CU_POINTER_ATTRIBUTE_HOST_POINTER, memory::device::address(ptr));
+	throw_if_error(status, "Failed obtaining the host-side address of supposedly-device-side pointer "
+		+ cuda::detail_::ptr_as_hex(ptr));
+	detail_::free(host_side_ptr);
 }
 
 /**
  * Determine whether a given stretch of memory was allocated as part of
  * a mapped pair of host and device memory regions
  *
+ * @todo What if it's a managed pointer?
+ *
  * @param ptr the beginning of a memory region - in either host or device
  * memory - to check
  * @return `true` iff the region was allocated as one side of a mapped
@@ -1609,24 +2032,25 @@ inline bool is_part_of_a_region_pair(const void* ptr)
 
 } // namespace memory
 
-
 namespace symbol {
 /**
  * Locates a CUDA symbol in global or constant device memory
  *
+ * @note `symbol_t` symbols are associated with the primary context
+ *
  * @return The region of memory CUDA associates with the symbol
  */
-inline memory::region_t locate(symbol_t symbol)
+template <typename T>
+inline memory::region_t locate(T&& symbol)
 {
 	void *start;
 	size_t symbol_size;
-	auto api_call_result = cudaGetSymbolAddress(&start, symbol.handle);
-	throw_if_error(api_call_result,
-		"Could not locate the device memory address for symbol " + cuda::detail_::ptr_as_hex(symbol.handle));
-	api_call_result = cudaGetSymbolSize(&symbol_size, symbol.handle);
-	throw_if_error(api_call_result,
-		"Could not locate the device memory address for symbol " + cuda::detail_::ptr_as_hex(symbol.handle));
-	return {start, symbol_size};
+	auto api_call_result = cudaGetSymbolAddress(&start, std::forward<T>(symbol));
+	throw_if_error(api_call_result,	"Could not locate the device memory address for a symbol");
+	api_call_result = cudaGetSymbolSize(&symbol_size, std::forward<T>(symbol));
+	throw_if_error(api_call_result,	"Could not locate the device memory address for the symbol at address"
+		+ cuda::detail_::ptr_as_hex(start));
+	return { start, symbol_size };
 }
 
 } // namespace symbol
diff --git a/src/cuda/api/miscellany.hpp b/src/cuda/api/miscellany.hpp
index dfaea734..05d4cd0c 100644
--- a/src/cuda/api/miscellany.hpp
+++ b/src/cuda/api/miscellany.hpp
@@ -8,28 +8,37 @@
 #ifndef CUDA_API_WRAPPERS_MISCELLANY_HPP_
 #define CUDA_API_WRAPPERS_MISCELLANY_HPP_
 
-#include <cuda/api/error.hpp>
-#include <cuda/common/types.hpp>
+#include <cuda/api/types.hpp>
 
 #include <cuda_runtime_api.h>
+#include <cuda/api/error.hpp>
+
+#include <cuda.h>
+#include <ostream>
+#include <utility>
 
 namespace cuda {
 
 /**
- * @brief Ensures the CUDA runtime has fully initialized
+ * Obtains the CUDA Runtime version
  *
- * @note The CUDA runtime uses lazy initialization, so that until you perform
- * certain actions, the CUDA driver is not used to create a context, nothing
- * is done on the device etc. This function forces this initialization to
- * happen immediately, while not having any other effect.
+ * @note unlike {@ref maximum_supported_by_driver()}, 0 cannot be returned,
+ * as we are actually using the runtime to obtain the version, so it does
+ * have _some_ version.
  */
-inline
-void force_runtime_initialization()
+inline void initialize_driver() {
+	constexpr const unsigned dummy_flags { 0 }; // this is the only allowed value for flags
+	auto status = cuInit(dummy_flags);
+	throw_if_error(status, "Failed initializing the CUDA driver");
+}
+
+inline void ensure_driver_is_initialized()
 {
-	// nVIDIA's Robin Thoni (https://www.rthoni.com/) guarantees
-	// the following code "does the trick"
-	auto status = cudaFree(nullptr);
-	throw_if_error(status, "Forcing CUDA runtime initialization");
+	thread_local bool driver_known_to_be_initialized { false };
+	if (not driver_known_to_be_initialized) {
+		initialize_driver();
+		driver_known_to_be_initialized = true;
+	}
 }
 
 namespace device {
@@ -46,15 +55,17 @@ namespace device {
  * @return the number of CUDA devices on this system
  * @throws cuda::error if the device count could not be obtained
  */
-inline device::id_t  count()
+inline device::id_t count()
 {
+	initialize_driver();
+		// This function is often called before any device is obtained (which is where we
+		// expect the driver to be initialized)
 	int device_count = 0; // Initializing, just to be on the safe side
-	status_t result = cudaGetDeviceCount(&device_count);
-	if (result == status::no_device) {
-		return 0;
-	}
-	else {
-		throw_if_error(result, "Failed obtaining the number of CUDA devices on the system");
+	status_t result = cuDeviceGetCount(&device_count);
+	switch(result) {
+		case status::no_device: return 0;
+		case status::success: break;
+		default: throw runtime_error(result, "Failed obtaining the number of CUDA devices on the system");
 	}
 	if (device_count < 0) {
 		throw ::std::logic_error("cudaGetDeviceCount() reports an invalid number of CUDA devices");
diff --git a/src/cuda/api/module.hpp b/src/cuda/api/module.hpp
new file mode 100644
index 00000000..69b9c070
--- /dev/null
+++ b/src/cuda/api/module.hpp
@@ -0,0 +1,355 @@
+/**
+ * @file module.hpp
+ *
+ * @brief Wrappers for working with modules of compiled CUDA code.
+ */
+#pragma once
+#ifndef CUDA_API_WRAPPERS_MODULE_HPP_
+#define CUDA_API_WRAPPERS_MODULE_HPP_
+
+#include <cuda/api/context.hpp>
+#include <cuda/api/primary_context.hpp>
+#include <cuda/api/kernel.hpp>
+#include <cuda/api/memory.hpp>
+#include <cuda/api/array.hpp>
+#include <cuda/api/link_options.hpp>
+#include <cuda.h>
+#include <array>
+
+#if __cplusplus >= 201703L
+#include <filesystem>
+#endif
+
+namespace cuda {
+
+///@cond
+class device_t;
+class context_t;
+class module_t;
+class kernel_t;
+///@endcond
+
+namespace module {
+
+namespace detail_ {
+
+inline module_t construct(
+	device::id_t device_id,
+	context::handle_t context_handle,
+	handle_t handle,
+	link::options_t options,
+	bool take_ownership = false,
+	bool hold_primary_context_reference = false) noexcept;
+
+inline ::std::string identify(const module::handle_t &handle)
+{
+	return std::string("module ") + cuda::detail_::ptr_as_hex(handle);
+}
+
+inline ::std::string identify(const module::handle_t &handle, context::handle_t context_handle)
+{
+	return identify(handle) + " in " + context::detail_::identify(context_handle);
+}
+
+inline ::std::string identify(const module::handle_t &handle, context::handle_t context_handle, device::id_t device_id)
+{
+	return identify(handle) + " in " + context::detail_::identify(context_handle, device_id);
+}
+
+::std::string identify(const module_t &module);
+
+} // namespace detail_
+
+/**
+ * Load a module from an appropriate compiled or semi-compiled file, allocating all
+ * relevant resources for it.
+ *
+ * @param path of a cubin, PTX, or fatbin file constituting the module to be loaded.
+ * @return the loaded module
+ *
+ * @note this covers cuModuleLoadFatBinary() even though that's not directly used
+ */
+module_t load_from_file(const char *path, link::options_t link_options = {});
+
+module_t load_from_file(const ::std::string &path, link::options_t link_options = {});
+
+#if __cplusplus >= 201703L
+module_t load_from_file(const ::std::filesystem::path& path, link::options_t options = {});
+#endif
+
+/**
+ * Create a CUDA driver module from raw module image data.
+ *
+ * @param[inout] context The CUDA context into which the module data will be loaded (and
+ *     in which the module contents may be used)
+ *     @parem[in
+ * The pointer may be obtained by mapping a cubin or PTX or fatbin file, passing a cubin or PTX or fatbin file as a NULL-terminated text string, or incorporating a cubin or fatbin object into the executable resources and using operating system calls such as Windows FindResource() to obtain the pointer.
+ */
+///@{
+module_t create(context_t context, const void* module_data, link::options_t link_options);
+module_t create(context_t context, const void* module_data);
+module_t create(device_t device, const void* module_data, link::options_t link_options);
+module_t create(device_t device, const void* module_data);
+template <typename ContiguousContainer>
+module_t create(context_t context, ContiguousContainer module_data);
+///@}
+
+} // namespace module
+
+/**
+ * Wrapper class for a CUDA code module
+ *
+ * @note This class is a "reference type", not a "value type". Therefore, making changes
+ * to the module is a const-respecting operation on this class.
+ */
+class module_t {
+
+public: // getters
+
+	module::handle_t handle() const { return handle_; }
+	context::handle_t context_handle() const { return context_handle_; }
+	device::id_t device_id() const { return device_id_; }
+	context_t context() const;
+	device_t device() const;
+
+	// These API calls are not really the way you want to work.
+	cuda::kernel_t get_kernel(const char* name) const
+	{
+		context::current::detail_::scoped_override_t set_context_for_this_scope(context_handle_);
+		kernel::handle_t kernel_function_handle;
+		auto result = cuModuleGetFunction(&kernel_function_handle, handle_, name);
+		throw_if_error(result, ::std::string("Failed obtaining function ") + name
+			+ " from " + module::detail_::identify(*this));
+		return kernel::detail_::wrap(
+			context::detail_::get_device_id(context_handle_), context_handle_, kernel_function_handle);
+	}
+
+	cuda::memory::region_t get_global_object(const char* name) const;
+
+	// TODO: Implement a surface reference and texture reference class rather than these raw pointers.
+
+	CUsurfref* get_surface(const char* name) const;
+	CUtexref* get_texture_reference(const char* name) const;
+
+protected: // constructors
+
+	module_t(
+		device::id_t device_id,
+		context::handle_t context,
+		module::handle_t handle,
+		link::options_t options,
+		bool owning,
+		bool hold_primary_context_reference)
+#ifdef NDEBUG
+		noexcept
+#endif
+		: device_id_(device_id), context_handle_(context), handle_(handle), options_(options), owning_(owning),
+		  holds_primary_context_refcount_unit_(hold_primary_context_reference)
+	{
+#ifndef NDEBUG
+		if (not owning and hold_primary_context_reference) {
+			throw std::invalid_argument("A non-owning module proxy should not try to hold its own primary context refcount unit");
+		}
+		if (hold_primary_context_reference and not context::detail_::is_primary(context_handle_))
+		{
+			throw std::invalid_argument("A module in a non-primary context should not presume to hold a primary context refcount unit");
+		}
+#endif
+		if (owning and hold_primary_context_reference) {
+			device::primary_context::detail_::increase_refcount(device_id);
+		}
+	}
+
+	module_t(device::id_t device_id, context::handle_t context, module::handle_t handle, link::options_t options, bool owning) noexcept
+	: module_t(device_id, context, handle, options, owning, false)
+	{ }
+
+public: // friendship
+
+	friend module_t module::detail_::construct(device::id_t, context::handle_t, module::handle_t, link::options_t, bool, bool) noexcept;
+
+
+public: // constructors and destructor
+
+	module_t(const module_t&) = delete;
+
+	module_t(module_t&& other) noexcept :
+		module_t(other.device_id_, other.context_handle_, other.handle_, other.options_, other.owning_)
+	{
+		other.owning_ = false;
+	};
+
+	~module_t()
+	{
+		if (owning_) {
+			context::current::detail_::scoped_override_t set_context_for_this_scope(context_handle_);
+			auto status = cuModuleUnload(handle_);
+		 	throw_if_error(status, "Failed unloading " + module::detail_::identify(*this));
+
+			if (holds_primary_context_refcount_unit_) {
+				device::primary_context::detail_::decrease_refcount(device_id_);
+			}
+		}
+	}
+
+public: // operators
+
+	module_t& operator=(const module_t& other) = delete;
+	module_t& operator=(module_t&& other) = delete;
+
+protected: // data members
+	device::id_t       device_id_;
+	context::handle_t  context_handle_;
+	module::handle_t   handle_;
+	link::options_t    options_;
+	bool               owning_;
+		// this field is mutable only for enabling move construction; other
+		// than in that case it must not be altered
+	bool               holds_primary_context_refcount_unit_;
+};
+
+namespace module {
+
+using handle_t = CUmodule;
+
+/**
+* Loads a populated module from a file on disk
+*
+* @param path Filesystem path of a fatbin, cubin or PTX file
+*
+* @todo: Do we really need link options here?
+ * @todo: Make this take a context_t; and consider adding load_module methods to context_t
+*/
+inline module_t load_from_file(const char* path, link::options_t link_options)
+{
+	handle_t new_module_handle;
+	auto status = cuModuleLoad(&new_module_handle, path);
+	throw_if_error(status, ::std::string("Failed loading a module from file ") + path);
+	bool do_take_ownership { true };
+	auto current_context_handle = context::current::detail_::get_handle();
+	auto current_device_id = context::detail_::get_device_id(current_context_handle);
+	return detail_::construct(current_device_id, current_context_handle, new_module_handle, link_options,
+		do_take_ownership);
+}
+
+inline module_t load_from_file(const ::std::string& path, link::options_t link_options)
+{
+	return load_from_file(path.c_str(), link_options);
+}
+
+#if __cplusplus >= 201703L
+inline module_t load_from_file(const ::std::filesystem::path& path)
+{
+	return load_from_file(path.c_str());
+}
+#endif
+
+namespace detail_ {
+
+// This might have been called "wrap", if we had not needed to take care
+// of primary context reference counting
+inline module_t construct(
+	device::id_t device_id,
+	context::handle_t context_handle,
+	handle_t module_handle,
+	link::options_t options,
+	bool take_ownership,
+	bool hold_primary_context_reference) noexcept
+{
+	return module_t{device_id, context_handle, module_handle, options, take_ownership, hold_primary_context_reference};
+}
+
+template <typename Creator>
+inline module_t create(const context_t& context, const void* module_data, Creator creator_function, bool hold_pc_reference)
+{
+	context::current::scoped_override_t set_context_for_this_scope(context);
+	handle_t new_module_handle;
+	auto status = creator_function(new_module_handle, module_data);
+	throw_if_error(status, ::std::string(
+		"Failed loading a module from memory location ") + cuda::detail_::ptr_as_hex(module_data) +
+		"within " + context::detail_::identify(context));
+	bool do_take_ownership { true };
+	// TODO: Make sure the default-constructed options correspond to what cuModuleLoadData uses as defaults
+	return detail_::construct(context.device_id(), context.handle(), new_module_handle,
+		link::options_t{}, do_take_ownership, hold_pc_reference);
+}
+
+// TODO: Consider adding create_module() methods to context_t
+inline module_t create(const context_t& context, const void* module_data, const link::options_t& link_options, bool hold_pc_reference)
+{
+	auto creator_function =
+		[&link_options](handle_t& new_module_handle, const void* module_data) {
+			auto marshalled_options = link_options.marshal();
+			return cuModuleLoadDataEx(
+				&new_module_handle,
+				module_data,
+				marshalled_options.count(),
+				const_cast<link::option_t *>(marshalled_options.options()),
+				const_cast<void **>(marshalled_options.values())
+			);
+		};
+	return detail_::create(context, module_data, creator_function, hold_pc_reference);
+}
+
+inline module_t create(const context_t& context, const void* module_data, bool hold_pc_reference)
+{
+	auto creator_function =
+		[](handle_t& new_module_handle, const void* module_data) {
+			return cuModuleLoadData(&new_module_handle, module_data);
+		};
+	return detail_::create(context, module_data, creator_function, hold_pc_reference);
+}
+
+} // namespace detail_
+
+// TODO: Use an optional to reduce the number of functions here... when the
+// library starts requiring C++14.
+
+inline module_t create(context_t context, const void* module_data)
+{
+	return detail_::create(context, module_data, false);
+}
+
+inline module_t create(context_t context, const void* module_data, link::options_t link_options)
+{
+	return detail_::create(context, module_data, link_options, false);
+}
+
+inline module_t create(device::primary_context_t primary_context, const void* module_data)
+{
+#ifndef NDEBUG
+	if (module_data == nullptr) {
+		throw std::invalid_argument("Attempt to create a module with a null pointer for its data");
+	}
+#endif
+	constexpr const bool do_hold_primary_context_reference { true };
+	const context_t& context = primary_context;
+	return detail_::create(context, module_data, do_hold_primary_context_reference);
+}
+
+inline module_t create(device::primary_context_t primary_context, const void* module_data, link::options_t link_options)
+{
+#ifndef NDEBUG
+	if (module_data == nullptr) {
+		throw std::invalid_argument("Attempt to create a module with a null pointer for its data");
+	}
+#endif
+	constexpr const bool do_hold_primary_context_reference {true };
+	const context_t& context = primary_context;
+	return detail_::create(context, module_data, link_options, do_hold_primary_context_reference);
+}
+
+namespace detail_ {
+
+inline ::std::string identify(const module_t& module)
+{
+	return identify(module.handle(), module.context_handle(), module.device().id());
+}
+
+} // namespace detail_
+
+} // namespace module
+
+} // namespace cuda
+
+#endif // CUDA_API_WRAPPERS_MODULE_HPP_
diff --git a/src/cuda/api/multi_wrapper_impls.hpp b/src/cuda/api/multi_wrapper_impls.hpp
index a461dc8f..b8b86faf 100644
--- a/src/cuda/api/multi_wrapper_impls.hpp
+++ b/src/cuda/api/multi_wrapper_impls.hpp
@@ -2,7 +2,7 @@
  * @file multi_wrapper_impls.hpp
  *
  * @brief Implementations of methods or functions requiring the definitions of
- * multiple CUDA entity proxy classes. In some cases these are declared in the
+ * multiple CUDA entity proxy classes. In most cases these are declared in the
  * individual proxy class files, with the other classes forward-declared.
  */
 #pragma once
@@ -13,12 +13,21 @@
 #include <cuda/api/device.hpp>
 #include <cuda/api/event.hpp>
 #include <cuda/api/kernel_launch.hpp>
-#include <cuda/api/kernel.hpp>
 #include <cuda/api/pointer.hpp>
 #include <cuda/api/stream.hpp>
 #include <cuda/api/unique_ptr.hpp>
+#include <cuda/api/primary_context.hpp>
+#include <cuda/api/kernel.hpp>
+#include <cuda/api/apriori_compiled_kernel.hpp>
+#include <cuda/api/module.hpp>
+#include <cuda/api/virtual_memory.hpp>
+#include <cuda/api/current_context.hpp>
+#include <cuda/api/current_device.hpp>
 #include <cuda/api/texture_view.hpp>
+#include <cuda/api/peer_to_peer.hpp>
+
 #include <cuda_runtime.h>
+#include <cuda.h>
 
 #include <type_traits>
 #include <vector>
@@ -26,51 +35,46 @@
 
 namespace cuda {
 
-template <typename T, dimensionality_t NumDimensions>
-device_t array_t<T, NumDimensions>::device() const noexcept
-{
-    return device::get(device_id_);
-}
+namespace detail_ {
 
-template <typename T, dimensionality_t NumDimensions>
-texture_view::texture_view(
-    const cuda::array_t<T, NumDimensions>&  arr,
-    texture::descriptor_t                   descriptor)
-    : device_id_(arr.device().id()), owning(true)
-{
-    cudaResourceDesc resource_descriptor;
-    memset(&resource_descriptor, 0, sizeof(resource_descriptor));
-    resource_descriptor.resType = cudaResourceTypeArray;
-    resource_descriptor.res.array.array = arr.get();
+template <typename... >
+using void_t = void;
 
-    auto status = cudaCreateTextureObject(&raw_handle_, &resource_descriptor, &descriptor, nullptr);
-    throw_if_error(status, "failed creating a CUDA texture object");
-}
+template<typename, template <typename> class, typename = void>
+struct is_detected : ::std::false_type {};
 
-inline device_t texture_view::associated_device() const noexcept
-{
-    return cuda::device::get(device_id_);
-}
+template<typename T, template <typename> class Op>
+struct is_detected<T, Op, void_t<Op<T>>> : ::std::true_type {};
 
-namespace array {
+template< class, class = void >
+struct has_data : ::std::false_type { };
 
-namespace detail_ {
+template< class T>
+struct has_data<T, void_t<decltype(std::declval<T>().data())>>
+: std::is_same<decltype(std::declval<T>().data()), void*>::type { };
+
+} // namespace detail_
+
+namespace array {
 
 template <typename T, dimensionality_t NumDimensions>
-handle_t create(const device_t& device, dimensions_t<NumDimensions> dimensions)
+array_t<T,NumDimensions> create(
+	const context_t&             context,
+	dimensions_t<NumDimensions>  dimensions)
 {
-	return create<T, NumDimensions>(device.id(), dimensions);
+	handle_t handle = detail_::create<T, NumDimensions>(context.handle(), dimensions);
+	return wrap<T, NumDimensions>(context.device_id(), context.handle(), handle, dimensions);
 }
 
-} // namespace detail_
-
 template <typename T, dimensionality_t NumDimensions>
-array_t<T, NumDimensions> create(
-    const device_t&              device,
-    dimensions_t<NumDimensions>  dimensions)
+array_t<T,NumDimensions> create(
+	device_t                     device,
+	dimensions_t<NumDimensions>  dimensions)
 {
-    handle_t handle { detail_::create<T, NumDimensions>(device, dimensions) };
-    return wrap<T>(device.id(), handle, dimensions);
+	device::current::detail_::scoped_context_override_t set_context_for_this_scope(device.id());
+	auto context_handle =  set_context_for_this_scope.primary_context_handle;
+	handle_t handle = detail_::create<T, NumDimensions>(context_handle, dimensions);
+	return wrap<T, NumDimensions>(device.id(), context_handle, handle, dimensions);
 }
 
 } // namespace array
@@ -78,27 +82,48 @@ array_t<T, NumDimensions> create(
 namespace event {
 
 inline event_t create(
-	device_t   device,
+	const context_t&  context,
+	bool              uses_blocking_sync,
+	bool              records_timing,
+	bool              interprocess)
+{
+	// Yes, we need the ID explicitly even on the current device,
+	// because event_t's don't have an implicit device ID.
+	return event::detail_::create(context.device_id(), context.handle(), uses_blocking_sync, records_timing, interprocess);
+}
+
+inline event_t create(
+	device_t&  device,
 	bool       uses_blocking_sync,
 	bool       records_timing,
 	bool       interprocess)
 {
-	auto device_id = device.id();
-		// Yes, we need the ID explicitly even on the current device,
-		// because event_t's don't have an implicit device ID.
-	return event::detail_::create(device_id , uses_blocking_sync, records_timing, interprocess);
+	device::current::detail_::scoped_context_override_t set_context_for_this_scope(device.id());
+	return event::detail_::create_in_current_context(
+		device.id(),
+		context::current::detail_::get_handle(),
+		uses_blocking_sync, records_timing, interprocess);
 }
 
 namespace ipc {
 
-inline handle_t export_(event_t& event)
+inline handle_t export_(const event_t& event)
 {
 	return detail_::export_(event.handle());
 }
 
-inline event_t import(device_t& device, const handle_t& handle)
+inline event_t import(const context_t& context, const handle_t& event_ipc_handle)
+{
+	bool do_not_take_ownership { false };
+	return event::detail_::wrap(context.device_id(), context.handle(), detail_::import(event_ipc_handle), do_not_take_ownership);
+}
+
+
+inline event_t import(const device_t& device, const handle_t& event_ipc_handle)
 {
-	return event::detail_::wrap(device.id(), detail_::import(handle), do_not_take_ownership);
+	device::current::detail_::scoped_context_override_t set_context_for_this_scope(device.id());
+	auto handle = detail_::import(event_ipc_handle);
+	return event::detail_::wrap(device.id(), context::current::detail_::get_handle(), handle, do_not_take_ownership);
 }
 
 } // namespace ipc
@@ -108,30 +133,182 @@ inline event_t import(device_t& device, const handle_t& handle)
 
 // device_t methods
 
-inline stream_t device_t::default_stream() const noexcept
+inline device::primary_context_t device_t::primary_context(bool scoped) const
+{
+    auto pc_handle = primary_context_handle();
+    auto decrease_refcount_on_destruct = not scoped;
+    if (not scoped) {
+        device::primary_context::detail_::increase_refcount(id_);
+            // Q: Why increase the refcount here, when `primary_context_handle()`
+            //    ensured this has already happened for this object?
+            // A: Because an unscoped primary_context_t needs its own refcount
+            //    unit (e.g. in case this object gets destructed but the
+            //    primary_context_t is still alive.
+    }
+    return device::primary_context::detail_::wrap(id_, pc_handle, decrease_refcount_on_destruct);
+}
+
+inline stream_t device_t::default_stream() const
 {
-	return stream::detail_::wrap(id(), stream::default_stream_handle);
+    return stream::detail_::wrap(id(), primary_context_handle(), stream::default_stream_handle);
 }
 
-inline stream_t
-device_t::create_stream(
+inline stream_t device_t::create_stream(
 	bool                will_synchronize_with_default_stream,
 	stream::priority_t  priority) const
 {
-	device::current::detail_::scoped_override_t set_device_for_this_scope(id_);
-	return stream::detail_::wrap(id(), stream::detail_::create_on_current_device(
-		will_synchronize_with_default_stream, priority), do_take_ownership);
+	device::current::detail_::scoped_context_override_t set_context_for_this_scope(id_);
+	return stream::detail_::create(id_, primary_context_handle(), will_synchronize_with_default_stream, priority);
+}
+
+inline bool context_t::is_primary() const
+{
+	auto pc_handle = device::primary_context::detail_::obtain_and_increase_refcount(device_id_);
+	device::primary_context::detail_::decrease_refcount(device_id_);
+	return handle_ == pc_handle;
+}
+
+inline module_t context_t::create_module(const void* module_data, link::options_t link_options) const
+{
+	return module::create(*this, module_data, link_options);
+}
+
+inline module_t context_t::create_module(const void* module_data) const
+{
+	return module::create(*this, module_data);
+}
+
+template <typename ContiguousContainer>
+module_t context_t::create_module(ContiguousContainer module_data) const
+{
+	return module::create<ContiguousContainer>(*this, module_data);
+}
+
+inline void context_t::enable_access_to(const context_t& peer) const
+{
+	context::peer_to_peer::enable_access(*this, peer);
+}
+
+inline void context_t::disable_access_to(const context_t& peer) const
+{
+	context::peer_to_peer::disable_access(*this, peer);
+}
+
+inline device_t context_t::device() const
+{
+	return device::detail_::wrap(device_id_);
+}
+
+inline stream_t context_t::create_stream(
+	bool                will_synchronize_with_default_stream,
+	stream::priority_t  priority)
+{
+	return stream::detail_::create(device_id_, handle_, will_synchronize_with_default_stream, priority);
 }
 
 namespace device {
+
+namespace primary_context {
+
+inline bool is_active(const device_t& device)
+{
+	return detail_::is_active(device.id());
+}
+
+inline void destroy(const device_t& device)
+{
+	auto status = cuDevicePrimaryCtxReset(device.id());
+	throw_if_error(status, "Failed destroying/resetting the primary context of device " + ::std::to_string(device.id()));
+}
+
+inline primary_context_t get(const device_t& device)
+{
+	auto pc_handle = detail_::get_handle(device.id(), true);
+	return detail_::wrap( device.id(), pc_handle, true);
+}
+
+
+} // namespace primary_context
+
+namespace peer_to_peer {
+
+inline bool can_access(device_t accessor, device_t peer)
+{
+	return detail_::can_access(accessor.id(), peer.id());
+}
+
+inline void enable_access(device_t accessor, device_t peer)
+{
+	return context::peer_to_peer::enable_access(accessor.primary_context(), peer.primary_context());
+}
+
+inline void disable_access(device_t accessor, device_t peer)
+{
+#ifndef NDEBUG
+	if (accessor == peer) {
+		throw std::invalid_argument("A device cannot be used as its own peer");
+	}
+#endif
+	context::peer_to_peer::disable_access(accessor.primary_context(), peer.primary_context());
+}
+
+inline bool can_access_each_other(device_t first, device_t second)
+{
+	return can_access(first, second) and can_access(second, first);
+}
+
+inline void enable_bidirectional_access(device_t first, device_t second)
+{
+#ifndef NDEBUG
+	if (first == second) {
+		throw std::invalid_argument("A device cannot be used as its own peer");
+	}
+#endif
+	context::peer_to_peer::enable_bidirectional_access(first.primary_context(), second.primary_context());
+}
+
+inline void disable_bidirectional_access(device_t first, device_t second)
+{
+#ifndef NDEBUG
+	if (first == second) {
+		throw std::invalid_argument("A device cannot be used as its own peer");
+	}
+#endif
+	context::peer_to_peer::disable_bidirectional_access(first.primary_context(), second.primary_context());
+}
+
+inline attribute_value_t get_attribute(attribute_t attribute, device_t first, device_t second)
+{
+#ifndef NDEBUG
+	if (first == second) {
+		throw std::invalid_argument("A device cannot be used as its own peer");
+	}
+#endif
+	return detail_::get_attribute(attribute, first.id(), second.id());
+}
+
+} // namespace peer_to_peer
+
+inline stream_t primary_context_t::default_stream() const noexcept
+{
+	return stream::detail_::wrap(device_id_, handle_, stream::default_stream_handle);
+}
+
 namespace current {
 
-inline scoped_override_t::scoped_override_t(device_t& device) : parent(device.id()) { }
+
+inline scoped_override_t::scoped_override_t(const device_t& device) : parent(device.id()) { }
 inline scoped_override_t::scoped_override_t(device_t&& device) : parent(device.id()) { }
 
 } // namespace current
 } // namespace device
 
+inline void synchronize(const device_t& device)
+{
+	auto pc = device.primary_context();
+	context::current::detail_::scoped_override_t set_context_for_this_scope(pc.handle());
+	context::current::detail_::synchronize(device.id(), pc.handle());
+}
 
 namespace detail_ {
 
@@ -146,10 +323,17 @@ void device_t::launch(
 		kernel_function, launch_configuration, parameters...);
 }
 
+inline context_t device_t::create_context(
+	context::host_thread_synch_scheduling_policy_t  synch_scheduling_policy,
+	bool                                            keep_larger_local_mem_after_resize) const
+{
+	return context::create(*this, synch_scheduling_policy, keep_larger_local_mem_after_resize);
+}
+
 inline event_t device_t::create_event(
 	bool          uses_blocking_sync,
 	bool          records_timing,
-	bool          interprocess) const
+	bool          interprocess)
 {
 	// The current implementation of event::create is not super-smart,
 	// but it's probably not worth it trying to improve just this function
@@ -158,20 +342,27 @@ inline event_t device_t::create_event(
 
 // event_t methods
 
-inline device_t event_t::device() const noexcept
+inline device_t event_t::device() const
+{
+	return cuda::device::get(device_id());
+}
+
+inline context_t event_t::context() const
 {
-	return cuda::device::get(device_id_);
+	constexpr const bool dont_take_ownership { false };
+	return context::detail_::wrap(device_id(), context_handle_, dont_take_ownership);
 }
 
-inline void event_t::record(const stream_t& stream)
+
+
+inline void event_t::record(const stream_t& stream) const
 {
 	// Note:
-	// TODO: Perhaps check the device ID here, rather than
-	// have the Runtime API call fail?
+	// TODO: Perhaps check the context match here, rather than have the Runtime API call fail?
 	event::detail_::enqueue(stream.handle(), handle_);
 }
 
-inline void event_t::fire(const stream_t& stream)
+inline void event_t::fire(const stream_t& stream) const
 {
 	record(stream);
 	stream.synchronize();
@@ -181,34 +372,42 @@ inline void event_t::fire(const stream_t& stream)
 
 inline device_t stream_t::device() const noexcept
 {
-	return cuda::device::get(device_id_);
+	return cuda::device::detail_::wrap(device_id_);
 }
 
-inline void stream_t::enqueue_t::wait(const event_t& event)
+inline context_t stream_t::context() const noexcept
+{
+	constexpr const bool dont_take_ownership { false };
+	return context::detail_::wrap(device_id_, context_handle_, dont_take_ownership);
+}
+
+inline void stream_t::enqueue_t::wait(const event_t& event_)
 {
 	auto device_id = associated_stream.device_id_;
-	device::current::detail_::scoped_override_t set_device_for_this_context(device_id);
+	device::current::detail_::scoped_context_override_t set_device_for_this_context(device_id);
 
 	// Required by the CUDA runtime API; the flags value is currently unused
 	constexpr const unsigned int flags = 0;
 
-	auto status = cudaStreamWaitEvent(associated_stream.handle_, event.handle(), flags);
-	throw_if_error(status,
-		::std::string("Failed scheduling a wait for " + event::detail_::identify(event.handle())
-		+ " on stream " + stream::detail_::identify(associated_stream.handle_, associated_stream.device_id_)));
+	auto status = cuStreamWaitEvent(associated_stream.handle_, event_.handle(), flags);
+	throw_if_error(status, "Failed scheduling a wait for " + event::detail_::identify(event_.handle())
+		+ " on " + stream::detail_::identify(associated_stream));
 
 }
 
 inline event_t& stream_t::enqueue_t::event(event_t& existing_event)
 {
 	auto device_id = associated_stream.device_id_;
-	if (existing_event.device_id() != device_id) {
-		throw ::std::invalid_argument("Attempt to enqueue a CUDA event associated with "
-			+ device::detail_::identify(existing_event.device_id()) + " to be triggered by a stream on "
-			+ device::detail_::identify(device_id));
+	auto context_handle = associated_stream.context_handle_;
+	auto stream_context_handle_ = associated_stream.context_handle_;
+	if (existing_event.context_handle() != stream_context_handle_) {
+		throw ::std::invalid_argument("Attempt to enqueue "
+			+ event::detail_::identify(existing_event)
+			+ ", to be triggered by " + stream::detail_::identify(associated_stream));
 	}
-	device::current::detail_::scoped_override_t set_device_for_this_context(device_id);
-	stream::detail_::record_event_on_current_device(device_id, associated_stream.handle_, existing_event.handle());
+	context::current::detail_::scoped_override_t set_context_for_this_scope(context_handle);
+	stream::detail_::record_event_in_current_context(device_id, context_handle,
+		associated_stream.handle_,existing_event.handle());
 	return existing_event;
 }
 
@@ -217,21 +416,58 @@ inline event_t stream_t::enqueue_t::event(
     bool          records_timing,
     bool          interprocess)
 {
-	auto device_id = associated_stream.device_id_;
-	device::current::detail_::scoped_override_t set_device_for_this_scope(device_id);
+	auto context_handle = associated_stream.context_handle_;
+	context::current::detail_::scoped_override_t set_context_for_this_scope(context_handle);
 
-	event_t ev { event::detail_::create_on_current_device(device_id, uses_blocking_sync, records_timing, interprocess) };
+	event_t ev { event::detail_::create_in_current_context(
+		associated_stream.device_id_, context_handle,
+		uses_blocking_sync, records_timing, interprocess) };
 	// Note that, at this point, the event is not associated with this enqueue object's stream.
-	stream::detail_::record_event_on_current_device(device_id, associated_stream.handle_, ev.handle());
+	this->event(ev);
 	return ev;
 }
 
 namespace memory {
 
 template <typename T>
-inline device_t pointer_t<T>::device() const noexcept
+inline device_t pointer_t<T>::device() const
+{
+	cuda::device::id_t device_id = get_attribute<CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL>();
+	return cuda::device::get(device_id);
+}
+template <typename T>
+inline pointer_t<T> pointer_t<T>::other_side_of_region_pair() const
+{
+	pointer::attribute_t attributes[] = {
+		CU_POINTER_ATTRIBUTE_MEMORY_TYPE,
+		CU_POINTER_ATTRIBUTE_HOST_POINTER,
+		CU_POINTER_ATTRIBUTE_DEVICE_POINTER
+	};
+	type_t memory_type;
+	T* host_ptr;
+	T* device_ptr;
+	void* value_ptrs[] = { &memory_type, &host_ptr, &device_ptr };
+	pointer::detail_::get_attributes(3, attributes, value_ptrs, ptr_);
+
+#ifndef NDEBUG
+	assert(host_ptr == ptr_ or device_ptr == ptr_);
+#endif
+	return { ptr_ == host_ptr ? device_ptr : host_ptr };
+}
+
+
+template <typename T>
+inline context_t pointer_t<T>::context() const
 {
-	return cuda::device::get(attributes().device);
+	pointer::attribute_t attributes[] = {
+		CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL,
+		CU_POINTER_ATTRIBUTE_CONTEXT
+	};
+	cuda::device::id_t device_id;
+	context::handle_t context_handle;
+	void* value_ptrs[] = {&device_id, &context_handle};
+	pointer::detail_::get_attributes(2, attributes, value_ptrs, ptr_);
+	return context::detail_::wrap(device_id, context_handle);
 }
 
 namespace async {
@@ -241,16 +477,23 @@ inline void copy(void *destination, const void *source, size_t num_bytes, const
 	detail_::copy(destination, source, num_bytes, stream.handle());
 }
 
+// Note: Assumes the source pointer is valid in the stream's context
 template <typename T, dimensionality_t NumDimensions>
 inline void copy(array_t<T, NumDimensions>& destination, const T* source, const stream_t& stream)
 {
-	detail_::copy(destination, source, stream.handle());
+	detail_::copy<T, NumDimensions>(destination, source, stream.handle());
 }
 
+// Note: Assumes the destination, source and stream are all usable on the same content
 template <typename T, dimensionality_t NumDimensions>
 inline void copy(T* destination, const array_t<T, NumDimensions>& source, const stream_t& stream)
 {
-	detail_::copy(destination, source, stream.handle());
+	if (stream.context_handle() != source.context_handle()) {
+		throw std::invalid_argument("Attempt to copy an array in"
+			+ context::detail_::identify(source.context_handle()) + " via "
+			+ stream::detail_::identify(stream));
+	}
+	detail_::copy<T, NumDimensions>(destination, source, stream.handle());
 }
 
 template <typename T>
@@ -263,21 +506,29 @@ inline void copy_single(T& destination, const T& source, const stream_t& stream)
 
 namespace device {
 
-inline region_t allocate(cuda::device_t device, size_t size_in_bytes)
+inline region_t allocate(const context_t& context, size_t size_in_bytes)
+{
+	return detail_::allocate(context.handle(), size_in_bytes);
+}
+
+
+inline region_t allocate(const device_t& device, size_t size_in_bytes)
 {
-	return detail_::allocate(device.id(), size_in_bytes);
+	cuda::device::current::detail_::scoped_context_override_t set_context_for_this_scope{device.id()};
+	return detail_::allocate_in_current_context(size_in_bytes);
 }
 
 namespace async {
 
 inline region_t allocate(const stream_t& stream, size_t size_in_bytes)
 {
-	return detail_::allocate(stream.device().id(), stream.handle(), size_in_bytes);
+	return detail_::allocate(stream.context().handle(), stream.handle(), size_in_bytes);
 }
 
-inline void set(void* start, int byte_value, size_t num_bytes, const stream_t& stream)
+template <typename T>
+inline void typed_set(T* start, const T& value, size_t num_elements, const stream_t& stream)
 {
-	detail_::set(start, byte_value, num_bytes, stream.handle());
+	detail_::set(start, value, num_elements, stream.handle());
 }
 
 inline void zero(void* start, size_t num_bytes, const stream_t& stream)
@@ -287,20 +538,28 @@ inline void zero(void* start, size_t num_bytes, const stream_t& stream)
 
 } // namespace async
 
+
 /**
  * @brief Create a variant of ::std::unique_pointer for an array in
- * the current device's global memory
+ * device-global memory.
+ *
+ * @note CUDA's runtime API always has a current device; but -
+ * there is not necessary a current context; so a primary context
+ * for a device may be created through this call.
  *
  * @tparam T  an array type; _not_ the type of individual elements
  *
+ * @param context       The CUDA device context in which to make the
+ *                      allocation.
  * @param num_elements  the number of elements to allocate
+ *
  * @return an ::std::unique_ptr pointing to the constructed T array
- */
-template<typename T>
-inline unique_ptr<T> make_unique(size_t num_elements)
+*/
+template <typename T>
+inline unique_ptr<T> make_unique(const context_t& context, size_t num_elements)
 {
-	static_assert(::std::is_array<T>::value, "make_unique<T>(device, num_elements) can only be invoked for T being an array type, T = U[]");
-	return cuda::memory::detail_::make_unique<T, detail_::allocator, detail_::deleter>(num_elements);
+	static_assert(::std::is_array<T>::value, "make_unique<T>() can only be invoked for T being an array type, T = U[]");
+	return memory::detail_::make_unique<T>(context.handle(), num_elements);
 }
 
 /**
@@ -312,30 +571,54 @@ inline unique_ptr<T> make_unique(size_t num_elements)
  * @param device        on which to construct the array of elements
  * @param num_elements  the number of elements to allocate
  * @return an ::std::unique_ptr pointing to the constructed T array
- */template<typename T>
+ */
+template<typename T>
 inline unique_ptr<T> make_unique(device_t device, size_t num_elements)
 {
-    cuda::device::current::detail_::scoped_override_t set_device_for_this_scope(device.id());
-    return make_unique<T>(num_elements);
+	static_assert(::std::is_array<T>::value, "make_unique<T>() can only be invoked for T being an array type, T = U[]");
+	cuda::device::current::detail_::scoped_context_override_t set_context_for_this_scope(device.id());
+	return memory::detail_::make_unique<T, device::detail_::allocator, device::detail_::deleter>(num_elements);
+}
+
+/**
+ * @brief Create a variant of ::std::unique_pointer for an array in
+ * device-global memory on the current device.
+ *
+ * @note The allocation will be made in the device's primary context -
+ * which will be created if it has not yet been.
+ *
+ * @tparam T  an array type; _not_ the type of individual elements
+ *
+ * @param num_elements  the number of elements to allocate
+ *
+ * @return an ::std::unique_ptr pointing to the constructed T array
+ */
+template<typename T>
+inline unique_ptr<T> make_unique(size_t num_elements)
+{
+	static_assert(::std::is_array<T>::value, "make_unique<T>() can only be invoked for T being an array type, T = U[]");
+	auto device = cuda::device::current::get();
+	make_unique<T>(device, num_elements);
 }
 
 /**
  * @brief Create a variant of ::std::unique_pointer for a single value
- * in the current device's global memory
+ * in device-global memory.
  *
  * @tparam T  the type of value to construct in device memory
  *
+ * @param device  on which to construct the T element
  * @return an ::std::unique_ptr pointing to the allocated memory
  */
 template <typename T>
-inline unique_ptr<T> make_unique()
+inline unique_ptr<T> make_unique(const context_t& context)
 {
-	return cuda::memory::detail_::make_unique<T, detail_::allocator, detail_::deleter>();
+	return cuda::memory::detail_::make_unique<T>(context.handle());
 }
 
 /**
  * @brief Create a variant of ::std::unique_pointer for a single value
- * in device-global memory
+ * in device-global memory.
  *
  * @tparam T  the type of value to construct in device memory
  *
@@ -345,12 +628,94 @@ inline unique_ptr<T> make_unique()
 template <typename T>
 inline unique_ptr<T> make_unique(device_t device)
 {
-    cuda::device::current::detail_::scoped_override_t set_device_for_this_scope(device.id());
-    return make_unique<T>();
+	cuda::device::current::detail_::scoped_context_override_t set_context_for_this_scope(device.id());
+	return memory::detail_::make_unique<T, device::detail_::allocator, device::detail_::deleter>();
+}
+
+/**
+ * @brief Create a variant of ::std::unique_pointer for a single value
+ * in device-global memory, on the current device
+ *
+ * @note The allocation will be made in the device's primary context -
+ * which will be created if it has not yet been.
+ *
+ * @tparam T  the type of value to construct in device memory
+ *
+ * @param num_elements  the number of elements to allocate
+ *
+ * @return an ::std::unique_ptr pointing to the allocated memory
+ */
+template<typename T>
+inline unique_ptr<T> make_unique()
+{
+	auto device = cuda::device::current::get();
+	make_unique<T>(device);
 }
 
 } // namespace device
 
+namespace inter_context {
+
+inline void copy(
+	void *        destination_address,
+	context_t     destination_context,
+	const void *  source_address,
+	context_t     source_context,
+	size_t        num_bytes)
+{
+	return detail_::copy(
+		destination_address, destination_context.handle(),
+		source_address, source_context.handle(), num_bytes);
+}
+
+namespace async {
+
+inline void copy(
+	void *        destination_address,
+	context_t     destination_context,
+	const void *  source_address,
+	context_t     source_context,
+	size_t        num_bytes,
+	stream_t      stream)
+{
+	return detail_::copy(
+		destination_address, destination_context.handle(), source_address,
+		source_context.handle(), num_bytes, stream.handle());
+}
+
+inline void copy(
+	region_t        destination,
+	context_t       destination_context,
+	const_region_t  source,
+	context_t       source_context,
+	stream_t        stream)
+{
+#ifndef NDEBUG
+	if (destination.size() < destination.size()) {
+		throw ::std::invalid_argument(
+			"Attempt to copy a region of " + ::std::to_string(source.size()) +
+				" bytes into a region of size " + ::std::to_string(destination.size()) + " bytes");
+	}
+#endif
+	copy(destination.start(), destination_context, source, source_context, stream);
+}
+
+
+inline void copy(
+	void *           destination,
+	context_t        destination_context,
+	const_region_t   source,
+	context_t        source_context,
+	const stream_t&  stream)
+{
+	copy(destination, destination_context, source.start(), source_context, source.size(), stream);
+}
+
+} // namespace async
+
+} // namespace inter_context
+
+
 namespace managed {
 
 namespace detail_ {
@@ -358,49 +723,102 @@ namespace detail_ {
 template <typename T>
 inline device_t base_region_t<T>::preferred_location() const
 {
-	auto device_id = detail_::get_scalar_range_attribute<bool>(*this, cudaMemRangeAttributePreferredLocation);
+	auto device_id = detail_::get_scalar_range_attribute<bool>(*this, CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION);
 	return cuda::device::get(device_id);
 }
 
 template <typename T>
 inline void base_region_t<T>::set_preferred_location(device_t& device) const
 {
-	detail_::set_scalar_range_attribute(*this, (cudaMemoryAdvise) cudaMemAdviseSetPreferredLocation, device.id());
+	detail_::set_range_attribute(*this,CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION, device.id());
 }
 
 template <typename T>
 inline void base_region_t<T>::clear_preferred_location() const
 {
-	detail_::set_scalar_range_attribute(*this, (cudaMemoryAdvise) cudaMemAdviseUnsetPreferredLocation);
+	detail_::unset_range_attribute(*this, CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION);
 }
 
 } // namespace detail_
 
+template<typename T>
+inline unique_ptr<T> make_unique(
+	const context_t&      context,
+	size_t                n,
+	initial_visibility_t  initial_visibility)
+{
+	context::current::scoped_override_t set_context_for_this_scope(context);
+	return detail_::make_unique_in_current_context<T>(n, initial_visibility);
+}
+
+template<typename T>
+inline unique_ptr<T> make_unique(
+	const device_t&       device,
+	size_t                n,
+	initial_visibility_t  initial_visibility)
+{
+	cuda::device::current::detail_::scoped_context_override_t set_context_for_this_scope(device.id());
+	return detail_::make_unique_in_current_context<T>(n, initial_visibility);
+}
+
+template<typename T>
+inline unique_ptr<T> make_unique(
+	size_t                n,
+	initial_visibility_t  initial_visibility)
+{
+	auto device = cuda::device::current::get();
+	return make_unique<T>(device, n, initial_visibility);
+}
+
+template<typename T>
+inline unique_ptr<T> make_unique(
+	const context_t&      context,
+	initial_visibility_t  initial_visibility)
+{
+	context::current::scoped_override_t set_context_for_this_scope(context);
+	return detail_::make_unique_in_current_context<T>(initial_visibility);
+}
+
+template<typename T>
+inline unique_ptr<T> make_unique(
+	device_t              device,
+	initial_visibility_t  initial_visibility)
+{
+	cuda::device::current::detail_::scoped_context_override_t set_context_for_this_scope(device.id());
+	return detail_::make_unique_in_current_context<T>(initial_visibility);
+}
+
+template<typename T>
+inline unique_ptr<T> make_unique(
+	initial_visibility_t  initial_visibility)
+{
+	auto device = cuda::device::current::get();
+	return make_unique<T>(initial_visibility);
+}
+
 
 inline void advise_expected_access_by(const_region_t region, device_t& device)
 {
-	detail_::set_scalar_range_attribute(region, cudaMemAdviseSetAccessedBy, device.id());
+	detail_::advise(region, CU_MEM_ADVISE_SET_ACCESSED_BY, device.id());
 }
 
 inline void advise_no_access_expected_by(const_region_t region, device_t& device)
 {
-	detail_::set_scalar_range_attribute(region, cudaMemAdviseUnsetAccessedBy, device.id());
+	detail_::advise(region, CU_MEM_ADVISE_UNSET_ACCESSED_BY, device.id());
 }
 
 template <typename Allocator>
 ::std::vector<device_t, Allocator> accessors(const_region_t region, const Allocator& allocator)
 {
-	static_assert(sizeof(cuda::device::id_t) == sizeof(device_t), "Unexpected size difference between device IDs and their wrapper class, device_t");
-
 	auto num_devices = cuda::device::count();
 	::std::vector<device_t, Allocator> devices(num_devices, allocator);
 	auto device_ids = reinterpret_cast<cuda::device::id_t *>(devices.data());
 
-
-	auto status = cudaMemRangeGetAttribute(
+	auto status = cuMemRangeGetAttribute(
 		device_ids, sizeof(device_t) * devices.size(),
-		cudaMemRangeAttributeAccessedBy, region.start(), region.size());
-	throw_if_error(status, "Obtaining the IDs of devices with access to the managed memory range at " + cuda::detail_::ptr_as_hex(region.start()));
+		CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY, device::address(region.start()), region.size());
+	throw_if_error(status, "Obtaining the IDs of devices with access to the managed memory range at "
+		+ cuda::detail_::ptr_as_hex(region.start()));
 	auto first_invalid_element = ::std::lower_bound(device_ids, device_ids + num_devices, cudaInvalidDeviceId);
 	// We may have gotten less results that the set of all devices, so let's whittle that down
 
@@ -414,41 +832,43 @@ ::std::vector<device_t, Allocator> accessors(const_region_t region, const Alloca
 namespace async {
 
 inline void prefetch(
+	const_region_t         region,
+	const cuda::device_t&  destination,
+	const stream_t&        stream)
+{
+	detail_::prefetch(region, destination.id(), stream.handle());
+}
+
+inline void prefetch_to_host(
 	const_region_t   region,
-	cuda::device_t   destination,
 	const stream_t&  stream)
 {
-	detail_::prefetch(region, destination.id(), stream.handle());
+	detail_::prefetch(region, CU_DEVICE_CPU, stream.handle());
 }
 
 } // namespace async
 
-
 inline region_t allocate(
-	cuda::device_t        device,
-	size_t                size_in_bytes,
+	const context_t&      context,
+	size_t                num_bytes,
 	initial_visibility_t  initial_visibility)
 {
-	return detail_::allocate(device.id(), size_in_bytes, initial_visibility);
+	return detail_::allocate(context.handle(), num_bytes, initial_visibility);
 }
 
-template<typename T>
-inline unique_ptr<T> make_unique(
-    device_t              device,
-    size_t                num_elements,
-    initial_visibility_t  initial_visibility)
+inline region_t allocate(
+	device_t              device,
+	size_t                num_bytes,
+	initial_visibility_t  initial_visibility)
 {
-    cuda::device::current::detail_::scoped_override_t(device.id());
-    return make_unique<T>(num_elements, initial_visibility);
+	cuda::device::current::detail_::scoped_context_override_t set_context_for_this_scope{device.id()};
+	return detail_::allocate_in_current_context(num_bytes, initial_visibility);
 }
 
-template<typename T>
-inline unique_ptr<T> make_unique(
-    device_t              device,
-    initial_visibility_t  initial_visibility)
+inline region_t allocate(size_t num_bytes)
 {
-    cuda::device::current::detail_::scoped_override_t(device.id());
-    return make_unique<T>(initial_visibility);
+	auto context_handle = context::current::detail_::get_with_fallback_push();
+	return allocate(context_handle, num_bytes, initial_visibility_t::to_all_devices);
 }
 
 } // namespace managed
@@ -460,7 +880,17 @@ inline region_pair allocate(
 	size_t              size_in_bytes,
 	allocation_options  options)
 {
-	return cuda::memory::mapped::detail_::allocate(device.id(), size_in_bytes, options);
+    auto pc = device.primary_context();
+	return cuda::memory::mapped::detail_::allocate(pc.handle(), size_in_bytes, options);
+}
+
+
+inline region_pair allocate(
+	cuda::context_t&    context,
+	size_t              size_in_bytes,
+	allocation_options  options)
+{
+	return cuda::memory::mapped::detail_::allocate(context.handle(), size_in_bytes, options);
 }
 
 } // namespace mapped
@@ -469,312 +899,591 @@ inline region_pair allocate(
 
 // kernel_t methods
 
-inline device_t kernel_t::device() const noexcept { return device::get(device_id_); }
+inline context_t kernel_t::context() const noexcept
+{
+	constexpr bool dont_take_ownership { false };
+	return context::detail_::from_handle(context_handle_, dont_take_ownership);
+}
 
-inline void kernel_t::set_attribute(kernel::attribute_t attribute, kernel::attribute_value_t value)
+inline device_t kernel_t::device() const noexcept
 {
-	device::current::detail_::scoped_override_t set_device_for_this_context(device_id_);
-	auto result = cudaFuncSetAttribute(ptr_, attribute, value);
-	throw_if_error(result, "Setting CUDA device function attribute " + ::std::to_string(attribute) + " to value " + ::std::to_string(value));
+	return device::get(device_id_);
 }
 
-inline void kernel_t::opt_in_to_extra_dynamic_memory(cuda::memory::shared::size_t amount_required_by_kernel)
+inline void kernel_t::set_attribute(kernel::attribute_t attribute, kernel::attribute_value_t value) const
 {
-	device::current::detail_::scoped_override_t set_device_for_this_context(device_id_);
 #if CUDART_VERSION >= 9000
-	auto result = cudaFuncSetAttribute(ptr_, cudaFuncAttributeMaxDynamicSharedMemorySize, amount_required_by_kernel);
+	context::current::detail_::scoped_override_t set_context_for_this_context(context_handle_);
+	auto result = cuFuncSetAttribute(handle_, static_cast<CUfunction_attribute>(attribute), value);
 	throw_if_error(result,
-		"Trying to opt-in to " + ::std::to_string(amount_required_by_kernel) + " bytes of dynamic shared memory, "
-		"exceeding the maximum available on device " + ::std::to_string(device_id_) + " (consider the amount of static shared memory"
-		"in use by the function).");
+		"Setting CUDA device function attribute " +
+#ifndef NDEBUG
+		::std::string(kernel::detail_::attribute_name(attribute)) +
 #else
+		::std::to_string(static_cast<std::underlying_type<kernel::attribute_t>::type>(attribute)) +
+#endif
+		" to value " + ::std::to_string(value)	);
 	throw(cuda::runtime_error {cuda::status::not_yet_implemented});
 #endif
 }
 
-#if defined(__CUDACC__)
+/*
+namespace kernel {
 
-// Unfortunately, the CUDA runtime API does not allow for computation of the grid parameters for maximum occupancy
-// from code compiled with a host-side-only compiler! See cuda_runtime.h for details
+namespace occupancy {
 
-namespace detail_ {
+inline grid::complete_dimensions_t min_grid_params_for_max_occupancy(
+	const kernel_t&                   kernel,
+	memory::shared::size_t            dynamic_shared_memory_size,
+	grid::block_dimension_t           block_size_limit,
+	bool                              disable_caching_override)
+{
+	return detail_::min_grid_params_for_max_occupancy(
+		kernel.handle(), kernel.device().id(), dynamic_shared_memory_size, block_size_limit, disable_caching_override);
+}
 
 template <typename UnaryFunction>
-inline grid::complete_dimensions_t min_grid_params_for_max_occupancy(
-	const void *             ptr,
-	device::id_t             device_id,
+grid::complete_dimensions_t
+apriori_compiled_kernel_t::min_grid_params_for_max_occupancy(
 	UnaryFunction            block_size_to_dynamic_shared_mem_size,
 	grid::block_dimension_t  block_size_limit,
-	bool                     disable_caching_override)
+	bool                     disable_caching_override) const
 {
-#if CUDART_VERSION <= 10000
-	throw(cuda::runtime_error {cuda::status::not_yet_implemented});
-#else
-	int min_grid_size_in_blocks { 0 };
-	int block_size { 0 };
-		// Note: only initializing the values her because of a
-		// spurious (?) compiler warning about potential uninitialized use.
-	auto result = cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags(
-		&min_grid_size_in_blocks, &block_size,
-		ptr,
-		block_size_to_dynamic_shared_mem_size,
-		static_cast<int>(block_size_limit),
-		disable_caching_override ? cudaOccupancyDisableCachingOverride : cudaOccupancyDefault
-	);
-	throw_if_error(result,
-		"Failed obtaining parameters for a minimum-size grid for kernel " + detail_::ptr_as_hex(ptr) +
-			" on device " + ::std::to_string(device_id) + ".");
-	return { min_grid_size_in_blocks, block_size };
-#endif // CUDART_VERSION <= 10000
+	return detail_::min_grid_params_for_max_occupancy(
+		ptr_, device_id_, block_size_to_dynamic_shared_mem_size, block_size_limit, disable_caching_override);
 }
 
-inline grid::complete_dimensions_t min_grid_params_for_max_occupancy(
-	const void *             ptr,
-	device::id_t             device_id,
-	memory::shared::size_t   dynamic_shared_mem_size,
-	grid::block_dimension_t  block_size_limit,
-	bool                     disable_caching_override)
+} // namespace occupancy
+
+} // namespace kernel
+*/
+
+namespace kernel {
+
+template<typename KernelFunctionPtr>
+apriori_compiled_kernel_t get(context_t context, KernelFunctionPtr function_ptr)
 {
-	auto always_need_same_shared_mem_size =
-		[dynamic_shared_mem_size](::size_t) { return dynamic_shared_mem_size; };
-	return min_grid_params_for_max_occupancy(
-		ptr, device_id, always_need_same_shared_mem_size, block_size_limit, disable_caching_override);
+	static_assert(
+		::std::is_pointer<KernelFunctionPtr>::value
+			and ::std::is_function<typename ::std::remove_pointer<KernelFunctionPtr>::type>::value,
+		"function_ptr must be a bona fide pointer to a kernel (__global__) function");
+
+	auto ptr_ = reinterpret_cast<const void*>(function_ptr);
+	auto handle = detail_::get_handle(ptr_);
+	return detail_::wrap(context.device_id(), context.handle(), handle, ptr_);
 }
 
-} // namespace detail_
+template<typename KernelFunctionPtr>
+apriori_compiled_kernel_t get(device_t device, KernelFunctionPtr function_ptr)
+{
+	return get<KernelFunctionPtr>(device.primary_context(), function_ptr);
+}
 
+} // namespace kernel
 
-inline grid::complete_dimensions_t min_grid_params_for_max_occupancy(
-	const kernel_t&          kernel,
-	memory::shared::size_t   dynamic_shared_memory_size,
-	grid::block_dimension_t  block_size_limit,
-	bool                     disable_caching_override)
+
+namespace stream {
+
+namespace detail_ {
+
+inline device::id_t device_id_of(stream::handle_t stream_handle)
 {
-	return detail_::min_grid_params_for_max_occupancy(
-		kernel.ptr(), kernel.device().id(), dynamic_shared_memory_size, block_size_limit, disable_caching_override);
+	return context::detail_::get_device_id(context_handle_of(stream_handle));
 }
 
+inline void record_event_in_current_context(
+	device::id_t       current_device_id,
+	context::handle_t  current_context_handle_,
+	stream::handle_t   stream_handle,
+	event::handle_t    event_handle)
+{
+	auto status = cuEventRecord(event_handle, stream_handle);
+	throw_if_error(status, "Failed scheduling " + event::detail_::identify(event_handle)
+		+ " on " + stream::detail_::identify(stream_handle, current_context_handle_, current_device_id));
+}
 
-inline grid::complete_dimensions_t kernel_t::min_grid_params_for_max_occupancy(
-	memory::shared::size_t   dynamic_shared_memory_size,
-	grid::block_dimension_t  block_size_limit,
-	bool                     disable_caching_override) const
+} // namespace detail_
+
+inline stream_t create(
+	const device_t&  device,
+	bool             synchronizes_with_default_stream,
+	priority_t       priority)
 {
-	return detail_::min_grid_params_for_max_occupancy(
-		ptr_, device_id_, dynamic_shared_memory_size, block_size_limit, disable_caching_override);
+	cuda::device::current::detail_::scoped_context_override_t set_context_for_this_scope{device.id()};
+	auto stream_handle = detail_::create_in_current_context(synchronizes_with_default_stream, priority);
+	return stream::detail_::wrap(device.id(), context::current::detail_::get_handle(), stream_handle);
 }
 
-template <typename UnaryFunction>
-grid::complete_dimensions_t kernel_t::min_grid_params_for_max_occupancy(
-	UnaryFunction            block_size_to_dynamic_shared_mem_size,
-	grid::block_dimension_t  block_size_limit,
-	bool                     disable_caching_override) const
+inline stream_t create(
+	const context_t&  context,
+	bool              synchronizes_with_default_stream,
+	priority_t        priority)
 {
-	return detail_::min_grid_params_for_max_occupancy(
-		ptr_, device_id_, block_size_to_dynamic_shared_mem_size, block_size_limit, disable_caching_override);
+	return detail_::create(context.device_id(), context.handle(), synchronizes_with_default_stream, priority);
 }
 
-#endif // defined __CUDACC__
+} // namespace stream
+
+namespace detail_ {
 
-inline void kernel_t::set_preferred_shared_mem_fraction(unsigned shared_mem_percentage)
+template<typename... KernelParameters>
+void enqueue_launch_helper<apriori_compiled_kernel_t, KernelParameters...>::operator()(
+	apriori_compiled_kernel_t  wrapped_kernel,
+	const stream_t &           stream,
+	launch_configuration_t     launch_configuration,
+	KernelParameters &&...     parameters)
+{
+	using raw_kernel_t = typename kernel::detail_::raw_kernel_typegen<KernelParameters ...>::type;
+	auto unwrapped_kernel_function = reinterpret_cast<raw_kernel_t>(const_cast<void *>(wrapped_kernel.ptr()));
+	// Notes:
+	// 1. The inner cast here is because we store the pointer as const void* - as an extra
+	//    precaution against anybody trying to write through it. Now, function pointers
+	//    can't get written through, but are still for some reason not considered const.
+	// 2. We rely on the caller providing us with more-or-less the correct parameters -
+	//    corresponding to the compiled kernel function's. I say "more or less" because the
+	//    `KernelParameter` pack may contain some references, arrays and so on - which CUDA
+	//    kernels cannot accept; so we massage those a bit.
+
+	detail_::enqueue_raw_kernel_launch(
+		unwrapped_kernel_function,
+		stream.handle(),
+		launch_configuration,
+		::std::forward<KernelParameters>(parameters)...);
+}
+
+template<typename... KernelParameters>
+std::array<void*, sizeof...(KernelParameters)>
+marshal_dynamic_kernel_arguments(KernelParameters&&... parameters)
 {
-	device::current::detail_::scoped_override_t set_device_for_this_context(device_id_);
-	if (shared_mem_percentage > 100) {
-		throw ::std::invalid_argument("Percentage value can't exceed 100");
+	return ::std::array<void*, sizeof...(KernelParameters)> { &parameters... };
+}
+
+template<typename... KernelParameters>
+struct enqueue_launch_helper<kernel_t, KernelParameters...> {
+
+	void operator()(
+		const kernel_t&                       wrapped_kernel,
+		const stream_t &                      stream,
+		launch_configuration_t                lc,
+		KernelParameters &&...                parameters)
+	{
+		auto marshalled_arguments { marshal_dynamic_kernel_arguments(::std::forward<KernelParameters>(parameters)...) };
+		auto function_handle = wrapped_kernel.handle();
+		status_t status;
+		if (lc.block_cooperation)
+			status = cuLaunchCooperativeKernel(
+				function_handle,
+				lc.dimensions.grid.x,  lc.dimensions.grid.y,  lc.dimensions.grid.z,
+				lc.dimensions.block.x, lc.dimensions.block.y, lc.dimensions.block.z,
+				lc.dynamic_shared_memory_size,
+				stream.handle(),
+				marshalled_arguments.data()
+			);
+		else {
+			constexpr const auto no_arguments_in_alternative_format = nullptr;
+			// TODO: Consider passing arguments in the alternative format
+			status = cuLaunchKernel(
+				function_handle,
+				lc.dimensions.grid.x,  lc.dimensions.grid.y,  lc.dimensions.grid.z,
+				lc.dimensions.block.x, lc.dimensions.block.y, lc.dimensions.block.z,
+				lc.dynamic_shared_memory_size,
+				stream.handle(),
+				marshalled_arguments.data(),
+				no_arguments_in_alternative_format
+			);
+		}
+		throw_if_error(status,
+			(lc.block_cooperation ? "Cooperative " : "") +
+				::std::string(" kernel launch failed for ") + kernel::detail_::identify(function_handle)
+				+ " on " + stream::detail_::identify(stream));
 	}
-#if CUDART_VERSION >= 9000
-	auto result = cudaFuncSetAttribute(ptr_, cudaFuncAttributePreferredSharedMemoryCarveout, shared_mem_percentage);
-	throw_if_error(result, "Trying to set the carve-out of shared memory/L1 cache memory");
-#else
-	throw(cuda::runtime_error {cuda::status::not_yet_implemented});
-#endif // CUDART_VERSION <= 9000
+
+};
+
+template<typename RawKernelFunction, typename... KernelParameters>
+void enqueue_launch(
+	::std::integral_constant<bool, false>, // Got a raw kernel function
+	RawKernelFunction       kernel_function,
+	const stream_t&         stream,
+	launch_configuration_t  launch_configuration,
+	KernelParameters&&...   parameters)
+{
+	detail_::enqueue_raw_kernel_launch<RawKernelFunction, KernelParameters...>(
+		::std::forward<RawKernelFunction>(kernel_function), stream.handle(), launch_configuration,
+		::std::forward<KernelParameters>(parameters)...);
 }
 
-inline kernel::attributes_t kernel_t::attributes() const
+template<typename Kernel, typename... KernelParameters>
+void enqueue_launch(
+	::std::integral_constant<bool, true>, // a kernel wrapped in a kernel_t (sub)class
+	Kernel                  kernel,
+	const stream_t&         stream,
+	launch_configuration_t  launch_configuration,
+	KernelParameters&&...   parameters)
 {
-	device::current::detail_::scoped_override_t set_device_for_this_context(device_id_);
-	kernel::attributes_t function_attributes;
-	auto status = cudaFuncGetAttributes(&function_attributes, ptr_);
-	throw_if_error(status, "Failed obtaining attributes for a CUDA device function");
-	return function_attributes;
+	enqueue_launch_helper<Kernel, KernelParameters...>{}(
+		::std::forward<Kernel>(kernel), stream, launch_configuration,
+		::std::forward<KernelParameters>(parameters)...);
 }
 
-inline void kernel_t::set_cache_preference(multiprocessor_cache_preference_t  preference)
+} // namespace detail_
+
+#if CUDA_VERSION >= 10020
+namespace memory {
+namespace virtual_ {
+namespace physical_allocation {
+
+inline device_t properties_t::device() const
 {
-	device::current::detail_::scoped_override_t set_device_for_this_context(device_id_);
-	auto result = cudaFuncSetCacheConfig(ptr_, (cudaFuncCache) preference);
-	throw_if_error(result,
-		"Setting the multiprocessor L1/Shared Memory cache distribution preference for a "
-		"CUDA device function");
+	return cuda::device::detail_::wrap(raw.location.id);
 }
 
+template<kind_t SharedHandleKind>
+properties_t create_properties_for(cuda::device_t device)
+{
+	return detail_::create_properties<SharedHandleKind>(device.id());
+}
 
-inline void kernel_t::set_shared_memory_bank_size(
-	multiprocessor_shared_memory_bank_size_option_t  config)
+template<kind_t SharedHandleKind>
+inline physical_allocation_t create(size_t size, device_t device)
 {
-	device::current::detail_::scoped_override_t set_device_for_this_context(device_id_);
-	auto result = cudaFuncSetSharedMemConfig(ptr_, (cudaSharedMemConfig) config);
-	throw_if_error(result);
+	auto properties = create_properties_for<SharedHandleKind>(device);
+	return create(size, properties);
 }
 
-inline grid::dimension_t kernel_t::maximum_active_blocks_per_multiprocessor(
-	grid::block_dimension_t   num_threads_per_block,
-	memory::shared::size_t    dynamic_shared_memory_per_block,
-	bool                      disable_caching_override)
+} // namespace physical_allocation
+
+inline void set_access_mode(
+	region_t fully_mapped_region,
+	device_t device,
+	access_mode_t access_mode)
 {
-	device::current::detail_::scoped_override_t set_device_for_this_context(device_id_);
-	int result;
-	unsigned int flags = disable_caching_override ?
-		cudaOccupancyDisableCachingOverride : cudaOccupancyDefault;
-	auto status = cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
-		&result, ptr_, (int) num_threads_per_block,
-		dynamic_shared_memory_per_block, flags);
-	throw_if_error(status, "Failed calculating the maximum occupancy "
-		"of device function blocks per multiprocessor");
-	return result;
+	CUmemAccessDesc desc { { CU_MEM_LOCATION_TYPE_DEVICE, device.id() }, CUmemAccess_flags(access_mode) };
+	constexpr const size_t count { 1 };
+	auto result = cuMemSetAccess(fully_mapped_region.device_address(), fully_mapped_region.size(), &desc, count);
+	throw_if_error(result, "Failed setting the access mode to the virtual memory mapping to the range of size "
+		+ ::std::to_string(fully_mapped_region.size()) + " bytes at " + cuda::detail_::ptr_as_hex(fully_mapped_region.data()));
 }
 
+inline void set_access_mode(mapping_t mapping, device_t device, access_mode_t access_mode)
+{
+	set_access_mode(mapping.address_range(), device, access_mode);
+}
 
-namespace kernel {
+template <template <typename... Ts> class Container>
+inline void set_access_mode(
+	region_t fully_mapped_region,
+	const Container<device_t>& devices,
+	access_mode_t access_mode)
+{
+	auto descriptors = new CUmemAccessDesc[devices.size()];
+	for(std::size_t i = 0; i < devices.size(); i++) {
+		descriptors[i] = {{CU_MEM_LOCATION_TYPE_DEVICE, devices[i].id()}, CUmemAccess_flags(access_mode)};
+	}
+	auto result = cuMemSetAccess(
+		device::address(fully_mapped_region.start()), fully_mapped_region.size(), descriptors, devices.size());
+	throw_if_error(result, "Failed setting the access mode to the virtual memory mapping to the range of size "
+		+ ::std::to_string(fully_mapped_region.size()) + " bytes at " + cuda::detail_::ptr_as_hex(fully_mapped_region.data()));
+}
 
-template<typename KernelFunctionPtr>
-kernel_t wrap(const device_t &device, KernelFunctionPtr function_ptr)
+template <template <typename... Ts> class Container>
+inline void set_access_mode(
+	region_t fully_mapped_region,
+	Container<device_t>&& devices,
+	access_mode_t access_mode)
 {
-	static_assert(
-		::std::is_pointer<KernelFunctionPtr>::value
-			and ::std::is_function<typename ::std::remove_pointer<KernelFunctionPtr>::type>::value,
-		"function_ptr must be a bona fide pointer to a kernel (__global__) function");
-	return detail_::wrap(device.id(), reinterpret_cast<const void*>(function_ptr));
+	return set_access_mode(fully_mapped_region, devices, access_mode);
 }
 
-} // namespace kernel
+template <template <typename... Ts> class Container>
+inline void set_access_mode(
+	mapping_t mapping,
+	const Container<device_t>&& devices,
+	access_mode_t access_mode)
+{
+	set_access_mode(mapping.address_range(), devices, access_mode);
+}
 
-namespace stream {
+template <template <typename... Ts> class Container>
+inline void set_access_mode(
+	mapping_t mapping,
+	Container<device_t>&& devices,
+	access_mode_t access_mode)
+{
+	set_access_mode(mapping, devices, access_mode);
+}
 
-inline stream_t create(
-	device_t    device,
-	bool        synchronizes_with_default_stream,
-	priority_t  priority)
+inline access_mode_t get_access_mode(region_t fully_mapped_region, device_t device)
 {
-	return detail_::create(device.id(), synchronizes_with_default_stream, priority);
+	return detail_::get_access_mode(fully_mapped_region, device.id());
 }
 
+inline access_mode_t get_access_mode(mapping_t mapping, device_t device)
+{
+	return get_access_mode(mapping.address_range(), device);
+}
+
+inline access_mode_t mapping_t::get_access_mode(device_t device) const
+{
+	return virtual_::get_access_mode(*this, device);
+}
+
+inline void mapping_t::set_access_mode(device_t device, access_mode_t access_mode) const
+{
+	virtual_::set_access_mode(*this, device, access_mode);
+}
+
+template <template <typename... Ts> class ContiguousContainer>
+void mapping_t::set_access_mode(
+	const ContiguousContainer<device_t>& devices,
+	access_mode_t access_mode) const
+{
+	virtual_::set_access_mode(*this, devices, access_mode);
+}
+
+template <template <typename... Ts> class ContiguousContainer>
+void mapping_t::set_access_mode(
+	ContiguousContainer<device_t>&& devices,
+	access_mode_t access_mode) const
+{
+	virtual_::set_access_mode(*this, devices, access_mode);
+}
+
+} // namespace virtual_
+} // namespace memory
+
+#endif // CUDA_VERSION >= 10020
+
+namespace context {
+
 namespace detail_ {
 
-inline void record_event_on_current_device(device::id_t device_id, stream::handle_t stream_handle, event::handle_t event_handle)
+inline handle_t get_primary_for_same_device(handle_t handle, bool increase_refcount)
 {
-	auto status = cudaEventRecord(event_handle, stream_handle);
-	throw_if_error(status,
-		"Failed scheduling " + event::detail_::identify(event_handle) + " to occur"
-		+ " on stream " + stream::detail_::identify(stream_handle, device_id));
+	auto device_id = get_device_id(handle);
+	return device::primary_context::detail_::get_handle(device_id, increase_refcount);
 }
-} // namespace detail_
 
-} // namespace stream
+inline bool is_primary_for_device(handle_t handle, device::id_t device_id)
+{
+	auto context_device_id = context::detail_::get_device_id(handle);
+	if (context_device_id != device_id) {
+		return false;
+	}
+	constexpr const bool dont_increase_refcount { false };
+	auto pc_handle = device::primary_context::detail_::get_handle(device_id, dont_increase_refcount);
+	return handle == pc_handle;
+}
+
+} // namespace detail
+
+inline bool is_primary(const context_t& context)
+{
+	return context::detail_::is_primary_for_device(context.handle(), context.device_id());
+}
+
+inline void synchronize(const context_t& context)
+{
+	return detail_::synchronize(context.device_id(), context.handle());
+}
+
+namespace current {
 
 namespace detail_ {
 
-template<typename... KernelParameters>
-inline void enqueue_launch(
-	::std::true_type,       // got a wrapped kernel
-	const kernel_t&         kernel,
-	const stream_t&         stream,
-	launch_configuration_t  launch_configuration,
-	KernelParameters&&...   parameters)
+/**
+ * @todo This function is a bit shady, consider dropping it.s
+ */
+inline handle_t push_default_if_missing()
 {
-	if (kernel.device() != stream.device()) {
-		throw ::std::invalid_argument("Attempt to enqueue a kernel for "
-			+ device::detail_::identify(kernel.device().id()) + " on a stream for "
-			+ "device " + device::detail_::identify(stream.device().id()));
+	auto handle = detail_::get_handle();
+	if (handle != context::detail_::none) {
+		return handle;
 	}
+	// TODO: consider using cudaSetDevice here instead
+	auto current_device_id = device::current::detail_::get_id();
+	auto pc_handle = device::primary_context::detail_::obtain_and_increase_refcount(current_device_id);
+	push(pc_handle);
+	return pc_handle;
+}
 
-	// Note: We are not performing an imperfect un-erasure of the wrapper function pointer's
-	// signature. Imperfect - since the KernelParameter pack may contain some
-	// references, arrays and so on - which CUDA kernels cannot accept; so have
-	// to massage it a bit.
+/**
+ * @note This specialized scope setter is used in API calls which aren't provided a context
+ * as a parameter, and when there is no context that's current. Such API calls are necessarily
+ * device-related (i.e. runtime-API-ish), and since there is always a current device, we can
+ * (and in fact must) fall back on that device's primary context as what the user assumes we
+ * would use. In these situations, we must also "leak" that device's primary context, in the
+ * sense of adding to its reference count without ever decreasing it again - since the only
+ * juncture at which we can decrease it is the scoped context setter's fallback; and if we
+ * do that, we will actually trigger the destruction of that primary context. As a consequence,
+ * if one _ever_ uses an API wrapper call which relies on this scoped context setter, the only
+ * way for them to destroy the primary context is either via @ref device_t::reset() (or
+ * manually decreasing the reference count to zero, which supposedly they will not do).
+ *
+ * @note not sure about how appropriate it is to pop the created primary context off
+ */
+class scoped_current_device_fallback_t {
+public:
+    device::id_t device_id_;
+    context::handle_t pc_handle_ { context::detail_::none };
+
+	explicit scoped_current_device_fallback_t()
+	{
+	    auto current_context_handle = get_handle();
+	    if (current_context_handle  == context::detail_::none) {
+	        device_id_ = device::current::detail_::get_id();
+	        pc_handle_ = device::primary_context::detail_::obtain_and_increase_refcount(device_id_);
+	        context::current::detail_::push(pc_handle_);
+	    }
+	}
 
-	using raw_kernel_type = void(*)(typename cuda::detail_::kernel_parameter_decay_t<KernelParameters>...);
-//		typename cuda::kernel::detail_::raw_kernel_typegen<
-//			typename cuda::detail_::kernel_parameter_decay_t<KernelParameters>::type ...
-//		>::type;
+	~scoped_current_device_fallback_t()
+	{
+//	    if (pc_handle_ != context::detail_::none) {
+//            context::current::detail_::pop();
+//	        device::primary_context::detail_::decrease_refcount(device_id_);
+//	    }
+	}
+};
 
-	auto unwrapped_kernel_function = reinterpret_cast<raw_kernel_type>(const_cast<void*>(kernel.ptr()));
+} // namespace detail_
 
-	detail_::enqueue_launch(
-		unwrapped_kernel_function,
-		stream.handle(),
-		launch_configuration,
-		::std::forward<KernelParameters>(parameters)...);
+inline scoped_override_t::scoped_override_t(const context_t &context) : parent(context.handle())
+{}
+
+inline scoped_override_t::scoped_override_t(context_t &&context) : parent(context.handle())
+{}
+
+} // namespace current
+
+inline context_t create_and_push(
+	device_t                               device,
+	host_thread_synch_scheduling_policy_t  synch_scheduling_policy,
+	bool                                   keep_larger_local_mem_after_resize)
+{
+	auto handle = detail_::create_and_push(device.id(), synch_scheduling_policy, keep_larger_local_mem_after_resize);
+	bool take_ownership = true;
+	return context::detail_::wrap(device.id(), handle, take_ownership);
 }
 
-template<typename RawKernelFunction, typename... KernelParameters>
-inline void enqueue_launch(
-	::std::false_type,      // got a raw function
-	RawKernelFunction       kernel_function,
-	const stream_t&         stream,
-	launch_configuration_t  launch_configuration,
-	KernelParameters&&...   parameters)
+inline context_t create(
+	device_t                               device,
+	host_thread_synch_scheduling_policy_t  synch_scheduling_policy,
+	bool                                   keep_larger_local_mem_after_resize)
 {
-	static_assert(
-		::std::is_function<typename ::std::decay<RawKernelFunction>::type>::value
-		or (
-			::std::is_pointer<RawKernelFunction>::value
-			and ::std::is_function<typename ::std::remove_pointer<RawKernelFunction>::type>::value
-			)
-		, "Invalid Kernel type - it must be either a function or a pointer-to-a-function");
+	auto created = create_and_push(device, synch_scheduling_policy, keep_larger_local_mem_after_resize);
+	current::pop();
+	return created;
+}
 
+namespace peer_to_peer {
 
-	// Note: It is possible that the parameter pack's signature does not exactly fit the function
-	// pointer - it contain some references, arrays and so on - which CUDA kernels cannot accept;
-	// but it should be close enough to pass to the function... or the user would not have asked
-	// to launch the kernel this way. So, no reinterpretation/decay should be necessary here.
+inline bool can_access(context_t accessor, context_t peer)
+{
+	return device::peer_to_peer::detail_::can_access(accessor.device_id(), peer.device_id());
+}
 
-	detail_::enqueue_launch(
-		kernel_function,
-		stream.handle(),
-		launch_configuration,
-		::std::forward<KernelParameters>(parameters)...);
+inline void enable_access(context_t accessor, context_t peer)
+{
+	detail_::enable_access(accessor.handle(), peer.handle());
 }
 
+inline void disable_access(context_t accessor, context_t peer)
+{
+	detail_::disable_access(accessor.handle(), peer.handle());
+}
 
-} // namespace detail_
+inline void enable_bidirectional_access(context_t first, context_t second)
+{
+	// Note: What happens when first and second are the same context? Or on the same device?
+	enable_access(first,  second);
+	enable_access(second, first );
+}
 
-template<typename Kernel, typename... KernelParameters>
-inline void enqueue_launch(
-	Kernel                  kernel,
-	const stream_t&         stream,
-	launch_configuration_t  launch_configuration,
-	KernelParameters&&...   parameters)
+inline void disable_bidirectional_access(context_t first, context_t second)
 {
-	constexpr const auto got_a_wrapped_kernel = ::std::is_same<typename ::std::decay<Kernel>::type, kernel_t>::value;
-	using got_a_wrapped_kernel_type = ::std::integral_constant<bool, got_a_wrapped_kernel>;
-	return detail_::enqueue_launch(got_a_wrapped_kernel_type{}, kernel, stream, launch_configuration,
-		::std::forward<KernelParameters>(parameters)...);
+	// Note: What happens when first and second are the same context? Or on the same device?
+	disable_access(first,  second);
+	disable_access(second, first );
+}
+
+
+} // namespace peer_to_peer
+
+namespace current {
+
+namespace peer_to_peer {
+
+inline void enable_access_to(const context_t &peer_context)
+{
+	context::peer_to_peer::detail_::enable_access_to(peer_context.handle());
 }
 
+inline void disable_access_to(const context_t &peer_context)
+{
+	context::peer_to_peer::detail_::disable_access_to(peer_context.handle());
+}
+
+} // namespace peer_to_peer
+
+} // namespace current
+
+} // namespace context
+
+inline memory::region_t context_t::global_memory_type::allocate(size_t size_in_bytes)
+{
+	return cuda::memory::device::detail_::allocate(context_handle_, size_in_bytes);
+}
+
+inline device_t context_t::global_memory_type::associated_device() const
+{
+    return cuda::device::get(device_id_);
+}
+
+inline context_t context_t::global_memory_type::associated_context() const
+{
+    constexpr const bool non_owning { false };
+    return cuda::context::detail_::wrap(device_id_, context_handle_, non_owning);
+}
+
+
 namespace detail_ {
 
 template<typename Kernel>
-device_t get_implicit_device(Kernel)
+device::primary_context_t get_implicit_primary_context(Kernel)
 {
-	return device::current::get();
+	return device::current::get().primary_context();
 }
 
 template<>
-inline device_t get_implicit_device<kernel_t>(kernel_t kernel)
+inline device::primary_context_t get_implicit_primary_context<kernel_t>(kernel_t kernel)
 {
-	return kernel.device();
+	auto context = kernel.context();
+	auto device = context.device();
+	auto primary_context = device.primary_context();
+	if (context != primary_context) {
+		throw std::logic_error("Attempt to launch a kernel associated with a non-primary context without specifying a stream associated with that context.");
+	}
+	return primary_context;
+}
+
+template<>
+inline device::primary_context_t get_implicit_primary_context<apriori_compiled_kernel_t>(apriori_compiled_kernel_t kernel)
+{
+	const kernel_t& kernel_ = kernel;
+	return get_implicit_primary_context(kernel_);
 }
 
 } // namespace detail_
+
 template<typename Kernel, typename... KernelParameters>
 inline void launch(
 	Kernel                  kernel,
 	launch_configuration_t  launch_configuration,
 	KernelParameters&&...   parameters)
 {
-	auto device = detail_::get_implicit_device(kernel);
-	stream_t stream = device.default_stream();
+	auto primary_context = detail_::get_implicit_primary_context(kernel);
+	auto stream = primary_context.default_stream();
 
 	// Note: If Kernel is a kernel_t, and its associated device is different
 	// than the current device, the next call will fail:
@@ -787,6 +1496,165 @@ inline void launch(
 }
 
 
+namespace memory {
+
+namespace host {
+
+inline void* allocate(
+    size_t              size_in_bytes,
+    allocation_options  options)
+{
+    context::current::detail_::scoped_current_device_fallback_t set_context_for_this_scope{};
+    void* allocated = nullptr;
+    auto flags = memory::detail_::make_cuda_host_alloc_flags(options);
+    auto result = cuMemHostAlloc(&allocated, size_in_bytes, flags);
+    if (is_success(result) && allocated == nullptr) {
+        // Can this even happen? hopefully not
+        result = static_cast<status_t>(status::named_t::unknown);
+    }
+    throw_if_error(result, "Failed allocating " + ::std::to_string(size_in_bytes) + " bytes of host memory");
+    return allocated;
+}
+
+} // namespace host
+
+namespace pointer {
+namespace detail_ {
+
+template<attribute_t attribute>
+attribute_value_type_t <attribute> get_attribute(const void *ptr)
+{
+	context::current::detail_::scoped_current_device_fallback_t ensure_we_have_some_context;
+	attribute_value_type_t <attribute> attribute_value;
+	auto status = cuPointerGetAttribute(&attribute_value, attribute, device::address(ptr));
+	throw_if_error(status, "Obtaining attribute " + ::std::to_string((int) attribute)
+		+ " for pointer " + cuda::detail_::ptr_as_hex(ptr) );
+	return attribute_value;
+}
+
+// TODO: Consider switching to a span with C++20
+inline void get_attributes(unsigned num_attributes, pointer::attribute_t* attributes, void** value_ptrs, const void* ptr)
+{
+	context::current::detail_::scoped_current_device_fallback_t ensure_we_have_some_context;
+	auto status = cuPointerGetAttributes( num_attributes, attributes, value_ptrs, device::address(ptr) );
+	throw_if_error(status, "Obtaining multiple attributes for pointer " + cuda::detail_::ptr_as_hex(ptr));
+}
+
+} // namespace detail_
+} // nasmespace pointer
+
+} // namespace memory
+
+// module_t methods
+
+inline context_t module_t::context() const { return context::detail_::from_handle(context_handle_); }
+inline device_t module_t::device() const { return device::get(context::detail_::get_device_id(context_handle_)); }
+
+inline device_t texture_view::device() const
+{
+	return device::get(context::detail_::get_device_id(context_handle_));
+}
+
+inline context_t texture_view::context() const
+{
+	return context::detail_::from_handle(context_handle_);
+}
+
+template <typename T, dimensionality_t NumDimensions>
+device_t array_t<T, NumDimensions>::device() const noexcept
+{
+	return device::get(device_id_);
+}
+
+template <typename T, dimensionality_t NumDimensions>
+context_t array_t<T, NumDimensions>::context() const
+{
+	// TODO: Save the device id in the array_t as well.
+	return context::detail_::from_handle(context_handle_);
+}
+
+namespace memory {
+
+inline void copy(void *destination, const void *source, size_t num_bytes)
+{
+    context::current::detail_::scoped_current_device_fallback_t set_context_for_this_scope{};
+    auto result = cuMemcpy(device::address(destination), device::address(source), num_bytes);
+    // TODO: Determine whether it was from host to device, device to host etc and
+    // add this information to the error string
+    throw_if_error(result, "Synchronously copying data");
+}
+
+namespace device {
+
+template <typename T>
+inline void typed_set(T* start, const T& value, size_t num_elements)
+{
+    context::current::detail_::scoped_current_device_fallback_t set_context_for_this_scope{};
+    static_assert(::std::is_trivially_copyable<T>::value, "Non-trivially-copyable types cannot be used for setting memory");
+    static_assert(
+        sizeof(T) == 1 or sizeof(T) == 2 or
+        sizeof(T) == 4 or sizeof(T) == 8,
+        "Unsupported type size - only sizes 1, 2 and 4 are supported");
+    // TODO: Consider checking for alignment when compiling without NDEBUG
+    status_t result {CUDA_SUCCESS};
+    switch(sizeof(T)) {
+        case(1): result = cuMemsetD8 (address(start), reinterpret_cast<const ::std::uint8_t& >(value), num_elements); break;
+        case(2): result = cuMemsetD16(address(start), reinterpret_cast<const ::std::uint16_t&>(value), num_elements); break;
+        case(4): result = cuMemsetD32(address(start), reinterpret_cast<const ::std::uint32_t&>(value), num_elements); break;
+    }
+    throw_if_error(result, "Setting global device memory bytes");
+}
+
+} // namespace device
+
+} // namespace memory
+
+namespace module {
+
+inline module_t create(device_t device, const void* module_data, link::options_t link_options)
+{
+	return create(device.primary_context(), module_data, link_options);
+}
+
+inline module_t create(device_t device, const void* module_data)
+{
+	return create(device.primary_context(), module_data);
+}
+
+} // namespace module
+
+namespace stream {
+
+namespace detail_ {
+
+inline ::std::string identify(const stream_t& stream)
+{
+	return identify(stream.handle(), stream.context().handle(), stream.device().id());
+}
+
+} // namespace detail_
+
+} // namespace stream
+
+#if CUDA_VERSION >= 11000
+
+inline void copy_attributes(const stream_t &dest, const stream_t &src)
+{
+#ifndef NDEBUG
+	if (dest.device() != src.device()) {
+		throw ::std::invalid_argument("Attempt to copy attributes between streams on different devices");
+	}
+	if (dest.context() != src.context()) {
+		throw ::std::invalid_argument("Attempt to copy attributes between streams on different contexts");
+	}
+#endif
+	context::current::scoped_override_t set_context_for_this_scope(dest.context());
+	auto status = cuStreamCopyAttributes(dest.handle(), src.handle());
+	throw_if_error(status);
+}
+
+#endif // CUDA_VERSION >= 11000
+
 } // namespace cuda
 
 #endif // MULTI_WRAPPER_IMPLS_HPP_
diff --git a/src/cuda/api/pci_id.hpp b/src/cuda/api/pci_id.hpp
index 8a88a266..b288eb6c 100644
--- a/src/cuda/api/pci_id.hpp
+++ b/src/cuda/api/pci_id.hpp
@@ -8,8 +8,8 @@
 #ifndef CUDA_API_WRAPPERS_PCI_ID_CUH_
 #define CUDA_API_WRAPPERS_PCI_ID_CUH_
 
+#include <cuda/api/types.hpp>
 #include <cuda/api/error.hpp>
-#include <cuda/common/types.hpp>
 
 #include <cuda_runtime_api.h>
 
@@ -25,7 +25,7 @@ namespace device {
  * see @ref properties_t
  */
 struct pci_location_t {
-	// This is simply what we get in CUDA's cudaDeviceProp structure
+	// These are the values CUDA's API provides us with directly
 	int domain;
 	int bus;
 	int device;
@@ -52,7 +52,7 @@ inline id_t resolve_id(pci_location_t pci_id)
 {
 	::std::string as_string { pci_id };
 	id_t cuda_device_id;
-	auto result = cudaDeviceGetByPCIBusId(&cuda_device_id, as_string.c_str());
+	auto result = cuDeviceGetByPCIBusId(&cuda_device_id, as_string.c_str());
 	throw_if_error(result,
 		"Failed obtaining a CUDA device ID corresponding to PCI id " + as_string);
 	return cuda_device_id;
diff --git a/src/cuda/api/pci_id_impl.hpp b/src/cuda/api/pci_id_impl.hpp
index fb5c432e..0a65eb89 100644
--- a/src/cuda/api/pci_id_impl.hpp
+++ b/src/cuda/api/pci_id_impl.hpp
@@ -14,6 +14,7 @@
 #define CUDA_API_WRAPPERS_PCI_ID_HPP_IMPL_
 
 #include <cuda/api/pci_id.hpp>
+
 #include <string>
 #include <istream>
 #include <ostream>
diff --git a/src/cuda/api/peer_to_peer.hpp b/src/cuda/api/peer_to_peer.hpp
index 7be7ec89..6f3b7bb5 100644
--- a/src/cuda/api/peer_to_peer.hpp
+++ b/src/cuda/api/peer_to_peer.hpp
@@ -8,29 +8,141 @@
 #ifndef CUDA_API_WRAPPERS_PEER_TO_PEER_HPP_
 #define CUDA_API_WRAPPERS_PEER_TO_PEER_HPP_
 
-#include <cuda/api/device.hpp>
+#include <cuda/api/current_context.hpp>
 
 namespace cuda {
+
 namespace device {
+
 namespace peer_to_peer {
 
+// Aliases for all CUDA device attributes
+
+constexpr const attribute_t link_performance_rank = CU_DEVICE_P2P_ATTRIBUTE_PERFORMANCE_RANK; /// A relative value indicating the performance of the link between two devices
+constexpr const attribute_t	access_support = CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED; /// 1 if access is supported, 0 otherwise
+constexpr const attribute_t	native_atomics_support = CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED; /// 1 if the first device can perform native atomic operations on the second device, 0 otherwise
+constexpr const attribute_t	array_access_support = CU_DEVICE_P2P_ATTRIBUTE_CUDA_ARRAY_ACCESS_SUPPORTED; /// 1 if special array iterpolatory access operations are supported across the link, 0 otherwise
+
+
+namespace detail_ {
 /**
- * @brief The value of type for all CUDA device "attributes"; see also @ref attribute_t.
+ * @brief Get one of the numeric attributes for a(n ordered) pair of devices,
+ * relating to their interaction
+ *
+ * @note This is the device-pair equivalent of @ref device_t::get_attribute()
+ *
+ * @param attribute identifier of the attribute of interest
+ * @param source source device
+ * @param destination destination device
+ * @return the numeric attribute value
  */
-using attribute_value_t = int;
+inline attribute_value_t get_attribute(attribute_t attribute, id_t source, id_t destination)
+{
+	attribute_value_t value;
+	auto status = cuDeviceGetP2PAttribute(&value, attribute, source, destination);
+	throw_if_error(status, "Failed obtaining peer-to-peer device attribute for device pair ("
+		+ ::std::to_string(source) + ", " + ::std::to_string(destination) + ')');
+	return value;
+}
+
+inline bool can_access(const device::id_t accessor, const device::id_t peer)
+{
+	int result;
+	auto status = cuDeviceCanAccessPeer(&result, accessor, peer);
+	throw_if_error(status, "Failed determining whether " + device::detail_::identify(accessor)
+		+ " can access " + device::detail_::identify(peer));
+	return (result == 1);
+}
+
+} // namespace detail_
+
+} // namespace peer_to_peer
+
+} // namespace device
+
+namespace context {
+
+namespace current {
+
+namespace peer_to_peer {
+
+void enable_access_to(const context_t &context, const context_t &peer_context);
+
+void disable_access_to(const context_t &context, const context_t &peer_context);
+
+} // namespace peer_to_peer
+
+} // namespace current
+
+namespace peer_to_peer {
+
+namespace detail_ {
+
+inline void enable_access_to(context::handle_t peer_context)
+{
+	enum : unsigned {fixed_flags = 0 };
+	// No flags are supported as of CUDA 8.0
+	auto status = cuCtxEnablePeerAccess(peer_context, fixed_flags);
+	throw_if_error(status, "Failed enabling access to peer " + context::detail_::identify(peer_context));
+}
+
+inline void disable_access_to(context::handle_t peer_context)
+{
+	auto status = cuCtxDisablePeerAccess(peer_context);
+	throw_if_error(status, "Failed disabling access to peer " + context::detail_::identify(peer_context));
+}
+
+inline void enable_access(context::handle_t accessor, context::handle_t peer)
+{
+	context::current::detail_::scoped_override_t set_context_for_this_context(accessor);
+	enable_access_to(peer);
+}
+
+inline void disable_access(context::handle_t accessor, context::handle_t peer)
+{
+	context::current::detail_::scoped_override_t set_context_for_this_context(accessor);
+	disable_access_to(peer);
+}
+
+} // namespace detail_
+
+/**
+ * @brief Check if a CUDA context can access the global memory of another CUDA context
+ */
+inline bool can_access(context_t accessor, context_t peer);
 
 /**
- * @brief An identifier of a integral-numeric-value attribute of a CUDA device.
+ * @brief Enable access by one CUDA device to the global memory of another
  *
- * @note Somewhat annoyingly, CUDA devices have attributes, properties and flags.
- * Attributes have integral number values; properties have all sorts of values,
- * including arrays and limited-length strings (see
- * @ref cuda::device::properties_t), and flags are either binary or
- * small-finite-domain type fitting into an overall flagss value (see
- * @ref cuda::device_t::flags_t). Flags and properties are obtained all at once,
- * attributes are more one-at-a-time.
+ * @param accessor device interested in making a remote access
+ * @param peer device to be accessed
  */
-using attribute_t = cudaDeviceP2PAttr;
+inline void enable_access(context_t accessor, context_t peer);
+
+/**
+ * @brief Disable access by one CUDA device to the global memory of another
+ *
+ * @param accessor device interested in making a remote access
+ * @param peer device to be accessed
+ */
+inline void disable_access(context_t accessor, context_t peer);
+
+/**
+ * @brief Enable access both by the @p first to the @p second context and the other way around.
+ */
+inline void enable_bidirectional_access(context_t first, context_t second);
+
+/**
+ * @brief Disable access both by the @p first to the @p second context and the other way around.
+ */
+inline void disable_bidirectional_access(context_t first, context_t second);
+
+} // namespace peer_to_peer
+} // namespace context
+
+namespace device {
+
+namespace peer_to_peer {
 
 /**
  * @brief Determine whether one CUDA device can access the global memory
@@ -40,60 +152,42 @@ using attribute_t = cudaDeviceP2PAttr;
  * @param peer device to be accessed
  * @return true iff acess is possible
  */
-inline bool can_access(device_t accessor, device_t peer)
-{
-	return accessor.can_access(peer);
-}
+inline bool can_access(device_t accessor, device_t peer);
 
 /**
  * @brief Enable access by one CUDA device to the global memory of another
  *
  * @param accessor device interested in making a remote access
  * @param peer device to be accessed
+ *
+ * @todo Consider disabling this, given that access is context-specific
  */
-inline void enable_access(device_t accessor, device_t peer)
-{
-	return accessor.enable_access_to(peer);
-}
+inline void enable_access(device_t accessor, device_t peer);
 
 /**
  * @brief Disable access by one CUDA device to the global memory of another
  *
  * @param accessor device interested in making a remote access
  * @param peer device to be accessed
+ *
+ * @todo Consider disabling this, given that access is context-specific
  */
-inline void disable_access(device_t accessor, device_t peer)
-{
-	accessor.disable_access_to(peer);
-}
+inline void disable_access(device_t accessor, device_t peer);
 
 /**
  * @brief Determine whether two CUDA devices can currently access each other.
  */
-inline bool can_access_each_other(const device_t first, const device_t second)
-{
-	return first.can_access(second) and second.can_access(first);
-}
-
+inline bool can_access_each_other(device_t first, device_t second);
 
 /**
  * @brief Enable access both by the @p first to the @p second device and the other way around.
  */
-inline void enable_bidirectional_access(device_t first, device_t second)
-{
-	enable_access(first,  second);
-	enable_access(second, first );
-}
+inline void enable_bidirectional_access(device_t first, device_t second);
 
 /**
  * @brief Disable access both by the @p first to the @p second device and the other way around.
  */
-inline void disable_bidirectional_access(device_t first, device_t second)
-{
-	// Note: What happens when first and second have the same id?
-	disable_access(first,  second);
-	disable_access(second, first );
-}
+inline void disable_bidirectional_access(device_t first, device_t second);
 
 /**
  * @brief Get one of the numeric attributes for a(n ordered) pair of devices,
@@ -106,18 +200,7 @@ inline void disable_bidirectional_access(device_t first, device_t second)
  * @param second destination device
  * @return the numeric attribute value
  */
-inline attribute_value_t get_attribute(
-	attribute_t     attribute,
-	const device_t  first,
-	const device_t  second)
-{
-	attribute_value_t value;
-	auto status = cudaDeviceGetP2PAttribute(&value, attribute, first.id(), second.id());
-	throw_if_error(status,
-		"Failed obtaining peer-to-peer device attribute for device pair (" + ::std::to_string(first.id()) + ", "
-			+ ::std::to_string(second.id()) + ')');
-	return value;
-}
+inline attribute_value_t get_attribute(attribute_t attribute, device_t first, device_t second);
 
 } // namespace peer_to_peer
 } // namespace device
diff --git a/src/cuda/api/pointer.hpp b/src/cuda/api/pointer.hpp
index 1e8fb1f6..83cb4b3a 100644
--- a/src/cuda/api/pointer.hpp
+++ b/src/cuda/api/pointer.hpp
@@ -18,9 +18,10 @@
 
 #include <cuda/api/constants.hpp>
 #include <cuda/api/error.hpp>
-#include <cuda/common/types.hpp>
+#include <cuda/api/types.hpp>
 
 #include <cuda_runtime_api.h>
+#include <cuda.h>
 
 #ifndef NDEBUG
 #include <cassert>
@@ -30,6 +31,7 @@ namespace cuda {
 
 ///@cond
 class device_t;
+class context_t;
 ///@endcond
 
 namespace memory {
@@ -38,57 +40,71 @@ namespace memory {
 /**
  * @brief see @ref memory::host, @ref memory::device, @ref memory::managed
  */
-enum type_t : ::std::underlying_type<cudaMemoryType>::type {
-    host_memory         = cudaMemoryTypeHost,
-    device_memory       = cudaMemoryTypeDevice,
+enum type_t : ::std::underlying_type<CUmemorytype>::type {
+    host_         = CU_MEMORYTYPE_HOST,
+    device_       = CU_MEMORYTYPE_DEVICE,
+	array         = CU_MEMORYTYPE_ARRAY,
+    unified_      = CU_MEMORYTYPE_UNIFIED,
+	managed_      = CU_MEMORYTYPE_UNIFIED, // an alias (more like the runtime API name)
 #if CUDART_VERSION >= 10000
-    unregistered_memory = cudaMemoryTypeUnregistered,
-    managed_memory      = cudaMemoryTypeManaged,
+	// TODO: Why doesn't the driver API have this?
+    // unregistered_ = cudaMemoryTypeUnregistered,
 #else
-    unregistered_memory,
-    managed_memory,
+    unregistered_
 #endif // CUDART_VERSION >= 10000
 };
 
+#if CUDA_VERSION >= 11020
+namespace pool {
+/**
+ * @note Unsupported for now
+ */
+using handle_t = CUmemoryPool;
+} // namespace pool
+#endif // CUDA_VERSION >= 11020
 
 namespace pointer {
 
-/**
- * Holds various CUDA-related attributes of a pointer.
- */
-struct attributes_t : cudaPointerAttributes {
+namespace detail_ {
+
+// Note: We could theoretically template this, but - there don't seem to be a lot of "clients" for this
+// function right now, and I would rather not drag in <tuple>
+void get_attributes(unsigned num_attributes, pointer::attribute_t* attributes, void** value_ptrs, const void* ptr);
+
+template <attribute_t attribute> struct attribute_value {};
+template <> struct attribute_value<CU_POINTER_ATTRIBUTE_CONTEXT>                    { using type = context::handle_t;};
+template <> struct attribute_value<CU_POINTER_ATTRIBUTE_MEMORY_TYPE>                { using type = memory::type_t;};
+template <> struct attribute_value<CU_POINTER_ATTRIBUTE_DEVICE_POINTER>             { using type = void*;};
+template <> struct attribute_value<CU_POINTER_ATTRIBUTE_HOST_POINTER>               { using type = void*;};
+template <> struct attribute_value<CU_POINTER_ATTRIBUTE_SYNC_MEMOPS>                { using type = int;};
+template <> struct attribute_value<CU_POINTER_ATTRIBUTE_BUFFER_ID>                  { using type = unsigned long long;};
+template <> struct attribute_value<CU_POINTER_ATTRIBUTE_IS_MANAGED>                 { using type = int;};
+template <> struct attribute_value<CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL>             { using type = cuda::device::id_t;};
+template <> struct attribute_value<CU_POINTER_ATTRIBUTE_RANGE_START_ADDR>           { using type = void*;};
+template <> struct attribute_value<CU_POINTER_ATTRIBUTE_RANGE_SIZE>                 { using type = size_t;};
+template <> struct attribute_value<CU_POINTER_ATTRIBUTE_MAPPED>                     { using type = int;};
+template <> struct attribute_value<CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE> { using type = int;};
+template <> struct attribute_value<CU_POINTER_ATTRIBUTE_ALLOWED_HANDLE_TYPES>       { using type = uint64_t;};
+#if CUDA_VERSION >= 11030
+template <> struct attribute_value<CU_POINTER_ATTRIBUTE_MEMPOOL_HANDLE>             { using type = pool::handle_t;};
+#endif
 
-	/**
-	 * @brief indicates a choice memory space, management and access type -
-	 * from among the options in @ref type_t .
-	 *
-	 * @return A value whose semantics are those introduced in CUDA 10.0,
-	 * rather than those of CUDA 9 and earlier, where `type_t::device`
-	 * actually signifies either device-only or `type_t::managed`.
-	 */
-	type_t memory_type() const
-	{
-	    // TODO: For some strange reason, g++ 6.x claims that converting
-	    // to the underlying type is a "narrowing conversion", and doesn't
-		// like some other conversions I've tried - so let's cast
-		// more violently instead
-		// Note: In CUDA v10.0, the semantics changed to what we're supporting
+template <CUpointer_attribute attribute>
+using attribute_value_type_t = typename attribute_value<attribute>::type;
 
-#if CUDART_VERSION >= 10000
-	    return (type_t)cudaPointerAttributes::type;
-#else // CUDART_VERSION < 10000
-		using utype = typename ::std::underlying_type<cudaMemoryType>::type;
-		if ( ((utype) memoryType == utype {type_t::device_memory}) and cudaPointerAttributes::isManaged) 
-		{
-			return type_t::managed_memory;
-		}
-	    return (type_t)(memoryType);
-#endif // CUDART_VERSION >= 10000
-	}
-};
+template <attribute_t attribute>
+attribute_value_type_t<attribute> get_attribute(const void* ptr);
+
+} // namespace detail_
 
 } // namespace pointer
 
+inline memory::type_t type_of(const void* ptr)
+{
+	return pointer::detail_::get_attribute<CU_POINTER_ATTRIBUTE_MEMORY_TYPE>(ptr);
+}
+
+
 /**
  * A convenience wrapper around a raw pointer "known" to the CUDA runtime
  * and which thus has various kinds of associated information which this
@@ -105,47 +121,70 @@ class pointer_t {
 	T* get() const { return ptr_; }
 	operator T*() const { return ptr_; }
 
-public: // other non-mutators
-	pointer::attributes_t attributes() const
+protected:
+	template <pointer::attribute_t attribute>
+	pointer::detail_::attribute_value_type_t<attribute> get_attribute() const
 	{
-		pointer::attributes_t the_attributes;
-		auto status = cudaPointerGetAttributes (&the_attributes, ptr_);
-		throw_if_error(status, "Failed obtaining attributes of pointer " + cuda::detail_::ptr_as_hex(ptr_));
-		return the_attributes;
+		return pointer::detail_::get_attribute<attribute>(ptr_);
 	}
-	device_t device() const noexcept;
+
+public: // other non-mutators
+
+	/**
+	 * Returns a proxy for the device into whose global memory the pointer points.
+	 */
+	device_t device() const;
+
+	/**
+	 * Returns a proxy for the context in which the memory area, into which the pointer
+	 * points, was allocated.
+	 */
+	context_t context() const;
 
 	/**
 	 * @returns A pointer into device-accessible memory (not necessary on-device memory though).
 	 * CUDA ensures that, for pointers to memory not accessible on the CUDA device, `nullptr`
 	 * is returned.
+	 *
+	 * @note With unified memory, this will typically return the same value as the original pointer.
 	 */
-	T* get_for_device() const { return attributes().devicePointer; }
+	T* get_for_device() const
+	{
+		return (T*) pointer::detail_::get_attribute<CU_POINTER_ATTRIBUTE_DEVICE_POINTER>(ptr_);
+	}
 
 	/**
 	 * @returns A pointer into device-accessible memory (not necessary on-device memory though).
 	 * CUDA ensures that, for pointers to memory not accessible on the CUDA device, `nullptr`
 	 * is returned.
+	 *
+	 * @note With unified memory, this will typically return the same value as the original pointer.
 	 */
-	T* get_for_host() const { return attributes().hostPointer; }
+	T* get_for_host() const
+	{
+		return (T*) pointer::detail_::get_attribute<CU_POINTER_ATTRIBUTE_HOST_POINTER>(ptr_);
+	}
+
+	region_t containing_range() const
+	{
+		// TODO: Consider checking the alignment
+		auto range_start = pointer::detail_::get_attribute<CU_POINTER_ATTRIBUTE_RANGE_START_ADDR>(ptr_);
+		auto range_size = pointer::detail_::get_attribute<CU_POINTER_ATTRIBUTE_RANGE_SIZE>(ptr_);
+		return { range_start, range_size};
+	}
 
 	/**
 	 * @returns For a mapped-memory pointer, returns the other side of the mapping,
 	 * i.e. if this is the device pointer, returns the host pointer, otherwise
-	 * returns the device pointer. For a managed-memory pointer, returns the
-	 * single pointer usable on both device and host. In other cases returns `nullptr`.
+	 * returns the device pointer. For a managed-memory pointer, or in a unified setting,
+	 * returns the single pointer usable on both device and host. In other cases
+	 * returns `nullptr`.
 	 *
 	 * @note this relies on either the device and host pointers being `nullptr` in
 	 * the case of a non-mapped pointer; and on the device and host pointers being
 	 * identical to ptr_ for managed-memory pointers.
 	 */
-	pointer_t other_side_of_region_pair() const {
-	    auto attrs = attributes();
-#ifndef NDEBUG
-	    assert(attrs.devicePointer == ptr_ or attrs.hostPointer == ptr_);
-#endif
-	    return pointer_t { ptr_ == attrs.devicePointer ? attrs.hostPointer : ptr_ };
-	}
+	pointer_t other_side_of_region_pair() const;
 
 public: // constructors
 	pointer_t(T* ptr) noexcept : ptr_(ptr) { }
@@ -165,7 +204,7 @@ namespace pointer {
  * to be wrapped.
  */
 template<typename T>
-inline pointer_t<T> wrap(T* ptr) noexcept { return pointer_t<T>(ptr); }
+inline pointer_t<T> wrap(T* ptr) noexcept { return { ptr }; }
 
 } // namespace pointer
 } // namespace memory
diff --git a/src/cuda/api/primary_context.hpp b/src/cuda/api/primary_context.hpp
new file mode 100644
index 00000000..2b2968e8
--- /dev/null
+++ b/src/cuda/api/primary_context.hpp
@@ -0,0 +1,328 @@
+/**
+ * @file primary_context.hpp
+ */
+
+#ifndef CUDA_API_WRAPPERS_PRIMARY_CONTEXT_HPP_
+#define CUDA_API_WRAPPERS_PRIMARY_CONTEXT_HPP_
+
+#include <cuda/api/current_context.hpp>
+#include <cuda/api/context.hpp> // A primary context is a context, so can't avoid this
+
+namespace cuda {
+
+namespace device {
+
+///@cond
+class primary_context_t;
+///@endcond
+
+namespace primary_context {
+
+namespace detail_ {
+
+inline bool is_active(device::id_t device_id)
+{
+	unsigned flags;
+	int active;
+	auto status = cuDevicePrimaryCtxGetState(device_id, &flags, &active);
+	throw_if_error(status, "Failed obtaining the state of the primary context for " + device::detail_::identify(device_id));
+	return active;
+}
+
+inline void decrease_refcount(device::id_t device_id)
+{
+	auto status = cuDevicePrimaryCtxRelease(device_id);
+	throw_if_error(status, "Failed releasing the reference to the primary context for " + device::detail_::identify(device_id));
+}
+
+inline handle_t obtain_and_increase_refcount(device::id_t device_id)
+{
+	handle_t primary_context_handle;
+	auto status = cuDevicePrimaryCtxRetain(&primary_context_handle, device_id);
+	throw_if_error(status,
+		"Failed obtaining (and possibly creating, and adding a reference count to) the primary context for "
+		+ device::detail_::identify(device_id));
+	return primary_context_handle;
+}
+
+inline void increase_refcount(device::id_t device_id)
+{
+    obtain_and_increase_refcount(device_id);
+}
+
+//inline void activate(device::handle_t device_id) {
+//    if (not is_active(device_id)) {
+//        increase_refcount(device_id);
+//    }
+//}
+
+
+} // namespace detail_
+
+inline bool is_active(const device_t& device);
+
+/**
+ * @brief Destroy and clean up all resources associated with the specified device's primary context
+ *
+ * @param device The device whose primary context is to be destroyed
+ */
+void destroy(const device_t& device);
+
+namespace detail_ {
+
+inline primary_context_t wrap(
+	device::id_t       device_id,
+	context::handle_t  handle,
+	bool               decrease_refcount_on_destruct) noexcept;
+
+} // namespace detail_
+
+} // namespace primary_context
+
+/**
+ * A class for holding the primary context of a CUDA device (@ref device_t).
+ *
+ *
+ * @note Since the runtime API tends to make such contexts active and not
+ * let them go inactive very easily, this class assumes the primary context
+ * is active already on construction. Limiting constructor accessibility
+ * will help ensure this invariant is indeed maintained.
+ */
+class primary_context_t : public context_t {
+
+protected: // data members
+
+    /**
+     * @brief Responsibility for a unit of reference to the primary context.
+     *
+     * When true, the object is "responsible" for decreasing, at some point,
+     * the number of references registered with the CUDA driver to the
+     * device's primary context.
+     *
+     * When false, it is assumed whoever constructed the object has full
+     * responsibility for the CUDA driver reference count, will not let
+     * it drop to 0 while this object is alive, and does not need this
+     * object to reduce the reference count itself.
+     *
+     * These two values correspond to different use-cases of constructing
+     * an object of this type, in particular - construction by lvalue and
+     * rvalue/temporary device_t proxy objects.
+     */
+    bool owns_refcount_unit_;
+
+protected: // constructors
+
+	primary_context_t(
+			device::id_t       device_id,
+			context::handle_t  handle,
+			bool               decrease_refcount_on_destruction) noexcept
+	: context_t(device_id, handle, false),
+	  owns_refcount_unit_(decrease_refcount_on_destruction) { }
+	    // Note we are _not_ increasing the reference count; we assume
+	    // the primary context is already active. Also, like any context
+	    // proxy, we are not making this context current on construction
+	    // nor expecting it to be current throughout its lifetime.
+
+protected:
+
+	// Note: Hides the base class' non-virtual sflags() method
+	context::flags_t flags() const
+	{
+		return context_t::flags() & ~CU_CTX_MAP_HOST;
+			// CU_CTX_MAP_HOST is ignored since CUDA 3.2, and has been officially
+			// deprecated in CUDA 11. Moreover, in CUDA 11 (and possibly other versions),
+			// the flags you get with cuDevicePrimaryCtxGetState() and cuCtxGetFlag()
+			// differ on this particular flag - and cuDevicePrimaryCtxSetFlags() doesn't
+			// like seeing it.
+	}
+
+    void set_flags(flags_type new_flags) const
+    {
+        auto status = cuDevicePrimaryCtxSetFlags(device_id_, new_flags);
+        throw_if_error(status, "Failed setting primary context flags for " + device::detail_::identify(device_id_));
+    }
+
+    void set_flags(
+        device::host_thread_synch_scheduling_policy_t
+               synch_scheduling_policy = device::host_thread_synch_scheduling_policy_t::heuristic,
+        bool   keep_larger_local_mem_after_resize = true)
+    {
+        set_flags(context::detail_::make_flags(synch_scheduling_policy, keep_larger_local_mem_after_resize));
+    }
+
+public:
+
+	stream_t default_stream() const noexcept;
+
+public: // friendship
+
+    friend class device_t;
+	friend primary_context_t device::primary_context::detail_::wrap(device::id_t, context::handle_t, bool) noexcept;
+
+public: // constructors and destructor
+
+	primary_context_t(const primary_context_t& other)
+	: context_t(other), owns_refcount_unit_(other.owns_refcount_unit_)
+	{
+	    if (owns_refcount_unit_) {
+	        primary_context::detail_::obtain_and_increase_refcount(device_id_);
+	    }
+	}
+
+
+	primary_context_t(primary_context_t&& other) noexcept = default;
+
+	~primary_context_t()
+	{
+	    if (owns_refcount_unit_) {
+	        primary_context::detail_::decrease_refcount(device_id_);
+	    }
+	}
+
+public: // operators
+
+	context_t& operator=(const primary_context_t& other) = delete;
+	context_t& operator=(primary_context_t&& other) = delete;
+
+public: // mutators of the proxied primary context, but not of the proxy
+
+	void set_synch_scheduling_policy(context::host_thread_synch_scheduling_policy_t new_policy) const
+    {
+        auto other_flags = flags() & ~CU_CTX_SCHED_MASK;
+        set_flags(other_flags | (flags_type) new_policy);
+    }
+
+    bool keeping_larger_local_mem_after_resize() const
+    {
+        return flags() & CU_CTX_LMEM_RESIZE_TO_MAX;
+    }
+
+    void keep_larger_local_mem_after_resize(bool keep = true) const
+    {
+        auto other_flags = flags() & ~CU_CTX_LMEM_RESIZE_TO_MAX;
+        flags_type new_flags = other_flags | (keep ? CU_CTX_LMEM_RESIZE_TO_MAX : 0);
+        set_flags(new_flags);
+    }
+
+    void dont_keep_larger_local_mem_after_resize() const
+    {
+        keep_larger_local_mem_after_resize(false);
+    }
+};
+
+namespace primary_context {
+
+namespace detail_ {
+
+// Note the refcount semantics here, they're a bit tricky
+inline context::handle_t get_handle(device::id_t device_id, bool with_refcount_increase = false)
+{
+	auto handle = obtain_and_increase_refcount(device_id);
+	if (not with_refcount_increase) {
+		decrease_refcount(device_id);
+	}
+	return handle;
+}
+
+} // namespace detail_
+
+/**
+ * Obtain a handle to the primary context of a given device - creating
+ * it ("activating" it) if it doesn't exist.
+ *
+ * @note This method and its returned object will "perform their own"
+ * reference accounting vis-a-vis the driver, i.e. the caller does not
+ * need to worry about increasing or decreasing the CUDA driver reference
+ * count for the primary context. Naturally, though, the caller must not
+ * interfere with this reference accounting by decreasing the reference
+ * count arbitrarily by more than it has increased it, by destroying
+ * the primary context etc.
+ *
+ * @param device The device whose primary context is to be proxied
+ * @return A proxy object for the specified device
+ */
+primary_context_t get(const device_t& device);
+
+namespace detail_ {
+
+// Note that destroying the wrapped instance decreases the refcount,
+// meaning that the handle must have been obtained with an "unmatched"
+// refcount increase
+inline device::primary_context_t wrap(
+    id_t      device_id,
+    handle_t  handle,
+    bool      decrease_refcount_on_destruct) noexcept
+{
+	return {device_id, handle, decrease_refcount_on_destruct};
+}
+
+} // namespace detail_
+
+
+
+} // namespace primary_context
+} // namespace device
+
+namespace context {
+
+namespace detail_ {
+
+/**
+ * Checks if a context is the primary one for a device.
+ *
+ * @param handle Handle to the potentially-primary context
+ * @param device_id Index of the device whose primary context we're interested in
+ *
+ * @note avoid using this if not really necessary - it may cause
+ * the primary context to be created.
+ */
+bool is_primary_for_device(handle_t handle, device::id_t device_id);
+
+handle_t get_primary_for_same_device(handle_t handle, bool assume_active = false);
+
+
+inline bool is_primary(handle_t handle)
+{
+	return is_primary_for_device(handle, get_device_id(handle));
+}
+
+} // namespace detail_
+
+} // namespace context
+
+namespace device {
+
+namespace primary_context {
+
+namespace detail_ {
+
+inline bool is_current(device::id_t device_id)
+{
+	auto current_context = context::current::detail_::get_handle();
+	return context::detail_::is_primary_for_device(current_context, device_id);
+}
+
+} // namespace detail
+
+inline bool is_current()
+{
+	auto device_id = context::current::detail_::get_device_id();
+	return detail_::is_current(device_id);
+}
+
+} // namespace primary_context
+
+} // namespace device
+
+} // namespace cuda
+
+
+// Should we want to let the primary context reset itself? That's something which regular
+// contexts can't do. We might preclude this, and allow it through the device_t class,
+// instead - it already has device_t::reset(), which should be the exact same thing. We
+// don't really care if that destroys contexts we're holding on to, because: 1. It won't
+// cause segmentation violations - we're not dereferencing freed pointers and 2. it's the
+// user's problem, not ours.
+
+
+#endif /* CUDA_API_WRAPPERS_PRIMARY_CONTEXT_HPP_ */
diff --git a/src/cuda/api/stream.hpp b/src/cuda/api/stream.hpp
index 484b2671..077e01b9 100644
--- a/src/cuda/api/stream.hpp
+++ b/src/cuda/api/stream.hpp
@@ -3,24 +3,29 @@
  *
  * @brief A proxy class for CUDA streams, providing access to
  * all Runtime API calls involving their use and management.
+ *
+ * @note : Missing functionality: Stream attributes; stream capturing.
  */
 #pragma once
 #ifndef CUDA_API_WRAPPERS_STREAM_HPP_
 #define CUDA_API_WRAPPERS_STREAM_HPP_
 
+#include <cuda/api/current_context.hpp>
 #include <cuda/api/current_device.hpp>
 #include <cuda/api/error.hpp>
 #include <cuda/api/kernel_launch.hpp>
 #include <cuda/api/memory.hpp>
 #include <cuda/api/miscellany.hpp>
-#include <cuda/common/types.hpp>
+#include <cuda/api/types.hpp>
 
 #include <cuda_runtime_api.h>
+#include <cuda.h>
 
 #include <string>
 #include <memory>
 #include <utility>
 #include <tuple>
+#include <algorithm>
 
 namespace cuda {
 
@@ -40,16 +45,32 @@ enum : bool {
 	nonblocking = async,
 };
 
+enum wait_condition_t : unsigned {
+    greater_or_equal_to            = CU_STREAM_WAIT_VALUE_GEQ,
+    geq                            = CU_STREAM_WAIT_VALUE_GEQ,
+
+    equality                       = CU_STREAM_WAIT_VALUE_EQ,
+    equals                         = CU_STREAM_WAIT_VALUE_EQ,
+
+    nonzero_after_applying_bitmask = CU_STREAM_WAIT_VALUE_AND,
+    one_bits_overlap               = CU_STREAM_WAIT_VALUE_AND,
+    bitwise_and                    = CU_STREAM_WAIT_VALUE_AND,
+
+    zero_bits_overlap              = CU_STREAM_WAIT_VALUE_NOR,
+    bitwise_nor                    = CU_STREAM_WAIT_VALUE_NOR,
+} ;
+
+
 #if CUDA_VERSION >= 11000
 /**
  * Possible synchronization behavior of a host thread when performing a synchronous action
  * on a stream (in particular, synchronizing with a stream).
  */
-enum synchronization_policy_t : typename std::underlying_type<cudaSynchronizationPolicy>::type {
+enum synchronization_policy_t : typename ::std::underlying_type<CUsynchronizationPolicy>::type {
 	/**
 	 * @todo Figure out what this default actually is!
 	 */
-	automatic = cudaSyncPolicyAuto,
+	automatic = CU_SYNC_POLICY_AUTO,
 
 	/**
 	 * @brief Keep control and spin-check for result availability
@@ -59,7 +80,7 @@ enum synchronization_policy_t : typename std::underlying_type<cudaSynchronizatio
 	 * for the device, but may lower the performance of other CPU threads
 	 * working in parallel.
 	 */
-	spin      = cudaSyncPolicySpin,
+	spin      = CU_SYNC_POLICY_SPIN,
 
 	/**
 	 * @brief Yield control while waiting for results.
@@ -70,7 +91,7 @@ enum synchronization_policy_t : typename std::underlying_type<cudaSynchronizatio
 	 * CPU threads performing work in parallel.
 	 *
 	 */
-	yield     = cudaSyncPolicyYield,
+	yield     = CU_SYNC_POLICY_YIELD,
 
 	/**
 	 * @brief Block the thread until the stream has concluded pending actions.
@@ -78,77 +99,53 @@ enum synchronization_policy_t : typename std::underlying_type<cudaSynchronizatio
 	 * Instruct CUDA to block the CPU thread on a synchronization
 	 * primitive when waiting for the stream to finish work.
 	 */
-	block  = cudaSyncPolicyBlockingSync
+	block  = CU_SYNC_POLICY_BLOCKING_SYNC
 };
 #endif // CUDA_VERSION >= 11000
 
 namespace detail_ {
 
-inline handle_t create_on_current_device(
+::std::string identify(const stream_t& stream);
+
+inline handle_t create_in_current_context(
 	bool          synchronizes_with_default_stream,
 	priority_t    priority = stream::default_priority
 )
 {
 	unsigned int flags = (synchronizes_with_default_stream == sync) ?
-		cudaStreamDefault : cudaStreamNonBlocking;
+		CU_STREAM_DEFAULT : CU_STREAM_NON_BLOCKING;
 	handle_t new_stream_handle;
-	auto status = cudaStreamCreateWithPriority(&new_stream_handle, flags, priority);
-	cuda::throw_if_error(status, "Failed creating a new stream on current device");
+	auto status = cuStreamCreateWithPriority(&new_stream_handle, flags, priority);
+	    // We could instead have used an equivalent Driver API call:
+	    // cuStreamCreateWithPriority(cuStreamCreateWithPriority(&new_stream_handle, flags, priority);
+	cuda::throw_if_error(status, "Failed creating a new stream in " + detail_::identify(new_stream_handle));
 	return new_stream_handle;
 }
 
-/**
- * Check whether a certain stream is associated with a specific device.
- *
- * @note the stream_t class includes information regarding a stream's
- * device association, so this function only makes sense for CUDA stream
- * identifiers.
- *
- * @param stream_handle the CUDA runtime API handle for the stream whose
- * association is to be checked
- * @param device_id a CUDA device ID
- * @return true if the specified stream is associated with the specified
- * device, false if they are unassociated
- * @throws if the association check returns anything weird
- */
-inline bool is_associated_with(stream::handle_t stream_handle, device::id_t device_id)
+inline context::handle_t context_handle_of(stream::handle_t stream_handle)
 {
-	device::current::detail_::scoped_override_t set_device_for_this_scope(device_id);
-	auto status = cudaStreamQuery(stream_handle);
-	switch(status) {
-	case cudaSuccess:
-	case cudaErrorNotReady:
-		return true;
-	case cudaErrorInvalidResourceHandle:
-		return false;
-	default:
-		throw(::std::logic_error("unexpected status returned from cudaStreamQuery()"));
-	}
+	context::handle_t handle;
+	auto result = cuStreamGetCtx(stream_handle, &handle);
+	throw_if_error(result, "Failed obtaining the context of " + cuda::detail_::ptr_as_hex(stream_handle));
+	return handle;
 }
 
 /**
- * @brief Obtains the device ID with which a stream with a given handle is associated
+ * @brief Obtains the device ID with which a stream with a given ID is associated
  *
- * Strangely enough, CUDA won't tell you which device a stream is associated with,
- * while it can - supposedly - tell this itself when querying stream status. So,
- * let's use that. This is ugly and possibly buggy, but it _might_ just work.
+ * @note No guarantees are made if the input stream handle is the default stream's.
  *
- * @param stream_handle a stream identifier
+ * @param stream_handle a stream handle, other than the default stream for any
+ * device or context
  * @return the identifier of the device for which the stream was created.
  */
-inline device::id_t associated_device(stream::handle_t stream_handle)
-{
-	if (stream_handle == cuda::stream::default_stream_handle) {
-		throw ::std::invalid_argument("Cannot determine device association for the default/null stream");
-	}
-	for(device::id_t device_index = 0; device_index < device::count(); device_index++) {
-		if (is_associated_with(stream_handle, device_index)) { return device_index; }
-	}
-	throw ::std::runtime_error("		""Could not find any device associated with stream "
-		+ stream::detail_::identify(stream_handle));
-}
+inline device::id_t device_id_of(stream::handle_t stream_handle);
 
-inline void record_event_on_current_device(device::id_t current_device_id, stream::handle_t stream_handle, event::handle_t event_handle);
+inline void record_event_in_current_context(
+	device::id_t       current_device_id,
+	context::handle_t  current_context_handle_,
+	stream::handle_t   stream_handle,
+	event::handle_t    event_handle);
 
 /**
  * Wraps a CUDA stream handle in a stream_t proxy instance,
@@ -158,9 +155,20 @@ inline void record_event_on_current_device(device::id_t current_device_id, strea
  * @return a stream_t proxy for the CUDA stream
  */
 stream_t wrap(
-	device::id_t  device_id,
-	handle_t      stream_handle,
-	bool          take_ownership = false) noexcept;
+	device::id_t       device_id,
+	context::handle_t  context_handle,
+	handle_t           stream_handle,
+	bool               take_ownership = false) noexcept;
+
+// Providing the same signature to multiple CUDA driver calls, to allow
+// uniform templated use of all of them
+template<typename T>
+CUresult wait_on_value(CUstream stream_handle, CUdeviceptr address, T value, unsigned int flags);
+
+// Providing the same signature to multiple CUDA driver calls, to allow
+// uniform templated use of all of them
+template<typename T>
+CUresult write_value(CUstream stream_handle, CUdeviceptr address, T value, unsigned int flags);
 
 } // namespace detail_
 
@@ -170,6 +178,11 @@ inline void synchronize(const stream_t& stream);
 
 /**
  * @brief Proxy class for a CUDA stream
+ *
+ * @note a stream is specific to a context, and thus also specific to a device.
+ *
+ * @note This class is a "reference type", not a "value type". Therefore, making changes
+ * to properties of the stream is a const-respecting operation on this class.
  */
 class stream_t {
 
@@ -180,14 +193,24 @@ class stream_t {
 		does_synchronize_with_default_stream     = true,
 	};
 
-protected: // type definitions
-	using device_setter_type = device::current::detail_::scoped_override_t;
+public: // const getters
+	/// The raw CUDA handle for a stream which this class wraps
+	stream::handle_t   handle() const noexcept  { return handle_; }
+
+	/// The raw CUDA handle for the context in which the represented stream is defined.
+	context::handle_t  context_handle()  const noexcept { return context_handle_; }
 
+	/// The raw CUDA ID for the device w.r.t. which the stream is defined
+	device::id_t       device_id()       const noexcept { return device_id_; }
 
-public: // const getters
-	stream::handle_t handle() const noexcept { return handle_; }
-	device_t device() const noexcept;
-	bool is_owning() const noexcept { return owning; }
+	/// The device w.r.t. which the stream is defined.
+	device_t           device()    const noexcept;
+
+	/// The context in which this stream was defined.
+	context_t          context()   const noexcept;
+
+	/// True if this wrapper is responsible for telling CUDA to destroy the stream upon the wrapper's own destruction
+	bool               is_owning() const noexcept { return owning; }
 
 public: // other non-mutators
 
@@ -198,24 +221,23 @@ class stream_t {
 	 */
 	bool synchronizes_with_default_stream() const
 	{
-		// Is it necessary to set the device here? I wonder.
-		device_setter_type set_device_for_this_scope(device_id_);
 		unsigned int flags;
-		auto status = cudaStreamGetFlags(handle_, &flags);
-		throw_if_error(status, "Failed obtaining flags for stream"
-			+ stream::detail_::identify(handle_, device_id_));
-
-		return flags & cudaStreamNonBlocking;
+		auto status = cuStreamGetFlags(handle_, &flags);
+		    // Could have used the equivalent Driver API call,
+		    // cuStreamGetFlags(handle_, &flags);
+		throw_if_error(status, "Failed obtaining flags for a stream in "
+				+ context::detail_::identify(context_handle_, device_id_));
+		return flags & CU_STREAM_NON_BLOCKING;
 	}
 
 	stream::priority_t priority() const
 	{
-		// Is it necessary to set the device here? I wonder.
-		device_setter_type set_device_for_this_scope(device_id_);
 		int the_priority;
-		auto status = cudaStreamGetPriority(handle_, &the_priority);
-		throw_if_error(status, "Failure obtaining priority for "
-			+ stream::detail_::identify(handle_, device_id_));
+		auto status = cuStreamGetPriority(handle_, &the_priority);
+			// Could have used the equivalent Runtime API call:
+			// cuStreamGetPriority(handle_, &the_priority);
+		throw_if_error(status, "Failed obtaining priority for a stream in "
+			+ context::detail_::identify(context_handle_, device_id_));
 		return the_priority;
 	}
 
@@ -231,17 +253,18 @@ class stream_t {
 	 */
 	bool has_work_remaining() const
 	{
-		device_setter_type set_device_for_this_scope(device_id_);
-		auto status = cudaStreamQuery(handle_);
+		context::current::detail_::scoped_override_t set_context_for_this_scope(context_handle_);
+		auto status = cuStreamQuery(handle_);
+		    // Could have used the equivalent runtime API call:
+			// cuStreamQuery(handle_);
 		switch(status) {
-		case cudaSuccess:
+		case CUDA_SUCCESS:
 			return false;
-		case cudaErrorNotReady:
+		case CUDA_ERROR_NOT_READY:
 			return true;
 		default:
-			throw(cuda::runtime_error(status,
-				"unexpected status returned from cudaStreamQuery() for "
-				+ stream::detail_::identify(handle_, device_id_)));
+			throw cuda::runtime_error(static_cast<cuda::status::named_t>(status),
+				"unexpected stream status for " + stream::detail_::identify(handle_, device_id_));
 		}
 	}
 
@@ -251,7 +274,7 @@ class stream_t {
 	 * @return true if there is no work pending, false if all
 	 * previously-scheduled work has been completed
 	 */
-	bool is_clear() const { return not has_work_remaining(); }
+	bool is_clear() const { return !has_work_remaining(); }
 
 	/**
 	 * An alias for @ref is_clear() - to conform to how the CUDA runtime
@@ -266,53 +289,52 @@ class stream_t {
 	 * A function used internally by this class as the host function to call directly; see
 	 * @ref enqueue_t::host_function_call - but only with CUDA version 10.0 and later.
 	 *
-	 * @param stream_handle the handle of the stream for which a host function call was triggered - this
+	 * @param stream_handle the ID of the stream for which a host function call was triggered - this
 	 * will be passed by the CUDA runtime
-	 * @param device_id_stream_handle_and_callable a 3-tuple, containing the ID of the device to which
-	 * the stream launching the callable is associated, the handle of that launching stream, and the
-	 * callable callback which was passed to @ref enqueue_t::host_function_call, and which the programmer
+	 * @param stream_wrapper_members_and_callable a tuple, containing the information necessary to
+	 * recreate the wrapper with which the callback is associated, without any additional CUDA API calls -
+	 * plus the callable which was passed to @ref enqueue_t::host_function_call, and which the programmer
 	 * actually wants to be called.
-
 	 */
 	template <typename Callable>
-	static void stream_launched_host_function_adapter(void * device_id_stream_handle_and_callable)
+	static void stream_launched_host_function_adapter(void * stream_wrapper_members_and_callable)
 	{
-		using triplet_type = ::std::tuple<device::id_t, stream::handle_t, Callable>;
-		auto* triplet_ptr = reinterpret_cast<triplet_type*>(device_id_stream_handle_and_callable);
-		auto unique_ptr = ::std::unique_ptr<triplet_type>{triplet_ptr}; // Ensures deletion when we leave this function.
-		auto device_id = ::std::get<0>(*triplet_ptr);
-		auto stream_handle = ::std::get<1>(*triplet_ptr);
-		auto& callable = ::std::get<2>(*triplet_ptr);
-		callable( stream_t{device_id, stream_handle, do_not_take_ownership} );
+		using tuple_type = ::std::tuple<device::id_t, context::handle_t , stream::handle_t, Callable>;
+		auto* tuple_ptr = reinterpret_cast<tuple_type *>(stream_wrapper_members_and_callable);
+		auto unique_ptr_to_tuple = ::std::unique_ptr<tuple_type>{tuple_ptr}; // Ensures deletion when we leave this function.
+		auto device_id        = ::std::get<0>(*unique_ptr_to_tuple.get());
+		auto context_handle   = ::std::get<1>(*unique_ptr_to_tuple.get());
+		auto stream_handle        = ::std::get<2>(*unique_ptr_to_tuple.get());
+		const auto& callable  = ::std::get<3>(*unique_ptr_to_tuple.get());
+		callable( stream_t{device_id, context_handle, stream_handle, do_not_take_ownership} );
 	}
 
 	/**
 	 * @brief A function to @ref `host_function_launch_adapter`, for use with the old-style CUDA Runtime API call,
 	 * which passes more arguments to the callable - and calls the host function even on device failures.
 	 *
-	 * @param stream_handle the handle of the stream for which a host function call was triggered - this
+	 * @param stream_handle the ID of the stream for which a host function call was triggered - this
 	 * will be passed by the CUDA runtime
 	 * @note status indicates the status the CUDA status when the host function call is triggered; anything
 	 * other than @ref `cuda::status::success` means there's been a device error previously - but
 	 * in that case, we won't invoke the callable, as such execution is deprecated; see:
 	 * https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__STREAM.html#group__CUDART__STREAM
-	 * @param device_id_and_callable a pair-value, containing the ID of the device to which the stream
-	 * launching the host function call is associated, as well as the callable callback which was passed to
+	 * @param device_id_and_callable a pair-value, containing the ID of the device to which the stream launching
+	 * the host function call is associated, as well as the callable callback which was passed to
 	 * @ref enqueue_t::host_function_call, and which the programmer actually wants to be called.
 	 */
 	template <typename Callable>
 	static void callback_launch_adapter(
-		stream::handle_t  stream_handle,
+		stream::handle_t,
 		status_t      status,
-		void *        device_id_stream_handle_and_callable)
+		void *        stream_wrapper_members_and_callable)
 	{
-		(void) stream_handle; // it's redundant
 		if (status != cuda::status::success) {
-			using triplet_type = ::std::tuple<device::id_t, stream::handle_t, Callable>;
-			delete reinterpret_cast<triplet_type*>(device_id_stream_handle_and_callable);
+			using tuple_type = ::std::tuple<device::id_t, context::handle_t , stream::handle_t, Callable>;
+			delete reinterpret_cast<tuple_type*>(stream_wrapper_members_and_callable);
 			return;
 		}
-		stream_launched_host_function_adapter<Callable>(device_id_stream_handle_and_callable);
+		stream_launched_host_function_adapter<Callable>(stream_wrapper_members_and_callable);
 	}
 
 public: // mutators
@@ -335,17 +357,17 @@ class stream_t {
 		void kernel_launch(
 			const KernelFunction&       kernel_function,
 			launch_configuration_t      launch_configuration,
-			KernelParameters...         parameters)
+			KernelParameters &&...      parameters)
 		{
 			// Kernel executions cannot be enqueued in streams associated
 			// with devices other than the current one, see:
 			// http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#stream-and-event-behavior
-			device_setter_type set_device_for_this_scope(associated_stream.device_id_);
+			context::current::detail_::scoped_override_t set_context_for_this_scope(associated_stream.context_handle_);
 			return cuda::enqueue_launch(
 				kernel_function,
 				associated_stream,
 				launch_configuration,
-				parameters...);
+				::std::forward<KernelParameters>(parameters)...);
 		}
 
 		/**
@@ -390,6 +412,11 @@ class stream_t {
 		{
 			copy(destination, source, source.size());
 		}
+
+		void copy(void* destination, memory::const_region_t source)
+		{
+			copy(destination, source, source.size());
+		}
 		///@}
 
 		/**
@@ -403,7 +430,7 @@ class stream_t {
 		void memset(void *destination, int byte_value, size_t num_bytes)
 		{
 			// Is it necessary to set the device? I wonder.
-			device_setter_type set_device_for_this_scope(associated_stream.device_id_);
+            context::current::detail_::scoped_override_t set_context_for_this_scope(associated_stream.context_handle_);
 			memory::device::async::detail_::set(destination, byte_value, num_bytes, associated_stream.handle_);
 		}
 
@@ -420,8 +447,7 @@ class stream_t {
 		 */
 		void memzero(void *destination, size_t num_bytes)
 		{
-			// Is it necessary to set the device? I wonder.
-			device_setter_type set_device_for_this_scope(associated_stream.device_id_);
+            context::current::detail_::scoped_override_t set_context_for_this_scope(associated_stream.context_handle_);
 			memory::device::async::detail_::zero(destination, num_bytes, associated_stream.handle_);
 		}
 
@@ -466,8 +492,7 @@ class stream_t {
 		template <typename Callable>
 		void host_function_call(Callable callable_)
 		{
-			device_setter_type set_device_for_this_scope(associated_stream.device_id_);
-
+            context::current::detail_::scoped_override_t set_context_for_this_scope(associated_stream.context_handle_);
 
 			// Since callable_ will be going out of scope after the enqueueing,
 			// and we don't know anything about the scope of the original argument with
@@ -475,8 +500,9 @@ class stream_t {
 			// and pass that as the user-defined data. We also add information about
 			// the enqueueing stream.
 			auto raw_callable_extra_argument = new
-				::std::tuple<device::id_t, stream::handle_t, Callable>(
-					associated_stream.device_id_,
+				::std::tuple<device::id_t, context::handle_t, stream::handle_t, Callable>(
+				associated_stream.device_id_,
+				associated_stream.context_handle_,
 				associated_stream.handle(),
 					Callable(::std::move(callable_))
 				);
@@ -485,19 +511,22 @@ class stream_t {
 			// callback - what it will actually _do_ is invoke the callback we were passed.
 
 #if CUDART_VERSION >= 10000
-			auto status = cudaLaunchHostFunc(
+			auto status = cuLaunchHostFunc(
 				associated_stream.handle_, &stream_launched_host_function_adapter<Callable>, raw_callable_extra_argument);
+			    // Could have used the equivalent Driver API call: cuLaunchHostFunc()
 #else
 			// The nVIDIA runtime API (at least up to v10.2) requires passing 0 as the flags
 			// variable, see:
 			// http://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__STREAM.html
 			constexpr const unsigned fixed_flags { 0u };
-			auto status = cudaStreamAddCallback(
+			auto status = cuStreamAddCallback(
 				associated_stream.handle_, &callback_launch_adapter<Callable>, raw_callable_extra_argument, fixed_flags);
+			    // Could have used the equivalent Driver API call: cuAddStreamCallback()
 #endif
 
 			throw_if_error(status, "Failed scheduling a callback to be launched on "
-				+ stream::detail_::identify(associated_stream.handle(), associated_stream.device().id()));
+				+ stream::detail_::identify(associated_stream.handle_,
+					associated_stream.context_handle_, associated_stream.device_id_));
 		}
 
 
@@ -533,24 +562,24 @@ class stream_t {
 		/**
 		 * @param managed_region_start a pointer to the beginning of the managed memory region.
 		 * This cannot be a pointer to anywhere in the middle of an allocated region - you must
-		 * pass whatever @ref cuda::memory::managed::allocate() (or `cudaMallocManaged()`)
-		 * returned.
+		 * pass whatever @ref cuda::memory::managed::allocate() returned.
 		 */
 		void memory_attachment(
 			const void* managed_region_start,
 			memory::managed::attachment_t attachment = memory::managed::attachment_t::single_stream)
 		{
-			device_setter_type set_device_for_this_scope(associated_stream.device_id_);
+			context::current::detail_::scoped_override_t set_context_for_this_scope(associated_stream.context_handle_);
 			// This fixed value is required by the CUDA Runtime API,
 			// to indicate that the entire memory region, rather than a part of it, will be
 			// attached to this stream
 			constexpr const size_t length = 0;
 			auto flags = static_cast<unsigned>(attachment);
-			auto status =  cudaStreamAttachMemAsync(
-				associated_stream.handle_, managed_region_start, length, flags);
-			throw_if_error(status,
-				"Failed scheduling an attachment of a managed memory region on "
-				+ stream::detail_::identify(associated_stream.handle_, associated_stream.device_id_));
+			auto status =  cuStreamAttachMemAsync(
+				associated_stream.handle_,  memory::device::address(managed_region_start), length, flags);
+			    // Could have used the equivalent Driver API call cuStreamAttachMemAsync
+			throw_if_error(status, "Failed scheduling an attachment of a managed memory region on "
+				+ stream::detail_::identify(associated_stream.handle_, associated_stream.context_handle_,
+				associated_stream.device_id_));
 		}
 
 		/**
@@ -574,13 +603,130 @@ class stream_t {
 		 * @note this call will not delay any already-enqueued work on the stream,
 		 * only work enqueued _after_ the call.
 		 *
-		 * @param event the event for whose occurrence to wait; the event
+		 * @param event_ the event for whose occurrence to wait; the event
 		 * would typically be recorded on another stream.
 		 *
 		 */
-		void wait(const event_t& event);
+		void wait(const event_t& event_);
 
-	}; // class enqueue_t
+		/**
+		 * Schedule writing a single value to global device memory after all
+		 * previous work has concluded.
+		 *
+		 * @tparam T the value to schedule a setting of. Can only be a raw
+		 * uint32_t or uint64_t !
+		 * @param address location in global device memory to set at the appropriate time.
+		 * @param value the value to write to @p address.
+		 * @param with_memory_barrier if false, allows reordering of this write operation
+		 * with writes scheduled before it.
+		 */
+		template <typename T>
+		void set_single_value(T* __restrict__ address, T value, bool with_memory_barrier = true)
+        {
+            static_assert(
+                ::std::is_same<T,int32_t>::value or ::std::is_same<T,int64_t>::value,
+                "Unsupported type for stream value wait."
+            );
+            unsigned flags = with_memory_barrier ?
+                CU_STREAM_WRITE_VALUE_DEFAULT :
+                CU_STREAM_WRITE_VALUE_NO_MEMORY_BARRIER;
+		    auto result = static_cast<status_t>(
+		        stream::detail_::write_value(associated_stream.handle_, address, value, flags));
+		    throw_if_error(result, "Failed scheduling a write to global memory on "
+		        + stream::detail_::identify(associated_stream.handle_,associated_stream.context_handle_,
+                + associated_stream.device_id_));
+        }
+
+        /**
+         * Wait for a value in device global memory to change so as to meet some condition
+         *
+         * @tparam T the value to schedule a setting of. Can only be a raw
+		 * uint32_t or uint64_t !
+		 * @param address location in global device memory to set at the appropriate time.
+		 * @param condition the kind of condition to check against the reference value. Examples:
+         * equal to 5, greater-or-equal to 5, non-zero bitwise-and with 5 etc.
+		 * @param value the condition is checked against this reference value. Example: waiting on
+         * the value at address to be greater-or-equal to this value.
+		 * @param with_memory_barrier If true, all remote writes guaranteed to have reached the device
+         * before the wait is performed will be visible to all operations on this stream/queue scheduled
+         * after the wait.
+         */
+        template <typename T>
+        void wait(const T* address, stream::wait_condition_t condition, T value, bool with_memory_barrier = false)
+        {
+            static_assert(
+                ::std::is_same<T,int32_t>::value or ::std::is_same<T,int64_t>::value,
+                "Unsupported type for stream value wait."
+            );
+            unsigned flags = static_cast<unsigned>(condition) |
+                (with_memory_barrier ? CU_STREAM_WAIT_VALUE_FLUSH : 0);
+            auto result = static_cast<status_t>(
+                stream::detail_::wait_on_value(associated_stream.handle_, address, value, flags));
+            throw_if_error(result, "Failed scheduling a wait  to global memory on "
+                + stream::detail_::identify(associated_stream.handle_, associated_stream.context_handle_,
+				associated_stream.device_id_));
+        }
+
+        /**
+         * Guarantee all remote writes to the specified address are visible to subsequent operations
+         * scheduled on this stream.
+         *
+         * @param address location the previous remote writes to which need to be visible to
+         * subsequent operations.
+         */
+        void flush_remote_writes()
+        {
+            CUstreamBatchMemOpParams flush_op;
+            flush_op.operation = CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES;
+            unsigned count = 1;
+            unsigned flags = 0;
+            // Let's cross our fingers and assume nothing else needs to be set here...
+            cuStreamBatchMemOp(associated_stream.handle_, count, &flush_op, flags);
+        }
+
+        /**
+         * Enqueue multiple single-value write, wait and flush operations to the device
+         * (avoiding the overhead of multiple enqueue calls).
+         *
+         * @note see @ref wait(), @ref set_single_value and @ref flush_remote_writes.
+         *
+         * @{
+         */
+
+        /**
+         * @param ops_begin beginning of a sequence of single-value operation specifications
+         * @param ops_end end of a sequence of single-value operation specifications
+         */
+        template <typename Iterator>
+        void single_value_operations_batch(Iterator ops_begin, Iterator ops_end)
+        {
+            static_assert(::std::is_same<typename ::std::iterator_traits<Iterator>::value_type, CUstreamBatchMemOpParams>::value,
+            "Only accepting iterator pairs for the CUDA-driver-API memory operation descriptor,"
+                " CUstreamBatchMemOpParams, as the value type");
+            auto num_ops = ::std::distance(ops_begin, ops_end);
+            if (::std::is_same<typename ::std::remove_const<decltype(ops_begin)>::type, CUstreamBatchMemOpParams* >::value,
+                "Only accepting containers of the CUDA-driver-API memory operation descriptor, CUstreamBatchMemOpParams")
+            {
+                auto ops_ptr = reinterpret_cast<const CUstreamBatchMemOpParams*>(ops_begin);
+                cuStreamBatchMemOp(associated_stream.handle_, num_ops, ops_ptr);
+            }
+            else {
+                auto ops_uptr = ::std::unique_ptr<CUstreamBatchMemOpParams[]>(new CUstreamBatchMemOpParams[num_ops]);
+                ::std::copy(ops_begin, ops_end, ops_uptr.get());
+                cuStreamBatchMemOp(associated_stream.handle_, num_ops, ops_uptr.get());
+            }
+        }
+
+        /**
+         * @param single_value_ops A sequence of single-value operation specifiers to enqueue together.
+         */
+        template <typename Container>
+        void single_value_operations_batch(const Container& single_value_ops)
+        {
+            return single_value_operations_batch(single_value_ops.begin(), single_value_ops.end());
+        }
+
+    }; // class enqueue_t
 
 	friend class enqueue_t;
 
@@ -590,43 +736,46 @@ class stream_t {
 	 */
 	void synchronize() const
 	{
-		// Is it necessary to set the device here? I wonder.
-		device_setter_type set_device_for_this_scope(device_id_);
 		cuda::synchronize(*this);
 	}
 
 #if CUDA_VERSION >= 11000
 	stream::synchronization_policy_t synchronization_policy()
 	{
-		device::current::detail_::scoped_override_t set_device_for_this_scope(device_id_);
-		cudaStreamAttrValue wrapped_result{};
-		auto status = cudaStreamGetAttribute(handle_, cudaStreamAttributeSynchronizationPolicy, &wrapped_result);
+		context::current::detail_::scoped_override_t set_context_for_this_scope(context_handle_);
+		CUstreamAttrValue wrapped_result{};
+		auto status = cuStreamGetAttribute(handle_, CU_STREAM_ATTRIBUTE_SYNCHRONIZATION_POLICY, &wrapped_result);
 		throw_if_error(status);
 		return static_cast<stream::synchronization_policy_t>(wrapped_result.syncPolicy);
 	}
 
 	void set_synchronization_policy(stream::synchronization_policy_t policy)
 	{
-		device::current::detail_::scoped_override_t set_device_for_this_scope(device_id_);
-		cudaStreamAttrValue wrapped_value{};
-		wrapped_value.syncPolicy = static_cast<cudaSynchronizationPolicy>(policy);
-		auto status = cudaStreamSetAttribute(handle_, cudaStreamAttributeSynchronizationPolicy, &wrapped_value);
+		context::current::detail_::scoped_override_t set_context_for_this_scope(context_handle_);
+		CUstreamAttrValue wrapped_value{};
+		wrapped_value.syncPolicy = static_cast<CUsynchronizationPolicy>(policy);
+		auto status = cuStreamSetAttribute(handle_, CU_STREAM_ATTRIBUTE_SYNCHRONIZATION_POLICY, &wrapped_value);
 		throw_if_error(status);
 	}
 #endif
 
 protected: // constructor
 
-	stream_t(device::id_t device_id, stream::handle_t stream_handle, bool take_ownership = false) noexcept
-	: device_id_(device_id), handle_(stream_handle), owning(take_ownership) { }
+    stream_t(
+        device::id_t       device_id,
+        context::handle_t  context_handle,
+        stream::handle_t   stream_handle,
+        bool               take_ownership = false) noexcept
+	: device_id_(device_id), context_handle_(context_handle), handle_(stream_handle), owning(take_ownership) { }
 
 public: // constructors and destructor
 
-	stream_t(const stream_t& other) noexcept :
-	stream_t(other.device_id_, other.handle_, false) { }
+	stream_t(const stream_t& other) noexcept : 
+		stream_t(other.device_id_, other.context_handle_, other.handle_, false)
+	{ }
 
 	stream_t(stream_t&& other) noexcept : 
-		stream_t(other.device_id_, other.handle_, other.owning)
+		stream_t(other.device_id_, other.context_handle_, other.handle_, other.owning)
 	{
 		other.owning = false;
 	}
@@ -634,29 +783,41 @@ class stream_t {
 	~stream_t()
 	{
 		if (owning) {
-			device_setter_type set_device_for_this_scope(device_id_);
-			cudaStreamDestroy(handle_);
+			context::current::detail_::scoped_override_t set_context_for_this_scope(context_handle_);
+			cuStreamDestroy(handle_);
 		}
 	}
 
 public: // operators
 
+	// TODO: Do we really want to allow assignments? Hmm... probably not, it's
+	// too risky - someone might destroy one of the streams and use the others
 	stream_t& operator=(const stream_t& other) = delete;
 	stream_t& operator=(stream_t& other) = delete;
 
 public: // friendship
 
-	friend stream_t stream::detail_::wrap(device::id_t device_id, stream::handle_t stream_handle, bool take_ownership) noexcept;
+	friend stream_t stream::detail_::wrap(
+		device::id_t       device_id,
+		context::handle_t  context_handle,
+		stream::handle_t   stream_handle,
+		bool               take_ownership) noexcept;
 
 	friend inline bool operator==(const stream_t& lhs, const stream_t& rhs) noexcept
 	{
-		return lhs.device_id_ == rhs.device_id_ and lhs.handle() == rhs.handle();
+		return
+			lhs.context_handle_ == rhs.context_handle_
+#ifndef NDEBUG
+			and lhs.device_id_ == rhs.device_id_
+#endif
+			and lhs.handle_ == rhs.handle_;
 	}
 
 protected: // data members
-	const device::id_t  device_id_;
-	const stream::handle_t  handle_;
-	bool                owning;
+	const device::id_t       device_id_;
+	const context::handle_t  context_handle_;
+	const stream::handle_t   handle_;
+	bool                     owning;
 
 public: // data members - which only exist in lieu of namespaces
 	enqueue_t     enqueue { *this };
@@ -673,11 +834,12 @@ inline bool operator!=(const stream_t& lhs, const stream_t& rhs) noexcept
 namespace stream {
 
 namespace detail_ {
+
 /**
  * @brief Wrap an existing stream in a @ref stream_t instance.
  *
  * @param device_id ID of the device for which the stream is defined
- * @param stream_handle handle of the pre-existing stream
+ * @param stream_handle ID of the pre-existing stream
  * @param take_ownership When set to `false`, the stream
  * will not be destroyed along with the wrapper; use this setting
  * when temporarily working with a stream existing irrespective of
@@ -688,22 +850,49 @@ namespace detail_ {
  * device-stream combination.
  */
 inline stream_t wrap(
-	device::id_t  device_id,
-	handle_t          stream_handle,
-	bool          take_ownership /* = false, see declaration */) noexcept
+	device::id_t       device_id,
+	context::handle_t  context_handle,
+	stream::handle_t   stream_handle,
+	bool               take_ownership) noexcept
 {
-	return stream_t(device_id, stream_handle, take_ownership);
+	return { device_id, context_handle, stream_handle, take_ownership };
 }
 
 inline stream_t create(
-	device::id_t  device_id,
-	bool          synchronizes_with_default_stream,
-	priority_t    priority = stream::default_priority)
+	device::id_t       device_id,
+	context::handle_t  context_handle,
+	bool               synchronizes_with_default_stream,
+	priority_t         priority = stream::default_priority)
 {
-	device::current::detail_::scoped_override_t set_device_for_this_scope(device_id);
-	auto new_stream_handle = cuda::stream::detail_::create_on_current_device(
+	context::current::detail_::scoped_override_t set_context_for_this_scope(context_handle);
+	auto new_stream_handle = cuda::stream::detail_::create_in_current_context(
 		synchronizes_with_default_stream, priority);
-	return wrap(device_id, new_stream_handle, do_take_ownership);
+	return wrap(device_id, context_handle, new_stream_handle, do_take_ownership);
+}
+
+template<>
+inline CUresult wait_on_value<uint32_t>(CUstream stream_handle, CUdeviceptr address, uint32_t value, unsigned int flags)
+{
+    return cuStreamWaitValue32(stream_handle, address, value, flags);
+}
+
+template<>
+inline CUresult wait_on_value<uint64_t>(CUstream stream_handle, CUdeviceptr address, uint64_t value, unsigned int flags)
+{
+    return cuStreamWaitValue64(stream_handle, address, value, flags);
+}
+
+
+template<>
+inline CUresult write_value<uint32_t>(CUstream stream_handle, CUdeviceptr address, uint32_t value, unsigned int flags)
+{
+    return cuStreamWriteValue32(stream_handle, address, value, flags);
+}
+
+template<>
+inline CUresult write_value<uint64_t>(CUstream stream_handle, CUdeviceptr address, uint64_t value, unsigned int flags)
+{
+    return cuStreamWriteValue64(stream_handle, address, value, flags);
 }
 
 } // namespace detail_
@@ -721,9 +910,14 @@ inline stream_t create(
  * @return The newly-created stream
  */
 stream_t create(
-	device_t     device,
-	bool         synchronizes_with_default_stream,
-	priority_t   priority = stream::default_priority);
+	const device_t&  device,
+	bool             synchronizes_with_default_stream,
+	priority_t       priority = stream::default_priority);
+
+stream_t create(
+	const context_t&  context,
+	bool              synchronizes_with_default_stream,
+	priority_t        priority = stream::default_priority);
 
 } // namespace stream
 
@@ -732,9 +926,8 @@ using queue_id_t = stream::handle_t;
 
 inline void synchronize(const stream_t& stream)
 {
-	auto status = cudaStreamSynchronize(stream.handle());
-	throw_if_error(status,"Failed synchronizing "
-		+ stream::detail_::identify(stream.handle(), stream.device().id()));
+	auto status = cuStreamSynchronize(stream.handle());
+	throw_if_error(status, "Failed synchronizing " + stream::detail_::identify(stream));
 }
 
 #if CUDA_VERSION >= 11000
@@ -749,17 +942,7 @@ inline void synchronize(const stream_t& stream)
  * settings; see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#L2_access_policy
  * for details.
  */
-inline void copy_attributes(const stream_t& dest, const stream_t& src)
-{
-#ifndef NDEBUG
-	if (dest.device() != src.device()) {
-		throw std::invalid_argument("Attempt to copy attributes between streams on different devices");
-	}
-#endif
-	device::current::scoped_override_t set_device_for_this_scope(dest.device());
-	auto status = cudaStreamCopyAttributes(dest.id(), src.id());
-	throw_if_error(status);
-}
+void copy_attributes(const stream_t& dest, const stream_t& src);
 #endif // CUDA_VERSION >= 11000
 
 } // namespace cuda
diff --git a/src/cuda/api/texture_view.hpp b/src/cuda/api/texture_view.hpp
index 14b84667..1d7cd51e 100644
--- a/src/cuda/api/texture_view.hpp
+++ b/src/cuda/api/texture_view.hpp
@@ -19,7 +19,7 @@ class texture_view;
 
 namespace texture {
 
-using raw_handle_t = cudaTextureObject_t;
+using raw_handle_t = CUtexObject;
 
 /**
  * A simplifying rudimentary wrapper wrapper for the CUDA runtime API's internal
@@ -29,22 +29,26 @@ using raw_handle_t = cudaTextureObject_t;
  * @todo Could be expanded into a richer wrapper class allowing actual settings
  * of the various fields.
  */
-struct descriptor_t : public cudaTextureDesc {
+struct descriptor_t : public CUDA_TEXTURE_DESC {
 	inline descriptor_t()
 	{
-		memset(static_cast<cudaTextureDesc*>(this), 0, sizeof(cudaTextureDesc));
-		this->addressMode[0] = cudaAddressModeBorder;
-		this->addressMode[1] = cudaAddressModeBorder;
-		this->addressMode[2] = cudaAddressModeBorder;
-		this->filterMode = cudaFilterModePoint;
-		this->readMode = cudaReadModeElementType;
-		this->normalizedCoords = 0;
+		using parent = CUDA_TEXTURE_DESC;
+		memset(static_cast<parent*>(this), 0, sizeof(parent));
+		// Note: This should set the fields directly listed in the CUDA Runtime API
+		// version of this structure to 0.
+		this->addressMode[0] = CU_TR_ADDRESS_MODE_BORDER;
+		this->addressMode[1] = CU_TR_ADDRESS_MODE_BORDER;
+		this->addressMode[2] = CU_TR_ADDRESS_MODE_BORDER;
+		this->filterMode = CU_TR_FILTER_MODE_POINT;
 	}
 };
 
 namespace detail_ {
 
-inline texture_view wrap(device::id_t device_id, texture::raw_handle_t handle, bool take_ownership) noexcept;
+inline texture_view wrap(
+	context::handle_t      context_handle_,
+	texture::raw_handle_t  handle,
+	bool                   take_ownership) noexcept;
 
 }  // namespace detail_
 
@@ -70,20 +74,18 @@ inline texture_view wrap(device::id_t device_id, texture::raw_handle_t handle, b
  */
 class texture_view {
 	using raw_handle_type = texture::raw_handle_t;
+	using scoped_context_setter = cuda::context::current::detail_::scoped_override_t;
 
 public:
 	bool is_owning() const noexcept { return owning; }
-    raw_handle_type raw_handle() const noexcept { return raw_handle_; }
-    device_t associated_device() const noexcept;
+	raw_handle_type raw_handle() const noexcept { return raw_view_handle; }
 
 public: // constructors and destructors
 
 	texture_view(const texture_view& other) = delete;
 
 	texture_view(texture_view&& other) noexcept :
-		device_id_(other.device_id_),
-		raw_handle_(other.raw_handle_),
-		owning(other.raw_handle_)
+		raw_view_handle(other.raw_view_handle), owning(other.raw_view_handle)
 	{
 		other.owning = false;
 	};
@@ -92,14 +94,26 @@ class texture_view {
 	template <typename T, dimensionality_t NumDimensions>
 	texture_view(
 		const cuda::array_t<T, NumDimensions>& arr,
-		texture::descriptor_t descriptor = texture::descriptor_t());
+		texture::descriptor_t descriptor = texture::descriptor_t()) :
+		context_handle_(arr.context_handle()), owning(true)
+	{
+		scoped_context_setter set_context(context_handle_);
+		CUDA_RESOURCE_DESC resource_descriptor;
+		memset(&resource_descriptor, 0, sizeof(resource_descriptor));
+		resource_descriptor.resType = CU_RESOURCE_TYPE_ARRAY;
+		resource_descriptor.res.array.hArray = arr.get();
+
+		auto status = cuTexObjectCreate(&raw_view_handle, &resource_descriptor, &descriptor, nullptr);
+		throw_if_error(status, "failed creating a CUDA texture object");
+    }
 
 public: // operators
 
 	~texture_view()
 	{
 		if (owning) {
-			auto status = cudaDestroyTextureObject(raw_handle_);
+			scoped_context_setter set_context(context_handle_);
+			auto status = cuTexObjectDestroy(raw_view_handle);
 			throw_if_error(status, "failed destroying texture object");
 		}
 	}
@@ -110,16 +124,21 @@ class texture_view {
 protected: // constructor
 
 	// Usable by the wrap function
-	texture_view(device::id_t device_id, raw_handle_type handle , bool take_ownership) noexcept
-	:   device_id_(device_id), raw_handle_(handle), owning(take_ownership) { }
+	texture_view(context::handle_t context_handle, raw_handle_type handle , bool take_ownership) noexcept
+		: context_handle_(context_handle), raw_view_handle(handle), owning(take_ownership) { }
+
+public: // non-mutating getters
+
+	context_t context() const;
+	device_t device() const;
 
 public: // friendship
 
-	friend texture_view texture::detail_::wrap(device::id_t, raw_handle_type, bool) noexcept;
+	friend texture_view texture::detail_::wrap(context::handle_t, raw_handle_type, bool) noexcept;
 
 protected:
-    device::id_t device_id_;
-	raw_handle_type raw_handle_ { } ;
+	context::handle_t context_handle_ { } ;
+	raw_handle_type raw_view_handle { } ;
 	bool owning;
 };
 
@@ -131,15 +150,18 @@ inline bool operator==(const texture_view& lhs, const texture_view& rhs) noexcep
 
 inline bool operator!=(const texture_view& lhs, const texture_view& rhs) noexcept
 {
-	return not (lhs.raw_handle() == rhs.raw_handle());
+	return lhs.raw_handle() != rhs.raw_handle();
 }
 
 namespace texture {
 namespace detail_ {
 
-inline texture_view wrap(device::id_t device_id, texture::raw_handle_t handle, bool take_ownership) noexcept
+inline texture_view wrap(
+	context::handle_t      context_handle_,
+	texture::raw_handle_t  handle,
+	bool                   take_ownership) noexcept
 {
-	return texture_view { device_id, handle, take_ownership };
+	return { context_handle_, handle, take_ownership };
 }
 
 } // namespace detail_
diff --git a/src/cuda/common/types.hpp b/src/cuda/api/types.hpp
similarity index 65%
rename from src/cuda/common/types.hpp
rename to src/cuda/api/types.hpp
index 5a863cc4..09e7c8cb 100644
--- a/src/cuda/common/types.hpp
+++ b/src/cuda/api/types.hpp
@@ -6,14 +6,15 @@
  * This is a common file for all definitions of fundamental CUDA-related types,
  * some shared by different APIs.
  *
- * @note Most types here are defined using "Runtime API terminology", but this is
- * inconsequential, as the corresponding Driver API types are merely aliases of
- * them. For example, in CUDA's own header files, we have:
- *
- *   typedef CUevent_st * CUevent
- *   typedef CUevent_st * cudaEvent_t
- *
+ * @note In this file you'll find several numeric or opaque handle types, e.g.
+ * for devices, streams and events. These are mostly to be ignored; they
+ * appear here to make interaction with the unwrapped API easier and to break
+ * dependencies in the code. Instead, this library offers wrapper classes for
+ * them, in separate header files. For example: `stream.hpp` contains a stream_t
+ * class with its unique stream handle. Those are the ones you will want
+ * to use - they are more convenient and safer.
  */
+
 #pragma once
 #ifndef CUDA_API_WRAPPERS_COMMON_TYPES_HPP_
 #define CUDA_API_WRAPPERS_COMMON_TYPES_HPP_
@@ -25,11 +26,16 @@
 #ifndef __CUDACC__
 #include <builtin_types.h>
 #endif
+#include <cuda.h>
 
 #include <type_traits>
+#include <utility>
 #include <cassert>
 #include <cstddef> // for ::std::size_t
 #include <cstdint>
+#ifndef NDEBUG
+#include <stdexcept>
+#endif
 
 #ifndef __CUDACC__
 #ifndef __device__
@@ -46,7 +52,6 @@
 #endif
 #endif
 
-
 #ifdef _MSC_VER
 /*
  * Microsoft Visual C++ (upto v2017) does not support the C++
@@ -61,22 +66,15 @@
  */
 namespace cuda {
 
-/*
- * The different id and handle types - for devices, streams events etc. - are
- * just numeric values (mostly useful for breaking dependencies and for
- * interaction with code using the original CUDA APIs); we also have wrapper
- * classes for the entites they identify, constructible from the appropriate
- * handles/id's. These allow convenient access to their related functionality -
- * such as @ref cuda::device_t, @ref cuda::stream_t and @ref cuda::event_t.
- */
-
-
 /**
- * Indicates either the result (success or error index) of a CUDA Runtime API call,
- * or the overall status of the Runtime API (which is typically the last triggered
- * error).
+ * Indicates either the result (success or error index) of a CUDA Runtime or Driver API call,
+ * or the overall status of the API (which is typically the last triggered error).
+ *
+ * @note This single type really needs to double as both CUresult for driver API calls and
+ * cudaError_t for runtime API calls. These aren't actually the same type - but they are both enums,
+ * sharing most of the defined values. See also @ref error.hpp where we unify the set of errors.
  */
-using status_t = cudaError_t;
+using status_t = CUresult;
 
 using size_t = ::std::size_t;
 
@@ -89,6 +87,7 @@ using dimensionality_t = unsigned;
 namespace array {
 
 using dimension_t = size_t;
+
 /**
  * CUDA's array memory-objects are multi-dimensional; but their dimensions,
  * or extents, are not the same as @ref cuda::grid::dimensions_t ; they may be
@@ -117,19 +116,10 @@ struct dimensions_t<3> // this almost-inherits cudaExtent
 	constexpr __host__ __device__ dimensions_t(dimensions_t&& other)
 		: dimensions_t(other.width, other.height, other.depth) { }
 
-	CPP14_CONSTEXPR __host__ __device__ dimensions_t& operator=(const dimensions_t& other)
-	{
-		width = other.width; height = other.height; depth = other.depth;
-		return *this;
+	CPP14_CONSTEXPR dimensions_t& operator=(const dimensions_t& other) = default;
+	CPP14_CONSTEXPR dimensions_t& operator=(dimensions_t&& other) = default;
 
-	}
-	CPP14_CONSTEXPR __host__ __device__ dimensions_t& operator=(dimensions_t&& other)
-	{
-		width = other.width; height = other.height; depth = other.depth;
-		return *this;
-	}
-
-	constexpr __host__ __device__ operator cudaExtent(void) const
+	constexpr __host__ __device__ operator cudaExtent() const
 	{
 		return { width, height, depth };
 			// Note: We're not using make_cudaExtent here because:
@@ -147,6 +137,7 @@ struct dimensions_t<3> // this almost-inherits cudaExtent
 	// Named constructor idioms
 
 	static constexpr __host__ __device__ dimensions_t cube(dimension_t x)   { return dimensions_t{ x, x, x }; }
+	static constexpr __host__ __device__ dimensions_t zero() { return cube(0); }
 };
 
 /**
@@ -186,6 +177,7 @@ struct dimensions_t<2>
 	// Named constructor idioms
 
 	static constexpr __host__ __device__ dimensions_t square(dimension_t x)   { return dimensions_t{ x, x }; }
+	static constexpr __host__ __device__ dimensions_t zero() { return square(0); }
 };
 
 } // namespace array
@@ -197,9 +189,19 @@ struct dimensions_t<2>
 namespace event {
 
 /**
- * The CUDA Runtime APIs' handle for events
+ * The CUDA Runtime API's numeric handle for events
+ */
+using handle_t = CUevent;
+
+namespace ipc {
+
+/**
+ * The concrete value passed between processes, used to tell
+ * the CUDA Runtime API which event is desired.
  */
-using handle_t = cudaEvent_t;
+using handle_t = CUipcEventHandle;
+
+} // namespace ipc
 
 } // namespace event
 
@@ -211,9 +213,9 @@ using handle_t = cudaEvent_t;
 namespace stream {
 
 /**
- * The CUDA Runtime API's handle for streams
+ * The CUDA API's handle for streams
  */
-using handle_t = cudaStream_t;
+using handle_t             = CUstream;
 
 /**
  * CUDA streams have a scheduling priority, with lower values meaning higher priority.
@@ -340,10 +342,128 @@ constexpr inline bool operator!=(const complete_dimensions_t lhs, const complete
 /**
  * @namespace memory
  *
- * @brief Management and operations on memory in different CUDA-recognized
- * spaces.
+ * @brief Representation, allocation and manipulation of CUDA-related memory, of different
+ * kinds.
  */
 namespace memory {
+
+namespace pointer {
+
+using attribute_t = CUpointer_attribute;
+
+} // namespace pointer
+
+namespace device {
+
+/**
+ * The numeric type which can represent the range of memory addresses on a CUDA device.
+ */
+using address_t = CUdeviceptr;
+
+static_assert(sizeof(void *) == sizeof(device::address_t), "Unexpected address size");
+
+/**
+ * Return a pointers address as a numeric value of the type appropriate for device
+ * @param device_ptr a pointer into device memory
+ * @return a reinterpretation of @p device_address as a numeric address.
+ */
+inline address_t address(const void* device_ptr) noexcept
+{
+	static_assert(sizeof(void*) == sizeof(address_t), "Incompatible sizes for a void pointer and memory::device::address_t");
+	return reinterpret_cast<address_t>(device_ptr);
+}
+
+} // namespace device
+
+inline void* as_pointer(device::address_t address) noexcept
+{
+	static_assert(sizeof(void*) == sizeof(device::address_t), "Incompatible sizes for a void pointer and memory::device::address_t");
+	return reinterpret_cast<void*>(address);
+}
+
+namespace detail_ {
+
+// Note: T should be either void or void const, nothing else
+template <class T>
+class base_region_t {
+private:
+	T* start_ = nullptr;
+	size_t size_in_bytes_ = 0;
+
+	using char_type = typename std::conditional<std::is_const<T>::value, const char *, char *>::type;
+public:
+	base_region_t() = default;
+	base_region_t(T* start, size_t size_in_bytes)
+		: start_(start), size_in_bytes_(size_in_bytes) {}
+	base_region_t(device::address_t start, size_t size_in_bytes)
+		: start_(as_pointer(start)), size_in_bytes_(size_in_bytes) {}
+
+	T*& start() { return start_; }
+	size_t& size() { return size_in_bytes_; }
+
+	size_t size() const { return size_in_bytes_; }
+	T* start() const { return start_; }
+	T* data() const { return start(); }
+	T* get() const { return start(); }
+
+	device::address_t device_address() const noexcept
+	{
+		return device::address(start_);
+	}
+
+protected:
+	base_region_t subregion(size_t offset_in_bytes, size_t size_in_bytes) const
+#ifdef NDEBUG
+		noexcept
+#endif
+	{
+#ifndef NDEBUG
+		if (offset_in_bytes >= size_in_bytes_) {
+			throw std::invalid_argument("subregion begins past region end");
+		}
+		else if (offset_in_bytes + size_in_bytes > size_in_bytes_) {
+			throw std::invalid_argument("subregion exceeds original region bounds");
+		}
+#endif
+		return { static_cast<char_type>(start_) + offset_in_bytes, size_in_bytes };
+	}
+};
+
+template <typename T>
+bool operator==(const base_region_t<T>& lhs, const base_region_t<T>& rhs)
+{
+	return lhs.start() == rhs.start()
+		and lhs.size() == rhs.size();
+}
+
+template <typename T>
+bool operator!=(const base_region_t<T>& lhs, const base_region_t<T>& rhs)
+{
+	return not (lhs == rhs);
+}
+
+
+}  // namespace detail_
+
+struct region_t : public detail_::base_region_t<void> {
+	using base_region_t<void>::base_region_t;
+	region_t subregion(size_t offset_in_bytes, size_t size_in_bytes) const
+	{
+		auto parent_class_subregion = base_region_t<void>::subregion(offset_in_bytes, size_in_bytes);
+		return { parent_class_subregion.data(), parent_class_subregion.size() };
+	}
+};
+
+struct const_region_t : public detail_::base_region_t<void const> {
+	using base_region_t<void const>::base_region_t;
+	const_region_t(const region_t& r) : base_region_t(r.start(), r.size()) {}
+	const_region_t subregion(size_t offset_in_bytes, size_t size_in_bytes) const
+	{
+		auto parent_class_subregion = base_region_t<void const>::subregion(offset_in_bytes, size_in_bytes);
+		return { parent_class_subregion.data(), parent_class_subregion.size() };
+	}
+};
+
 namespace shared {
 
 /**
@@ -360,9 +480,19 @@ namespace shared {
  */
 using size_t = unsigned;
 
-using bank_size_configuration_t = cudaSharedMemConfig;
-
 } // namespace shared
+
+namespace managed {
+
+enum class initial_visibility_t {
+	to_all_devices,
+	to_supporters_of_concurrent_managed_access,
+};
+
+using range_attribute_t = CUmem_range_attribute;
+
+} // namespace managed
+
 } // namespace memory
 
 /**
@@ -478,15 +608,15 @@ constexpr inline bool operator!=(const launch_configuration_t lhs, const launch_
  * change the balance in the allocation of L1-cache-like resources between
  * actual L1 cache and shared memory; these are the possible choices.
  */
-enum class multiprocessor_cache_preference_t {
+enum class multiprocessor_cache_preference_t : ::std::underlying_type<CUfunc_cache_enum>::type {
 	/** No preference for more L1 cache or for more shared memory; the API can do as it please */
-	no_preference                 = cudaFuncCachePreferNone,
+	no_preference                 = CU_FUNC_CACHE_PREFER_NONE,
 	/** Divide the cache resources equally between actual L1 cache and shared memory */
-	equal_l1_and_shared_memory    = cudaFuncCachePreferEqual,
+	equal_l1_and_shared_memory    = CU_FUNC_CACHE_PREFER_EQUAL,
 	/** Divide the cache resources to maximize available shared memory at the expense of L1 cache */
-	prefer_shared_memory_over_l1  = cudaFuncCachePreferShared,
+	prefer_shared_memory_over_l1  = CU_FUNC_CACHE_PREFER_SHARED,
 	/** Divide the cache resources to maximize available L1 cache at the expense of shared memory */
-	prefer_l1_over_shared_memory  = cudaFuncCachePreferL1,
+	prefer_l1_over_shared_memory  = CU_FUNC_CACHE_PREFER_L1,
 	// aliases
 	none                          = no_preference,
 	equal                         = equal_l1_and_shared_memory,
@@ -505,11 +635,11 @@ enum class multiprocessor_cache_preference_t {
  * @ref device_t::shared_memory_bank_size .
  */
 enum multiprocessor_shared_memory_bank_size_option_t
-	: ::std::underlying_type<cudaSharedMemConfig>::type
+	: ::std::underlying_type<CUsharedconfig>::type
 {
-	device_default       = cudaSharedMemBankSizeDefault,
-	four_bytes_per_bank  = cudaSharedMemBankSizeFourByte,
-	eight_bytes_per_bank = cudaSharedMemBankSizeEightByte
+	device_default       = CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE,
+	four_bytes_per_bank  = CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE,
+	eight_bytes_per_bank = CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE
 };
 
 /**
@@ -520,58 +650,40 @@ namespace device {
 
 /**
  * @brief Numeric ID of a CUDA device used by the CUDA Runtime API.
+ *
+ * @note at the time of writing and the foreseeable future, this
+ * type should be an int.
  */
-using id_t               = int;
+using id_t               = CUdevice;
 
 /**
  * CUDA devices have both "attributes" and "properties". This is the
- * type for attribute identifiers/indices, aliasing @ref cudaDeviceAttr.
+ * type for attribute identifiers/indices.
  */
-using attribute_t        = cudaDeviceAttr;
+using attribute_t        = CUdevice_attribute;
 /**
  * All CUDA device attributes (@ref cuda::device::attribute_t) have a value of this type.
  */
 using attribute_value_t  = int;
 
+namespace peer_to_peer {
+
 /**
  * While Individual CUDA devices have individual "attributes" (@ref attribute_t),
  * there are also attributes characterizing pairs; this type is used for
- * identifying/indexing them, aliasing `cudaDeviceP2PAttr`.
+ * identifying/indexing them.
  */
-using pair_attribute_t   = cudaDeviceP2PAttr;
+using attribute_t = CUdevice_P2PAttribute;
 
-} // namespace device
+} // namespace peer_to_peer
 
-namespace detail_ {
+} // namespace device
 
-/**
- * @brief adapt a type to be usable as a kernel parameter.
- *
- * CUDA kernels don't accept just any parameter type a C++ function may accept.
- * Specifically: No references, arrays decay (IIANM) and functions pass by address.
- * However - not all "decaying" of `::std::decay` is necessary. Such transformation
- * can be effected by this type-trait struct.
- */
-template<typename P>
-struct kernel_parameter_decay {
-private:
-    typedef typename ::std::remove_reference<P>::type U;
-public:
-    typedef typename ::std::conditional<
-        ::std::is_array<U>::value,
-        typename ::std::remove_extent<U>::type*,
-        typename ::std::conditional<
-            ::std::is_function<U>::value,
-            typename ::std::add_pointer<U>::type,
-            U
-        >::type
-    >::type type;
-};
+namespace context {
 
-template<typename P>
-using kernel_parameter_decay_t = typename kernel_parameter_decay<P>::type;
+using handle_t = CUcontext;
 
-} // namespace detail_
+using flags_t = unsigned;
 
 /**
  * Scheduling policies the Runtime API may use when the host-side
@@ -580,71 +692,108 @@ using kernel_parameter_decay_t = typename kernel_parameter_decay<P>::type;
  */
 enum host_thread_synch_scheduling_policy_t : unsigned int {
 
-	/**
-	 * @brief Default behavior; yield or spin based on a heuristic.
-	 *
-	 * The default value if the flags parameter is zero, uses a heuristic
-	 * based on the number of active CUDA contexts in the process C and
-	 * the number of logical processors in the system P. If C > P, then
-	 * CUDA will yield to other OS threads when waiting for the device,
-	 * otherwise CUDA will not yield while waiting for results and
-	 * actively spin on the processor.
-	 */
-	heuristic = cudaDeviceScheduleAuto,
+    /**
+     * @brief Default behavior; yield or spin based on a heuristic.
+     *
+     * The default value if the flags parameter is zero, uses a heuristic
+     * based on the number of active CUDA contexts in the process C and
+     * the number of logical processors in the system P. If C > P, then
+     * CUDA will yield to other OS threads when waiting for the device,
+     * otherwise CUDA will not yield while waiting for results and
+     * actively spin on the processor.
+     */
+    heuristic = CU_CTX_SCHED_AUTO,
 
 	/**
 	 * @brief Alias for the default behavior; see @ref heuristic .
 	 */
 	default_ = heuristic,
 
-	/**
-	 * @brief Keep control and spin-check for result availability
-	 *
-	 * Instruct CUDA to actively spin when waiting for results from the
-	 * device. This can decrease latency when waiting for the device, but
-	 * may lower the performance of CPU threads if they are performing
-	 * work in parallel with the CUDA thread.
-	 *
-	 */
-	spin      = cudaDeviceScheduleSpin,
+    /**
+     * @brief Keep control and spin-check for result availability
+     *
+     * Instruct CUDA to actively spin when waiting for results from the
+     * device. This can decrease latency when waiting for the device, but
+     * may lower the performance of CPU threads if they are performing
+     * work in parallel with the CUDA thread.
+     *
+     */
+    spin      = CU_CTX_SCHED_SPIN,
+
+    /**
+     * @brief Block the thread until results are available.
+     *
+     * Instruct CUDA to block the CPU thread on a synchronization
+     * primitive when waiting for the device to finish work.
+     */
+    block     = CU_CTX_SCHED_BLOCKING_SYNC,
+
+    /**
+     * @brief Yield control while waiting for results.
+     *
+     * Instruct CUDA to yield its thread when waiting for results from
+     * the device. This can increase latency when waiting for the
+     * device, but can increase the performance of CPU threads
+     * performing work in parallel with the device.
+     *
+     */
+    yield     = CU_CTX_SCHED_YIELD,
+
+    /** see @ref heuristic */
+    automatic = heuristic,
+};
 
-	/**
-	 * @brief Block the thread until results are available.
-	 *
-	 * Instruct CUDA to block the CPU thread on a synchronization
-	 * primitive when waiting for the device to finish work.
-	 */
-	block     = cudaDeviceScheduleBlockingSync,
+} // namespace context
 
-	/**
-	 * @brief Yield control while waiting for results.
-	 *
-	 * Instruct CUDA to yield its thread when waiting for results from
-	 * the device. This can increase latency when waiting for the
-	 * device, but can increase the performance of CPU threads
-	 * performing work in parallel with the device.
-	 *
-	 */
-	yield     = cudaDeviceScheduleYield,
+namespace device {
 
-	/** see @ref heuristic */
-	automatic = heuristic,
-};
+using flags_t = context::flags_t;
+
+namespace primary_context {
+
+using handle_t = cuda::context::handle_t;
+
+} // namespace primary_context
+
+using host_thread_synch_scheduling_policy_t = context::host_thread_synch_scheduling_policy_t;
+
+} // namespace device
 
 using native_word_t = unsigned;
 
-/**
- * Object-code symbols
- */
-struct symbol_t {
-	const void* handle;
-};
+namespace detail_ {
+
+template <typename T, typename U>
+inline T identity_cast(U&& x)
+{
+	static_assert(::std::is_same<
+            typename ::std::remove_reference<T>::type,
+            typename ::std::remove_reference<U>::type
+        >::value,
+        "Casting to a different type - don't use identity_cast");
+	return static_cast<T>(::std::forward<U>(x));
+}
+
+} // namespace detail_
+
+using uuid_t = CUuuid;
+
+namespace module {
+
+using handle_t = CUmodule;
+
+} // namespace module
 
 namespace kernel {
 
-using attribute_t = cudaFuncAttribute;
+
+using attribute_t = CUfunction_attribute;
 using attribute_value_t = int;
 
+// TODO: Is this really only for kernels, or can any device-side function be
+// represented by a CUfunction?
+using handle_t = CUfunction;
+
 } // namespace kernel
 
 } // namespace cuda
diff --git a/src/cuda/api/unique_ptr.hpp b/src/cuda/api/unique_ptr.hpp
index 5d4b9e4e..0b94f151 100644
--- a/src/cuda/api/unique_ptr.hpp
+++ b/src/cuda/api/unique_ptr.hpp
@@ -9,6 +9,8 @@
 #define CUDA_API_WRAPPERS_UNIQUE_PTR_HPP_
 
 #include <cuda/api/current_device.hpp>
+#include <cuda/api/current_context.hpp>
+#include <cuda/api/primary_context.hpp>
 #include <cuda/api/memory.hpp>
 
 namespace cuda {
@@ -57,22 +59,36 @@ using deleter = device::detail_::deleter;
 
 template<typename T>
 inline ::std::unique_ptr<T, deleter>
-make_unique(cuda::device::id_t device_id, size_t num_elements)
+make_unique(context::handle_t context_handle, size_t n)
 {
-	cuda::device::current::detail_::scoped_override_t set_device_for_this_scope(device_id);
-	return memory::detail_::make_unique<T, device::detail_::allocator, deleter>(num_elements);
+	cuda::context::current::detail_::scoped_override_t set_context_for_this_scope(context_handle);
+	return memory::detail_::make_unique<T, device::detail_::allocator, deleter>(n);
 }
 
+//template<typename T>
+//inline ::std::unique_ptr<T, deleter>
+//make_unique(cuda::device::handle_t device_id, size_t n)
+//{
+//	auto pc_id = cuda::device::primary_context::detail_::get_handle(device_id);
+//	return make_unique<T, deleter>(pc_id, n);
+//}
+
 template<typename T>
 inline ::std::unique_ptr<T, deleter>
-make_unique(cuda::device::id_t device_id)
+make_unique(context::handle_t context_handle)
 {
-
-	cuda::device::current::detail_::scoped_override_t set_device_for_this_scope(device_id);
-
+	cuda::context::current::detail_::scoped_override_t set_context_for_this_scope(context_handle);
 	return memory::detail_::make_unique<T, device::detail_::allocator, deleter>();
 }
 
+//template<typename T>
+//inline ::std::unique_ptr<T, deleter>
+//make_unique(cuda::device::handle_t device_id)
+//{
+//	auto pc_id = cuda::device::primary_context::detail_::get_handle(device_id);
+//	make_unique<T, deleter>(pc_id);
+//}
+
 } // namespace detail_
 
 namespace device {
@@ -81,24 +97,30 @@ template<typename T>
 using unique_ptr = ::std::unique_ptr<T, detail_::deleter>;
 
 template<typename T>
-inline unique_ptr<T> make_unique(cuda::device_t device, size_t num_elements);
+inline unique_ptr<T> make_unique(const context_t& context, size_t n);
 
 template<typename T>
-inline unique_ptr<T> make_unique(cuda::device_t device);
+inline unique_ptr<T> make_unique(device_t device, size_t n);
 
 template<typename T>
-inline unique_ptr<T> make_unique(size_t num_elements);
+inline unique_ptr<T> make_unique(size_t n);
 
 template<typename T>
-inline unique_ptr<T> make_unique();
+inline unique_ptr<T> make_unique(const context_t& context);
 
+template<typename T>
+inline unique_ptr<T> make_unique(device_t device);
 
 template<typename T>
-inline unique_ptr<T> make_unique(T* raw_ptr)
-{
-	// We should not have to care about single-elements vs arrays here. I think
-	return unique_ptr<T>(raw_ptr);
-}
+inline unique_ptr<T> make_unique();
+
+
+//template<typename T>
+//inline unique_ptr<T> make_unique(T* raw_ptr)
+//{
+//	// We should not have to care about single-elements vs arrays here. I think
+//	return unique_ptr<T>(raw_ptr);
+//}
 
 } // namespace device
 
@@ -108,9 +130,9 @@ template<typename T>
 using unique_ptr = ::std::unique_ptr<T, detail_::deleter>;
 
 template<typename T>
-inline unique_ptr<T> make_unique(size_t num_elements)
+inline unique_ptr<T> make_unique(size_t n)
 {
-	return cuda::memory::detail_::make_unique<T, detail_::allocator, detail_::deleter>(num_elements);
+	return cuda::memory::detail_::make_unique<T, detail_::allocator, detail_::deleter>(n);
 }
 
 template<typename T>
@@ -126,22 +148,24 @@ namespace managed {
 template<typename T>
 using unique_ptr = ::std::unique_ptr<T, detail_::deleter>;
 
+namespace detail_ {
+
 template<typename T>
-inline unique_ptr<T> make_unique(
-	size_t                num_elements,
+inline unique_ptr<T> make_unique_in_current_context(
+	size_t                n,
 	initial_visibility_t  initial_visibility = initial_visibility_t::to_all_devices)
 {
 	return (initial_visibility == initial_visibility_t::to_all_devices) ?
 		cuda::memory::detail_::make_unique<T, detail_::allocator<
 			initial_visibility_t::to_all_devices>, detail_::deleter
-		>(num_elements) :
+		>(n) :
 		cuda::memory::detail_::make_unique<T, detail_::allocator<
 			initial_visibility_t::to_supporters_of_concurrent_managed_access>, detail_::deleter
-		>(num_elements);
+		>(n);
 }
 
 template<typename T>
-inline unique_ptr<T> make_unique(
+inline unique_ptr<T> make_unique_in_current_context(
 	initial_visibility_t initial_visibility = initial_visibility_t::to_all_devices)
 {
 	return (initial_visibility == initial_visibility_t::to_all_devices) ?
@@ -153,16 +177,42 @@ inline unique_ptr<T> make_unique(
 		>();
 }
 
+} // namespace detail_
+
 template<typename T>
 inline unique_ptr<T> make_unique(
-    device_t              device,
-    size_t                num_elements,
-    initial_visibility_t  initial_visibility = initial_visibility_t::to_all_devices);
+	const context_t& context,
+	size_t n,
+	initial_visibility_t  initial_visibility = initial_visibility_t::to_all_devices);
 
 template<typename T>
 inline unique_ptr<T> make_unique(
-    device_t              device,
-    initial_visibility_t  initial_visibility = initial_visibility_t::to_all_devices);
+	const device_t& device,
+	size_t n,
+	initial_visibility_t  initial_visibility = initial_visibility_t::to_all_devices);
+
+
+template<typename T>
+inline unique_ptr<T> make_unique(
+	size_t n,
+	initial_visibility_t  initial_visibility = initial_visibility_t::to_all_devices);
+
+
+template<typename T>
+inline unique_ptr<T> make_unique(
+	const context_t& context,
+	initial_visibility_t  initial_visibility = initial_visibility_t::to_all_devices);
+
+template<typename T>
+inline unique_ptr<T> make_unique(
+	device_t device,
+	initial_visibility_t  initial_visibility = initial_visibility_t::to_all_devices);
+
+
+template<typename T>
+inline unique_ptr<T> make_unique(
+	initial_visibility_t  initial_visibility = initial_visibility_t::to_all_devices);
+
 
 } // namespace managed
 
diff --git a/src/cuda/api/versions.hpp b/src/cuda/api/versions.hpp
index 988afb69..157dd823 100644
--- a/src/cuda/api/versions.hpp
+++ b/src/cuda/api/versions.hpp
@@ -11,6 +11,7 @@
 #define CUDA_API_WRAPPERS_VERSIONS_HPP_
 
 #include <cuda/api/error.hpp>
+
 #include <ostream>
 #include <utility>
 
@@ -123,23 +124,24 @@ inline version_t make(int major, int minor) noexcept
  * @todo In future CUDA versions which support C++17 - return
  * an optional
  *
+ *
  * @return If an nVIDIA GPU driver is installed on this system,
  * the maximum CUDA version it supports is returned.
  * If no version is supported, @ref version_numbers::none() is returned.
  */
-inline version_t maximum_supported_by_driver() {
+inline version_t driver() {
 	combined_version_t version;
-	auto status = cudaDriverGetVersion(&version);
-	throw_if_error(status, "Failed obtaining the maximum CUDA version supported by the nVIDIA GPU driver");
+	auto status = cuDriverGetVersion(&version);
+		// The same value would be returned using cuDriverGetVersion()
+	throw_if_error(status, "Failed obtaining the CUDA driver version");
 	return version_t::from_single_number(version);
 }
 
 /**
  * Obtains the CUDA Runtime version
  *
- * @note unlike {@ref maximum_supported_by_driver()}, 0 cannot be returned,
- * as we are actually using the runtime to obtain the version, so it does
- * have _some_ version.
+ * @note unlike {@ref driver()}, the value of @ref none() cannot be returned,
+ * as we are actually using the runtime to obtain the version.
  */
 inline version_t runtime() {
 	combined_version_t version;
diff --git a/src/cuda/api/virtual_memory.hpp b/src/cuda/api/virtual_memory.hpp
new file mode 100644
index 00000000..fcd02756
--- /dev/null
+++ b/src/cuda/api/virtual_memory.hpp
@@ -0,0 +1,524 @@
+/**
+ * @file virtual_memory.hpp
+ */
+#ifndef CUDA_API_WRAPPERS_VIRTUAL_MEMORY_HPP_
+#define CUDA_API_WRAPPERS_VIRTUAL_MEMORY_HPP_
+
+#include <cuda.h>
+
+#if CUDA_VERSION >= 10020
+
+#include <cuda/api/types.hpp>
+#include <cuda/api/memory.hpp>
+
+namespace cuda {
+// TODO: Perhaps move this down into the device namespace ?
+namespace memory {
+namespace virtual_ {
+
+class reserved_address_range_t;
+class physical_allocation_t;
+class mapping_t;
+
+namespace detail_ {
+
+inline void cancel_reservation(memory::region_t reserved)
+{
+    auto status = cuMemAddressFree(memory::device::address(reserved.start()), reserved.size());
+    throw_if_error(status, "Failed freeing a reservation of " + memory::detail_::identify(reserved));
+}
+
+} // namespace detail_
+
+using alignment_t = size_t;
+
+enum alignment : alignment_t {
+	default_ = 0,
+	trivial = 1
+};
+
+namespace detail_ {
+
+reserved_address_range_t wrap(region_t addres_range, alignment_t alignment, bool take_ownership);
+
+} // namespace detail_
+
+
+class reserved_address_range_t {
+protected:
+
+    reserved_address_range_t(region_t region, alignment_t alignment, bool owning) noexcept
+        : region_(region), alignment_(alignment), owning_(owning) { }
+
+public:
+    friend reserved_address_range_t detail_::wrap(region_t, alignment_t, bool);
+
+    reserved_address_range_t(reserved_address_range_t&& other) noexcept
+	: region_(other.region_), alignment_(other.alignment_), owning_(other.owning_)
+    {
+        other.owning_ = false;
+    }
+
+    ~reserved_address_range_t()
+	{
+		if (not owning_) { return; }
+		detail_::cancel_reservation(region_);
+	}
+
+public: // getters
+    bool is_owning() const noexcept { return owning_; }
+    region_t region() const noexcept{ return region_; }
+    alignment_t alignment() const noexcept { return alignment_; }
+
+protected: // data members
+    const region_t     region_;
+    const alignment_t  alignment_;
+    bool               owning_;
+};
+
+namespace detail_ {
+
+inline reserved_address_range_t wrap(region_t address_range, alignment_t alignment, bool take_ownership)
+{
+	return { address_range, alignment, take_ownership };
+}
+
+} // namespace detail_
+
+inline reserved_address_range_t reserve(region_t requested_region, alignment_t alignment = alignment::default_)
+{
+    unsigned long flags { 0 };
+    CUdeviceptr ptr;
+    auto status = cuMemAddressReserve(&ptr, requested_region.size(), alignment, requested_region.device_address(), flags);
+    throw_if_error(status, "Failed making a reservation of " + cuda::memory::detail_::identify(requested_region)
+		+ " with alignment value " + std::to_string(alignment));
+    bool is_owning { true };
+    return detail_::wrap(memory::region_t { ptr, requested_region.size() }, alignment, is_owning);
+}
+
+inline reserved_address_range_t reserve(size_t requested_size, alignment_t alignment = alignment::default_)
+{
+	return reserve(region_t{ nullptr, requested_size }, alignment);
+}
+
+namespace physical_allocation {
+
+// TODO: Consider simply aliasing CUmemAllocationHandleType and using constexpr const's or anonymous enums
+enum class kind_t : ::std::underlying_type<CUmemAllocationHandleType>::type {
+    posix_file_descriptor = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR,
+    win32_handle          = CU_MEM_HANDLE_TYPE_WIN32,
+    win32_kmt             = CU_MEM_HANDLE_TYPE_WIN32_KMT,
+};
+
+namespace detail_ {
+enum class granularity_kind_t : ::std::underlying_type<CUmemAllocationGranularity_flags_enum>::type {
+	minimum_required            = CU_MEM_ALLOC_GRANULARITY_MINIMUM,
+	recommended_for_performance = CU_MEM_ALLOC_GRANULARITY_RECOMMENDED
+};
+
+template<kind_t SharedHandleKind> struct shared_handle_type_helper;
+
+template <> struct shared_handle_type_helper<kind_t::posix_file_descriptor> { using type = int; };
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+template <> struct shared_handle_type_helper<kind_t::win32_handle> { using type = HANDLE; };
+#endif
+// TODO: What about WIN32_KMT?
+} // namespace detail_
+
+template<kind_t SharedHandleKind>
+using shared_handle_t = typename detail_::shared_handle_type_helper<SharedHandleKind>::type;
+
+// Note: Not inheriting from CUmemAllocationProp_st, since
+// that structure is a bit messed up
+struct properties_t {
+    // Note: Specifying a compression type is currently unsupported,
+    // as the driver API does not document semantics for the relevant
+    // properties field
+
+public: // getters
+    cuda::device_t device() const;
+
+	// TODO: Is this only relevant to requests?
+    physical_allocation::kind_t requested_kind() const
+    {
+		return kind_t(raw.requestedHandleTypes);
+    };
+
+protected: // non-mutators
+    size_t granularity(detail_::granularity_kind_t granuality_kind) const {
+        size_t result;
+        auto status = cuMemGetAllocationGranularity(&result, &raw,
+        	static_cast<CUmemAllocationGranularity_flags>(granuality_kind));
+        throw_if_error(status, "Could not determine physical allocation granularity");
+        return result;
+    }
+
+public: // non-mutators
+	size_t minimum_granularity()     const { return granularity(detail_::granularity_kind_t::minimum_required); }
+	size_t recommended_granularity() const { return granularity(detail_::granularity_kind_t::recommended_for_performance); }
+
+public:
+	properties_t(CUmemAllocationProp_st raw_properties) : raw(raw_properties)
+	{
+		if (raw.location.type != CU_MEM_LOCATION_TYPE_DEVICE) {
+			throw ::std::runtime_error("Unexpected physical_allocation type - we only know about devices!");
+		}
+	}
+
+	properties_t(properties_t&&) = default;
+	properties_t(const properties_t&) = default;
+
+public:
+	CUmemAllocationProp_st raw;
+
+};
+
+namespace detail_ {
+
+template<cuda::memory::virtual_::physical_allocation::kind_t SharedHandleKind>
+properties_t create_properties(cuda::device::id_t device_id)
+{
+	CUmemAllocationProp_st raw_props{};
+	raw_props.type = CU_MEM_ALLOCATION_TYPE_PINNED;
+	raw_props.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+	raw_props.location.id = (int) device_id;
+	raw_props.requestedHandleTypes = (CUmemAllocationHandleType) SharedHandleKind;
+	raw_props.win32HandleMetaData = nullptr;
+	return properties_t{raw_props};
+}
+
+} // namespace detail_
+
+template<cuda::memory::virtual_::physical_allocation::kind_t SharedHandleKind>
+properties_t create_properties_for(cuda::device_t device);
+
+using handle_t = CUmemGenericAllocationHandle;
+
+namespace detail_ {
+
+physical_allocation_t wrap(handle_t handle, size_t size, bool holds_refcount_unit);
+
+} // namespace detail_
+
+} // namespace physical_allocation
+
+class physical_allocation_t {
+protected: // constructors
+    physical_allocation_t(physical_allocation::handle_t handle, size_t size, bool holds_refcount_unit)
+    : handle_(handle), size_(size), holds_refcount_unit_(holds_refcount_unit) { }
+
+public: // constructors & destructor
+    physical_allocation_t(const physical_allocation_t& other) noexcept : handle_(other.handle_), size_(other.size_), holds_refcount_unit_(false)
+    { }
+
+    physical_allocation_t(physical_allocation_t&& other) noexcept  : handle_(other.handle_), size_(other.size_), holds_refcount_unit_(other.holds_refcount_unit_)
+    {
+        other.holds_refcount_unit_ = false;
+    }
+
+    ~physical_allocation_t() {
+        if (not holds_refcount_unit_) { return; }
+        auto result = cuMemRelease(handle_);
+        throw_if_error(result, "Failed making a virtual memory physical_allocation of size " + ::std::to_string(size_));
+    }
+
+public: // non-mutators
+    friend physical_allocation_t physical_allocation::detail_::wrap(physical_allocation::handle_t handle, size_t size, bool holds_refcount_unit);
+
+    size_t size() const noexcept { return size_; }
+    physical_allocation::handle_t handle() const noexcept { return handle_; }
+    bool holds_refcount_unit() const noexcept { return holds_refcount_unit_; }
+
+    physical_allocation::properties_t properties() const {
+		CUmemAllocationProp raw_properties;
+        auto status = cuMemGetAllocationPropertiesFromHandle(&raw_properties, handle_);
+        throw_if_error(status, "Obtaining the properties of a virtual memory physical_allocation with handle " + ::std::to_string(handle_));
+        return { raw_properties };
+    }
+
+    template <physical_allocation::kind_t SharedHandleKind>
+    physical_allocation::shared_handle_t<SharedHandleKind> sharing_handle() const
+    {
+        physical_allocation::shared_handle_t<SharedHandleKind> shared_handle_;
+        constexpr const unsigned long long flags { 0 };
+        auto result = cuMemExportToShareableHandle(&shared_handle_, handle_, (CUmemAllocationHandleType) SharedHandleKind, flags);
+        throw_if_error(result, "Exporting a (generic CUDA) shared memory physical_allocation to a shared handle");
+        return shared_handle_;
+    }
+
+protected: // data members
+    const physical_allocation::handle_t handle_;
+    size_t size_;
+    bool holds_refcount_unit_;
+};
+
+namespace physical_allocation {
+
+inline physical_allocation_t create(size_t size, properties_t properties)
+{
+    constexpr const unsigned long long flags { 0 };
+    CUmemGenericAllocationHandle handle;
+    auto result = cuMemCreate(&handle, size, &properties.raw, flags);
+    throw_if_error(result, "Failed making a virtual memory physical_allocation of size " + ::std::to_string(size));
+    constexpr const bool is_owning { true };
+    return detail_::wrap(handle, size, is_owning);
+}
+
+physical_allocation_t create(size_t size, device_t device);
+
+namespace detail_ {
+
+inline ::std::string identify(handle_t handle, size_t size) {
+	return ::std::string("physical allocation with handle ") + std::to_string(handle)
+		+ " of size " + std::to_string(size);
+}
+
+inline physical_allocation_t wrap(handle_t handle, size_t size, bool holds_refcount_unit)
+{
+	return { handle, size, holds_refcount_unit };
+}
+
+inline properties_t properties_of(handle_t handle)
+{
+    CUmemAllocationProp prop;
+    auto result = cuMemGetAllocationPropertiesFromHandle (&prop, handle);
+    throw_if_error(result, "Failed obtaining the properties of the virtual memory physical_allocation with handle "
+      + ::std::to_string(handle));
+    return { prop };
+}
+
+} // namespace detail_
+
+/**
+ *
+ * @note Unfortunately, importing a handle does not tell you how much memory is allocated
+ *
+ * @tparam SharedHandleKind In practice, a to choose between operating systems, as different
+ * OSes would use different kinds of shared handles.
+ * @param shared_handle a handle obtained from another process, where it had been
+ * exported from a CUDA-specific physical_allocation handle.
+ *
+ * @return the
+ */
+template <physical_allocation::kind_t SharedHandleKind>
+physical_allocation_t import(shared_handle_t<SharedHandleKind> shared_handle, size_t size, bool holds_refcount_unit = false)
+{
+    handle_t result_handle;
+    auto result = cuMemImportFromShareableHandle(
+        &result_handle, reinterpret_cast<void*>(shared_handle), CUmemAllocationHandleType(SharedHandleKind));
+    throw_if_error(result, "Failed importing a virtual memory physical_allocation from a shared handle ");
+    return physical_allocation::detail_::wrap(result_handle, size, holds_refcount_unit);
+}
+
+namespace detail_ {
+
+inline ::std::string identify(physical_allocation_t physical_allocation) {
+	return identify(physical_allocation.handle(), physical_allocation.size());
+}
+
+} // namespace detail_
+
+} // namespace physical_allocation
+
+enum access_mode_t : ::std::underlying_type<CUmemAccess_flags>::type {
+    no_access             = CU_MEM_ACCESS_FLAGS_PROT_NONE,
+    read_access           = CU_MEM_ACCESS_FLAGS_PROT_READ,
+    read_and_write_access = CU_MEM_ACCESS_FLAGS_PROT_READWRITE,
+    rw_access             = read_and_write_access
+};
+
+namespace mapping {
+namespace detail_ {
+
+inline mapping_t wrap(region_t address_range, bool owning = false);
+
+inline ::std::string identify(region_t address_range) {
+	return ::std::string("mapping of ") + memory::detail_::identify(address_range);
+}
+
+} // namespace detail_
+} // namespace mapping
+
+namespace detail_ {
+
+inline access_mode_t get_access_mode(region_t fully_mapped_region, cuda::device::id_t device_id)
+{
+	CUmemLocation_st location { CU_MEM_LOCATION_TYPE_DEVICE, device_id };
+	unsigned long long flags;
+	auto result = cuMemGetAccess(&flags, &location, fully_mapped_region.device_address() );
+	throw_if_error(result, "Failed determining the access mode for "
+		+ cuda::device::detail_::identify(device_id)
+		+ " to the virtual memory mapping to the range of size "
+		+ ::std::to_string(fully_mapped_region.size()) + " bytes at " + cuda::detail_::ptr_as_hex(fully_mapped_region.data()));
+	return (access_mode_t) flags; // Does this actually work?
+}
+
+} // namespace detail_
+
+/**
+ * Determines what kind of access a device has to a mapped region in the (universal) address space
+ *
+ * @param fully_mapped_region a region in the universal (virtual) address space, which must be
+ * covered entirely by virtual memory mappings.
+ */
+access_mode_t get_access_mode(region_t fully_mapped_region, device_t device);
+
+/**
+ * Determines what kind of access a device has to a the region of memory mapped to a single
+ * physical allocation.
+ */
+access_mode_t get_access_mode(mapping_t mapping, device_t device);
+
+/**
+ * Set the access mode from a single device to a mapped region in the (universal) address space
+ *
+ * @param fully_mapped_region a region in the universal (virtual) address space, which must be
+ * covered entirely by virtual memory mappings.
+ */
+void set_access_mode(region_t fully_mapped_region, device_t device, access_mode_t access_mode);
+
+/**
+ * Set the access mode from a single device to the region of memory mapped to a single
+ * physical allocation.
+ */
+void set_access_mode(mapping_t mapping, device_t device, access_mode_t access_mode);
+///@}
+
+/**
+ * Set the access mode from several devices to a mapped region in the (universal) address space
+ *
+ * @param fully_mapped_region a region in the universal (virtual) address space, which must be
+ * covered entirely by virtual memory mappings.
+ */
+///@{
+template <template <typename... Ts> class ContiguousContainer>
+void set_access_mode(
+	region_t fully_mapped_region,
+	const ContiguousContainer<device_t>& devices,
+	access_mode_t access_mode);
+
+template <template <typename... Ts> class ContiguousContainer>
+void set_access_mode(
+	region_t fully_mapped_region,
+	ContiguousContainer<device_t>&& devices,
+	access_mode_t access_mode);
+///@}
+
+/**
+ * Set the access mode from several devices to the region of memory mapped to a single
+ * physical allocation.
+ */
+///@{
+template <template <typename... Ts> class ContiguousContainer>
+inline void set_access_mode(
+	mapping_t mapping,
+	const ContiguousContainer<device_t>& devices,
+	access_mode_t access_mode);
+
+template <template <typename... Ts> class ContiguousContainer>
+inline void set_access_mode(
+	mapping_t mapping,
+	ContiguousContainer<device_t>&& devices,
+	access_mode_t access_mode);
+///@}
+
+
+class mapping_t {
+protected:  // constructors
+    mapping_t(region_t region, bool owning) : address_range_(region), owning_(owning) { }
+
+public: // constructors & destructions
+
+    friend mapping_t mapping::detail_::wrap(region_t address_range, bool owning);
+
+    mapping_t(const mapping_t& other) noexcept :
+		address_range_(other.address_range()), owning_(false) { }
+
+    mapping_t(mapping_t&& other) noexcept :
+		address_range_(other.address_range()), owning_(other.owning_)
+    {
+        other.owning_ = false;
+    }
+
+    region_t address_range() const noexcept { return address_range_; }
+    bool is_owning() const noexcept { return owning_; }
+
+	access_mode_t get_access_mode(device_t device) const;
+	void set_access_mode(device_t device, access_mode_t access_mode) const;
+
+	template <template <typename... Ts> class ContiguousContainer>
+	inline void set_access_mode(
+		const ContiguousContainer<device_t>& devices,
+		access_mode_t access_mode) const;
+
+	template <template <typename... Ts> class ContiguousContainer>
+	inline void set_access_mode(
+		ContiguousContainer<device_t>&& devices,
+		access_mode_t access_mode) const;
+
+	~mapping_t()
+	{
+		if (not owning_) { return; }
+		auto result = cuMemUnmap(address_range_.device_address(), address_range_.size());
+		throw_if_error(result, "Failed unmapping " + mapping::detail_::identify(address_range_));
+	}
+
+public:
+#if CUDA_VERSION >= 11000
+
+	physical_allocation_t allocation() const
+	{
+		CUmemGenericAllocationHandle allocation_handle;
+		auto status = cuMemRetainAllocationHandle(&allocation_handle, address_range_.data());
+		throw_if_error(status, " Failed obtaining/retaining the physical_allocation handle for the virtual memory "
+			"range mapped to " + cuda::detail_::ptr_as_hex(address_range_.data()) + " of size " +
+				::std::to_string(address_range_.size()) + " bytes");
+		constexpr const bool increase_refcount{false};
+		return physical_allocation::detail_::wrap(allocation_handle, address_range_.size(), increase_refcount);
+	}
+#endif
+protected:
+
+    region_t address_range_;
+    bool owning_;
+
+};
+
+namespace mapping {
+
+namespace detail_ {
+
+mapping_t wrap(region_t range, bool owning)
+{
+	return { range, owning };
+}
+
+inline ::std::string identify(mapping_t mapping)
+{
+	return mapping::detail_::identify(mapping.address_range());
+}
+
+} // namespace detail_
+
+} // namespace mapping
+
+inline mapping_t map(region_t region, physical_allocation_t physical_allocation)
+{
+    size_t offset_into_allocation { 0 }; // not yet supported, but in the API
+    constexpr const unsigned long long flags { 0 };
+    auto handle = physical_allocation.handle();
+    auto status = cuMemMap(region.device_address(), region.size(), offset_into_allocation, handle, flags);
+    throw_if_error(status, "Failed making a virtual memory mapping of " + physical_allocation::detail_::identify(physical_allocation)
+        + " to the range of size " + ::std::to_string(region.size()) + " bytes at " +
+        cuda::detail_::ptr_as_hex(region.data()));
+    constexpr const bool is_owning { true };
+    return mapping::detail_::wrap(region, is_owning);
+}
+
+
+} // namespace virtual_
+} // namespace memory
+} // namespace cuda
+
+#endif // CUDA_VERSION >= 10020
+#endif // CUDA_API_WRAPPERS_VIRTUAL_MEMORY_HPP_
diff --git a/src/cuda/nvtx/profiling.hpp b/src/cuda/nvtx/profiling.hpp
index 9c5b58ef..dc2f8c35 100644
--- a/src/cuda/nvtx/profiling.hpp
+++ b/src/cuda/nvtx/profiling.hpp
@@ -10,7 +10,7 @@
 #ifndef CUDA_NVTX_WRAPPERS_PROFILING_HPP_
 #define CUDA_NVTX_WRAPPERS_PROFILING_HPP_
 
-#include <cuda/common/types.hpp>
+#include <cuda/api/types.hpp>
 
 #include <cstdint>
 #include <string>
diff --git a/src/cuda/runtime_api.hpp b/src/cuda/runtime_api.hpp
index aedbacbb..7f5fc2ba 100644
--- a/src/cuda/runtime_api.hpp
+++ b/src/cuda/runtime_api.hpp
@@ -3,12 +3,16 @@
  *
  * @brief A single file which includes, in turn, all of the CUDA
  * Runtime API wrappers and related headers.
+ *
+ * @note This header includes a subset of the overall API wrapper code;
+ * but note that, indirectly, additional headers are included including
+ * driver-related ones.
  */
 #pragma once
 #ifndef CUDA_RUNTIME_API_WRAPPERS_HPP_
 #define CUDA_RUNTIME_API_WRAPPERS_HPP_
 
-#include <cuda/common/types.hpp>
+#include <cuda/api/types.hpp>
 #include <cuda/api/array.hpp>
 #include <cuda/api/constants.hpp>
 #include <cuda/api/error.hpp>
@@ -30,6 +34,7 @@
 #include <cuda/api/devices.hpp>
 
 #include <cuda/api/pci_id_impl.hpp>
+#include <cuda/api/apriori_compiled_kernel.hpp>
 #include <cuda/api/multi_wrapper_impls.hpp>
 #include <cuda/api/kernel.hpp>
 #include <cuda/api/kernel_launch.hpp>