diff --git a/src/cuda/api/apriori_compiled_kernel.hpp b/src/cuda/api/apriori_compiled_kernel.hpp
index a3adaae9..9d66f813 100644
--- a/src/cuda/api/apriori_compiled_kernel.hpp
+++ b/src/cuda/api/apriori_compiled_kernel.hpp
@@ -36,7 +36,7 @@ inline handle_t get_handle(const void *kernel_function_ptr, const char* name = n
 
 	handle_t handle;
 	auto status = cudaGetFuncBySymbol(&handle, kernel_function_ptr);
-	throw_if_error(status, "Failed obtaining a CUDA function handle for "
+	throw_if_error_lazy(status, "Failed obtaining a CUDA function handle for "
 		+ ((name == nullptr) ? ::std::string("a kernel function") : ::std::string("kernel function ") + name)
 		+ " at " + cuda::detail_::ptr_as_hex(kernel_function_ptr));
 	return handle;
diff --git a/src/cuda/api/array.hpp b/src/cuda/api/array.hpp
index 860e7494..95833029 100644
--- a/src/cuda/api/array.hpp
+++ b/src/cuda/api/array.hpp
@@ -70,7 +70,7 @@ handle_t create_in_current_context(dimensions_t<3> dimensions)
 	descriptor.Flags = 0;
 
 	auto status = cuArray3DCreate(&handle, &descriptor);
-	throw_if_error(status, "failed allocating 3D CUDA array");
+	throw_if_error_lazy(status, "failed allocating 3D CUDA array");
 	return handle;
 }
 
@@ -84,7 +84,7 @@ handle_t create_in_current_context(dimensions_t<2> dimensions)
 	descriptor.NumChannels = 1;
 	handle_t handle;
 	auto status = cuArrayCreate(&handle, &descriptor);
-	throw_if_error(status, "failed allocating 2D CUDA array");
+	throw_if_error_lazy(status, "failed allocating 2D CUDA array");
 	return handle;
 }
 
@@ -106,7 +106,7 @@ handle_t get_descriptor(context::handle_t context_handle, handle_t handle)
 	auto status = (NumDimensions == 2) ?
 		cuArrayGetDescriptor(&result, handle) :
 		cuArray3DGetDescriptor(&result, handle);
-	throw_if_error(status,
+	throw_if_error_lazy(status,
 		::std::string("Failed obtaining the descriptor of the CUDA ") +
 		(NumDimensions == 2 ? "2":"3") + "D array at " + cuda::detail_::ptr_as_hex(handle));
 	return result;
@@ -170,7 +170,7 @@ class array_t {
 			auto status = cuArrayDestroy(handle_);
 			// Note: Throwing in a noexcept destructor; if the free'ing fails, the program
 			// will likely terminate
-			throw_if_error(status, "Failed destroying CUDA array " + cuda::detail_::ptr_as_hex(handle_));
+			throw_if_error_lazy(status, "Failed destroying CUDA array " + cuda::detail_::ptr_as_hex(handle_));
 		}
 	}
 
diff --git a/src/cuda/api/context.hpp b/src/cuda/api/context.hpp
index 7ed00701..78849e05 100644
--- a/src/cuda/api/context.hpp
+++ b/src/cuda/api/context.hpp
@@ -83,7 +83,7 @@ inline limit_value_t get_limit(limit_t limit_id)
 {
 	limit_value_t limit_value;
 	auto status = cuCtxGetLimit(&limit_value, limit_id);
-	throw_if_error(status,
+	throw_if_error_lazy(status,
 			"Failed obtaining CUDA context limit value");
 	return limit_value;
 }
@@ -91,7 +91,7 @@ inline limit_value_t get_limit(limit_t limit_id)
 inline void set_limit(limit_t limit_id, limit_value_t new_value)
 {
 	auto status = cuCtxSetLimit(limit_id, new_value);
-	throw_if_error(status, "Failed obtaining CUDA context limit value");
+	throw_if_error_lazy(status, "Failed obtaining CUDA context limit value");
 }
 
 constexpr flags_t inline make_flags(
@@ -122,7 +122,7 @@ inline size_t total_memory(handle_t handle)
 {
 	size_t total_mem_in_bytes;
 	auto status = cuMemGetInfo(nullptr, &total_mem_in_bytes);
-	throw_if_error(status, "Failed determining amount of total memory for " + identify(handle));
+	throw_if_error_lazy(status, "Failed determining amount of total memory for " + identify(handle));
 	return total_mem_in_bytes;
 
 }
@@ -131,14 +131,14 @@ inline size_t free_memory(handle_t handle)
 {
 	size_t free_mem_in_bytes;
 	auto status = cuMemGetInfo(&free_mem_in_bytes, nullptr);
-	throw_if_error(status, "Failed determining amount of free memory for " + identify(handle));
+	throw_if_error_lazy(status, "Failed determining amount of free memory for " + identify(handle));
 	return free_mem_in_bytes;
 }
 
 inline void set_cache_preference(handle_t handle, multiprocessor_cache_preference_t preference)
 {
 	auto status = cuCtxSetCacheConfig(static_cast<CUfunc_cache>(preference));
-	throw_if_error(status,
+	throw_if_error_lazy(status,
 		"Setting the multiprocessor L1/Shared Memory cache distribution preference to " +
 			::std::to_string((unsigned) preference) + " for " + identify(handle));
 }
@@ -147,7 +147,7 @@ inline multiprocessor_cache_preference_t cache_preference(handle_t handle)
 {
 	CUfunc_cache preference;
 	auto status = cuCtxGetCacheConfig(&preference);
-	throw_if_error(status,
+	throw_if_error_lazy(status,
 		"Obtaining the multiprocessor L1/Shared Memory cache distribution preference for " + identify(handle));
 	return (multiprocessor_cache_preference_t) preference;
 }
@@ -156,14 +156,14 @@ inline shared_memory_bank_size_t shared_memory_bank_size(handle_t handle)
 {
 	CUsharedconfig bank_size;
 	auto status = cuCtxGetSharedMemConfig(&bank_size);
-	throw_if_error(status, "Obtaining the multiprocessor shared memory bank size for " + identify(handle));
+	throw_if_error_lazy(status, "Obtaining the multiprocessor shared memory bank size for " + identify(handle));
 	return static_cast<shared_memory_bank_size_t>(bank_size);
 }
 
 inline void set_shared_memory_bank_size(handle_t handle, shared_memory_bank_size_t bank_size)
 {
 	auto status = cuCtxSetSharedMemConfig(static_cast<CUsharedconfig>(bank_size));
-	throw_if_error(status, "Setting the multiprocessor shared memory bank size for " + identify(handle));
+	throw_if_error_lazy(status, "Setting the multiprocessor shared memory bank size for " + identify(handle));
 }
 
 inline void synchronize(context::handle_t handle)
@@ -181,13 +181,13 @@ inline void synchronize(device::id_t device_id, context::handle_t handle)
 inline void destroy(handle_t handle)
 {
 	auto status = cuCtxDestroy(handle);
-	throw_if_error(status, "Failed destroying " + identify(handle));
+	throw_if_error_lazy(status, "Failed destroying " + identify(handle));
 }
 
 inline void destroy(handle_t handle, device::id_t device_index)
 {
 	auto status = cuCtxDestroy(handle);
-	throw_if_error(status, "Failed destroying " + identify(handle, device_index));
+	throw_if_error_lazy(status, "Failed destroying " + identify(handle, device_index));
 }
 
 inline context::flags_t get_flags(handle_t handle)
@@ -461,7 +461,7 @@ class context_t {
 		scoped_setter_type set_context_for_this_scope(handle_);
 		context::stream_priority_range_t result;
 		auto status = cuCtxGetStreamPriorityRange(&result.least, &result.greatest);
-		throw_if_error(status, "Obtaining the priority range for streams within " +
+		throw_if_error_lazy(status, "Obtaining the priority range for streams within " +
 			context::detail_::identify(*this));
 		return result;
 	}
@@ -476,7 +476,7 @@ class context_t {
 	{
 		unsigned int raw_version;
 		auto status = cuCtxGetApiVersion(handle_, &raw_version);
-		throw_if_error(status, "Failed obtaining the API version for " + context::detail_::identify(*this));
+		throw_if_error_lazy(status, "Failed obtaining the API version for " + context::detail_::identify(*this));
 		return version_t::from_single_number((int) raw_version);
 	}
 
@@ -540,7 +540,7 @@ class context_t {
 		scoped_setter_type set_context_for_this_scope(handle_);
 #if (CUDA_VERSION >= 11000)
 		auto status = cuCtxResetPersistingL2Cache();
-		throw_if_error(status, "Failed resetting/clearing the persisting L2 cache memory");
+		throw_if_error_lazy(status, "Failed resetting/clearing the persisting L2 cache memory");
 #endif
 		throw cuda::runtime_error(
 			cuda::status::insufficient_driver,
@@ -709,7 +709,7 @@ inline handle_t create_and_push(
 			keep_larger_local_mem_after_resize);
 	handle_t handle;
 	auto status = cuCtxCreate(&handle, flags, device_id);
-	throw_if_error(status, "failed creating a CUDA context associated with "
+	throw_if_error_lazy(status, "failed creating a CUDA context associated with "
 		+ device::detail_::identify(device_id));
 	return handle;
 }
diff --git a/src/cuda/api/current_context.hpp b/src/cuda/api/current_context.hpp
index 6f074812..79c8337f 100644
--- a/src/cuda/api/current_context.hpp
+++ b/src/cuda/api/current_context.hpp
@@ -33,7 +33,7 @@ inline bool exists()
 	if (status == cuda::status::not_yet_initialized) {
 		return false;
 	}
-	throw_if_error(status, "Failed obtaining the current context's handle");
+	throw_if_error_lazy(status, "Failed obtaining the current context's handle");
 	return (handle != context::detail_::none);
 }
 
@@ -93,7 +93,7 @@ inline status_and_handle_pair get_with_status()
 inline handle_t get_handle()
 {
 	auto p = get_with_status();
-	throw_if_error(p.status, "Failed obtaining the current context's handle");
+	throw_if_error_lazy(p.status, "Failed obtaining the current context's handle");
 	return p.handle;
 }
 
@@ -102,7 +102,7 @@ inline context::flags_t get_flags()
 {
 	context::flags_t result;
 	auto status = cuCtxGetFlags(&result);
-	throw_if_error(status, "Failed obtaining the current context's flags");
+	throw_if_error_lazy(status, "Failed obtaining the current context's flags");
 	return result;
 }
 
@@ -110,7 +110,7 @@ inline device::id_t get_device_id()
 {
 	device::id_t device_id;
 	auto result = cuCtxGetDevice(&device_id);
-	throw_if_error(result, "Failed obtaining the current context's device");
+	throw_if_error_lazy(result, "Failed obtaining the current context's device");
 	return device_id;
 }
 
@@ -125,7 +125,7 @@ inline device::id_t get_device_id()
 inline void push(handle_t context_handle)
 {
 	auto status = cuCtxPushCurrent(context_handle);
-	throw_if_error(status, "Failed pushing to the top of the context stack: "
+	throw_if_error_lazy(status, "Failed pushing to the top of the context stack: "
 		+ context::detail_::identify(context_handle));
 }
 
@@ -152,7 +152,7 @@ inline context::handle_t pop()
 {
 	handle_t popped_context_handle;
 	auto status = cuCtxPopCurrent(&popped_context_handle);
-	throw_if_error(status, "Failed popping the current CUDA context");
+	throw_if_error_lazy(status, "Failed popping the current CUDA context");
 	return popped_context_handle;
 }
 
@@ -162,7 +162,7 @@ inline void set(handle_t context_handle)
 	// if (detail_::get_handle() == context_handle_) { return; }
 	// ... but decided against it.
 	auto status = cuCtxSetCurrent(context_handle);
-	throw_if_error(status,
+	throw_if_error_lazy(status,
 		"Failed setting the current context to " + context::detail_::identify(context_handle));
 }
 
diff --git a/src/cuda/api/current_device.hpp b/src/cuda/api/current_device.hpp
index a96efcb4..dbb4e6e4 100644
--- a/src/cuda/api/current_device.hpp
+++ b/src/cuda/api/current_device.hpp
@@ -52,7 +52,7 @@ inline id_t get_id()
 		// Should we activate and push the default device's context? probably not.
 		return default_device_id;
 	}
-	throw_if_error(status,
+	throw_if_error_lazy(status,
 		"Failed obtaining the current context for determining which device is active");
 
 	if (current_context_handle == context::detail_::none) {
@@ -64,7 +64,7 @@ inline id_t get_id()
 	//
 //	handle_t  device_id;
 //	auto status = cudaGetDevice(&device_id);
-//	throw_if_error(status, "Failure obtaining current device id");
+//	throw_if_error_lazy(status, "Failure obtaining current device id");
 //	return device_id;
 }
 
@@ -154,7 +154,7 @@ inline void set(const id_t *device_ids, size_t num_devices)
 		throw cuda::runtime_error(status::invalid_device, "More devices listed than exist on the system");
 	}
 	auto result = cudaSetValidDevices(const_cast<int *>(device_ids), (int) num_devices);
-	throw_if_error(result,
+	throw_if_error_lazy(result,
 		"Failure setting the current device to any of the list of "
 		+ ::std::to_string(num_devices) + " devices specified");
 }
diff --git a/src/cuda/api/device.hpp b/src/cuda/api/device.hpp
index d6454e98..a81f94b4 100644
--- a/src/cuda/api/device.hpp
+++ b/src/cuda/api/device.hpp
@@ -93,7 +93,7 @@ inline ::std::string get_name(id_t id)
 	auto buffer_size = (size_type) (sizeof(stack_buffer) / sizeof(char));
 	auto try_getting_name = [&](char* buffer, size_type buffer_size) -> size_type {
 		auto status =  cuDeviceGetName(buffer, buffer_size-1, id);
-		throw_if_error(status, "Failed obtaining the CUDA device name");
+		throw_if_error_lazy(status, "Failed obtaining the CUDA device name");
 		buffer[buffer_size-1] = '\0';
 		return (size_type) ::std::strlen(buffer);
 	};
@@ -163,7 +163,7 @@ class device_t {
 		context_setter_type set_for_this_scope(primary_context_handle());
 		int result;
 		auto status = cuDeviceCanAccessPeer(&result, id(), peer.id());
-		throw_if_error(status, "Failed determining whether "
+		throw_if_error_lazy(status, "Failed determining whether "
 			+ device::detail_::identify(id_) + " can access "
 			+ device::detail_::identify(peer.id_));
 		return (result == 1);
@@ -194,7 +194,7 @@ class device_t {
 	uuid_t uuid () const {
 		uuid_t result;
 		auto status = cuDeviceGetUuid(&result, id_);
-		throw_if_error(status, "Failed obtaining UUID for " + device::detail_::identify(id_));
+		throw_if_error_lazy(status, "Failed obtaining UUID for " + device::detail_::identify(id_));
 		return result;
 	}
 #endif // CUDA_VERSION >= 9020
@@ -228,7 +228,7 @@ class device_t {
 	void set_flags(flags_type new_flags) const
 	{
 		auto status = cuDevicePrimaryCtxSetFlags(id(), new_flags);
-		throw_if_error(status, "Failed setting (primary context) flags for device " + device::detail_::identify(id_));
+		throw_if_error_lazy(status, "Failed setting (primary context) flags for device " + device::detail_::identify(id_));
 	}
 
 public:
@@ -242,14 +242,14 @@ class device_t {
 	{
 		properties_t properties;
 		auto status = cudaGetDeviceProperties(&properties, id());
-		throw_if_error(status, "Failed obtaining device properties for " + device::detail_::identify(id_));
+		throw_if_error_lazy(status, "Failed obtaining device properties for " + device::detail_::identify(id_));
 		return properties;
 	}
 
 	static device_t choose_best_match(const properties_t& properties) {
 		device::id_t id;
 		auto status = cudaChooseDevice(&id, &properties);
-		throw_if_error(status, "Failed choosing a best matching device by a a property set.");
+		throw_if_error_lazy(status, "Failed choosing a best matching device by a a property set.");
 		return device::wrap(id);
 	}
 
@@ -274,7 +274,7 @@ class device_t {
 	{
 		attribute_value_t attribute_value;
 		auto status = cuDeviceGetAttribute(&attribute_value, attribute, id_);
-		throw_if_error(status, "Failed obtaining device properties for " + device::detail_::identify(id_));
+		throw_if_error_lazy(status, "Failed obtaining device properties for " + device::detail_::identify(id_));
 		return attribute_value;
 	}
 
@@ -428,7 +428,7 @@ class device_t {
 	        primary_context_handle_;
 		context_setter_type set_context_for_this_scope{pc_handle};
 		auto status = cudaDeviceReset();
-		throw_if_error(status, "Resetting " + device::detail_::identify(id_));
+		throw_if_error_lazy(status, "Resetting " + device::detail_::identify(id_));
 	}
 
 	/**
diff --git a/src/cuda/api/error.hpp b/src/cuda/api/error.hpp
index 809db553..4a2deee7 100644
--- a/src/cuda/api/error.hpp
+++ b/src/cuda/api/error.hpp
@@ -299,6 +299,14 @@ class runtime_error : public ::std::runtime_error {
 	status_t code_;
 };
 
+#define throw_if_error_lazy(status__, ... ) \
+do { \
+	status_t tie_status__ = static_cast<status_t>(status__); \
+	if (is_failure(tie_status__)) { \
+		throw runtime_error(tie_status__, (__VA_ARGS__)); \
+	} \
+} while(false)
+
 // TODO: The following could use ::std::optional arguments - which would
 // prevent the need for dual versions of the functions - but we're
 // not writing C++17 here
diff --git a/src/cuda/api/event.hpp b/src/cuda/api/event.hpp
index 47c10614..fa9164e2 100644
--- a/src/cuda/api/event.hpp
+++ b/src/cuda/api/event.hpp
@@ -38,7 +38,7 @@ inline void destroy(
 inline void enqueue_in_current_context(stream::handle_t stream_handle, handle_t event_handle)
 {
 	auto status = cuEventRecord(event_handle, stream_handle);
-	throw_if_error(status,
+	throw_if_error_lazy(status,
 		"Failed recording " + event::detail_::identify(event_handle)
 		+ " on " + stream::detail_::identify(stream_handle));
 }
@@ -342,7 +342,7 @@ inline duration_t time_elapsed_between(const event_t& start, const event_t& end)
 {
 	float elapsed_milliseconds;
 	auto status = cuEventElapsedTime(&elapsed_milliseconds, start.handle(), end.handle());
-	throw_if_error(status, "determining the time elapsed between events");
+	throw_if_error_lazy(status, "determining the time elapsed between events");
 	return duration_t { elapsed_milliseconds };
 }
 
@@ -372,7 +372,7 @@ inline handle_t create_raw_in_current_context(flags_t flags = 0u)
 {
 	cuda::event::handle_t new_event_handle;
 	auto status = cuEventCreate(&new_event_handle, flags);
-	throw_if_error(status, "Failed creating a CUDA event");
+	throw_if_error_lazy(status, "Failed creating a CUDA event");
 	return new_event_handle;
 }
 
@@ -401,7 +401,7 @@ inline void destroy_in_current_context(
 	context::handle_t  current_context_handle)
 {
 	auto status = cuEventDestroy(handle);
-	throw_if_error(status, "Failed destroying " +
+	throw_if_error_lazy(status, "Failed destroying " +
 		identify(handle, current_context_handle, current_device_id));
 }
 
@@ -478,7 +478,7 @@ inline void wait(event_t event)
 	auto event_handle = event.handle();
 	context::current::detail_::scoped_override_t context_for_this_scope(context_handle);
 	auto status = cuEventSynchronize(event_handle);
-	throw_if_error(status, "Failed synchronizing " + event::detail_::identify(event));
+	throw_if_error_lazy(status, "Failed synchronizing " + event::detail_::identify(event));
 }
 
 inline void synchronize(event_t event)
diff --git a/src/cuda/api/ipc.hpp b/src/cuda/api/ipc.hpp
index 7948aca0..c04460b2 100644
--- a/src/cuda/api/ipc.hpp
+++ b/src/cuda/api/ipc.hpp
@@ -61,7 +61,7 @@ using handle_t = CUipcMemHandle;
 inline handle_t export_(void* device_ptr) {
 	handle_t handle;
 	auto status = cuIpcGetMemHandle(&handle, device::address(device_ptr));
-	throw_if_error(status, "Failed producing an IPC memory handle for device pointer "
+	throw_if_error_lazy(status, "Failed producing an IPC memory handle for device pointer "
 		+ cuda::detail_::ptr_as_hex(device_ptr));
 	return handle;
 }
@@ -81,7 +81,7 @@ inline T* import(const handle_t& handle)
 {
 	CUdeviceptr device_ptr;
 	auto status = cuIpcOpenMemHandle(&device_ptr, handle, CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS);
-	throw_if_error(status, "Failed obtaining a device pointer from an IPC memory handle");
+	throw_if_error_lazy(status, "Failed obtaining a device pointer from an IPC memory handle");
 	return reinterpret_cast<T*>(device_ptr);
 }
 
@@ -93,7 +93,7 @@ inline T* import(const handle_t& handle)
 inline void unmap(void* ipc_mapped_ptr)
 {
 	auto status = cuIpcCloseMemHandle(device::address(ipc_mapped_ptr));
-	throw_if_error(status, "Failed unmapping IPC memory mapped to " + cuda::detail_::ptr_as_hex(ipc_mapped_ptr));
+	throw_if_error_lazy(status, "Failed unmapping IPC memory mapped to " + cuda::detail_::ptr_as_hex(ipc_mapped_ptr));
 }
 
 /**
@@ -162,7 +162,7 @@ inline handle_t export_(event::handle_t event_handle)
 {
 	handle_t ipc_handle;
 	auto status = cuIpcGetEventHandle(&ipc_handle, event_handle);
-	throw_if_error(status, "Failed obtaining an IPC event handle for " +
+	throw_if_error_lazy(status, "Failed obtaining an IPC event handle for " +
 		event::detail_::identify(event_handle));
 	return ipc_handle;
 }
@@ -171,7 +171,7 @@ inline event::handle_t import(const handle_t& handle)
 {
 	event::handle_t event_handle;
 	auto status = cuIpcOpenEventHandle(&event_handle, handle);
-	throw_if_error(status, "Failed obtaining an event handle from an IPC event handle");
+	throw_if_error_lazy(status, "Failed obtaining an event handle from an IPC event handle");
 	return event_handle;
 }
 
diff --git a/src/cuda/api/kernel.hpp b/src/cuda/api/kernel.hpp
index 4f4b9f6a..0319d6ec 100644
--- a/src/cuda/api/kernel.hpp
+++ b/src/cuda/api/kernel.hpp
@@ -81,7 +81,7 @@ inline attribute_value_t get_attribute_in_current_context(handle_t handle, attri
 {
 	kernel::attribute_value_t attribute_value;
 	auto result = cuFuncGetAttribute(&attribute_value,  attribute, handle);
-	throw_if_error(result,
+	throw_if_error_lazy(result,
 		::std::string("Failed obtaining attribute ") + attribute_name(attribute)
 	);
 	return attribute_value;
@@ -301,7 +301,7 @@ class kernel_t {
 	{
 		context::current::detail_::scoped_override_t set_context_for_this_context(context_handle_);
 		auto result = cuFuncSetCacheConfig(handle(), (CUfunc_cache) preference);
-		throw_if_error(result,
+		throw_if_error_lazy(result,
 			"Setting the multiprocessor L1/Shared Memory cache distribution preference for a "
 			"CUDA device function");
 	}
@@ -317,7 +317,7 @@ class kernel_t {
 		// TODO: Need to set a context, not a device
 		context::current::detail_::scoped_override_t set_context_for_this_context(context_handle_);
 		auto result = cuFuncSetSharedMemConfig(handle(), static_cast<CUsharedconfig>(config) );
-		throw_if_error(result, "Failed setting the shared memory bank size");
+		throw_if_error_lazy(result, "Failed setting the shared memory bank size");
 	}
 
 protected: // ctors & dtor
@@ -396,7 +396,7 @@ inline grid::dimension_t max_active_blocks_per_multiprocessor(
 	auto flags = (unsigned) disable_caching_override ? CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE : CU_OCCUPANCY_DEFAULT;
 	status = cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
 		&result, handle, (int) block_size_in_threads, dynamic_shared_memory_per_block, flags);
-	throw_if_error(status,
+	throw_if_error_lazy(status,
 		"Determining the maximum occupancy in blocks per multiprocessor, given the block size and the amount of dyanmic memory per block");
 	return result;
 }
@@ -426,7 +426,7 @@ inline grid::composite_dimensions_t min_grid_params_for_max_occupancy(
 		disable_caching_override ? CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE : CU_OCCUPANCY_DEFAULT
 	);
 
-	throw_if_error(result,
+	throw_if_error_lazy(result,
 		"Failed obtaining parameters for a minimum-size grid for " + kernel::detail_::identify(kernel_handle, device_id)
 		+ " with maximum occupancy given dynamic shared memory and block size data");
 	return { (grid::dimension_t) min_grid_size_in_blocks, (grid::block_dimension_t) block_size };
@@ -448,7 +448,7 @@ inline memory::shared::size_t max_dynamic_shared_memory_per_block(
 	size_t result;
 	auto status = cuOccupancyAvailableDynamicSMemPerBlock(
 		&result, kernel.handle(), (int) blocks_on_multiprocessor, (int) block_size_in_threads);
-	throw_if_error(status,
+	throw_if_error_lazy(status,
 		"Determining the available dynamic memory per block, given the number of blocks on a multiprocessor and their size");
 	return (memory::shared::size_t) result;
 }
diff --git a/src/cuda/api/kernel_launch.hpp b/src/cuda/api/kernel_launch.hpp
index 66b5b587..2ab49419 100644
--- a/src/cuda/api/kernel_launch.hpp
+++ b/src/cuda/api/kernel_launch.hpp
@@ -214,7 +214,7 @@ void enqueue_raw_kernel_launch(
 			(size_t)launch_configuration.dynamic_shared_memory_size,
 			cudaStream_t(stream_handle));
 #endif // CUDA_VERSION >= 11000
-		throw_if_error(status, "Cooperative kernel launch failed");
+		throw_if_error_lazy(status, "Cooperative kernel launch failed");
 #endif // CUDA_VERSION >= 9000
 	}
 }
diff --git a/src/cuda/api/link.hpp b/src/cuda/api/link.hpp
index 77a65171..83c9f28b 100644
--- a/src/cuda/api/link.hpp
+++ b/src/cuda/api/link.hpp
@@ -97,7 +97,7 @@ class link_t {
 		void* cubin_output_start;
 		size_t cubin_output_size;
 		auto status = cuLinkComplete(handle_, &cubin_output_start, &cubin_output_size);
-		throw_if_error(status,
+		throw_if_error_lazy(status,
 			"Failed completing the link with state at address " + cuda::detail_::ptr_as_hex(handle_));
 		return memory::region_t{cubin_output_start, cubin_output_size};
 	}
@@ -116,7 +116,7 @@ class link_t {
 			const_cast<link::option_t*>(marshalled_options.options()),
 			const_cast<void**>(marshalled_options.values())
 		);
-		throw_if_error(status,
+		throw_if_error_lazy(status,
 			"Failed adding input " + ::std::string(image.name) + " of type " + ::std::to_string(image.type) + " to a link.");
 	}
 
@@ -131,7 +131,7 @@ class link_t {
 			const_cast<link::option_t*>(marshalled_options.options()),
 			const_cast<void**>(marshalled_options.values())
 			);
-		throw_if_error(status,
+		throw_if_error_lazy(status,
 			"Failed loading an object of type " + ::std::to_string(file_input.type) + " from file " + file_input.path);
 	}
 
@@ -171,7 +171,7 @@ class link_t {
 		if (owning) {
 			context::current::detail_::scoped_override_t set_context_for_this_scope(context_handle_);
 			auto status = cuLinkDestroy(handle_);
-			throw_if_error(status,
+			throw_if_error_lazy(status,
 				::std::string("Failed destroying the link ") + detail_::ptr_as_hex(handle_) +
 				" in " + context::detail_::identify(context_handle_, device_id_));
 		}
@@ -212,7 +212,7 @@ inline link_t create(const link::options_t& options = link::options_t{})
 		const_cast<void**>(marshalled_options.values()),
 		&new_link_handle
 	);
-	throw_if_error(status, "Failed creating a new link ");
+	throw_if_error_lazy(status, "Failed creating a new link ");
 	auto do_take_ownership = true;
 	auto context_handle = context::current::detail_::get_handle();
 	auto device_id = context::current::detail_::get_device_id();
diff --git a/src/cuda/api/memory.hpp b/src/cuda/api/memory.hpp
index 2c3468bd..ac639a5b 100644
--- a/src/cuda/api/memory.hpp
+++ b/src/cuda/api/memory.hpp
@@ -151,7 +151,7 @@ inline cuda::memory::region_t allocate_in_current_context(size_t num_bytes)
 		// Can this even happen? hopefully not
 		status = (status_t) status::unknown;
 	}
-	throw_if_error(status, "Failed allocating " + ::std::to_string(num_bytes) +
+	throw_if_error_lazy(status, "Failed allocating " + ::std::to_string(num_bytes) +
 		" bytes of global memory on the current CUDA device");
 	return {as_pointer(allocated), num_bytes};
 }
@@ -185,7 +185,7 @@ inline region_t allocate(
 		// Can this even happen? hopefully not
 		status = static_cast<decltype(status)>(status::unknown);
 	}
-	throw_if_error(status,
+	throw_if_error_lazy(status,
 		"Failed scheduling an asynchronous allocation of " + ::std::to_string(num_bytes) +
 		" bytes of global memory on " + stream::detail_::identify(stream_handle, context_handle) );
 	return {as_pointer(allocated), num_bytes};
@@ -239,7 +239,7 @@ inline void free(
 	void*              allocated_region_start)
 {
 	auto status = cuMemFreeAsync(device::address(allocated_region_start), stream_handle);
-	throw_if_error(status,
+	throw_if_error_lazy(status,
 		"Failed scheduling an asynchronous freeing of the global memory region starting at "
 		+ cuda::detail_::ptr_as_hex(allocated_region_start) + " on "
 		+ stream::detail_::identify(stream_handle, context_handle) );
@@ -842,7 +842,7 @@ void copy(array_t<T, NumDimensions> destination, array_t<T, NumDimensions> sourc
 	params.dstPitch = params.srcPitch = dims.width * sizeof(T);
 	auto status = //(source.context() == destination.context()) ?
 		detail_::multidim_copy<NumDimensions>(source.context_handle(), params);
-	throw_if_error(status, "Copying from a CUDA array into a regular memory region");
+	throw_if_error_lazy(status, "Copying from a CUDA array into a regular memory region");
 }
 
 
@@ -901,7 +901,7 @@ inline void copy(void* destination, const void* source, size_t num_bytes, stream
 
 	// TODO: Determine whether it was from host to device, device to host etc and
 	// add this information to the error string
-	throw_if_error(result, "Scheduling a memory copy on " + stream::detail_::identify(stream_handle));
+	throw_if_error_lazy(result, "Scheduling a memory copy on " + stream::detail_::identify(stream_handle));
 }
 
 /**
@@ -1217,7 +1217,7 @@ inline void set(void* start, int byte_value, size_t num_bytes, stream::handle_t
 {
 	// TODO: Double-check that this call doesn't require setting the current device
 	auto result = cuMemsetD8Async(address(start), (unsigned char) byte_value, num_bytes, stream_handle);
-	throw_if_error(result, "asynchronously memsetting an on-device buffer");
+	throw_if_error_lazy(result, "asynchronously memsetting an on-device buffer");
 }
 
 inline void set(region_t region, int byte_value, stream::handle_t stream_handle)
@@ -1251,7 +1251,7 @@ inline void typed_set(T* start, const T& value, size_t num_elements, stream::han
 		case(2): result = cuMemsetD16Async(address(start), reinterpret_cast<const ::std::uint16_t&>(value), num_elements, stream_handle); break;
 		case(4): result = cuMemsetD32Async(address(start), reinterpret_cast<const ::std::uint32_t&>(value), num_elements, stream_handle); break;
 	}
-	throw_if_error(result, "Setting global device memory bytes");
+	throw_if_error_lazy(result, "Setting global device memory bytes");
 }
 
 } // namespace detail_
@@ -1330,7 +1330,7 @@ inline void copy(
 		destination_context,
 		reinterpret_cast<device::address_t>(source_address),
 		source_context, num_bytes);
-	throw_if_error(status,
+	throw_if_error_lazy(status,
 		::std::string("Failed copying data between devices: From address ")
 			+ cuda::detail_::ptr_as_hex(source_address) + " in "
 			+ context::detail_::identify(source_context) + " to address "
@@ -1402,7 +1402,7 @@ inline void copy(
 
 	// TODO: Determine whether it was from host to device, device to host etc and
 	// add this information to the error string
-	throw_if_error(result, "Scheduling an inter-context memory copy from "
+	throw_if_error_lazy(result, "Scheduling an inter-context memory copy from "
 		+ context::detail_::identify(source_context_handle) + " to "
 		+ context::detail_::identify(destination_context_handle) + " on "
 		+ stream::detail_::identify(stream_handle));
@@ -1559,7 +1559,7 @@ struct deleter {
 inline void register_(const void *ptr, size_t size, unsigned flags)
 {
 	auto result = cuMemHostRegister(const_cast<void *>(ptr), size, flags);
-	throw_if_error(result,
+	throw_if_error_lazy(result,
 		"Could not register and page-lock the region of " + ::std::to_string(size) +
 		" bytes of host memory at " + cuda::detail_::ptr_as_hex(ptr));
 }
@@ -1648,7 +1648,7 @@ inline void register_(const_region_t region)
 inline void deregister(const void *ptr)
 {
 	auto result = cuMemHostUnregister(const_cast<void *>(ptr));
-	throw_if_error(result,
+	throw_if_error_lazy(result,
 		"Could not unregister the memory segment starting at address *a");
 }
 
@@ -1770,7 +1770,7 @@ inline T get_scalar_range_attribute(managed::const_region_t region, range_attrib
 	uint32_t attribute_value { 0 };
 	auto result = cuMemRangeGetAttribute(
 		&attribute_value, sizeof(attribute_value), attribute, device::address(region.start()), region.size());
-	throw_if_error(result,
+	throw_if_error_lazy(result,
 		"Obtaining an attribute for a managed memory range at " + cuda::detail_::ptr_as_hex(region.start()));
 	return static_cast<T>(attribute_value);
 }
@@ -1780,7 +1780,7 @@ inline T get_scalar_range_attribute(managed::const_region_t region, range_attrib
 inline void advise(managed::const_region_t region, advice_t advice, cuda::device::id_t device_id)
 {
 	auto result = cuMemAdvise(device::address(region.start()), region.size(), advice, device_id);
-	throw_if_error(result, "Setting an attribute for a managed memory range at "
+	throw_if_error_lazy(result, "Setting an attribute for a managed memory range at "
 	+ cuda::detail_::ptr_as_hex(region.start()));
 }
 
@@ -1847,7 +1847,7 @@ inline region_t allocate_in_current_context(
 		// Can this even happen? hopefully not
 		status = (status_t) status::unknown;
 	}
-	throw_if_error(status, "Failed allocating "
+	throw_if_error_lazy(status, "Failed allocating "
 		+ ::std::to_string(num_bytes) + " bytes of managed CUDA memory");
 	return {as_pointer(allocated), num_bytes};
 }
@@ -1862,7 +1862,7 @@ inline void free(void* ptr)
 {
 	auto result = cuMemFree(device::address(ptr));
 	cuda::device::primary_context::detail_::decrease_refcount(cuda::device::default_device_id);
-	throw_if_error(result, "Freeing managed memory at " + cuda::detail_::ptr_as_hex(ptr));
+	throw_if_error_lazy(result, "Freeing managed memory at " + cuda::detail_::ptr_as_hex(ptr));
 }
 inline void free(region_t region)
 {
@@ -1949,7 +1949,7 @@ region_t allocate(size_t num_bytes);
 inline void free(void* managed_ptr)
 {
 	auto result = cuMemFree(device::address(managed_ptr));
-	throw_if_error(result,
+	throw_if_error_lazy(result,
 		"Freeing managed memory (host and device regions) at address "
 		+ cuda::detail_::ptr_as_hex(managed_ptr));
 }
@@ -1973,7 +1973,7 @@ namespace detail_ {
 inline void set(const_region_t region, kind_t advice, cuda::device::id_t device_id)
 {
 	auto result = cuMemAdvise(device::address(region.start()), region.size(), (managed::detail_::advice_t) advice, device_id);
-	throw_if_error(result, "Setting advice on a (managed) memory region at"
+	throw_if_error_lazy(result, "Setting advice on a (managed) memory region at"
 		+ cuda::detail_::ptr_as_hex(region.start()) + " w.r.t. " + cuda::device::detail_::identify(device_id));
 }
 
@@ -1993,7 +1993,7 @@ inline void prefetch(
 	stream::handle_t    source_stream_handle)
 {
 	auto result = cuMemPrefetchAsync(device::address(region.start()), region.size(), destination, source_stream_handle);
-	throw_if_error(result,
+	throw_if_error_lazy(result,
 		"Prefetching " + ::std::to_string(region.size()) + " bytes of managed memory at address "
 		 + cuda::detail_::ptr_as_hex(region.start()) + " to " + (
 		 	(destination == CU_DEVICE_CPU) ? "the host" : cuda::device::detail_::identify(destination))  );
@@ -2038,7 +2038,7 @@ inline T* device_side_pointer_for(T* host_memory_ptr)
 		&device_side_ptr,
 		host_memory_ptr,
 		get_device_pointer_flags);
-	throw_if_error(status,
+	throw_if_error_lazy(status,
 		"Failed obtaining the device-side pointer for host-memory pointer "
 		+ cuda::detail_::ptr_as_hex(host_memory_ptr) + " supposedly mapped to device memory");
 	return as_pointer(device_side_ptr);
@@ -2070,7 +2070,7 @@ inline region_pair allocate_in_current_context(
 		// Can this even happen? hopefully not
 		status = (status_t) status::named_t::unknown;
 	}
-	throw_if_error(status,
+	throw_if_error_lazy(status,
 		"Failed allocating a mapped pair of memory regions of size " + ::std::to_string(size_in_bytes)
 		+ " bytes of global memory in " + context::detail_::identify(current_context_handle));
 	allocated.device_side = device_side_pointer_for(allocated.host_side);
@@ -2089,7 +2089,7 @@ inline region_pair allocate(
 inline void free(void* host_side_pair)
 {
 	auto result = cuMemFreeHost(host_side_pair);
-	throw_if_error(result, "Freeing a mapped memory region pair with host-side address "
+	throw_if_error_lazy(result, "Freeing a mapped memory region pair with host-side address "
 		+ cuda::detail_::ptr_as_hex(host_side_pair));
 }
 
@@ -2146,7 +2146,7 @@ inline void free_region_pair_of(void* ptr)
 	// We could check this...
 	void* host_side_ptr;
 	auto status = cuPointerGetAttribute (&host_side_ptr, CU_POINTER_ATTRIBUTE_HOST_POINTER, memory::device::address(ptr));
-	throw_if_error(status, "Failed obtaining the host-side address of supposedly-device-side pointer "
+	throw_if_error_lazy(status, "Failed obtaining the host-side address of supposedly-device-side pointer "
 		+ cuda::detail_::ptr_as_hex(ptr));
 	detail_::free(host_side_ptr);
 }
@@ -2186,9 +2186,9 @@ inline memory::region_t locate(T&& symbol)
 	void *start;
 	size_t symbol_size;
 	auto api_call_result = cudaGetSymbolAddress(&start, ::std::forward<T>(symbol));
-	throw_if_error(api_call_result, "Could not locate the device memory address for a symbol");
+	throw_if_error_lazy(api_call_result, "Could not locate the device memory address for a symbol");
 	api_call_result = cudaGetSymbolSize(&symbol_size, ::std::forward<T>(symbol));
-	throw_if_error(api_call_result, "Could not locate the device memory address for the symbol at address"
+	throw_if_error_lazy(api_call_result, "Could not locate the device memory address for the symbol at address"
 		+ cuda::detail_::ptr_as_hex(start));
 	return { start, symbol_size };
 }
diff --git a/src/cuda/api/miscellany.hpp b/src/cuda/api/miscellany.hpp
index 897f0642..cf7fd530 100644
--- a/src/cuda/api/miscellany.hpp
+++ b/src/cuda/api/miscellany.hpp
@@ -30,7 +30,7 @@ inline void initialize_driver()
 {
 	static constexpr const unsigned dummy_flags { 0 }; // this is the only allowed value for flags
 	auto status = cuInit(dummy_flags);
-	throw_if_error(status, "Failed initializing the CUDA driver");
+	throw_if_error_lazy(status, "Failed initializing the CUDA driver");
 }
 
 inline void ensure_driver_is_initialized()
diff --git a/src/cuda/api/module.hpp b/src/cuda/api/module.hpp
index 7348e05a..263d1792 100644
--- a/src/cuda/api/module.hpp
+++ b/src/cuda/api/module.hpp
@@ -132,7 +132,7 @@ class module_t {
 		CUdeviceptr dptr;
 		size_t size;
 		auto result = cuModuleGetGlobal(&dptr, &size, handle_, name);
-		throw_if_error(result, "Obtaining the address and size of a named global object");
+		throw_if_error_lazy(result, "Obtaining the address and size of a named global object");
 		return { memory::as_pointer(dptr), size };
 	}
 
@@ -240,7 +240,7 @@ inline module_t load_from_file_in_current_context(
 {
 	handle_t new_module_handle;
 	auto status = cuModuleLoad(&new_module_handle, path);
-	throw_if_error(status, ::std::string("Failed loading a module from file ") + path);
+	throw_if_error_lazy(status, ::std::string("Failed loading a module from file ") + path);
 	bool do_take_ownership{true};
 	return wrap(
 		current_context_device_id,
@@ -366,7 +366,7 @@ inline void destroy(handle_t handle, context::handle_t context_handle, device::i
 {
 	context::current::detail_::scoped_override_t set_context_for_this_scope(context_handle);
 	auto status = cuModuleUnload(handle);
-	throw_if_error(status, "Failed unloading " + identify(handle, context_handle, device_id));
+	throw_if_error_lazy(status, "Failed unloading " + identify(handle, context_handle, device_id));
 }
 
 } // namespace detail_
diff --git a/src/cuda/api/multi_wrapper_impls/apriori_compiled_kernel.hpp b/src/cuda/api/multi_wrapper_impls/apriori_compiled_kernel.hpp
index 4e978ebb..3e8bccb3 100644
--- a/src/cuda/api/multi_wrapper_impls/apriori_compiled_kernel.hpp
+++ b/src/cuda/api/multi_wrapper_impls/apriori_compiled_kernel.hpp
@@ -41,7 +41,7 @@ inline kernel::attributes_t apriori_compiled_kernel_t::attributes() const
 	context::current::detail_::scoped_override_t set_context_for_this_scope(context_handle_);
 	kernel::attributes_t function_attributes;
 	auto status = cudaFuncGetAttributes(&function_attributes, ptr_);
-	throw_if_error(status, "Failed obtaining attributes for a CUDA device function");
+	throw_if_error_lazy(status, "Failed obtaining attributes for a CUDA device function");
 	return function_attributes;
 }
 
@@ -50,7 +50,7 @@ inline void apriori_compiled_kernel_t::set_cache_preference(multiprocessor_cache
 	// Note: assuming the primary context is active
 	context::current::detail_::scoped_override_t set_context_for_this_scope(context_handle_);
 	auto result = cudaFuncSetCacheConfig(ptr_, (cudaFuncCache) preference);
-	throw_if_error(result,
+	throw_if_error_lazy(result,
 		"Setting the multiprocessor L1/Shared Memory cache distribution preference for a "
 		"CUDA device function");
 }
@@ -61,7 +61,7 @@ inline void apriori_compiled_kernel_t::set_shared_memory_bank_size(
 	// Note: assuming the primary context is active
 	context::current::detail_::scoped_override_t set_context_for_this_scope(context_handle_);
 	auto result = cudaFuncSetSharedMemConfig(ptr_, (cudaSharedMemConfig) config);
-	throw_if_error(result);
+	throw_if_error_lazy(result, "Failed setting shared memory bank size to " + std::to_string(config));
 }
 
 inline void apriori_compiled_kernel_t::set_attribute(kernel::attribute_t attribute, kernel::attribute_value_t value) const
@@ -81,7 +81,7 @@ inline void apriori_compiled_kernel_t::set_attribute(kernel::attribute_t attribu
 		}
 	}();
 	auto result = cudaFuncSetAttribute(ptr_, runtime_attribute, value);
-	throw_if_error(result, "Setting CUDA device function attribute " + ::std::to_string(attribute) + " to value " + ::std::to_string(value));
+	throw_if_error_lazy(result, "Setting CUDA device function attribute " + ::std::to_string(attribute) + " to value " + ::std::to_string(value));
 }
 
 kernel::attribute_value_t apriori_compiled_kernel_t::get_attribute(kernel::attribute_t attribute) const
diff --git a/src/cuda/api/multi_wrapper_impls/device.hpp b/src/cuda/api/multi_wrapper_impls/device.hpp
index 023fea93..5efcf1b2 100644
--- a/src/cuda/api/multi_wrapper_impls/device.hpp
+++ b/src/cuda/api/multi_wrapper_impls/device.hpp
@@ -36,7 +36,7 @@ inline bool is_active(const device_t& device)
 inline void destroy(const device_t& device)
 {
 	auto status = cuDevicePrimaryCtxReset(device.id());
-	throw_if_error(status, "Failed destroying/resetting the primary context of device " + ::std::to_string(device.id()));
+	throw_if_error_lazy(status, "Failed destroying/resetting the primary context of device " + ::std::to_string(device.id()));
 }
 
 inline primary_context_t get(const device_t& device)
diff --git a/src/cuda/api/multi_wrapper_impls/kernel.hpp b/src/cuda/api/multi_wrapper_impls/kernel.hpp
index c6e364f8..be2fc8e9 100644
--- a/src/cuda/api/multi_wrapper_impls/kernel.hpp
+++ b/src/cuda/api/multi_wrapper_impls/kernel.hpp
@@ -68,7 +68,7 @@ inline void kernel_t::set_attribute(kernel::attribute_t attribute, kernel::attri
 #if CUDA_VERSION >= 9000
 	context::current::detail_::scoped_override_t set_context_for_this_context(context_handle_);
 	auto result = cuFuncSetAttribute(handle_, static_cast<CUfunction_attribute>(attribute), value);
-	throw_if_error(result,
+	throw_if_error_lazy(result,
 		"Setting CUDA device function attribute " +
 		::std::string(kernel::detail_::attribute_name(attribute)) +
 		" to value " + ::std::to_string(value)	);
diff --git a/src/cuda/api/multi_wrapper_impls/kernel_launch.hpp b/src/cuda/api/multi_wrapper_impls/kernel_launch.hpp
index 67950d1e..f4ad7585 100644
--- a/src/cuda/api/multi_wrapper_impls/kernel_launch.hpp
+++ b/src/cuda/api/multi_wrapper_impls/kernel_launch.hpp
@@ -86,7 +86,7 @@ inline void launch_type_erased_in_current_context(
 			no_arguments_in_alternative_format
 		);
 	}
-	throw_if_error(status,
+	throw_if_error_lazy(status,
 		(lc.block_cooperation ? "Cooperative " : "") +
 		::std::string(" kernel launch failed for ") + kernel::detail_::identify(kernel_function_handle)
 		+ " on " + stream::detail_::identify(stream_handle, context_handle, device_id));
@@ -212,7 +212,7 @@ inline grid::composite_dimensions_t min_grid_params_for_max_occupancy(
 		static_cast<int>(block_size_limit),
 		disable_caching_override ? cudaOccupancyDisableCachingOverride : cudaOccupancyDefault
 	);
-	throw_if_error(result,
+	throw_if_error_lazy(result,
 		"Failed obtaining parameters for a minimum-size grid for kernel " + detail_::ptr_as_hex(ptr) +
 			" on device " + ::std::to_string(device_id) + ".");
 	return { (grid::dimension_t) min_grid_size_in_blocks, (grid::block_dimension_t) block_size };
diff --git a/src/cuda/api/multi_wrapper_impls/memory.hpp b/src/cuda/api/multi_wrapper_impls/memory.hpp
index 2fa2d1fd..f421dab6 100644
--- a/src/cuda/api/multi_wrapper_impls/memory.hpp
+++ b/src/cuda/api/multi_wrapper_impls/memory.hpp
@@ -386,7 +386,7 @@ ::std::vector<device_t, Allocator> accessors(const_region_t region, const Alloca
 	auto status = cuMemRangeGetAttribute(
 	device_ids, sizeof(device_t) * devices.size(),
 	CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY, device::address(region.start()), region.size());
-	throw_if_error(status, "Obtaining the IDs of devices with access to the managed memory range at "
+	throw_if_error_lazy(status, "Obtaining the IDs of devices with access to the managed memory range at "
 						   + cuda::detail_::ptr_as_hex(region.start()));
 	auto first_invalid_element = ::std::lower_bound(device_ids, device_ids + num_devices, cudaInvalidDeviceId);
 	// We may have gotten less results that the set of all devices, so let's whittle that down
@@ -487,7 +487,7 @@ inline void* allocate(
 		// Can this even happen? hopefully not
 		result = static_cast<status_t>(status::named_t::unknown);
 	}
-	throw_if_error(result, "Failed allocating " + ::std::to_string(size_in_bytes) + " bytes of host memory");
+	throw_if_error_lazy(result, "Failed allocating " + ::std::to_string(size_in_bytes) + " bytes of host memory");
 	return allocated;
 }
 
@@ -502,7 +502,7 @@ attribute_value_type_t <attribute> get_attribute(const void *ptr)
 	context::current::detail_::scoped_existence_ensurer_t ensure_we_have_some_context;
 	attribute_value_type_t <attribute> attribute_value;
 	auto status = cuPointerGetAttribute(&attribute_value, attribute, device::address(ptr));
-	throw_if_error(status, "Obtaining attribute " + ::std::to_string((int) attribute)
+	throw_if_error_lazy(status, "Obtaining attribute " + ::std::to_string((int) attribute)
 						   + " for pointer " + cuda::detail_::ptr_as_hex(ptr) );
 	return attribute_value;
 }
@@ -512,7 +512,7 @@ inline void get_attributes(unsigned num_attributes, pointer::attribute_t* attrib
 {
 	context::current::detail_::scoped_existence_ensurer_t ensure_we_have_some_context;
 	auto status = cuPointerGetAttributes( num_attributes, attributes, value_ptrs, device::address(ptr) );
-	throw_if_error(status, "Obtaining multiple attributes for pointer " + cuda::detail_::ptr_as_hex(ptr));
+	throw_if_error_lazy(status, "Obtaining multiple attributes for pointer " + cuda::detail_::ptr_as_hex(ptr));
 }
 
 } // namespace detail_
@@ -524,7 +524,7 @@ inline void copy(void *destination, const void *source, size_t num_bytes)
 	auto result = cuMemcpy(device::address(destination), device::address(source), num_bytes);
 	// TODO: Determine whether it was from host to device, device to host etc and
 	// add this information to the error string
-	throw_if_error(result, "Synchronously copying data");
+	throw_if_error_lazy(result, "Synchronously copying data");
 }
 
 namespace device {
@@ -545,7 +545,7 @@ inline void typed_set(T* start, const T& value, size_t num_elements)
 		case(2): result = cuMemsetD16(address(start), reinterpret_cast<const ::std::uint16_t&>(value), num_elements); break;
 		case(4): result = cuMemsetD32(address(start), reinterpret_cast<const ::std::uint32_t&>(value), num_elements); break;
 	}
-	throw_if_error(result, "Setting global device memory bytes");
+	throw_if_error_lazy(result, "Setting global device memory bytes");
 }
 
 } // namespace device
@@ -599,7 +599,7 @@ inline void set_access_mode(
 	CUmemAccessDesc desc { { CU_MEM_LOCATION_TYPE_DEVICE, device.id() }, CUmemAccess_flags(access_mode) };
 	static constexpr const size_t count { 1 };
 	auto result = cuMemSetAccess(fully_mapped_region.device_address(), fully_mapped_region.size(), &desc, count);
-	throw_if_error(result, "Failed setting the access mode to the virtual memory mapping to the range of size "
+	throw_if_error_lazy(result, "Failed setting the access mode to the virtual memory mapping to the range of size "
 						   + ::std::to_string(fully_mapped_region.size()) + " bytes at " + cuda::detail_::ptr_as_hex(fully_mapped_region.data()));
 }
 
@@ -620,7 +620,7 @@ inline void set_access_mode(
 	}
 	auto result = cuMemSetAccess(
 	device::address(fully_mapped_region.start()), fully_mapped_region.size(), descriptors, devices.size());
-	throw_if_error(result, "Failed setting the access mode to the virtual memory mapping to the range of size "
+	throw_if_error_lazy(result, "Failed setting the access mode to the virtual memory mapping to the range of size "
 						   + ::std::to_string(fully_mapped_region.size()) + " bytes at " + cuda::detail_::ptr_as_hex(fully_mapped_region.data()));
 }
 
diff --git a/src/cuda/api/multi_wrapper_impls/module.hpp b/src/cuda/api/multi_wrapper_impls/module.hpp
index 0c2e808b..52dd07fe 100644
--- a/src/cuda/api/multi_wrapper_impls/module.hpp
+++ b/src/cuda/api/multi_wrapper_impls/module.hpp
@@ -40,7 +40,7 @@ inline cuda::kernel_t module_t::get_kernel(const char* name) const
 	context::current::detail_::scoped_override_t set_context_for_this_scope(context_handle_);
 	kernel::handle_t kernel_function_handle;
 	auto result = cuModuleGetFunction(&kernel_function_handle, handle_, name);
-	throw_if_error(result, ::std::string("Failed obtaining function ") + name
+	throw_if_error_lazy(result, ::std::string("Failed obtaining function ") + name
 						   + " from " + module::detail_::identify(*this));
 	return kernel::wrap(context::detail_::get_device_id(context_handle_), context_handle_, kernel_function_handle);
 }
@@ -56,7 +56,7 @@ module_t create(const context_t& context, const void* module_data, Creator creat
 	context::current::scoped_override_t set_context_for_this_scope(context);
 	handle_t new_module_handle;
 	auto status = creator_function(new_module_handle, module_data);
-	throw_if_error(status, ::std::string("Failed loading a module from memory location ")
+	throw_if_error_lazy(status, ::std::string("Failed loading a module from memory location ")
 		+ cuda::detail_::ptr_as_hex(module_data)
 		+ " within " + context::detail_::identify(context));
 	bool do_take_ownership { true };
@@ -129,7 +129,7 @@ inline CUsurfref module_t::get_surface(const char* name) const
 	context::current::detail_::scoped_override_t set_context_for_this_scope(context_handle_);
 	CUsurfref raw_surface_reference;
 	auto status = cuModuleGetSurfRef(&raw_surface_reference, handle_, name);
-	throw_if_error(status, ::std::string("Failed obtaining a reference to surface \"") + name + "\" from "
+	throw_if_error_lazy(status, ::std::string("Failed obtaining a reference to surface \"") + name + "\" from "
 		+ module::detail_::identify(*this));
 	return raw_surface_reference;
 }
@@ -139,7 +139,7 @@ inline CUtexref module_t::get_texture_reference(const char* name) const
 	context::current::detail_::scoped_override_t set_context_for_this_scope(context_handle_);
 	CUtexref raw_texture_reference;
 	auto status = cuModuleGetTexRef(&raw_texture_reference, handle_, name);
-	throw_if_error(status, ::std::string("Failed obtaining a reference to texture \"") + name + "\" from "
+	throw_if_error_lazy(status, ::std::string("Failed obtaining a reference to texture \"") + name + "\" from "
 		+ module::detail_::identify(*this));
 	return raw_texture_reference;
 }
diff --git a/src/cuda/api/multi_wrapper_impls/stream.hpp b/src/cuda/api/multi_wrapper_impls/stream.hpp
index eab628b1..10691889 100644
--- a/src/cuda/api/multi_wrapper_impls/stream.hpp
+++ b/src/cuda/api/multi_wrapper_impls/stream.hpp
@@ -47,7 +47,7 @@ inline void record_event_in_current_context(
 	event::handle_t    event_handle)
 {
 	auto status = cuEventRecord(event_handle, stream_handle);
-	throw_if_error(status,
+	throw_if_error_lazy(status,
 		"Failed scheduling " + event::detail_::identify(event_handle)
 		+ " on " + stream::detail_::identify(stream_handle, current_context_handle_, current_device_id));
 }
@@ -85,7 +85,7 @@ inline void stream_t::enqueue_t::wait(const event_t& event_) const
 	static constexpr const unsigned int flags = 0;
 
 	auto status = cuStreamWaitEvent(associated_stream.handle_, event_.handle(), flags);
-	throw_if_error(status,
+	throw_if_error_lazy(status,
 		"Failed scheduling a wait for " + event::detail_::identify(event_.handle())
 		+ " on " + stream::detail_::identify(associated_stream));
 
@@ -152,7 +152,7 @@ inline void copy_attributes(const stream_t &dest, const stream_t &src)
 #endif
 	context::current::detail_::scoped_override_t set_context_for_this_scope(dest.context_handle());
 	auto status = cuStreamCopyAttributes(dest.handle(), src.handle());
-	throw_if_error(status, "Copying attributes from " + stream::detail_::identify(src)
+	throw_if_error_lazy(status, "Copying attributes from " + stream::detail_::identify(src)
 		+ " to " + stream::detail_::identify(src));
 }
 
diff --git a/src/cuda/api/pci_id.hpp b/src/cuda/api/pci_id.hpp
index b0a6a008..16b3fe2d 100644
--- a/src/cuda/api/pci_id.hpp
+++ b/src/cuda/api/pci_id.hpp
@@ -53,7 +53,7 @@ inline id_t resolve_id(pci_location_t pci_id)
 	::std::string as_string { pci_id };
 	id_t cuda_device_id;
 	auto result = cuDeviceGetByPCIBusId(&cuda_device_id, as_string.c_str());
-	throw_if_error(result,
+	throw_if_error_lazy(result,
 		"Failed obtaining a CUDA device ID corresponding to PCI id " + as_string);
 	return cuda_device_id;
 }
diff --git a/src/cuda/api/peer_to_peer.hpp b/src/cuda/api/peer_to_peer.hpp
index cfa71a7a..1a3ceb93 100644
--- a/src/cuda/api/peer_to_peer.hpp
+++ b/src/cuda/api/peer_to_peer.hpp
@@ -42,7 +42,7 @@ inline attribute_value_t get_attribute(attribute_t attribute, id_t source, id_t
 {
 	attribute_value_t value;
 	auto status = cuDeviceGetP2PAttribute(&value, attribute, source, destination);
-	throw_if_error(status, "Failed obtaining peer-to-peer device attribute for device pair ("
+	throw_if_error_lazy(status, "Failed obtaining peer-to-peer device attribute for device pair ("
 		+ ::std::to_string(source) + ", " + ::std::to_string(destination) + ')');
 	return value;
 }
@@ -51,7 +51,7 @@ inline bool can_access(const device::id_t accessor, const device::id_t peer)
 {
 	int result;
 	auto status = cuDeviceCanAccessPeer(&result, accessor, peer);
-	throw_if_error(status, "Failed determining whether " + device::detail_::identify(accessor)
+	throw_if_error_lazy(status, "Failed determining whether " + device::detail_::identify(accessor)
 		+ " can access " + device::detail_::identify(peer));
 	return (result == 1);
 }
@@ -85,13 +85,13 @@ inline void enable_access_to(context::handle_t peer_context)
 	enum : unsigned {fixed_flags = 0 };
 	// No flags are supported as of CUDA 8.0
 	auto status = cuCtxEnablePeerAccess(peer_context, fixed_flags);
-	throw_if_error(status, "Failed enabling access to peer " + context::detail_::identify(peer_context));
+	throw_if_error_lazy(status, "Failed enabling access to peer " + context::detail_::identify(peer_context));
 }
 
 inline void disable_access_to(context::handle_t peer_context)
 {
 	auto status = cuCtxDisablePeerAccess(peer_context);
-	throw_if_error(status, "Failed disabling access to peer " + context::detail_::identify(peer_context));
+	throw_if_error_lazy(status, "Failed disabling access to peer " + context::detail_::identify(peer_context));
 }
 
 inline void enable_access(context::handle_t accessor, context::handle_t peer)
diff --git a/src/cuda/api/primary_context.hpp b/src/cuda/api/primary_context.hpp
index 80be8d9a..11ba809a 100644
--- a/src/cuda/api/primary_context.hpp
+++ b/src/cuda/api/primary_context.hpp
@@ -58,14 +58,14 @@ inline status_t decrease_refcount_nothrow(device::id_t device_id) noexcept
 inline void decrease_refcount(device::id_t device_id)
 {
 	auto status = decrease_refcount_nothrow(device_id);
-	throw_if_error(status, "Failed releasing the reference to the primary context for " + device::detail_::identify(device_id));
+	throw_if_error_lazy(status, "Failed releasing the reference to the primary context for " + device::detail_::identify(device_id));
 }
 
 inline handle_t obtain_and_increase_refcount(device::id_t device_id)
 {
 	handle_t primary_context_handle;
 	auto status = cuDevicePrimaryCtxRetain(&primary_context_handle, device_id);
-	throw_if_error(status,
+	throw_if_error_lazy(status,
 		"Failed obtaining (and possibly creating, and adding a reference count to) the primary context for "
 		+ device::detail_::identify(device_id));
 	return primary_context_handle;
diff --git a/src/cuda/api/stream.hpp b/src/cuda/api/stream.hpp
index 26b1331a..251b7591 100644
--- a/src/cuda/api/stream.hpp
+++ b/src/cuda/api/stream.hpp
@@ -118,7 +118,7 @@ inline handle_t create_raw_in_current_context(
 	auto status = cuStreamCreateWithPriority(&new_stream_handle, flags, priority);
 		// We could instead have used an equivalent Driver API call:
 		// cuStreamCreateWithPriority(cuStreamCreateWithPriority(&new_stream_handle, flags, priority);
-	throw_if_error(status, "Failed creating a new stream in " + detail_::identify(new_stream_handle));
+	throw_if_error_lazy(status, "Failed creating a new stream in " + detail_::identify(new_stream_handle));
 	return new_stream_handle;
 }
 
@@ -127,7 +127,7 @@ inline context::handle_t context_handle_of(stream::handle_t stream_handle)
 {
 	context::handle_t handle;
 	auto result = cuStreamGetCtx(stream_handle, &handle);
-	throw_if_error(result, "Failed obtaining the context of " + cuda::detail_::ptr_as_hex(stream_handle));
+	throw_if_error_lazy(result, "Failed obtaining the context of " + cuda::detail_::ptr_as_hex(stream_handle));
 	return handle;
 }
 #endif // CUDA_VERSION >= 9020
@@ -244,7 +244,7 @@ class stream_t {
 		auto status = cuStreamGetFlags(handle_, &flags);
 			// Could have used the equivalent Driver API call,
 			// cuStreamGetFlags(handle_, &flags);
-		throw_if_error(status, "Failed obtaining flags for a stream in "
+		throw_if_error_lazy(status, "Failed obtaining flags for a stream in "
 				+ context::detail_::identify(context_handle_, device_id_));
 		return flags & CU_STREAM_NON_BLOCKING;
 	}
@@ -255,7 +255,7 @@ class stream_t {
 		auto status = cuStreamGetPriority(handle_, &the_priority);
 			// Could have used the equivalent Runtime API call:
 			// cuStreamGetPriority(handle_, &the_priority);
-		throw_if_error(status, "Failed obtaining priority for a stream in "
+		throw_if_error_lazy(status, "Failed obtaining priority for a stream in "
 			+ context::detail_::identify(context_handle_, device_id_));
 		return the_priority;
 	}
@@ -548,7 +548,7 @@ class stream_t {
 				// Could have used the equivalent Driver API call: cuAddStreamCallback()
 #endif
 
-			throw_if_error(status, "Failed scheduling a callback to be launched on "
+			throw_if_error_lazy(status, "Failed scheduling a callback to be launched on "
 				+ stream::detail_::identify(associated_stream.handle_,
 					associated_stream.context_handle_, associated_stream.device_id_));
 		}
@@ -623,7 +623,7 @@ class stream_t {
 			auto status =  cuStreamAttachMemAsync(
 				associated_stream.handle_,  memory::device::address(managed_region_start), length, flags);
 				// Could have used the equivalent Driver API call cuStreamAttachMemAsync
-			throw_if_error(status, "Failed scheduling an attachment of a managed memory region on "
+			throw_if_error_lazy(status, "Failed scheduling an attachment of a managed memory region on "
 				+ stream::detail_::identify(associated_stream.handle_, associated_stream.context_handle_,
 				associated_stream.device_id_));
 		}
@@ -678,7 +678,7 @@ class stream_t {
 				CU_STREAM_WRITE_VALUE_NO_MEMORY_BARRIER;
 			auto result = static_cast<status_t>(
 				stream::detail_::write_value(associated_stream.handle_, address, value, flags));
-			throw_if_error(result, "Failed scheduling a write to global memory on "
+			throw_if_error_lazy(result, "Failed scheduling a write to global memory on "
 				+ stream::detail_::identify(associated_stream.handle_,associated_stream.context_handle_,
 				+ associated_stream.device_id_));
 		}
@@ -708,7 +708,7 @@ class stream_t {
 				(with_memory_barrier ? CU_STREAM_WAIT_VALUE_FLUSH : 0);
 			auto result = static_cast<status_t>(
 				stream::detail_::wait_on_value(associated_stream.handle_, address, value, flags));
-			throw_if_error(result,
+			throw_if_error_lazy(result,
 				"Failed scheduling a wait on global memory address on "
 				+ stream::detail_::identify(
 					associated_stream.handle_,
@@ -795,7 +795,7 @@ class stream_t {
 		context::current::detail_::scoped_override_t set_context_for_this_scope(context_handle_);
 		CUstreamAttrValue wrapped_result{};
 		auto status = cuStreamGetAttribute(handle_, CU_STREAM_ATTRIBUTE_SYNCHRONIZATION_POLICY, &wrapped_result);
-		throw_if_error(status, ::std::string("Obtaining the synchronization policy of ") + stream::detail_::identify(*this));
+		throw_if_error_lazy(status, ::std::string("Obtaining the synchronization policy of ") + stream::detail_::identify(*this));
 		return static_cast<stream::synchronization_policy_t>(wrapped_result.syncPolicy);
 	}
 
@@ -805,7 +805,7 @@ class stream_t {
 		CUstreamAttrValue wrapped_value{};
 		wrapped_value.syncPolicy = static_cast<CUsynchronizationPolicy>(policy);
 		auto status = cuStreamSetAttribute(handle_, CU_STREAM_ATTRIBUTE_SYNCHRONIZATION_POLICY, &wrapped_value);
-		throw_if_error(status, ::std::string("Setting the synchronization policy of ") + stream::detail_::identify(*this));
+		throw_if_error_lazy(status, ::std::string("Setting the synchronization policy of ") + stream::detail_::identify(*this));
 	}
 #endif
 
@@ -1019,7 +1019,7 @@ using queue_id_t = stream::handle_t;
 inline void synchronize(const stream_t& stream)
 {
 	auto status = cuStreamSynchronize(stream.handle());
-	throw_if_error(status, "Failed synchronizing " + stream::detail_::identify(stream));
+	throw_if_error_lazy(status, "Failed synchronizing " + stream::detail_::identify(stream));
 }
 
 #if CUDA_VERSION >= 11000
diff --git a/src/cuda/api/texture_view.hpp b/src/cuda/api/texture_view.hpp
index 9d89248f..de79d921 100644
--- a/src/cuda/api/texture_view.hpp
+++ b/src/cuda/api/texture_view.hpp
@@ -116,7 +116,7 @@ class texture_view {
 		resource_descriptor.res.array.hArray = arr.get();
 
 		auto status = cuTexObjectCreate(&raw_view_handle, &resource_descriptor, &descriptor, nullptr);
-		throw_if_error(status, "failed creating a CUDA texture object");
+		throw_if_error_lazy(status, "failed creating a CUDA texture object");
     }
 
 public: // operators
@@ -126,7 +126,7 @@ class texture_view {
 		if (owning) {
 			scoped_context_setter set_context(context_handle_);
 			auto status = cuTexObjectDestroy(raw_view_handle);
-			throw_if_error(status, "failed destroying texture object");
+			throw_if_error_lazy(status, "failed destroying texture object");
 		}
 	}
 
diff --git a/src/cuda/api/versions.hpp b/src/cuda/api/versions.hpp
index ec087e69..998e4ed6 100644
--- a/src/cuda/api/versions.hpp
+++ b/src/cuda/api/versions.hpp
@@ -133,7 +133,7 @@ inline version_t driver() {
 	combined_version_t version;
 	auto status = cuDriverGetVersion(&version);
 		// The same value would be returned using cuDriverGetVersion()
-	throw_if_error(status, "Failed obtaining the CUDA driver version");
+	throw_if_error_lazy(status, "Failed obtaining the CUDA driver version");
 	return version_t::from_single_number(version);
 }
 
@@ -146,7 +146,7 @@ inline version_t driver() {
 inline version_t runtime() {
 	combined_version_t version;
 	auto status = cudaRuntimeGetVersion(&version);
-	throw_if_error(status, "Failed obtaining the CUDA runtime version");
+	throw_if_error_lazy(status, "Failed obtaining the CUDA runtime version");
 	return version_t::from_single_number(version);
 }
 
diff --git a/src/cuda/api/virtual_memory.hpp b/src/cuda/api/virtual_memory.hpp
index d577c0d2..6fbfe77c 100644
--- a/src/cuda/api/virtual_memory.hpp
+++ b/src/cuda/api/virtual_memory.hpp
@@ -25,7 +25,7 @@ namespace detail_ {
 inline void cancel_reservation(memory::region_t reserved)
 {
 	auto status = cuMemAddressFree(memory::device::address(reserved.start()), reserved.size());
-	throw_if_error(status, "Failed freeing a reservation of " + memory::detail_::identify(reserved));
+	throw_if_error_lazy(status, "Failed freeing a reservation of " + memory::detail_::identify(reserved));
 }
 
 } // namespace detail_
@@ -90,7 +90,7 @@ inline reserved_address_range_t reserve(region_t requested_region, alignment_t a
 	unsigned long flags { 0 };
 	CUdeviceptr ptr;
 	auto status = cuMemAddressReserve(&ptr, requested_region.size(), alignment, requested_region.device_address(), flags);
-	throw_if_error(status, "Failed making a reservation of " + cuda::memory::detail_::identify(requested_region)
+	throw_if_error_lazy(status, "Failed making a reservation of " + cuda::memory::detail_::identify(requested_region)
 		+ " with alignment value " + ::std::to_string(alignment));
 	bool is_owning { true };
 	return detail_::wrap(memory::region_t { ptr, requested_region.size() }, alignment, is_owning);
@@ -149,7 +149,7 @@ struct properties_t {
 		size_t result;
 		auto status = cuMemGetAllocationGranularity(&result, &raw,
 			static_cast<CUmemAllocationGranularity_flags>(granuality_kind));
-		throw_if_error(status, "Could not determine physical allocation granularity");
+		throw_if_error_lazy(status, "Could not determine physical allocation granularity");
 		return result;
 	}
 
@@ -219,7 +219,7 @@ class physical_allocation_t {
 	~physical_allocation_t() {
 		if (not holds_refcount_unit_) { return; }
 		auto result = cuMemRelease(handle_);
-		throw_if_error(result, "Failed making a virtual memory physical_allocation of size " + ::std::to_string(size_));
+		throw_if_error_lazy(result, "Failed making a virtual memory physical_allocation of size " + ::std::to_string(size_));
 	}
 
 public: // non-mutators
@@ -232,7 +232,7 @@ class physical_allocation_t {
 	physical_allocation::properties_t properties() const {
 		CUmemAllocationProp raw_properties;
 		auto status = cuMemGetAllocationPropertiesFromHandle(&raw_properties, handle_);
-		throw_if_error(status, "Obtaining the properties of a virtual memory physical_allocation with handle " + ::std::to_string(handle_));
+		throw_if_error_lazy(status, "Obtaining the properties of a virtual memory physical_allocation with handle " + ::std::to_string(handle_));
 		return { raw_properties };
 	}
 
@@ -242,7 +242,7 @@ class physical_allocation_t {
 		physical_allocation::shared_handle_t<SharedHandleKind> shared_handle_;
 		static constexpr const unsigned long long flags { 0 };
 		auto result = cuMemExportToShareableHandle(&shared_handle_, handle_, (CUmemAllocationHandleType) SharedHandleKind, flags);
-		throw_if_error(result, "Exporting a (generic CUDA) shared memory physical_allocation to a shared handle");
+		throw_if_error_lazy(result, "Exporting a (generic CUDA) shared memory physical_allocation to a shared handle");
 		return shared_handle_;
 	}
 
@@ -259,7 +259,7 @@ inline physical_allocation_t create(size_t size, properties_t properties)
 	static constexpr const unsigned long long flags { 0 };
 	CUmemGenericAllocationHandle handle;
 	auto result = cuMemCreate(&handle, size, &properties.raw, flags);
-	throw_if_error(result, "Failed making a virtual memory physical_allocation of size " + ::std::to_string(size));
+	throw_if_error_lazy(result, "Failed making a virtual memory physical_allocation of size " + ::std::to_string(size));
 	static constexpr const bool is_owning { true };
 	return detail_::wrap(handle, size, is_owning);
 }
@@ -282,7 +282,7 @@ inline properties_t properties_of(handle_t handle)
 {
 	CUmemAllocationProp prop;
 	auto result = cuMemGetAllocationPropertiesFromHandle (&prop, handle);
-	throw_if_error(result, "Failed obtaining the properties of the virtual memory physical_allocation with handle "
+	throw_if_error_lazy(result, "Failed obtaining the properties of the virtual memory physical_allocation with handle "
 	  + ::std::to_string(handle));
 	return { prop };
 }
@@ -306,7 +306,7 @@ physical_allocation_t import(shared_handle_t<SharedHandleKind> shared_handle, si
 	handle_t result_handle;
 	auto result = cuMemImportFromShareableHandle(
 		&result_handle, reinterpret_cast<void*>(shared_handle), CUmemAllocationHandleType(SharedHandleKind));
-	throw_if_error(result, "Failed importing a virtual memory physical_allocation from a shared handle ");
+	throw_if_error_lazy(result, "Failed importing a virtual memory physical_allocation from a shared handle ");
 	return physical_allocation::detail_::wrap(result_handle, size, holds_refcount_unit);
 }
 
@@ -346,7 +346,7 @@ inline access_mode_t get_access_mode(region_t fully_mapped_region, cuda::device:
 	CUmemLocation_st location { CU_MEM_LOCATION_TYPE_DEVICE, device_id };
 	unsigned long long flags;
 	auto result = cuMemGetAccess(&flags, &location, fully_mapped_region.device_address() );
-	throw_if_error(result, "Failed determining the access mode for "
+	throw_if_error_lazy(result, "Failed determining the access mode for "
 		+ cuda::device::detail_::identify(device_id)
 		+ " to the virtual memory mapping to the range of size "
 		+ ::std::to_string(fully_mapped_region.size()) + " bytes at " + cuda::detail_::ptr_as_hex(fully_mapped_region.data()));
@@ -460,7 +460,7 @@ class mapping_t {
 	{
 		if (not owning_) { return; }
 		auto result = cuMemUnmap(address_range_.device_address(), address_range_.size());
-		throw_if_error(result, "Failed unmapping " + mapping::detail_::identify(address_range_));
+		throw_if_error_lazy(result, "Failed unmapping " + mapping::detail_::identify(address_range_));
 	}
 
 public:
@@ -470,7 +470,7 @@ class mapping_t {
 	{
 		CUmemGenericAllocationHandle allocation_handle;
 		auto status = cuMemRetainAllocationHandle(&allocation_handle, address_range_.data());
-		throw_if_error(status, " Failed obtaining/retaining the physical_allocation handle for the virtual memory "
+		throw_if_error_lazy(status, " Failed obtaining/retaining the physical_allocation handle for the virtual memory "
 			"range mapped to " + cuda::detail_::ptr_as_hex(address_range_.data()) + " of size " +
 				::std::to_string(address_range_.size()) + " bytes");
 		constexpr const bool increase_refcount{false};
@@ -508,7 +508,7 @@ inline mapping_t map(region_t region, physical_allocation_t physical_allocation)
 	constexpr const unsigned long long flags { 0 };
 	auto handle = physical_allocation.handle();
 	auto status = cuMemMap(region.device_address(), region.size(), offset_into_allocation, handle, flags);
-	throw_if_error(status, "Failed making a virtual memory mapping of "
+	throw_if_error_lazy(status, "Failed making a virtual memory mapping of "
 		+ physical_allocation::detail_::identify(physical_allocation)
 		+ " to the range of size " + ::std::to_string(region.size()) + " bytes at " +
 		cuda::detail_::ptr_as_hex(region.data()));
diff --git a/src/cuda/nvrtc/compilation_output.hpp b/src/cuda/nvrtc/compilation_output.hpp
index 382b9362..c918d8b9 100644
--- a/src/cuda/nvrtc/compilation_output.hpp
+++ b/src/cuda/nvrtc/compilation_output.hpp
@@ -74,14 +74,14 @@ inline size_t get_log_size(program::handle_t program_handle, const char* program
 {
 	size_t size;
 	auto status = nvrtcGetProgramLogSize(program_handle, &size);
-	throw_if_error(status, "Failed obtaining compilation log size for " + identify(program_handle, program_name));
+	throw_if_error_lazy(status, "Failed obtaining compilation log size for " + identify(program_handle, program_name));
 	return size;
 }
 
 inline void get_log(char* buffer, program::handle_t program_handle, const char *program_name = nullptr)
 {
 	auto status = nvrtcGetProgramLog(program_handle, buffer);
-	throw_if_error(status, "Failed obtaining NVRTC program compilation log for"
+	throw_if_error_lazy(status, "Failed obtaining NVRTC program compilation log for"
 		+ identify(program_handle, program_name));
 }
 
@@ -91,7 +91,7 @@ inline size_t get_cubin_size(program::handle_t program_handle, const char* progr
 {
 	size_t size;
 	auto status = nvrtcGetCUBINSize(program_handle, &size);
-	throw_if_error(status, "Failed obtaining NVRTC program output CUBIN size");
+	throw_if_error_lazy(status, "Failed obtaining NVRTC program output CUBIN size");
 	if (size == 0) {
 		throw ::std::runtime_error("CUBIN requested for a program compiled for a virtual architecture only: "
 			+ identify(program_handle, program_name));
@@ -102,7 +102,7 @@ inline size_t get_cubin_size(program::handle_t program_handle, const char* progr
 inline void get_cubin(char* buffer, program::handle_t program_handle, const char *program_name = nullptr)
 {
 	auto status = nvrtcGetCUBIN(program_handle, buffer);
-	throw_if_error(status, "Failed obtaining NVRTC program output CUBIN for "
+	throw_if_error_lazy(status, "Failed obtaining NVRTC program output CUBIN for "
 		+ identify(program_handle, program_name));
 }
 
@@ -113,7 +113,7 @@ inline size_t get_ptx_size(program::handle_t program_handle, const char *program
 {
 	size_t size;
 	auto status = nvrtcGetPTXSize(program_handle, &size);
-	throw_if_error(status, "Failed obtaining NVRTC program output PTX size for "
+	throw_if_error_lazy(status, "Failed obtaining NVRTC program output PTX size for "
 						   + identify(program_handle, program_name));
 	return size;
 }
@@ -121,7 +121,7 @@ inline size_t get_ptx_size(program::handle_t program_handle, const char *program
 inline void get_ptx(char* buffer, program::handle_t program_handle, const char *program_name = nullptr)
 {
 	auto status = nvrtcGetPTX(program_handle, buffer);
-	throw_if_error(status, "Failed obtaining NVRTC program output PTX for "
+	throw_if_error_lazy(status, "Failed obtaining NVRTC program output PTX for "
 						   + identify(program_handle, program_name));
 }
 
@@ -131,7 +131,7 @@ inline size_t get_nvvm_size(program::handle_t program_handle, const char *progra
 {
 	size_t size;
 	auto status = nvrtcGetNVVMSize(program_handle, &size);
-	throw_if_error(status, "Failed obtaining NVRTC program output NVVM size for "
+	throw_if_error_lazy(status, "Failed obtaining NVRTC program output NVVM size for "
 		+ identify(program_handle, program_name));
 	return size;
 }
@@ -139,7 +139,7 @@ inline size_t get_nvvm_size(program::handle_t program_handle, const char *progra
 inline void get_nvvm(char* buffer, program::handle_t program_handle, const char *program_name = nullptr)
 {
 	auto status = nvrtcGetNVVM(program_handle, buffer);
-	throw_if_error(status, "Failed obtaining NVRTC program output NVVM for "
+	throw_if_error_lazy(status, "Failed obtaining NVRTC program output NVVM for "
 		+ identify(program_handle, program_name));
 }
 
@@ -258,7 +258,7 @@ class compilation_output_t {
 		size_t size;
 		auto status = nvrtcGetPTXSize(program_handle_, &size);
 		if (status == NVRTC_ERROR_INVALID_PROGRAM) { return false; }
-		throw_if_error(status, "Failed determining whether the NVRTC program has a compiled PTX result: "
+		throw_if_error_lazy(status, "Failed determining whether the NVRTC program has a compiled PTX result: "
 			+ compilation_output::detail_::identify(*this));
 		if (size == 0) {
 			throw ::std::logic_error("PTX size reported as 0 by "
@@ -303,7 +303,7 @@ class compilation_output_t {
 		size_t size;
 		auto status = nvrtcGetCUBINSize(program_handle_, &size);
 		if (status == NVRTC_ERROR_INVALID_PROGRAM) { return false; }
-		throw_if_error(status, "Failed determining whether the NVRTC program has a compiled CUBIN result: "
+		throw_if_error_lazy(status, "Failed determining whether the NVRTC program has a compiled CUBIN result: "
 			+ compilation_output::detail_::identify(*this));
 		return (size > 0);
 	}
@@ -349,7 +349,7 @@ class compilation_output_t {
 		size_t size;
 		auto status = nvrtcGetNVVMSize(program_handle_, &size);
 		if (status == NVRTC_ERROR_INVALID_PROGRAM) { return false; }
-		throw_if_error(status, "Failed determining whether the NVRTC program has a compiled NVVM result: "
+		throw_if_error_lazy(status, "Failed determining whether the NVRTC program has a compiled NVVM result: "
 			+ compilation_output::detail_::identify(*this));
 		if (size == 0) {
 			throw ::std::logic_error("NVVM size reported as 0 by NVRTC for program: "
@@ -373,7 +373,7 @@ class compilation_output_t {
 	{
 		const char* result;
 		auto status = nvrtcGetLoweredName(program_handle_, unmangled_name, &result);
-		throw_if_error(status, ::std::string("Failed obtaining the mangled form of name \"")
+		throw_if_error_lazy(status, ::std::string("Failed obtaining the mangled form of name \"")
 			+ unmangled_name + "\" in dynamically-compiled program \"" + program_name_ + '\"');
 		return result;
 	}
@@ -417,7 +417,7 @@ class compilation_output_t {
 	{
 		if (owns_handle_) {
 			auto status = nvrtcDestroyProgram(&program_handle_);
-			throw_if_error(status, "Destroying " + program::detail_::identify(program_handle_, program_name_.c_str()));
+			throw_if_error_lazy(status, "Destroying " + program::detail_::identify(program_handle_, program_name_.c_str()));
 		}
 	}
 
diff --git a/src/cuda/nvrtc/program.hpp b/src/cuda/nvrtc/program.hpp
index 984087f0..e9a66437 100644
--- a/src/cuda/nvrtc/program.hpp
+++ b/src/cuda/nvrtc/program.hpp
@@ -45,14 +45,14 @@ inline program::handle_t create(
 	program::handle_t program_handle;
 	auto status = nvrtcCreateProgram(
 		&program_handle, program_source, program_name, (int) num_headers, header_sources, header_names);
-	throw_if_error(status, "Failed creating an NVRTC program (named " + ::std::string(program_name) + ')');
+	throw_if_error_lazy(status, "Failed creating an NVRTC program (named " + ::std::string(program_name) + ')');
 	return program_handle;
 }
 
 inline void register_global(handle_t program_handle, const char *global_to_register)
 {
 	auto status = nvrtcAddNameExpression(program_handle, global_to_register);
-	throw_if_error(status, "Failed registering global entity " + ::std::string(global_to_register)
+	throw_if_error_lazy(status, "Failed registering global entity " + ::std::string(global_to_register)
 		+ " with " + identify(program_handle));
 }
 
@@ -387,10 +387,10 @@ supported_targets()
 {
 	int num_supported_archs;
 	auto status = nvrtcGetNumSupportedArchs(&num_supported_archs);
-	throw_if_error(status, "Failed obtaining the number of target NVRTC architectures");
+	throw_if_error_lazy(status, "Failed obtaining the number of target NVRTC architectures");
 	auto raw_archs = ::std::unique_ptr<int[]>(new int[num_supported_archs]);
 	status = nvrtcGetSupportedArchs(raw_archs.get());
-	throw_if_error(status, "Failed obtaining the architectures supported by NVRTC");
+	throw_if_error_lazy(status, "Failed obtaining the architectures supported by NVRTC");
 	dynarray<device::compute_capability_t> result;
 	result.reserve(num_supported_archs);
 	::std::transform(raw_archs.get(), raw_archs.get() + num_supported_archs, ::std::back_inserter(result),
diff --git a/src/cuda/nvrtc/versions.hpp b/src/cuda/nvrtc/versions.hpp
index 1a8e5d7e..bfa0daff 100644
--- a/src/cuda/nvrtc/versions.hpp
+++ b/src/cuda/nvrtc/versions.hpp
@@ -21,7 +21,7 @@ namespace version_numbers {
 inline version_t nvrtc() {
 	version_t version;
 	auto status = nvrtcVersion(&version.major, &version.minor);
-	throw_if_error(status, "Failed obtaining the NVRTC library version");
+	throw_if_error_lazy(status, "Failed obtaining the NVRTC library version");
 	return version;
 }
 
diff --git a/src/cuda/nvtx/profiling.hpp b/src/cuda/nvtx/profiling.hpp
index b2f288a1..0bf945f4 100644
--- a/src/cuda/nvtx/profiling.hpp
+++ b/src/cuda/nvtx/profiling.hpp
@@ -194,7 +194,7 @@ inline void range_end(range::handle_t range_handle)
 inline void start()
 {
 	auto status = cuProfilerStart();
-	throw_if_error(status, "Starting CUDA profiling");
+	throw_if_error_lazy(status, "Starting CUDA profiling");
 }
 
 /**
@@ -203,7 +203,7 @@ inline void start()
 inline void stop()
 {
 	auto status = cuProfilerStop();
-	throw_if_error(status, "Stopping CUDA profiling");
+	throw_if_error_lazy(status, "Stopping CUDA profiling");
 }
 
 } // namespace profiling