Fixes #585: Can now use cuda::memory::make_unique() and have it def…

…ault to device-global memory
eyalroz · Feb 12, 2024 · d7512ab · d7512ab
1 parent 51a24a8
commit d7512ab
Show file tree

Hide file tree

Showing 15 changed files with 62 additions and 41 deletions.
diff --git a/examples/by_api_module/unified_addressing.cpp b/examples/by_api_module/unified_addressing.cpp
@@ -29,8 +29,8 @@ void pointer_properties(const cuda::device_t& device)
 		cuda::context::create(device)
 	};
 	cuda::memory::device::unique_ptr<char[]> regions[2] = {
-		cuda::memory::device::make_unique<char[]>(contexts[0], fixed_size),
-		cuda::memory::device::make_unique<char[]>(contexts[1], fixed_size)
+		cuda::memory::make_unique<char[]>(contexts[0], fixed_size),
+		cuda::memory::make_unique<char[]>(contexts[1], fixed_size)
 	};
 	void* raw_pointers[2] = {
 		regions[0].get(),

diff --git a/examples/modified_cuda_samples/asyncAPI/asyncAPI.cu b/examples/modified_cuda_samples/asyncAPI/asyncAPI.cu
@@ -59,7 +59,7 @@ int main(int, char **)
 	auto a = cuda::memory::host::make_unique<datum[]>(n);
 	cuda::memory::host::zero(a.get(), num_bytes);
 
-	auto d_a = cuda::memory::device::make_unique<datum[]>(device, n);
+	auto d_a = cuda::memory::make_unique<datum[]>(device, n);
 
 	auto launch_config = cuda::launch_config_builder()
 		.overall_size(n)

diff --git a/examples/modified_cuda_samples/binaryPartitionCG/binaryPartitionCG.cu b/examples/modified_cuda_samples/binaryPartitionCG/binaryPartitionCG.cu
@@ -105,9 +105,9 @@ int main(int argc, const char **argv)
 
     auto stream = device.create_stream(cuda::stream::async);
 	// Note: With CUDA 11, we could allocate these asynchronously on the stream
-	auto d_inputArr = cuda::memory::device::make_unique<int[]>(device, arrSize);
-	auto d_numOfOdds = cuda::memory::device::make_unique<int>(device);
-	auto d_sumOfOddEvenElems = cuda::memory::device::make_unique<int[]>(device, 2);
+	auto d_inputArr = cuda::memory::make_unique<int[]>(device, arrSize);
+	auto d_numOfOdds = cuda::memory::make_unique<int>(device);
+	auto d_sumOfOddEvenElems = cuda::memory::make_unique<int[]>(device, 2);
 
 	// Note: There's some code repetition here; unique pointers don't also keep track of the allocated size.
 	// Unfortunately, the standard library does not offer an owning dynamically-allocated memory region

diff --git a/examples/modified_cuda_samples/clock_nvrtc/clock.cpp b/examples/modified_cuda_samples/clock_nvrtc/clock.cpp
@@ -154,10 +154,10 @@ int main()
 	{
 		const auto dynamic_shared_mem_size = sizeof(float) * 2 * num_threads_per_block;
 
-		auto d_input = cuda::memory::device::make_unique<float[]>(device, input_size);
-		auto d_output = cuda::memory::device::make_unique<float[]>(device, num_blocks);
+		auto d_input = cuda::memory::make_unique<float[]>(device, input_size);
+		auto d_output = cuda::memory::make_unique<float[]>(device, num_blocks);
 			// Note: We won't actually be checking the output...
-		auto d_timers = cuda::memory::device::make_unique<clock_t []>(device, num_timers);
+		auto d_timers = cuda::memory::make_unique<clock_t []>(device, num_timers);
 		cuda::memory::copy(d_input.get(), input.get(), input_size * sizeof(float));
 
 		auto launch_config = cuda::launch_config_builder()

diff --git a/examples/modified_cuda_samples/inlinePTX/inlinePTX.cu b/examples/modified_cuda_samples/inlinePTX/inlinePTX.cu
@@ -43,7 +43,7 @@ int main(int, char **)
 	cuda::device::current::set_to_default();
 	auto device = cuda::device::current::get();
 
-	auto d_ptr = cuda::memory::device::make_unique<int[]>(device, N);
+	auto d_ptr = cuda::memory::make_unique<int[]>(device, N);
 	auto h_ptr = cuda::memory::host::make_unique<int[]>(N);
 
 	std::cout << "Generating data on CPU\n";

diff --git a/examples/modified_cuda_samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest.cu b/examples/modified_cuda_samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest.cu
@@ -176,8 +176,8 @@ void outputBandwidthMatrix(P2PEngine mechanism, bool test_p2p, P2PDataTransfer p
 
     for (auto device : cuda::devices()) {
         streams.push_back(device.create_stream(cuda::stream::async));
-        buffers.push_back(cuda::memory::device::make_unique<int[]>(device, numElems));
-        buffersD2D.push_back(cuda::memory::device::make_unique<int[]>(device, numElems));
+        buffers.push_back(cuda::memory::make_unique<int[]>(device, numElems));
+        buffersD2D.push_back(cuda::memory::make_unique<int[]>(device, numElems));
         start.push_back(device.create_event());
         stop.push_back(device.create_event());
     }
@@ -308,8 +308,8 @@ void outputBidirectionalBandwidthMatrix(P2PEngine p2p_mechanism, bool test_p2p)
     for (auto device : cuda::devices()) {
         streams_0.push_back(device.create_stream(cuda::stream::async));
         streams_1.push_back(device.create_stream(cuda::stream::async));
-        buffers.push_back(cuda::memory::device::make_unique<int[]>(device, numElems));
-        buffersD2D.push_back(cuda::memory::device::make_unique<int[]>(device, numElems));
+        buffers.push_back(cuda::memory::make_unique<int[]>(device, numElems));
+        buffersD2D.push_back(cuda::memory::make_unique<int[]>(device, numElems));
         start.push_back(device.create_event());
         stop.push_back(device.create_event());
     }
@@ -417,8 +417,8 @@ void outputLatencyMatrix(P2PEngine p2p_mechanism, bool test_p2p, P2PDataTransfer
 
     for(auto device : cuda::devices()) {
         streams.push_back(device.create_stream(cuda::stream::async));
-        buffers.push_back(cuda::memory::device::make_unique<int[]>(device, numElems));
-        buffersD2D.push_back(cuda::memory::device::make_unique<int[]>(device, numElems));
+        buffers.push_back(cuda::memory::make_unique<int[]>(device, numElems));
+        buffersD2D.push_back(cuda::memory::make_unique<int[]>(device, numElems));
         start.push_back(device.create_event());
         stop.push_back(device.create_event());
     }

diff --git a/examples/modified_cuda_samples/simpleDrvRuntimePTX/simpleDrvRuntimePTX.cpp b/examples/modified_cuda_samples/simpleDrvRuntimePTX/simpleDrvRuntimePTX.cpp
@@ -147,9 +147,9 @@ int main(int argc, char** argv)
 	std::generate_n(h_B.get(), N, generator);
 
     // Allocate vectors in device memory
-	auto d_A = cuda::memory::device::make_unique<float[]>(device, N);
-	auto d_B = cuda::memory::device::make_unique<float[]>(device, N);
-	auto d_C = cuda::memory::device::make_unique<float[]>(device, N);
+	auto d_A = cuda::memory::make_unique<float[]>(device, N);
+	auto d_B = cuda::memory::make_unique<float[]>(device, N);
+	auto d_C = cuda::memory::make_unique<float[]>(device, N);
 
 
 	cuda::memory::async::copy(d_A.get(), h_A.get(), size, stream);

diff --git a/examples/modified_cuda_samples/simpleStreams/simpleStreams.cu b/examples/modified_cuda_samples/simpleStreams/simpleStreams.cu
@@ -115,8 +115,8 @@ void run_simple_streams_example(
 
 	// allocate device memory
 	// pointers to data and init value in the device memory
-	auto d_a = cuda::memory::device::make_unique<int[]>(device, params.n);
-	auto d_c = cuda::memory::device::make_unique<int>(device);
+	auto d_a = cuda::memory::make_unique<int[]>(device, params.n);
+	auto d_c = cuda::memory::make_unique<int>(device);
 	cuda::memory::copy_single(d_c.get(), &c);
 
 	std::cout << "\nStarting Test\n";

diff --git a/examples/modified_cuda_samples/vectorAdd/vectorAdd.cu b/examples/modified_cuda_samples/vectorAdd/vectorAdd.cu
@@ -43,9 +43,9 @@ int main()
 	std::generate(h_B.get(), h_B.get() + numElements, generator);
 
 	auto device = cuda::device::current::get();
-	auto d_A = cuda::memory::device::make_unique<float[]>(device, numElements);
-	auto d_B = cuda::memory::device::make_unique<float[]>(device, numElements);
-	auto d_C = cuda::memory::device::make_unique<float[]>(device, numElements);
+	auto d_A = cuda::memory::make_unique<float[]>(device, numElements);
+	auto d_B = cuda::memory::make_unique<float[]>(device, numElements);
+	auto d_C = cuda::memory::make_unique<float[]>(device, numElements);
 
 	cuda::memory::copy(d_A.get(), h_A.get(), size);
 	cuda::memory::copy(d_B.get(), h_B.get(), size);

diff --git a/examples/modified_cuda_samples/vectorAdd_nvrtc/vectorAdd_nvrtc.cpp b/examples/modified_cuda_samples/vectorAdd_nvrtc/vectorAdd_nvrtc.cpp
@@ -68,9 +68,9 @@ int main(void)
 	std::generate(h_A.get(), h_A.get() + numElements, generator);
 	std::generate(h_B.get(), h_B.get() + numElements, generator);
 
-	auto d_A = cuda::memory::device::make_unique<float[]>(device, numElements);
-	auto d_B = cuda::memory::device::make_unique<float[]>(device, numElements);
-	auto d_C = cuda::memory::device::make_unique<float[]>(device, numElements);
+	auto d_A = cuda::memory::make_unique<float[]>(device, numElements);
+	auto d_B = cuda::memory::make_unique<float[]>(device, numElements);
+	auto d_C = cuda::memory::make_unique<float[]>(device, numElements);
 
 	cuda::memory::copy(d_A.get(), h_A.get(), size);
 	cuda::memory::copy(d_B.get(), h_B.get(), size);

diff --git a/examples/modified_cuda_samples/vectorAdd_ptx/vectorAdd_ptx.cpp b/examples/modified_cuda_samples/vectorAdd_ptx/vectorAdd_ptx.cpp
@@ -108,9 +108,9 @@ int main(void)
 	std::generate(h_A.get(), h_A.get() + numElements, generator);
 	std::generate(h_B.get(), h_B.get() + numElements, generator);
 
-	auto d_A = cuda::memory::device::make_unique<float[]>(device, numElements);
-	auto d_B = cuda::memory::device::make_unique<float[]>(device, numElements);
-	auto d_C = cuda::memory::device::make_unique<float[]>(device, numElements);
+	auto d_A = cuda::memory::make_unique<float[]>(device, numElements);
+	auto d_B = cuda::memory::make_unique<float[]>(device, numElements);
+	auto d_C = cuda::memory::make_unique<float[]>(device, numElements);
 
 	cuda::memory::copy(d_A.get(), h_A.get(), size);
 	cuda::memory::copy(d_B.get(), h_B.get(), size);

diff --git a/examples/other/io_compute_overlap_with_streams.cu b/examples/other/io_compute_overlap_with_streams.cu
@@ -71,9 +71,9 @@ std::vector<buffer_set_t> generate_buffers(
                 cuda::memory::host::make_unique<element_t[]>(num_elements),
                 cuda::memory::host::make_unique<element_t[]>(num_elements),
                 cuda::memory::host::make_unique<element_t[]>(num_elements),
-                cuda::memory::device::make_unique<element_t[]>(device, num_elements),
-                cuda::memory::device::make_unique<element_t[]>(device, num_elements),
-                cuda::memory::device::make_unique<element_t[]>(device, num_elements)
+                cuda::memory::make_unique<element_t[]>(device, num_elements),
+                cuda::memory::make_unique<element_t[]>(device, num_elements),
+                cuda::memory::make_unique<element_t[]>(device, num_elements)
             };
         }
     );

diff --git a/examples/other/jitify/jitify.cpp b/examples/other/jitify/jitify.cpp
@@ -156,7 +156,7 @@ void my_kernel(T* data) {
 	// TODO: A kernel::get(const module_t& module, const char* mangled_name function)
 	auto kernel = module.get_kernel(mangled_kernel_name);
 
-	auto d_data = cuda::memory::device::make_unique<T>(device);
+	auto d_data = cuda::memory::make_unique<T>(device);
 	T h_data = 5;
 	cuda::memory::copy_single<T>(d_data.get(), &h_data);
 
@@ -242,8 +242,8 @@ void my_kernel2(float const* indata, float* outdata) {
 	auto my_kernel1 = module.get_kernel(mangled_kernel_names[0]);
 	auto my_kernel2 = module.get_kernel(mangled_kernel_names[1]);
 
-	auto indata = cuda::memory::device::make_unique<T>(device);
-	auto outdata = cuda::memory::device::make_unique<T>(device);
+	auto indata = cuda::memory::make_unique<T>(device);
+	auto outdata = cuda::memory::make_unique<T>(device);
 	T inval = 3.14159f;
 	cuda::memory::copy_single<T>(indata.get(), &inval);
 
@@ -308,7 +308,7 @@ __global__ void constant_test(int *x) {
 	cuda::memory::copy(a, &inval[0]);
 	cuda::memory::copy(b_a, &inval[1]);
 	cuda::memory::copy(c_b_a, &inval[2]);
-	auto outdata = cuda::memory::device::make_unique<int[]>(device, n_const);
+	auto outdata = cuda::memory::make_unique<int[]>(device, n_const);
 	auto launch_config = cuda::launch_configuration_t(cuda::grid::composite_dimensions_t::point());
 	cuda::launch(kernel, launch_config, outdata.get());
 	int outval[n_const];
@@ -342,7 +342,7 @@ bool test_constant_2()
 	int inval[] = {3, 5, 9};
 	cuda::memory::copy(anon_b_a, inval);
 	auto launch_config = cuda::launch_configuration_t(cuda::grid::composite_dimensions_t::point());
-	auto outdata = cuda::memory::device::make_unique<int[]>(device, n_const);
+	auto outdata = cuda::memory::make_unique<int[]>(device, n_const);
 	cuda::launch(kernel, launch_config, outdata.get());
 	int outval[n_const];
 	auto ptr = outdata.get();

diff --git a/examples/other/vectorAdd_profiled.cu b/examples/other/vectorAdd_profiled.cu
@@ -48,9 +48,9 @@ int main()
 	std::generate(h_B.get(), h_B.get() + numElements, generator);
 
 	auto device = cuda::device::current::get();
-	auto d_A = cuda::memory::device::make_unique<float[]>(device, numElements);
-	auto d_B = cuda::memory::device::make_unique<float[]>(device, numElements);
-	auto d_C = cuda::memory::device::make_unique<float[]>(device, numElements);
+	auto d_A = cuda::memory::make_unique<float[]>(device, numElements);
+	auto d_B = cuda::memory::make_unique<float[]>(device, numElements);
+	auto d_C = cuda::memory::make_unique<float[]>(device, numElements);
 
 	cuda::memory::copy(d_A.get(), h_A.get(), size);
 	cuda::memory::copy(d_B.get(), h_B.get(), size);

diff --git a/src/cuda/api/unique_ptr.hpp b/src/cuda/api/unique_ptr.hpp
@@ -127,6 +127,27 @@ inline unique_ptr<T> make_unique();
 
 } // namespace device
 
+/// See @ref `device::make_unique(const context_t& context, size_t num_elements)`
+template<typename T>
+inline device::unique_ptr<T> make_unique(const context_t& context, size_t num_elements)
+{
+	return device::make_unique<T>(context, num_elements);
+}
+
+/// See @ref `device::make_unique(const device_t& device, size_t num_elements)`
+template<typename T>
+inline device::unique_ptr<T> make_unique(const device_t& device, size_t num_elements)
+{
+	return device::make_unique<T>(device, num_elements);
+}
+
+/// See @ref `device::make_unique(const device_t& device)`
+template<typename T>
+inline device::unique_ptr<T> make_unique(const device_t& device)
+{
+	return device::make_unique<T>(device);
+}
+
 namespace host {
 
 template<typename T>