Skip to content

Commit

Permalink
Fixes #585: Can now use cuda::memory::make_unique() and have it def…
Browse files Browse the repository at this point in the history
…ault to device-global memory
  • Loading branch information
eyalroz committed Feb 12, 2024
1 parent 51a24a8 commit d7512ab
Show file tree
Hide file tree
Showing 15 changed files with 62 additions and 41 deletions.
4 changes: 2 additions & 2 deletions examples/by_api_module/unified_addressing.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,8 @@ void pointer_properties(const cuda::device_t& device)
cuda::context::create(device)
};
cuda::memory::device::unique_ptr<char[]> regions[2] = {
cuda::memory::device::make_unique<char[]>(contexts[0], fixed_size),
cuda::memory::device::make_unique<char[]>(contexts[1], fixed_size)
cuda::memory::make_unique<char[]>(contexts[0], fixed_size),
cuda::memory::make_unique<char[]>(contexts[1], fixed_size)
};
void* raw_pointers[2] = {
regions[0].get(),
Expand Down
2 changes: 1 addition & 1 deletion examples/modified_cuda_samples/asyncAPI/asyncAPI.cu
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ int main(int, char **)
auto a = cuda::memory::host::make_unique<datum[]>(n);
cuda::memory::host::zero(a.get(), num_bytes);

auto d_a = cuda::memory::device::make_unique<datum[]>(device, n);
auto d_a = cuda::memory::make_unique<datum[]>(device, n);

auto launch_config = cuda::launch_config_builder()
.overall_size(n)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -105,9 +105,9 @@ int main(int argc, const char **argv)

auto stream = device.create_stream(cuda::stream::async);
// Note: With CUDA 11, we could allocate these asynchronously on the stream
auto d_inputArr = cuda::memory::device::make_unique<int[]>(device, arrSize);
auto d_numOfOdds = cuda::memory::device::make_unique<int>(device);
auto d_sumOfOddEvenElems = cuda::memory::device::make_unique<int[]>(device, 2);
auto d_inputArr = cuda::memory::make_unique<int[]>(device, arrSize);
auto d_numOfOdds = cuda::memory::make_unique<int>(device);
auto d_sumOfOddEvenElems = cuda::memory::make_unique<int[]>(device, 2);

// Note: There's some code repetition here; unique pointers don't also keep track of the allocated size.
// Unfortunately, the standard library does not offer an owning dynamically-allocated memory region
Expand Down
6 changes: 3 additions & 3 deletions examples/modified_cuda_samples/clock_nvrtc/clock.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -154,10 +154,10 @@ int main()
{
const auto dynamic_shared_mem_size = sizeof(float) * 2 * num_threads_per_block;

auto d_input = cuda::memory::device::make_unique<float[]>(device, input_size);
auto d_output = cuda::memory::device::make_unique<float[]>(device, num_blocks);
auto d_input = cuda::memory::make_unique<float[]>(device, input_size);
auto d_output = cuda::memory::make_unique<float[]>(device, num_blocks);
// Note: We won't actually be checking the output...
auto d_timers = cuda::memory::device::make_unique<clock_t []>(device, num_timers);
auto d_timers = cuda::memory::make_unique<clock_t []>(device, num_timers);
cuda::memory::copy(d_input.get(), input.get(), input_size * sizeof(float));

auto launch_config = cuda::launch_config_builder()
Expand Down
2 changes: 1 addition & 1 deletion examples/modified_cuda_samples/inlinePTX/inlinePTX.cu
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ int main(int, char **)
cuda::device::current::set_to_default();
auto device = cuda::device::current::get();

auto d_ptr = cuda::memory::device::make_unique<int[]>(device, N);
auto d_ptr = cuda::memory::make_unique<int[]>(device, N);
auto h_ptr = cuda::memory::host::make_unique<int[]>(N);

std::cout << "Generating data on CPU\n";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -176,8 +176,8 @@ void outputBandwidthMatrix(P2PEngine mechanism, bool test_p2p, P2PDataTransfer p

for (auto device : cuda::devices()) {
streams.push_back(device.create_stream(cuda::stream::async));
buffers.push_back(cuda::memory::device::make_unique<int[]>(device, numElems));
buffersD2D.push_back(cuda::memory::device::make_unique<int[]>(device, numElems));
buffers.push_back(cuda::memory::make_unique<int[]>(device, numElems));
buffersD2D.push_back(cuda::memory::make_unique<int[]>(device, numElems));
start.push_back(device.create_event());
stop.push_back(device.create_event());
}
Expand Down Expand Up @@ -308,8 +308,8 @@ void outputBidirectionalBandwidthMatrix(P2PEngine p2p_mechanism, bool test_p2p)
for (auto device : cuda::devices()) {
streams_0.push_back(device.create_stream(cuda::stream::async));
streams_1.push_back(device.create_stream(cuda::stream::async));
buffers.push_back(cuda::memory::device::make_unique<int[]>(device, numElems));
buffersD2D.push_back(cuda::memory::device::make_unique<int[]>(device, numElems));
buffers.push_back(cuda::memory::make_unique<int[]>(device, numElems));
buffersD2D.push_back(cuda::memory::make_unique<int[]>(device, numElems));
start.push_back(device.create_event());
stop.push_back(device.create_event());
}
Expand Down Expand Up @@ -417,8 +417,8 @@ void outputLatencyMatrix(P2PEngine p2p_mechanism, bool test_p2p, P2PDataTransfer

for(auto device : cuda::devices()) {
streams.push_back(device.create_stream(cuda::stream::async));
buffers.push_back(cuda::memory::device::make_unique<int[]>(device, numElems));
buffersD2D.push_back(cuda::memory::device::make_unique<int[]>(device, numElems));
buffers.push_back(cuda::memory::make_unique<int[]>(device, numElems));
buffersD2D.push_back(cuda::memory::make_unique<int[]>(device, numElems));
start.push_back(device.create_event());
stop.push_back(device.create_event());
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -147,9 +147,9 @@ int main(int argc, char** argv)
std::generate_n(h_B.get(), N, generator);

// Allocate vectors in device memory
auto d_A = cuda::memory::device::make_unique<float[]>(device, N);
auto d_B = cuda::memory::device::make_unique<float[]>(device, N);
auto d_C = cuda::memory::device::make_unique<float[]>(device, N);
auto d_A = cuda::memory::make_unique<float[]>(device, N);
auto d_B = cuda::memory::make_unique<float[]>(device, N);
auto d_C = cuda::memory::make_unique<float[]>(device, N);


cuda::memory::async::copy(d_A.get(), h_A.get(), size, stream);
Expand Down
4 changes: 2 additions & 2 deletions examples/modified_cuda_samples/simpleStreams/simpleStreams.cu
Original file line number Diff line number Diff line change
Expand Up @@ -115,8 +115,8 @@ void run_simple_streams_example(

// allocate device memory
// pointers to data and init value in the device memory
auto d_a = cuda::memory::device::make_unique<int[]>(device, params.n);
auto d_c = cuda::memory::device::make_unique<int>(device);
auto d_a = cuda::memory::make_unique<int[]>(device, params.n);
auto d_c = cuda::memory::make_unique<int>(device);
cuda::memory::copy_single(d_c.get(), &c);

std::cout << "\nStarting Test\n";
Expand Down
6 changes: 3 additions & 3 deletions examples/modified_cuda_samples/vectorAdd/vectorAdd.cu
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,9 @@ int main()
std::generate(h_B.get(), h_B.get() + numElements, generator);

auto device = cuda::device::current::get();
auto d_A = cuda::memory::device::make_unique<float[]>(device, numElements);
auto d_B = cuda::memory::device::make_unique<float[]>(device, numElements);
auto d_C = cuda::memory::device::make_unique<float[]>(device, numElements);
auto d_A = cuda::memory::make_unique<float[]>(device, numElements);
auto d_B = cuda::memory::make_unique<float[]>(device, numElements);
auto d_C = cuda::memory::make_unique<float[]>(device, numElements);

cuda::memory::copy(d_A.get(), h_A.get(), size);
cuda::memory::copy(d_B.get(), h_B.get(), size);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,9 +68,9 @@ int main(void)
std::generate(h_A.get(), h_A.get() + numElements, generator);
std::generate(h_B.get(), h_B.get() + numElements, generator);

auto d_A = cuda::memory::device::make_unique<float[]>(device, numElements);
auto d_B = cuda::memory::device::make_unique<float[]>(device, numElements);
auto d_C = cuda::memory::device::make_unique<float[]>(device, numElements);
auto d_A = cuda::memory::make_unique<float[]>(device, numElements);
auto d_B = cuda::memory::make_unique<float[]>(device, numElements);
auto d_C = cuda::memory::make_unique<float[]>(device, numElements);

cuda::memory::copy(d_A.get(), h_A.get(), size);
cuda::memory::copy(d_B.get(), h_B.get(), size);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -108,9 +108,9 @@ int main(void)
std::generate(h_A.get(), h_A.get() + numElements, generator);
std::generate(h_B.get(), h_B.get() + numElements, generator);

auto d_A = cuda::memory::device::make_unique<float[]>(device, numElements);
auto d_B = cuda::memory::device::make_unique<float[]>(device, numElements);
auto d_C = cuda::memory::device::make_unique<float[]>(device, numElements);
auto d_A = cuda::memory::make_unique<float[]>(device, numElements);
auto d_B = cuda::memory::make_unique<float[]>(device, numElements);
auto d_C = cuda::memory::make_unique<float[]>(device, numElements);

cuda::memory::copy(d_A.get(), h_A.get(), size);
cuda::memory::copy(d_B.get(), h_B.get(), size);
Expand Down
6 changes: 3 additions & 3 deletions examples/other/io_compute_overlap_with_streams.cu
Original file line number Diff line number Diff line change
Expand Up @@ -71,9 +71,9 @@ std::vector<buffer_set_t> generate_buffers(
cuda::memory::host::make_unique<element_t[]>(num_elements),
cuda::memory::host::make_unique<element_t[]>(num_elements),
cuda::memory::host::make_unique<element_t[]>(num_elements),
cuda::memory::device::make_unique<element_t[]>(device, num_elements),
cuda::memory::device::make_unique<element_t[]>(device, num_elements),
cuda::memory::device::make_unique<element_t[]>(device, num_elements)
cuda::memory::make_unique<element_t[]>(device, num_elements),
cuda::memory::make_unique<element_t[]>(device, num_elements),
cuda::memory::make_unique<element_t[]>(device, num_elements)
};
}
);
Expand Down
10 changes: 5 additions & 5 deletions examples/other/jitify/jitify.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@ void my_kernel(T* data) {
// TODO: A kernel::get(const module_t& module, const char* mangled_name function)
auto kernel = module.get_kernel(mangled_kernel_name);

auto d_data = cuda::memory::device::make_unique<T>(device);
auto d_data = cuda::memory::make_unique<T>(device);
T h_data = 5;
cuda::memory::copy_single<T>(d_data.get(), &h_data);

Expand Down Expand Up @@ -242,8 +242,8 @@ void my_kernel2(float const* indata, float* outdata) {
auto my_kernel1 = module.get_kernel(mangled_kernel_names[0]);
auto my_kernel2 = module.get_kernel(mangled_kernel_names[1]);

auto indata = cuda::memory::device::make_unique<T>(device);
auto outdata = cuda::memory::device::make_unique<T>(device);
auto indata = cuda::memory::make_unique<T>(device);
auto outdata = cuda::memory::make_unique<T>(device);
T inval = 3.14159f;
cuda::memory::copy_single<T>(indata.get(), &inval);

Expand Down Expand Up @@ -308,7 +308,7 @@ __global__ void constant_test(int *x) {
cuda::memory::copy(a, &inval[0]);
cuda::memory::copy(b_a, &inval[1]);
cuda::memory::copy(c_b_a, &inval[2]);
auto outdata = cuda::memory::device::make_unique<int[]>(device, n_const);
auto outdata = cuda::memory::make_unique<int[]>(device, n_const);
auto launch_config = cuda::launch_configuration_t(cuda::grid::composite_dimensions_t::point());
cuda::launch(kernel, launch_config, outdata.get());
int outval[n_const];
Expand Down Expand Up @@ -342,7 +342,7 @@ bool test_constant_2()
int inval[] = {3, 5, 9};
cuda::memory::copy(anon_b_a, inval);
auto launch_config = cuda::launch_configuration_t(cuda::grid::composite_dimensions_t::point());
auto outdata = cuda::memory::device::make_unique<int[]>(device, n_const);
auto outdata = cuda::memory::make_unique<int[]>(device, n_const);
cuda::launch(kernel, launch_config, outdata.get());
int outval[n_const];
auto ptr = outdata.get();
Expand Down
6 changes: 3 additions & 3 deletions examples/other/vectorAdd_profiled.cu
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,9 @@ int main()
std::generate(h_B.get(), h_B.get() + numElements, generator);

auto device = cuda::device::current::get();
auto d_A = cuda::memory::device::make_unique<float[]>(device, numElements);
auto d_B = cuda::memory::device::make_unique<float[]>(device, numElements);
auto d_C = cuda::memory::device::make_unique<float[]>(device, numElements);
auto d_A = cuda::memory::make_unique<float[]>(device, numElements);
auto d_B = cuda::memory::make_unique<float[]>(device, numElements);
auto d_C = cuda::memory::make_unique<float[]>(device, numElements);

cuda::memory::copy(d_A.get(), h_A.get(), size);
cuda::memory::copy(d_B.get(), h_B.get(), size);
Expand Down
21 changes: 21 additions & 0 deletions src/cuda/api/unique_ptr.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,27 @@ inline unique_ptr<T> make_unique();

} // namespace device

/// See @ref `device::make_unique(const context_t& context, size_t num_elements)`
template<typename T>
inline device::unique_ptr<T> make_unique(const context_t& context, size_t num_elements)
{
return device::make_unique<T>(context, num_elements);
}

/// See @ref `device::make_unique(const device_t& device, size_t num_elements)`
template<typename T>
inline device::unique_ptr<T> make_unique(const device_t& device, size_t num_elements)
{
return device::make_unique<T>(device, num_elements);
}

/// See @ref `device::make_unique(const device_t& device)`
template<typename T>
inline device::unique_ptr<T> make_unique(const device_t& device)
{
return device::make_unique<T>(device);
}

namespace host {

template<typename T>
Expand Down

0 comments on commit d7512ab

Please sign in to comment.