From bbfd6ea155a26b3bdb07863dd0d19223bd7ffc30 Mon Sep 17 00:00:00 2001 From: Eyal Rozenberg Date: Mon, 6 Jul 2020 11:37:55 +0300 Subject: [PATCH] Fixes #9: A near-complete revamp of the APIs, now taking the driver API into account and exposing most of its functionality. --- .github/workflows/cmake-build-linux.yml | 30 +- .github/workflows/cmake-build-windows.yml | 24 +- CMakeLists.txt | 12 +- examples/CMakeLists.txt | 5 +- .../context_management.cpp | 191 +++ .../device_management.cpp | 43 +- .../by_runtime_api_module/error_handling.cu | 34 +- .../by_runtime_api_module/event_management.cu | 4 +- .../execution_control.cu | 13 +- examples/by_runtime_api_module/ipc.cpp | 7 +- .../stream_management.cu | 2 +- .../unified_addressing.cpp | 135 +- .../version_management.cpp | 6 +- examples/common.hpp | 109 +- .../binaryPartitionCG/binaryPartitionCG.cu | 2 +- .../modified_cuda_samples/helper_cuda.hpp | 1 - .../inlinePTX/inlinePTX.cu | 20 +- .../p2pBandwidthLatencyTest.cu | 4 +- .../simpleDrvRuntimePTX.cpp | 175 ++ .../simpleIPC/simpleIPC.cu | 9 +- .../simpleStreams/simpleStreams.cu | 18 +- .../vectorAdd/vectorAdd.cu | 4 +- .../vectorAddManaged/vectorAddManaged.cu | 3 +- .../vectorAddMapped/vectorAddMapped.cu | 2 +- examples/other/array_management.cu | 2 +- .../main.cpp | 2 +- .../second_tu.cpp | 2 +- .../other/io_compute_overlap_with_streams.cu | 197 +-- examples/other/manipulate_current_device.cu | 59 + src/cuda/api.hpp | 50 + src/cuda/api/apriori_compiled_kernel.hpp | 145 ++ src/cuda/api/array.hpp | 120 +- src/cuda/api/constants.hpp | 12 +- src/cuda/api/context.hpp | 850 +++++++++ src/cuda/api/current_context.hpp | 241 +++ src/cuda/api/current_device.hpp | 149 +- src/cuda/api/detail/device_properties.hpp | 74 +- src/cuda/api/device.hpp | 672 ++++---- src/cuda/api/device_properties.hpp | 10 +- src/cuda/api/devices.hpp | 7 +- src/cuda/api/error.hpp | 281 ++- src/cuda/api/event.hpp | 175 +- src/cuda/api/ipc.hpp | 61 +- src/cuda/api/kernel.hpp | 446 +++-- src/cuda/api/kernel_launch.hpp | 184 +- src/cuda/api/link.hpp | 215 +++ src/cuda/api/link_options.hpp | 339 ++++ src/cuda/api/memory.hpp | 1346 ++++++++++----- src/cuda/api/miscellany.hpp | 51 +- src/cuda/api/module.hpp | 355 ++++ src/cuda/api/multi_wrapper_impls.hpp | 1524 +++++++++++++---- src/cuda/api/pci_id.hpp | 6 +- src/cuda/api/pci_id_impl.hpp | 1 + src/cuda/api/peer_to_peer.hpp | 187 +- src/cuda/api/pointer.hpp | 155 +- src/cuda/api/primary_context.hpp | 328 ++++ src/cuda/api/stream.hpp | 537 ++++-- src/cuda/api/texture_view.hpp | 72 +- src/cuda/{common => api}/types.hpp | 413 +++-- src/cuda/api/unique_ptr.hpp | 106 +- src/cuda/api/versions.hpp | 14 +- src/cuda/api/virtual_memory.hpp | 524 ++++++ src/cuda/nvtx/profiling.hpp | 2 +- src/cuda/runtime_api.hpp | 7 +- 64 files changed, 8434 insertions(+), 2340 deletions(-) create mode 100644 examples/by_driver_api_module/context_management.cpp create mode 100644 examples/modified_cuda_samples/simpleDrvRuntimePTX/simpleDrvRuntimePTX.cpp create mode 100644 examples/other/manipulate_current_device.cu create mode 100644 src/cuda/api.hpp create mode 100644 src/cuda/api/apriori_compiled_kernel.hpp create mode 100644 src/cuda/api/context.hpp create mode 100644 src/cuda/api/current_context.hpp create mode 100644 src/cuda/api/link.hpp create mode 100644 src/cuda/api/link_options.hpp create mode 100644 src/cuda/api/module.hpp create mode 100644 src/cuda/api/primary_context.hpp rename src/cuda/{common => api}/types.hpp (65%) create mode 100644 src/cuda/api/virtual_memory.hpp diff --git a/.github/workflows/cmake-build-linux.yml b/.github/workflows/cmake-build-linux.yml index 922f37c2..d4e8956e 100644 --- a/.github/workflows/cmake-build-linux.yml +++ b/.github/workflows/cmake-build-linux.yml @@ -63,21 +63,21 @@ jobs: gcc: 9 # may fail with gcc-10 due to an internal compiler error shell: "bash" cmake-generator: "Unix Makefiles" - - os: ubuntu-18.04 - cuda: "10.2" - gcc: 8 - shell: "bash" - cmake-generator: "Unix Makefiles" - - os: ubuntu-18.04 - cuda: "10.1" - gcc: 8 - shell: "bash" - cmake-generator: "Unix Makefiles" - - os: ubuntu-18.04 - cuda: "10.0" - gcc: 7 # fails with GCC 8 - no supported in CUDA 10.0 - shell: "bash" - cmake-generator: "Unix Makefiles" +# - os: ubuntu-18.04 +# cuda: "10.2" +# gcc: 8 +# shell: "bash" +# cmake-generator: "Unix Makefiles" +# - os: ubuntu-18.04 +# cuda: "10.1" +# gcc: 8 +# shell: "bash" +# cmake-generator: "Unix Makefiles" +# - os: ubuntu-18.04 +# cuda: "10.0" +# gcc: 7 # fails with GCC 8 - no supported in CUDA 10.0 +# shell: "bash" +# cmake-generator: "Unix Makefiles" # GitHub has remoted ubuntu-16.04 runnings, # so we're not testing builds with older CUDA versions # - os: ubuntu-16.04 diff --git a/.github/workflows/cmake-build-windows.yml b/.github/workflows/cmake-build-windows.yml index c24f684f..c9cc16c2 100644 --- a/.github/workflows/cmake-build-windows.yml +++ b/.github/workflows/cmake-build-windows.yml @@ -66,18 +66,18 @@ jobs: shell: "powershell" os-type: "windows" cmake-platform-flag: "-A x64" - - os: windows-2019 - cuda: "10.2.89" - visual-studio: "Visual Studio 16 2019" - shell: "powershell" - os-type: "windows" - cmake-platform-flag: "-A x64" - - os: windows-2019 - cuda: "10.1.243" - visual-studio: "Visual Studio 16 2019" - shell: "powershell" - os-type: "windows" - cmake-platform-flag: "-A x64" +# - os: windows-2019 +# cuda: "10.2.89" +# visual-studio: "Visual Studio 16 2019" +# shell: "powershell" +# os-type: "windows" +# cmake-platform-flag: "-A x64" +# - os: windows-2019 +# cuda: "10.1.243" +# visual-studio: "Visual Studio 16 2019" +# shell: "powershell" +# os-type: "windows" +# cmake-platform-flag: "-A x64" # Windows2016 & VS 2017 supports 10.0+ # - os: windows-2016 diff --git a/CMakeLists.txt b/CMakeLists.txt index f07388c7..d4fcbfb8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -17,14 +17,14 @@ if(WIN32 AND "${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC") endif() PROJECT(cuda-api-wrappers - VERSION 0.4.4 + VERSION 0.5.0 DESCRIPTION "Thin C++-flavored wrappers for the CUDA Runtime API" HOMEPAGE_URL https://github.com/eyalroz/cuda-api-wrappers LANGUAGES CUDA CXX) include(GNUInstallDirs) -find_package(CUDAToolkit REQUIRED) +find_package(CUDAToolkit 11.0 REQUIRED) find_package(Threads REQUIRED) set(CMAKE_THREAD_PREFER_PTHREAD TRUE) @@ -35,9 +35,9 @@ set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY "lib/") # Our library targets # ------------------- -add_library(runtime-api INTERFACE) # A header-only library! +add_library(runtime-and-driver INTERFACE) # A header-only library! add_library(nvtx) -set(wrapper-libraries runtime-api nvtx) +set(wrapper-libraries runtime-and-driver nvtx) foreach(WRAPPER_LIB ${wrapper-libraries}) target_compile_features(${WRAPPER_LIB} INTERFACE cxx_std_11) # This means _at least_ C++11 @@ -47,11 +47,11 @@ foreach(WRAPPER_LIB ${wrapper-libraries}) "$" "$" ) - target_link_libraries(${WRAPPER_LIB} INTERFACE CUDA::cudart) # CUDA::cuda_driver) + target_link_libraries(${WRAPPER_LIB} INTERFACE CUDA::cudart CUDA::nvToolsExt CUDA::cuda_driver) endforeach() set_target_properties(nvtx PROPERTIES OUTPUT_NAME "cuda-nvtx-wrappers") -target_link_libraries(nvtx PUBLIC runtime-api) +target_link_libraries(nvtx PUBLIC runtime-and-driver) target_link_libraries(nvtx PRIVATE Threads::Threads CUDA::nvToolsExt) set_property(TARGET nvtx PROPERTY CXX_STANDARD 11) set_property(TARGET nvtx PROPERTY CXX_STANDARD_REQUIRED ON) diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index b98970e7..b928cdf0 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -28,12 +28,13 @@ set(CMAKE_CUDA_STANDARD 11) set(CMAKE_CUDA_STANDARD_REQUIRED ON) set(CMAKE_CUDA_EXTENSIONS OFF) -link_libraries(runtime-api) +link_libraries(runtime-and-driver) set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "bin") add_executable(vectorAdd modified_cuda_samples/vectorAdd/vectorAdd.cu) add_executable(vectorAddMapped modified_cuda_samples/vectorAddMapped/vectorAddMapped.cu) add_executable(vectorAddManaged modified_cuda_samples/vectorAddManaged/vectorAddManaged.cu) +add_executable(simpleDrvRuntimePTX modified_cuda_samples/simpleDrvRuntimePTX/simpleDrvRuntimePTX.cpp) add_executable(inlinePTX modified_cuda_samples/inlinePTX/inlinePTX.cu) add_executable(simpleStreams modified_cuda_samples/simpleStreams/simpleStreams.cu) add_executable(simpleIPC modified_cuda_samples/simpleIPC/simpleIPC.cu) @@ -50,12 +51,14 @@ add_dependencies(modified_cuda_samples vectorAdd inlinePTX simpleStreams simpleI add_executable(version_management by_runtime_api_module/version_management.cpp) add_executable(error_handling by_runtime_api_module/error_handling.cu) add_executable(device_management by_runtime_api_module/device_management.cpp) +add_executable(context_management by_driver_api_module/context_management.cpp) add_executable(execution_control by_runtime_api_module/execution_control.cu) add_executable(stream_management by_runtime_api_module/stream_management.cu) add_executable(event_management by_runtime_api_module/event_management.cu) add_executable(unified_addressing by_runtime_api_module/unified_addressing.cpp) add_executable(io_compute_overlap_with_streams other/io_compute_overlap_with_streams.cu) +add_executable(manipulate_current_device other/manipulate_current_device.cu) add_executable(inclusion_in_two_translation_units other/inclusion_in_two_translation_units/main.cpp other/inclusion_in_two_translation_units/second_tu.cpp ) if(NOT "${CMAKE_CUDA_COMPILER_ID}" STREQUAL "Clang") diff --git a/examples/by_driver_api_module/context_management.cpp b/examples/by_driver_api_module/context_management.cpp new file mode 100644 index 00000000..53fe6f31 --- /dev/null +++ b/examples/by_driver_api_module/context_management.cpp @@ -0,0 +1,191 @@ +/** + * An example program utilizing most/all calls from the CUDA + * Driver API module: + * + * Device Management + */ +#include "../common.hpp" + +void current_context_manipulation(const cuda::device_t &device, const cuda::device::primary_context_t &pc, + const cuda::context_t &created_context); + +void test_context( + const cuda::context_t& context, + bool is_primary, + cuda::device::id_t device_id) +{ + std::cout << "Testing " << (is_primary ? "" : "non-") << "primary context " << context << '\n'; + if (context.device_id() != device_id) { + die_("The device's primary context's reported ID and the device wrapper's ID differ: " + + std::to_string(context.device_id()) + " !=" + std::to_string(device_id)); + } + + if (context.device().id() != device_id) { + die_("The context's associated device's ID is not the same as that of the device for which we obtained the context: " + + std::to_string(context.device().id()) + " !=" + std::to_string(device_id) ); + } + + if (context.is_primary() != is_primary) { + die_(std::string("The ") + (is_primary ? "" : "non-") + "primary context " + std::to_string(context) + + " \"believes\" it is " + (is_primary ? "not " : "") + "primary."); + } + + // Specific attributes and properties with their own API calls: + // L1/shared mem (CacheConfig), shared memory bank size (SharedMemConfig) + // and stream priority range + // ---------------------------------------------------------------- + + auto cache_preference = context.cache_preference(); + std::cout << "The cache preference for context " << context << " is: " << cache_preference << ".\n"; + + auto new_cache_preference = + cache_preference == cuda::multiprocessor_cache_preference_t::prefer_l1_over_shared_memory ? + cuda::multiprocessor_cache_preference_t::prefer_shared_memory_over_l1 : + cuda::multiprocessor_cache_preference_t::prefer_l1_over_shared_memory; + context.set_cache_preference(new_cache_preference); + cache_preference = context.cache_preference(); + assert_(cache_preference == new_cache_preference); + std::cout << "The cache preference for context " << context << " has now been set to: " << new_cache_preference << ".\n"; + + auto shared_mem_bank_size = context.shared_memory_bank_size(); + shared_mem_bank_size = + (shared_mem_bank_size == CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE) ? + CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE : CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE; + context.set_shared_memory_bank_size(shared_mem_bank_size); + auto stream_priority_range = context.stream_priority_range(); + if (stream_priority_range.is_trivial()) { + std::cout << "Context " << context << " does not support stream priorities. " + "All streams will have the same (default) priority.\n"; + } + else { + std::cout << "Streams in context " << context << " have priorities between " + << stream_priority_range.least << " (highest numeric value, least prioritized) and " + << std::to_string(stream_priority_range.greatest) << "(lowest numeric values, most prioritized).\n"; + assert(stream_priority_range.least > stream_priority_range.greatest); + } + + // Resource limits + // -------------------- + + auto printf_fifo_size = context.get_limit(CU_LIMIT_PRINTF_FIFO_SIZE); + std::cout << "The printf FIFO size for context " << context << " is " << printf_fifo_size << ".\n"; + decltype(printf_fifo_size) new_printf_fifo_size = + (printf_fifo_size <= 1024) ? 2 * printf_fifo_size : printf_fifo_size - 512; + context.set_limit(CU_LIMIT_PRINTF_FIFO_SIZE, new_printf_fifo_size); + printf_fifo_size = context.get_limit(CU_LIMIT_PRINTF_FIFO_SIZE); + assert_(printf_fifo_size == new_printf_fifo_size); + + // Flags - yes, yet another kind of attribute/property + // ---------------------------------------------------- + + std::cout << "Context " << context << " uses a" + << (context.synch_scheduling_policy() ? " synchronous" : "n asynchronous") + << " scheduling policy.\n"; + std::cout << "Context " << context << " is set to " + << (context.keeping_larger_local_mem_after_resize() ? "keep" : "discard") + << " shared memory allocation after launch.\n"; + // TODO: Change the settings as well obtaining them + +} + +void current_context_manipulation( + cuda::device_t &device, + cuda::device::primary_context_t &pc, + cuda::context_t &created_context) +{ + cuda::context_t context_0 = pc; + cuda::context_t context_1 = created_context; + cuda::context::current::set(context_0); + assert_(cuda::context::current::get() == context_0); + assert_(cuda::context::current::detail_::get_handle() == context_0.handle()); + cuda::context::current::set(context_1); + assert_(cuda::context::current::get() == context_1); + assert_(cuda::context::current::detail_::get_handle() == context_1.handle()); + + + auto context_2 = cuda::context::create(device); + { + cuda::context::current::scoped_override_t context_for_this_block { context_2 }; + assert_(context_2.handle() == cuda::context::current::get().handle()); + assert_(context_2 == cuda::context::current::get()); + } + auto gotten = cuda::context::current::get(); + assert_(gotten == context_1); + + auto context_3 = cuda::context::create_and_push(device); + +// std::cout << "Contexts:\n"; +// std::cout << "context_0: " << context_0 << '\n'; +// std::cout << "context_1: " << context_1 << '\n'; +// std::cout << "context_2: " << context_2 << '\n'; +// std::cout << "context_3: " << context_3 << '\n'; + + { + cuda::context::current::scoped_override_t context_for_this_block { context_3 }; + assert_(context_3.handle() == cuda::context::current::get().handle()); + assert_(context_3 == cuda::context::current::get()); + } + + { + auto popped = cuda::context::current::pop(); + assert_(popped == context_3); + } + gotten = cuda::context::current::get(); + assert_(gotten == context_1); +} + + +int main(int argc, char **argv) +{ + if (cuda::device::count() == 0) { + die_("No CUDA devices on this system"); + } + + // Being very cavalier about our command-line arguments here... + cuda::device::id_t device_id = (argc > 1) ? + std::stoi(argv[1]) : cuda::device::default_device_id; + + if (cuda::device::count() <= device_id) { + die_("No CUDA device with ID " + std::to_string(device_id)); + } + + auto device = cuda::device::get(device_id); + + std::cout << "Using CUDA device " << device.name() << " (having device ID " << device.id() << ")\n"; + +// report_context_stack("Before anything is done"); + auto pc = device.primary_context(); +// report_context_stack("After getting the primary context"); + + + cuda::context::current::push(pc); + constexpr const bool is_primary = true; + constexpr const bool isnt_primary = false; + test_context(pc, is_primary, device_id); + + { + auto popped = cuda::context::current::pop(); + if (popped != pc) { + die_("After pushing context " + std::to_string(pc) + " and popping it - the pop result is a different context, " + std::to_string(popped)); + } + } + + auto created_context = cuda::context::create(device); + test_context(created_context, isnt_primary, device_id); + current_context_manipulation(device, pc, created_context); + + std::cout << std::endl; +// report_context_stack("After current_context_manipulation"); + cuda::context::current::push(created_context); + cuda::context::current::push(created_context); + // We should have 3 copies of created_context on the stack at this point, and nothing else + cudaSetDevice(device_id); +// report_context_stack("After cudaSetDevice " + std::to_string(device_id)); + // We should have the primary context of the device + + + device.synchronize(); + device.reset(); + + std::cout << "\nSUCCESS\n"; +} diff --git a/examples/by_runtime_api_module/device_management.cpp b/examples/by_runtime_api_module/device_management.cpp index d5fb6d4b..8aa7de52 100644 --- a/examples/by_runtime_api_module/device_management.cpp +++ b/examples/by_runtime_api_module/device_management.cpp @@ -41,7 +41,7 @@ void attributes_and_properties() { auto device = cuda::device::current::get(); - auto max_registers_per_block = device.get_attribute(cudaDevAttrMaxRegistersPerBlock); + auto max_registers_per_block = device.get_attribute(CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK); std::cout << "Maximum number of registers per block on this device: " << max_registers_per_block << "\n"; @@ -55,7 +55,7 @@ void pci_bus_id() auto pci_id = device.pci_id(); std::string pci_id_str(pci_id); - cuda::outstanding_error::ensure_none(cuda::do_clear_errors); + cuda::outstanding_error::ensure_none(); auto re_obtained_device = cuda::device::get(pci_id_str); assert_(re_obtained_device == device); @@ -67,21 +67,19 @@ void global_memory() auto device = cuda::device::current::get(); auto device_global_mem = device.memory(); + auto total_memory = device_global_mem.amount_total(); + auto free_memory = device_global_mem.amount_total(); - assert_(device_global_mem.associated_device() == device); + std::cout + << "Device " << std::to_string(device.id()) << " reports it has " + << free_memory << " bytes free out of " << total_memory << " bytes total global memory " + << "(" << (total_memory - free_memory) << " bytes used).\n"; - if (device.id() != device.memory().associated_device().id()) { - die_("The device's reported ID and the device's memory object's reported device ID differ: " + if (device != device.memory().associated_device()) { + die_("The device's reported ID and the device's memory object's reported devices differ: " + std::to_string(device.id()) + " !=" + std::to_string(device.memory().associated_device().id())); } - auto total_memory = device_global_mem.amount_total(); - auto free_memory = device_global_mem.amount_total(); - - std::cout - << "Device " << std::to_string(device.id()) << " reports it has:\n" - << free_memory << " Bytes free out of " << total_memory << " Bytes total global memory.\n"; - assert_(free_memory <= total_memory); } @@ -91,16 +89,21 @@ void global_memory() void shared_memory() { auto device = cuda::device::current::get(); +// auto primary_context = device.primary_context(); +// report_context_stack("After getting the current device (which is " + std::to_string(device.id()) + ')'); auto reported_cache_preference = device.cache_preference(); std::cout << "The cache preference for device " << device.id() << " is: \"" << reported_cache_preference << "\".\n"; +// report_context_stack("After getting the cache preference for device " + std::to_string(device.id())); + auto applied_cache_preference = reported_cache_preference == cuda::multiprocessor_cache_preference_t::prefer_l1_over_shared_memory ? cuda::multiprocessor_cache_preference_t::prefer_shared_memory_over_l1 : cuda::multiprocessor_cache_preference_t::prefer_l1_over_shared_memory; device.set_cache_preference(applied_cache_preference); +// report_context_stack("After setting cache pref"); reported_cache_preference = device.cache_preference(); if (reported_cache_preference != applied_cache_preference) { std::cerr << "After setting cache preference to \"" @@ -119,8 +122,8 @@ void shared_memory() std::cout << "The reported shared memory bank size for device " << device.id() << " is: " << bank_size_names[reported_shared_mem_bank_size] << '.' << std::endl; auto applied_shared_mem_bank_size = - (reported_shared_mem_bank_size == cudaSharedMemBankSizeFourByte) ? - cudaSharedMemBankSizeEightByte : cudaSharedMemBankSizeFourByte; + (reported_shared_mem_bank_size == CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE) ? + CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE : CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE; device.set_shared_memory_bank_size(applied_shared_mem_bank_size); // We can't reliably check the bank size setting succeeded, since some devices, which @@ -155,12 +158,12 @@ void limits() { auto device = cuda::device::current::get(); - auto printf_fifo_size = device.get_limit(cudaLimitPrintfFifoSize); + auto printf_fifo_size = device.get_limit(CU_LIMIT_PRINTF_FIFO_SIZE); std::cout << "The printf FIFO size for device " << device.id() << " is " << printf_fifo_size << ".\n"; decltype(printf_fifo_size) new_printf_fifo_size = (printf_fifo_size <= 1024) ? 2 * printf_fifo_size : printf_fifo_size - 512; - device.set_limit(cudaLimitPrintfFifoSize, new_printf_fifo_size); - printf_fifo_size = device.get_limit(cudaLimitPrintfFifoSize); + device.set_limit(CU_LIMIT_PRINTF_FIFO_SIZE, new_printf_fifo_size); + printf_fifo_size = device.get_limit(CU_LIMIT_PRINTF_FIFO_SIZE); assert_(printf_fifo_size == new_printf_fifo_size); } @@ -187,7 +190,7 @@ void peer_to_peer(std::pair peer_ids) auto peer = cuda::device::get(peer_ids.second); if (device.can_access(peer)) { auto atomics_supported_over_link = cuda::device::peer_to_peer::get_attribute( - cudaDevP2PAttrNativeAtomicSupported, device, peer); + cuda::device::peer_to_peer::native_atomics_support, device, peer); std::cout << "Native atomics are " << (atomics_supported_over_link ? "" : "not ") << "supported over the link from device " << device.id() @@ -225,7 +228,9 @@ void current_device_manipulation() (void) e; // This avoids a spurious warning in MSVC 16.11 assert_(e.code() == cuda::status::invalid_device); // We expected to get this exception, just clear it - cuda::outstanding_error::clear(); + cuda::outstanding_error::ensure_none( + "The attempt to set the current device to an invalid value should not " + "create an outstanding error"); } // Iterate over all devices diff --git a/examples/by_runtime_api_module/error_handling.cu b/examples/by_runtime_api_module/error_handling.cu index 117e2c23..ebfd045d 100644 --- a/examples/by_runtime_api_module/error_handling.cu +++ b/examples/by_runtime_api_module/error_handling.cu @@ -27,40 +27,10 @@ int main(int, char **) } try { cuda::outstanding_error::ensure_none(); - die_("An exception should have be thrown when ensuring there were no outstanding errors (as we had just triggered one)"); } - catch(cuda::runtime_error&) { } - - cuda::outstanding_error::ensure_none(); - - // An exception was not thrown, since by default, - // ensure_no_outstanding_error() clears the error it finds - - // ... Let's do the whole thing again, but this time _without_ - // clearing the error - - try { - cuda::device::current::detail_::set(device_count); - die_("An exception should have be thrown when setting the current device to one-past-the-last."); + catch(cuda::runtime_error&) { + die_("An error was outstanding, despite our not having committed any 'sticky' errors)"); } - catch(cuda::runtime_error&) { } - - try { - cuda::outstanding_error::ensure_none(cuda::dont_clear_errors); - die_("An exception should have be thrown when setting the current device to one-past-the-last."); - } - catch(cuda::runtime_error&) { } - - try { - cuda::outstanding_error::ensure_none(cuda::dont_clear_errors); - die_("An exception should have be thrown when setting the current device to one-past-the-last."); - } - catch(cuda::runtime_error&) { } - - // This time around, repeated calls to ensure_no_outstanding_error do throw... - - cuda::outstanding_error::clear(); - cuda::outstanding_error::ensure_none(); // ... and that makes them stop std::cout << "SUCCESS\n"; return EXIT_SUCCESS; diff --git a/examples/by_runtime_api_module/event_management.cu b/examples/by_runtime_api_module/event_management.cu index b13f8d5c..30acdd2b 100644 --- a/examples/by_runtime_api_module/event_management.cu +++ b/examples/by_runtime_api_module/event_management.cu @@ -97,8 +97,8 @@ int main(int argc, char **argv) constexpr size_t buffer_size = 12345678; auto buffer = cuda::memory::managed::make_unique( device, buffer_size, cuda::memory::managed::initial_visibility_t::to_all_devices); - auto wrapped_kernel = cuda::kernel::wrap(device, increment); - cuda::grid::block_dimension_t threads_per_block = wrapped_kernel.attributes().maxThreadsPerBlock; + auto wrapped_kernel = cuda::kernel::get(device, increment); + cuda::grid::block_dimension_t threads_per_block = wrapped_kernel.maximum_threads_per_block(); cuda::grid::dimension_t num_blocks = div_rounding_up(buffer_size, threads_per_block); auto launch_config = cuda::make_launch_config(num_blocks, threads_per_block); diff --git a/examples/by_runtime_api_module/execution_control.cu b/examples/by_runtime_api_module/execution_control.cu index 78576131..7e039fe0 100644 --- a/examples/by_runtime_api_module/execution_control.cu +++ b/examples/by_runtime_api_module/execution_control.cu @@ -63,16 +63,15 @@ int main(int argc, char **argv) auto device = cuda::device::get(device_id).make_current(); std::cout << "Using CUDA device " << device.name() << " (having device ID " << device.id() << ")\n"; - auto kernel = cuda::kernel::wrap(device, kernel_function); + auto kernel = cuda::kernel::get(device, kernel_function); // ------------------------------------------ // Attributes without a specific API call // ------------------------------------------ - auto attributes = kernel.attributes(); std::cout << "The PTX version used in compiling device function " << kernel_name - << " is " << attributes.ptx_version() << ".\n"; + << " is " << kernel.ptx_version() << ".\n"; std::string cache_preference_names[] = { "No preference", @@ -102,7 +101,7 @@ int main(int argc, char **argv) const int bar = 123; const unsigned num_blocks = 3; - auto max_threads_per_block = attributes.maxThreadsPerBlock; + auto max_threads_per_block = kernel.get_attribute(CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK); auto launch_config = cuda::make_launch_config(num_blocks, max_threads_per_block); std::cout << "Launching kernel " << kernel_name @@ -175,10 +174,12 @@ int main(int argc, char **argv) if (not (e.code() == cuda::status::not_supported)) { throw e; } - cuda::outstanding_error::clear(); + // We should really not have a sticky error at this point, but lets' make + // extra sure. + cuda::outstanding_error::ensure_none(); } #endif - auto non_cooperative_kernel = cuda::kernel::wrap(device, kernel_function); + auto non_cooperative_kernel = cuda::kernel::get(device, kernel_function); auto non_cooperative_config = launch_config; non_cooperative_config.block_cooperation = true; std::cout diff --git a/examples/by_runtime_api_module/ipc.cpp b/examples/by_runtime_api_module/ipc.cpp index 6c7f4058..7cc5614a 100644 --- a/examples/by_runtime_api_module/ipc.cpp +++ b/examples/by_runtime_api_module/ipc.cpp @@ -17,16 +17,15 @@ * about from the other process. * */ -#include -#include -#include - #include #include #include #include #include +#include +#include +#include [[noreturn]] void die_(const std::string& message) diff --git a/examples/by_runtime_api_module/stream_management.cu b/examples/by_runtime_api_module/stream_management.cu index af92a93d..0418fb3a 100644 --- a/examples/by_runtime_api_module/stream_management.cu +++ b/examples/by_runtime_api_module/stream_management.cu @@ -172,7 +172,7 @@ int main(int argc, char **argv) print_first_char(buffer.get()); } ); - auto threads_per_block = cuda::kernel::wrap(device, increment).attributes().maxThreadsPerBlock; + auto threads_per_block = cuda::kernel::get(device, increment).get_attribute(CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK); auto num_blocks = div_rounding_up(buffer_size, threads_per_block); auto launch_config = cuda::make_launch_config(num_blocks, threads_per_block); // TODO: The following doesn't have much of a meaningful effect; we should modify this example diff --git a/examples/by_runtime_api_module/unified_addressing.cpp b/examples/by_runtime_api_module/unified_addressing.cpp index 5a796f9b..520dc68d 100644 --- a/examples/by_runtime_api_module/unified_addressing.cpp +++ b/examples/by_runtime_api_module/unified_addressing.cpp @@ -8,33 +8,91 @@ * one kernel, wait for the other process' kernel to * complete execution, and inspect each other's kernel's * output - in an output buffer that each of them learns - * about from the other process. + * about from the other process + * + * TODO: Mostly unimplemented for now. * */ -#include -#include -#include -#include +#include "../common.hpp" -#include -#include #include #include -[[noreturn]] bool die_(const std::string& message) -{ - std::cerr << message << "\n"; - exit(EXIT_FAILURE); -} +namespace tests { -int main(int argc, char **argv) +void pointer_properties(const cuda::device_t& device) { - cuda::device::id_t device_id = (argc > 1) ? - std::stoi(argv[1]) : cuda::device::default_device_id; - auto device = cuda::device::get(device_id); + constexpr const cuda::size_t fixed_size { 123 }; + cuda::context_t contexts[2] = { + cuda::context::create(device), + cuda::context::create(device) + }; + cuda::memory::device::unique_ptr regions[2] = { + cuda::memory::device::make_unique(contexts[0], fixed_size), + cuda::memory::device::make_unique(contexts[1], fixed_size) + }; + void* raw_pointers[2] = { + regions[0].get(), + regions[1].get() + }; + cuda::memory::pointer_t pointers[2] = { + cuda::memory::pointer::wrap(raw_pointers[0]), + cuda::memory::pointer::wrap(raw_pointers[1]), + }; + auto primary_context = device.primary_context(); + cuda::context::current::push(primary_context); // so that we check from a different context + for(size_t i = 0; i < 2; i++) { + auto reported_device_id = cuda::memory::pointer::detail_::get_attribute(raw_pointers[i]); + assert_(reported_device_id == device.id()); + auto context_handle = cuda::memory::pointer::detail_::get_attribute(raw_pointers[i]); + assert_(context_handle == contexts[i].handle()); + auto ptr_mem_type = cuda::memory::type_of(raw_pointers[i]); + assert_(ptr_mem_type == cuda::memory::type_t::device_ or ptr_mem_type == cuda::memory::type_t::unified_); + if (i == 0) { + std::cout << "The memory type reported for pointers to memory allocated on the device is: " << memory_type_name(ptr_mem_type) << "\n"; + } + assert_(pointers[i].get_for_device() == raw_pointers[i]); + try { + [[maybe_unused]] auto host_ptr = pointers[i].get_for_host(); + die_("Was expecting the host_ptr() method to fail for a device-side pointer"); + } catch(cuda::runtime_error& e) { + if (e.code() != cuda::status::named_t::invalid_value) { + throw e; + } + } + auto ptr_reported_as_managed = cuda::memory::pointer::detail_::get_attribute(raw_pointers[i]); + assert_(ptr_reported_as_managed == 0); +// auto ptr_reported_as_mapped = cuda::memory::pointer::detail_::get_attribute(raw_pointers[i]); +// assert_(ptr_reported_as_mapped == 0); +#if CUDA_VERSION >= 11030 + auto mempool_handle = cuda::memory::pointer::detail_::get_attribute(raw_pointers[i]); + assert_(mempool_handle == nullptr); +#endif + auto raw_offset_ptr = cuda::memory::as_pointer(cuda::memory::device::address(raw_pointers[i]) + 17); - std::cout << "Using CUDA device " << device.name() << " (having device ID " << device.id() << ")" << std::endl; + cuda::memory::region_t range = pointers[i].containing_range(); + cuda::memory::pointer_t offset_ptr { raw_offset_ptr }; + cuda::memory::region_t range_for_offset_ptr = offset_ptr.containing_range(); + assert_(range == range_for_offset_ptr); + assert_(range_for_offset_ptr.start() == raw_pointers[i]); +// std::cout << "range_for_offset_ptr.start() == " << range_for_offset_ptr.start() << '\n'; +// std::cout << "range_for_offset_ptr.size() == " << range_for_offset_ptr.size() << '\n'; +// std::cout << "offset_ptr == " << offset_ptr.get() << '\n'; + + // Consider testing: + // CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE + // CU_POINTER_ATTRIBUTE_MAPPED + // CU_POINTER_ATTRIBUTE_ALLOWED_HANDLE_TYPES + // CU_POINTER_ATTRIBUTE_IS_GPU_DIRECT_RDMA_CAPABLE + // CU_POINTER_ATTRIBUTE_ACCESS_FLAGS + + } + +} + +void wrapped_pointers_and_regions(const cuda::device_t& device) +{ static const size_t allocation_size { 1024 }; auto memory_region = device.memory().allocate(allocation_size); @@ -44,19 +102,21 @@ int main(int argc, char **argv) << "Verifying a wrapper for raw pointer " << memory_region.start() << " allocated on the CUDA device." << std::endl; - switch (ptr.attributes().memory_type()) { + switch (cuda::memory::type_of(ptr)) { using namespace cuda::memory; - case host_memory: die_("Pointer incorrectly reported to point into host memory"); break; - case managed_memory: die_("Pointer incorrectly reported not to point to managed memory"); break; - case unregistered_memory: die_("Pointer incorrectly reported to point to \"unregistered\" memory"); break; - case device_memory: break; + case host_: die_("Pointer incorrectly reported to point into host memory"); break; + case array: die_("Pointer incorrectly reported to point to array memory"); break; +// case unregistered_memory: die_("Pointer incorrectly reported to point to \"unregistered\" memory"); break; + case unified_: std::cout << "Allocated global-device-memory pointer reported to be of unified memory type."; + // die_("Pointer incorrectly reported not to point to managed memory"); break; + case device_: break; } { auto ptr_device = ptr.device(); auto ptr_device_id = ptr_device.id(); - (ptr_device_id == device_id) or die_( - "Pointer incorrectly reported as associated with device ID " + std::to_string(ptr_device_id) + - " rather than " + std::to_string(device_id) + "\n"); + (ptr_device_id == device.id()) or die_( + "Pointer incorrectly reported as associated with " + cuda::device::detail_::identify(device.id()) + + " rather than + " + cuda::device::detail_::identify(device.id())); } (ptr.get() == memory_region.start()) or die_("Invalid get() output"); if (ptr.get_for_device() != memory_region.start()) { @@ -66,7 +126,30 @@ int main(int argc, char **argv) << ptr.get_for_device() << " != " << memory_region.start(); die_(ss.str()); } - (ptr.get_for_host() == nullptr) or die_("Unexpected non-nullptr host-side address reported"); + try { + auto host_side_ptr = ptr.get_for_host(); + std::stringstream ss; + ss << "Unexpected success getting a host-side pointer for a device-only allocation; allocated pointer: " + << ptr.get() << ", " << " host-side pointer: " << host_side_ptr; + } + catch(cuda::runtime_error& e) { + if (e.code() != cuda::status::invalid_value) { throw e; } + } +} + +} // namespace tests + +int main(int argc, char **argv) +{ + cuda::device::id_t device_id = (argc > 1) ? + std::stoi(argv[1]) : cuda::device::default_device_id; + auto device = cuda::device::get(device_id); + + std::cout << "Using CUDA device " << device.name() << " (having device ID " << device.id() << ")" << std::endl; + + tests::wrapped_pointers_and_regions(device); + + tests::pointer_properties(device); std::cout << "\nSUCCESS\n"; return EXIT_SUCCESS; diff --git a/examples/by_runtime_api_module/version_management.cpp b/examples/by_runtime_api_module/version_management.cpp index 6a0aeb27..9da77095 100644 --- a/examples/by_runtime_api_module/version_management.cpp +++ b/examples/by_runtime_api_module/version_management.cpp @@ -5,11 +5,11 @@ * Version Management * */ -#include -#include #include #include #include +#include +#include [[noreturn]] void die_(const std::string& message) { @@ -27,7 +27,7 @@ int main(int, char **) auto runtime_version = cuda::version_numbers::runtime(); std::cout << "Using CUDA runtime version " << runtime_version << ".\n"; - auto driver_supported_version = cuda::version_numbers::maximum_supported_by_driver(); + auto driver_supported_version = cuda::version_numbers::driver(); if (driver_supported_version == cuda::version_numbers::none()) { std::cout << "There is no CUDA driver installed, so no CUDA runtime version is supported\n"; } diff --git a/examples/common.hpp b/examples/common.hpp index 61a27cd4..c3e05155 100644 --- a/examples/common.hpp +++ b/examples/common.hpp @@ -6,10 +6,18 @@ #ifndef EXAMPLES_COMMON_HPP_ #define EXAMPLES_COMMON_HPP_ -#include +// These next few lines allow for reporting the context +// stack contents within API code during debugging, but otherwise +// are not used. #include #include + +void report_current_context(const std::string& prefix); +void report_context_stack(const std::string& prefix); + +#include + #include #include #include @@ -19,7 +27,6 @@ #include #include #include -#include #include #include @@ -34,7 +41,7 @@ const char* cache_preference_name(cuda::multiprocessor_cache_preference_t pref) return cache_preference_names[(off_t) pref]; } -const char* host_thread_synch_scheduling_policy_name(cuda::host_thread_synch_scheduling_policy_t policy) +const char* host_thread_synch_scheduling_policy_name(cuda::context::host_thread_synch_scheduling_policy_t policy) { static const char *names[] = { "heuristic", @@ -47,6 +54,17 @@ const char* host_thread_synch_scheduling_policy_name(cuda::host_thread_synch_sch return names[(off_t) policy]; } +const char* memory_type_name(cuda::memory::type_t mem_type) +{ + static const char* memory_type_names[] = { + "N/A", + "host", + "device", + "array", + "unified" + }; + return memory_type_names[mem_type]; +} namespace std { @@ -60,11 +78,21 @@ std::ostream& operator<<(std::ostream& os, cuda::multiprocessor_cache_preference return (os << cache_preference_name(pref)); } -std::ostream& operator<<(std::ostream& os, cuda::host_thread_synch_scheduling_policy_t pref) +std::ostream& operator<<(std::ostream& os, cuda::context::host_thread_synch_scheduling_policy_t pref) { return (os << host_thread_synch_scheduling_policy_name(pref)); } +std::ostream& operator<<(std::ostream& os, cuda::context::handle_t handle) +{ + return (os << cuda::detail_::ptr_as_hex(handle)); +} + +std::ostream& operator<<(std::ostream& os, const cuda::context_t& context) +{ + return os << "[device " << context.device_id() << " handle " << context.handle() << ']'; +} + std::ostream& operator<<(std::ostream& os, const cuda::device_t& device) { return os << cuda::device::detail_::identify(device.id()); @@ -75,6 +103,14 @@ std::ostream& operator<<(std::ostream& os, const cuda::stream_t& stream) return os << cuda::stream::detail_::identify(stream.handle(), stream.device().id()); } +std::string to_string(const cuda::context_t& context) +{ + std::stringstream ss; + ss.clear(); + ss << context; + return ss.str(); +} + } // namespace std [[noreturn]] bool die_(const std::string& message) @@ -90,6 +126,70 @@ std::ostream& operator<<(std::ostream& os, const cuda::stream_t& stream) die_("Assertion failed at line " + std::to_string(__LINE__) + ": " #cond); \ } + +void report_current_context(const std::string& prefix = "") +{ + if (not prefix.empty()) { std::cout << prefix << ", the current context is: "; } + else std::cout << "The current context is: "; + if (not cuda::context::current::exists()) { + std::cout << "(None)" << std::endl; + } + else { + auto cc = cuda::context::current::get(); + std::cout << cc << std::endl; + } +} + + +void print_context_stack() +{ + if (not cuda::context::current::exists()) { + std::cout << "(Context stack is empty)" << std::endl; + return; + } + std::vector contexts; + while(cuda::context::current::exists()) { + contexts.push_back(cuda::context::current::detail_::pop()); + } +// std::cout << "" << contexts.size() << " contexts; top to bottom:\n"; + for (auto handle : contexts) { + auto device_id = cuda::context::detail_::get_device_id(handle); + std::cout << handle << " for device " << device_id; + if (cuda::context::detail_::is_primary(handle)) { + std::cout << " (primary, " + << (cuda::device::primary_context::detail_::is_active(device_id) ? "active" : "inactive") + << ')'; + } + std::cout << '\n'; + } + for (auto it = contexts.rbegin(); it != contexts.rend(); it++) { + cuda::context::current::detail_::push(*it); + } +} + +void report_primary_context_activity(const std::string& prefix = "") +{ + if (not prefix.empty()) { std::cout << prefix << ", "; } + std::cout << "Device primary contexts activity: "; + for(auto device : cuda::devices()) { + std::cout << device.id() << ": " + << (cuda::device::primary_context::detail_::is_active(device.id()) ? "ACTIVE" : "inactive") + << " "; + } + std::cout << '\n'; +} + +void report_context_stack(const std::string& prefix = "") +{ + if (not prefix.empty()) { std::cout << prefix << ", the context stack is (top to bottom):\n"; } + std::cout << "-----------------------------------------------------\n"; + print_context_stack(); + std::cout << "---\n"; + report_primary_context_activity(); + std::cout << "-----------------------------------------------------\n" << std::flush; +} + + // Note: This will only work correctly for positive values template typename std::common_type::type div_rounding_up(U1 dividend, U2 divisor) @@ -97,4 +197,5 @@ typename std::common_type::type div_rounding_up(U1 dividend, U2 divisor) return dividend / divisor + !!(dividend % divisor); } + #endif // EXAMPLES_COMMON_HPP_ diff --git a/examples/modified_cuda_samples/binaryPartitionCG/binaryPartitionCG.cu b/examples/modified_cuda_samples/binaryPartitionCG/binaryPartitionCG.cu index e89aa22c..6a9afffd 100644 --- a/examples/modified_cuda_samples/binaryPartitionCG/binaryPartitionCG.cu +++ b/examples/modified_cuda_samples/binaryPartitionCG/binaryPartitionCG.cu @@ -117,7 +117,7 @@ int main(int argc, const char **argv) stream.enqueue.memzero(d_numOfOdds.get(), sizeof(int)); stream.enqueue.memzero(d_sumOfOddEvenElems.get(), sizeof(int)*2); - auto kernel = cuda::kernel::wrap(device, oddEvenCountAndSumCG); + auto kernel = cuda::kernel::get(device, oddEvenCountAndSumCG); auto dims = kernel.min_grid_params_for_max_occupancy(); auto launch_config = cuda::make_launch_config(dims); // Note: While the kernel uses the "cooperative groups" CUDA-C++ headers, diff --git a/examples/modified_cuda_samples/helper_cuda.hpp b/examples/modified_cuda_samples/helper_cuda.hpp index 21ce6fea..abb227c5 100644 --- a/examples/modified_cuda_samples/helper_cuda.hpp +++ b/examples/modified_cuda_samples/helper_cuda.hpp @@ -88,7 +88,6 @@ inline int get_device_with_highest_gflops() return *iterator; } - // Initialization code to find the best CUDA Device // Unlike in NVIDIA's original helper_cuda.h, this does _not_ // make the chosen device current. diff --git a/examples/modified_cuda_samples/inlinePTX/inlinePTX.cu b/examples/modified_cuda_samples/inlinePTX/inlinePTX.cu index 5e6fb867..f1760c7e 100644 --- a/examples/modified_cuda_samples/inlinePTX/inlinePTX.cu +++ b/examples/modified_cuda_samples/inlinePTX/inlinePTX.cu @@ -10,27 +10,10 @@ * contact the author. */ -#include +#include "../../common.hpp" #include "ptx.cuh" -#include -#include - -// Note: This will only work correctly for positive values -template -typename std::common_type::type div_rounding_up(U1 dividend, U2 divisor) -{ - return dividend / divisor + !!(dividend % divisor); -} - -[[noreturn]] void die_(const std::string& message) -{ - std::cerr << message << "\n"; - exit(EXIT_FAILURE); -} - - __global__ void sequence_gpu(int *d_ptr, int length) { int elemID = blockIdx.x * blockDim.x + threadIdx.x; @@ -49,7 +32,6 @@ void sequence_cpu(int *h_ptr, int length) } } - int main(int, char **) { if (cuda::device::count() == 0) { diff --git a/examples/modified_cuda_samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest.cu b/examples/modified_cuda_samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest.cu index 46572de6..03c3766b 100644 --- a/examples/modified_cuda_samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest.cu +++ b/examples/modified_cuda_samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest.cu @@ -137,8 +137,10 @@ void enqueue_p2p_copy( P2PEngine p2p_mechanism, cuda::stream_t& stream) { - auto copy_kernel = cuda::kernel::wrap(stream.device(), copyp2p); + auto copy_kernel = cuda::kernel::get(stream.device(), copyp2p); auto grid_and_block_dims = copy_kernel.min_grid_params_for_max_occupancy(); + // Note: We could have alternatively used: + // auto grid_and_block_dims = cuda::kernel::occupancy::min_grid_params_for_max_occupancy(copy_kernel); auto launch_config = cuda::make_launch_config(grid_and_block_dims); diff --git a/examples/modified_cuda_samples/simpleDrvRuntimePTX/simpleDrvRuntimePTX.cpp b/examples/modified_cuda_samples/simpleDrvRuntimePTX/simpleDrvRuntimePTX.cpp new file mode 100644 index 00000000..da0c3cab --- /dev/null +++ b/examples/modified_cuda_samples/simpleDrvRuntimePTX/simpleDrvRuntimePTX.cpp @@ -0,0 +1,175 @@ +/* + * Copyright 1993-2019 NVIDIA Corporation. All rights reserved. + * + * Please refer to the NVIDIA end user license agreement (EULA) associated + * with this source code for terms and conditions that govern your use of + * this software. Any use, reproduction, disclosure, or distribution of + * this software and related documentation outside the terms of the EULA + * is strictly prohibited. + * + */ + +/* Vector addition: C = A + B. + * + * This sample is a very basic sample that implements element by element + * vector addition. It loads a cuda fatbinary and runs vector addition kernel. + * Uses both Driver and Runtime CUDA APIs for different purposes. + */ + +#include "../../common.hpp" + +std::string create_ptx_file() +{ + const char* ptx_file_contents = R"( + .version 6.5 + .target sm_30 + .address_size 64 + + // .globl dummy + + .visible .entry dummy( + +) + { + ret; + } + + + // .globl VecAdd_kernel + + .visible .entry VecAdd_kernel( + .param .u64 VecAdd_kernel_param_0, + .param .u64 VecAdd_kernel_param_1, + .param .u64 VecAdd_kernel_param_2, + .param .u32 VecAdd_kernel_param_3 + ) + { + .reg .pred %p<2>; + .reg .f32 %f<4>; + .reg .b32 %r<6>; + .reg .b64 %rd<11>; + + + ld.param.u64 %rd1, [VecAdd_kernel_param_0]; + ld.param.u64 %rd2, [VecAdd_kernel_param_1]; + ld.param.u64 %rd3, [VecAdd_kernel_param_2]; + ld.param.u32 %r2, [VecAdd_kernel_param_3]; + mov.u32 %r3, %ntid.x; + mov.u32 %r4, %ctaid.x; + mov.u32 %r5, %tid.x; + mad.lo.s32 %r1, %r4, %r3, %r5; + setp.ge.s32 %p1, %r1, %r2; + @%p1 bra BB0_2; + + cvta.to.global.u64 %rd4, %rd1; + mul.wide.s32 %rd5, %r1, 4; + add.s64 %rd6, %rd4, %rd5; + cvta.to.global.u64 %rd7, %rd2; + add.s64 %rd8, %rd7, %rd5; + ld.global.f32 %f1, [%rd8]; + ld.global.f32 %f2, [%rd6]; + add.f32 %f3, %f2, %f1; + cvta.to.global.u64 %rd9, %rd3; + add.s64 %rd10, %rd9, %rd5; + st.global.f32 [%rd10], %f3; + + BB0_2: + ret; + } + )"; + + char temp_filename[] = "caw-simple-drv-runtime-ptx-XXXXXX"; + int file_descriptor = mkstemp(temp_filename); + if (file_descriptor == -1) { + throw std::runtime_error(std::string("Failed creating a temporary file using mkstemp(): ") + std::strerror(errno) + '\n'); + } + FILE* ptx_file = fdopen(file_descriptor, "w"); + if (ptx_file == nullptr) { + throw std::runtime_error(std::string("Failed converting temporay file descriptor into a C library FILE structure: ") + std::strerror(errno) + '\n'); + } + if (fputs(ptx_file_contents, ptx_file) == EOF) { + throw std::runtime_error("Failed writing PTX to temporary file " + std::string(temp_filename) + ": " + std::strerror(errno) + '\n'); + } + if (fclose(ptx_file) == EOF) { + throw std::runtime_error("Failed closing temporary PTX file " + std::string(temp_filename) + ": " + std::strerror(errno) + '\n'); + } + return temp_filename; +} + +// Host code +int main(int argc, char** argv) +{ + std::cout << "simpleDrvRuntime - PTX version..\n"; + int N = 50000; + size_t size = N * sizeof(float); + + // Initialize + cuda::initialize_driver(); + + if (cuda::device::count() == 0) { + die_("No CUDA devices on this system"); + } + + // Being very cavalier about our command-line arguments here... + cuda::device::id_t device_id = (argc > 1) ? + std::stoi(argv[1]) : cuda::device::default_device_id; + + auto device = cuda::device::get(device_id); + + // Create context + auto context = cuda::context::create(device); + + cuda::context::current::scoped_override_t context_setter { context }; + +// first search for the module path before we load the results + auto ptx_filename = create_ptx_file(); + + auto module = cuda::module::load_from_file(ptx_filename); + auto vecAdd_kernel = module.get_kernel("VecAdd_kernel"); + auto dummy_kernel = module.get_kernel("dummy"); + + auto stream = cuda::stream::create(context, cuda::stream::async); + + stream.enqueue.kernel_launch(dummy_kernel, cuda::launch_configuration_t{1,1}); + + cuda::outstanding_error::ensure_none(); + + stream.synchronize(); + + auto h_A = std::unique_ptr(new float[N]); + auto h_B = std::unique_ptr(new float[N]); + auto h_C = std::unique_ptr(new float[N]); + + auto generator = []() { return rand() / (float) RAND_MAX; }; + std::generate_n(h_A.get(), N, generator); + std::generate_n(h_B.get(), N, generator); + + // Allocate vectors in device memory + auto d_A = cuda::memory::device::make_unique(device, N); + auto d_B = cuda::memory::device::make_unique(device, N); + auto d_C = cuda::memory::device::make_unique(device, N); + + + cuda::memory::async::copy(d_A.get(), h_A.get(), size, stream); + cuda::memory::async::copy(d_B.get(), h_B.get(), size, stream); + + auto threadsPerBlock = 256; + auto blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock; + auto launch_config = cuda::make_launch_config( blocksPerGrid, threadsPerBlock ); + + cuda::outstanding_error::ensure_none(); + + stream.enqueue.kernel_launch(vecAdd_kernel, launch_config, d_A.get(), d_B.get(), d_C.get(), N); + + cuda::memory::async::copy(h_C.get(), d_C.get(), size, stream); + stream.synchronize(); + + for (int i = 0; i < N; ++i) { + if (std::fabs(h_A.get()[i] + h_B.get()[i] - h_C.get()[i]) > 1e-5) { + std::cerr << "Result verification failed at element " << i << "\n"; + exit(EXIT_FAILURE); + } + } + std::cout << "SUCCESS\n"; + return EXIT_SUCCESS; +} diff --git a/examples/modified_cuda_samples/simpleIPC/simpleIPC.cu b/examples/modified_cuda_samples/simpleIPC/simpleIPC.cu index e4f30c03..a8bfcc80 100644 --- a/examples/modified_cuda_samples/simpleIPC/simpleIPC.cu +++ b/examples/modified_cuda_samples/simpleIPC/simpleIPC.cu @@ -11,7 +11,7 @@ #include "../helper_string.h" -#include +#include #include #include @@ -38,8 +38,8 @@ typedef struct ipcCUDA_st { int device; pid_t pid; - cudaIpcEventHandle_t eventHandle; - cudaIpcMemHandle_t memHandle; + cuda::event::ipc::handle_t eventHandle; + cuda::memory::ipc::handle_t memHandle; } ipcCUDA_t; typedef struct ipcDevices_st @@ -187,7 +187,8 @@ void runTestMultiKernel(ipcCUDA_t *s_mem, int index) h_refData[i] = rand(); } - auto device = cuda::device::get(s_mem[index].device).make_current(); + auto device = cuda::device::get(s_mem[index].device); + cuda::device::current::set(device); if (index == 0) { diff --git a/examples/modified_cuda_samples/simpleStreams/simpleStreams.cu b/examples/modified_cuda_samples/simpleStreams/simpleStreams.cu index 441aa92a..495e9500 100644 --- a/examples/modified_cuda_samples/simpleStreams/simpleStreams.cu +++ b/examples/modified_cuda_samples/simpleStreams/simpleStreams.cu @@ -40,19 +40,18 @@ const char *sEventSyncMethod[] = NULL }; -// System includes - // helper functions and utilities to work with CUDA #include "../helper_cuda.hpp" -#include - #include #include #include #include +using synch_policy_type = cuda::context::host_thread_synch_scheduling_policy_t; + + // Macro to aligned up to the memory size in question #define MEMORY_ALIGNMENT 4096 @@ -83,11 +82,11 @@ void printHelp() { std::cout << "Usage: " << sSDKsample << " [options below]\n" - << "\t--sync_method (" << (int) cuda::host_thread_synch_scheduling_policy_t::default_ << ") for CPU thread synchronization with GPU work." - << "\t Possible values: " << (int) cuda::host_thread_synch_scheduling_policy_t::heuristic << ", " - << (int) cuda::host_thread_synch_scheduling_policy_t::spin << ", " - << (int) cuda::host_thread_synch_scheduling_policy_t::yield << ", " - << (int) cuda::host_thread_synch_scheduling_policy_t::block << ".\n" + << "\t--sync_method (" << (int) synch_policy_type::default_ << ") for CPU thread synchronization with GPU work." + << "\t Possible values: " << (int) synch_policy_type::heuristic << ", " + << (int) synch_policy_type::spin << ", " + << (int) synch_policy_type::yield << ", " + << (int) synch_policy_type::block << ".\n" << "\t--use_generic_memory (default) use generic page-aligned host memory allocation\n" << "\t--use_cuda_malloc_host (optional) use pinned host memory allocation\n"; } @@ -103,7 +102,6 @@ int main(int argc, char **argv) // allocate generic memory and pin it laster instead of using cudaHostAlloc() - using synch_policy_type = cuda::host_thread_synch_scheduling_policy_t; auto synch_policy = synch_policy_type::block; int niterations; // number of iterations for the loop inside the kernel diff --git a/examples/modified_cuda_samples/vectorAdd/vectorAdd.cu b/examples/modified_cuda_samples/vectorAdd/vectorAdd.cu index 23ab5f66..9aea8a0c 100644 --- a/examples/modified_cuda_samples/vectorAdd/vectorAdd.cu +++ b/examples/modified_cuda_samples/vectorAdd/vectorAdd.cu @@ -10,7 +10,9 @@ * contact the author. */ -#include +#include "../../common.hpp" + +#include #include #include diff --git a/examples/modified_cuda_samples/vectorAddManaged/vectorAddManaged.cu b/examples/modified_cuda_samples/vectorAddManaged/vectorAddManaged.cu index c16f677d..6eac1e82 100644 --- a/examples/modified_cuda_samples/vectorAddManaged/vectorAddManaged.cu +++ b/examples/modified_cuda_samples/vectorAddManaged/vectorAddManaged.cu @@ -13,10 +13,9 @@ * used instead of regular host and device memory. */ -#include +#include #include -#include #include __global__ void vectorAdd(const float *A, const float *B, float *C, int numElements) diff --git a/examples/modified_cuda_samples/vectorAddMapped/vectorAddMapped.cu b/examples/modified_cuda_samples/vectorAddMapped/vectorAddMapped.cu index 6aa11a00..c69fc84b 100644 --- a/examples/modified_cuda_samples/vectorAddMapped/vectorAddMapped.cu +++ b/examples/modified_cuda_samples/vectorAddMapped/vectorAddMapped.cu @@ -13,7 +13,7 @@ * used instead of regular host and device memory. */ -#include +#include #include #include diff --git a/examples/other/array_management.cu b/examples/other/array_management.cu index 04c5d0b1..c886d700 100644 --- a/examples/other/array_management.cu +++ b/examples/other/array_management.cu @@ -68,7 +68,7 @@ void array_3d_example(cuda::device_t& device, size_t w, size_t h, size_t d) { auto ptr_out = cuda::memory::managed::make_unique(arr.size()); cuda::memory::copy(arr, ptr_in.get()); cuda::texture_view tv(arr); - assert_(tv.associated_device() == device); + assert_(tv.device() == device); constexpr cuda::grid::block_dimension_t block_dim = 10; constexpr auto block_dims = cuda::grid::block_dimensions_t::cube(block_dim); assert(div_rounding_up(w, block_dim) <= std::numeric_limits::max()); diff --git a/examples/other/inclusion_in_two_translation_units/main.cpp b/examples/other/inclusion_in_two_translation_units/main.cpp index 36c05c90..048cc31e 100644 --- a/examples/other/inclusion_in_two_translation_units/main.cpp +++ b/examples/other/inclusion_in_two_translation_units/main.cpp @@ -1,4 +1,4 @@ -#include +#include #include #include diff --git a/examples/other/inclusion_in_two_translation_units/second_tu.cpp b/examples/other/inclusion_in_two_translation_units/second_tu.cpp index 4b3624e0..7bd38c3e 100644 --- a/examples/other/inclusion_in_two_translation_units/second_tu.cpp +++ b/examples/other/inclusion_in_two_translation_units/second_tu.cpp @@ -1,4 +1,4 @@ -#include +#include cuda::device::id_t get_current_device_id() { diff --git a/examples/other/io_compute_overlap_with_streams.cu b/examples/other/io_compute_overlap_with_streams.cu index a827976b..d232c041 100644 --- a/examples/other/io_compute_overlap_with_streams.cu +++ b/examples/other/io_compute_overlap_with_streams.cu @@ -5,7 +5,7 @@ * Stream Management * */ -#include +#include #include #include @@ -27,125 +27,126 @@ __device__ void gpu_sleep(clock_value_t sleep_cycles) template __global__ void add( - const T* __restrict__ lhs, - const T* __restrict__ rhs, - T* __restrict__ result, - size_t length) + const T* __restrict__ lhs, + const T* __restrict__ rhs, + T* __restrict__ result, + size_t length) { - auto global_index = threadIdx.x + blockIdx.x * blockDim.x; - if (global_index < length) { - result[global_index] = lhs[global_index] + rhs[global_index]; - gpu_sleep(200000); - } + auto global_index = threadIdx.x + blockIdx.x * blockDim.x; + if (global_index < length) { + result[global_index] = lhs[global_index] + rhs[global_index]; + gpu_sleep(200000); + } } template constexpr I div_rounding_up(I dividend, const I2 divisor) noexcept { - return (dividend / divisor) + !!(dividend % divisor); + return (dividend / divisor) + !!(dividend % divisor); } /* * Produce a launch configuration with one thread covering each element */ cuda::launch_configuration_t make_linear_launch_config( - const cuda::device_t device, - size_t length) + const cuda::device_t device, + size_t length) { - auto threads_per_block = device.properties().max_threads_per_block(); - auto num_blocks = div_rounding_up(length, threads_per_block); - if (num_blocks > std::numeric_limits::max()) { - throw std::invalid_argument("Specified length exceeds CUDA's support for a linear grid"); - } - return cuda::make_launch_config((cuda::grid::dimensions_t) num_blocks, threads_per_block, cuda::no_dynamic_shared_memory); + auto threads_per_block = device.properties().max_threads_per_block(); + auto num_blocks = div_rounding_up(length, threads_per_block); + if (num_blocks > std::numeric_limits::max()) { + throw std::invalid_argument("Specified length exceeds CUDA's support for a linear grid"); + } + return cuda::make_launch_config((cuda::grid::dimensions_t) num_blocks, threads_per_block, cuda::no_dynamic_shared_memory); } struct buffer_set_t { - cuda::memory::host::unique_ptr host_lhs; - cuda::memory::host::unique_ptr host_rhs; - cuda::memory::host::unique_ptr host_result; - cuda::memory::device::unique_ptr device_lhs; - cuda::memory::device::unique_ptr device_rhs; - cuda::memory::device::unique_ptr device_result; + cuda::memory::host::unique_ptr host_lhs; + cuda::memory::host::unique_ptr host_rhs; + cuda::memory::host::unique_ptr host_result; + cuda::memory::device::unique_ptr device_lhs; + cuda::memory::device::unique_ptr device_rhs; + cuda::memory::device::unique_ptr device_result; }; std::vector generate_buffers( - const cuda::device_t device, - size_t num_kernels, - size_t num_elements) + const cuda::device_t& device, + size_t num_kernels, + size_t num_elements) { - // TODO: This should be an std::array, but generating - // it is a bit tricky and I don't want to burden the example - // with template wizardry - std::vector buffers; - std::generate_n(std::back_inserter(buffers), num_kernels, - [&]() { - return buffer_set_t { - // Sticking to C++11 here... - cuda::memory::host::make_unique(num_elements), - cuda::memory::host::make_unique(num_elements), - cuda::memory::host::make_unique(num_elements), - cuda::memory::device::make_unique(device, num_elements), - cuda::memory::device::make_unique(device, num_elements), - cuda::memory::device::make_unique(device, num_elements) - }; - } - ); - - // TODO: Consider actually filling the buffers - - return buffers; + // device.make_current(); + // TODO: This should be an std::array, but generating + // it is a bit tricky and I don't want to burden the example + // with template wizardry + std::vector buffers; + std::generate_n(std::back_inserter(buffers), num_kernels, + [&]() { + return buffer_set_t { + // Sticking to C++11 here... + cuda::memory::host::make_unique(num_elements), + cuda::memory::host::make_unique(num_elements), + cuda::memory::host::make_unique(num_elements), + cuda::memory::device::make_unique(device, num_elements), + cuda::memory::device::make_unique(device, num_elements), + cuda::memory::device::make_unique(device, num_elements) + }; + } + ); + + // TODO: Consider actually filling the buffers + + return buffers; } int main(int, char **) { - constexpr size_t num_kernels = 5; - constexpr size_t num_elements = 1e7; - - auto device = cuda::device::current::get(); - std::cout << "Using CUDA device " << device.name() << " (having ID " << device.id() << ")\n"; - - std::cout << "Generating host buffers... " << std::flush; - auto buffers = generate_buffers(device, num_kernels, num_elements); - std::cout << "done.\n" << std::flush; - - std::vector streams; - streams.reserve(num_kernels); - std::generate_n(std::back_inserter(streams), num_kernels, - [&]() { return device.create_stream(cuda::stream::async); }); - - auto common_launch_config = make_linear_launch_config(device, num_elements); - auto buffer_size = num_elements * sizeof(element_t); - - std::cout - << "Running " << num_kernels << " sequences of HtoD-kernel-DtoH, in parallel" << std::endl; - // Unfortunately, we need to use indices here - unless we - // had access to a zip iterator (e.g. boost::zip_iterator) - for(size_t k = 0; k < num_kernels; k++) { - auto& stream = streams[k]; - auto& buffer_set = buffers[k]; - stream.enqueue.copy(buffer_set.device_lhs.get(), buffer_set.host_lhs.get(), buffer_size); - stream.enqueue.copy(buffer_set.device_rhs.get(), buffer_set.host_rhs.get(), buffer_size); - stream.enqueue.kernel_launch( - add, - common_launch_config, - buffer_set.device_lhs.get(), - buffer_set.device_rhs.get(), - buffer_set.device_result.get(), - num_elements); - stream.enqueue.copy(buffer_set.host_result.get(), buffer_set.device_result.get(), buffer_size); - stream.enqueue.host_function_call( - [=](cuda::stream_t) { - std::cout - << "Stream " << k+1 << " of " << num_kernels << " has concluded all work. " << std::endl; - } - ); - } - std::this_thread::sleep_for(std::chrono::microseconds(50000)); - for(auto& stream : streams) { stream.synchronize(); } - cuda::outstanding_error::ensure_none(); - - // TODO: Consider checking for correctness here - - std::cout << "\nSUCCESS" << std::endl; + constexpr size_t num_kernels = 5; + constexpr size_t num_elements = 1e7; + + auto device = cuda::device::current::get(); + std::cout << "Using CUDA device " << device.name() << " (having ID " << device.id() << ")\n"; + + std::cout << "Generating host buffers... " << std::flush; + auto buffers = generate_buffers(device, num_kernels, num_elements); + std::cout << "done.\n" << std::flush; + + std::vector streams; + streams.reserve(num_kernels); + std::generate_n(std::back_inserter(streams), num_kernels, + [&]() { return device.create_stream(cuda::stream::async); }); + + auto common_launch_config = make_linear_launch_config(device, num_elements); + auto buffer_size = num_elements * sizeof(element_t); + + std::cout + << "Running " << num_kernels << " sequences of HtoD-kernel-DtoH, in parallel" << std::endl; + // Unfortunately, we need to use indices here - unless we + // had access to a zip iterator (e.g. boost::zip_iterator) + for(size_t k = 0; k < num_kernels; k++) { + auto& stream = streams[k]; + auto& buffer_set = buffers[k]; + stream.enqueue.copy(buffer_set.device_lhs.get(), buffer_set.host_lhs.get(), buffer_size); + stream.enqueue.copy(buffer_set.device_rhs.get(), buffer_set.host_rhs.get(), buffer_size); + stream.enqueue.kernel_launch( + add, + common_launch_config, + buffer_set.device_lhs.get(), + buffer_set.device_rhs.get(), + buffer_set.device_result.get(), + num_elements); + stream.enqueue.copy(buffer_set.host_result.get(), buffer_set.device_result.get(), buffer_size); + stream.enqueue.host_function_call( + [=](cuda::stream_t) { + std::cout + << "Stream " << k+1 << " of " << num_kernels << " has concluded all work. " << std::endl; + } + ); + } + std::this_thread::sleep_for(std::chrono::microseconds(50000)); + for(auto& stream : streams) { stream.synchronize(); } + cuda::outstanding_error::ensure_none(); + + // TODO: Consider checking for correctness here + + std::cout << "\nSUCCESS" << std::endl; } diff --git a/examples/other/manipulate_current_device.cu b/examples/other/manipulate_current_device.cu new file mode 100644 index 00000000..9da259df --- /dev/null +++ b/examples/other/manipulate_current_device.cu @@ -0,0 +1,59 @@ +/** + * An example program for the CUDA API wrappers library, + * which indirectly manipulates the current device using + * driver API calls. + * + */ +#include "../common.hpp" +#include + +void report_current_device() +{ + std::cout << "Runtime believes the current device index is: " + << cuda::device::current::detail_::get_id() << std::endl; +} + +int main() +{ + namespace context = cuda::context::detail_; + namespace cur_dev = cuda::device::current::detail_; + namespace pc = cuda::device::primary_context::detail_; + namespace cur_ctx = cuda::context::current::detail_; + + cuda::device::id_t dev_idx[2]; + cuda::context::handle_t pc_handle[2]; + + cuda::initialize_driver(); + dev_idx[0] = cur_dev::get_id(); + report_current_device(); + assert_(cur_dev::get_id() == 0); + dev_idx[1] = (dev_idx[0] == 0) ? 1 : 0; + pc_handle[0] = pc::obtain_and_increase_refcount(dev_idx[0]); + std::cout << "Obtained primary context handle for device " << dev_idx[0]<< '\n'; + pc_handle[1] = pc::obtain_and_increase_refcount(dev_idx[1]); + std::cout << "Obtained primary context handle for device " << dev_idx[1]<< '\n'; + report_current_device(); + cur_ctx::push(pc_handle[1]); + std::cout << "Pushed primary context handle for device " << dev_idx[1] << " onto the stack\n"; + report_current_device(); + assert_(cur_dev::get_id() == dev_idx[1]); + auto ctx = context::create_and_push(dev_idx[0]); + std::cout << "Created a new context for device " << dev_idx[0] << " and pushed it onto the stack\n"; + report_current_device(); + assert_(cur_dev::get_id() == dev_idx[0]); + cur_ctx::push(ctx); + std::cout << "Pushed primary context handle for device " << dev_idx[0] << " onto the stack\n"; + report_current_device(); + assert_(cur_dev::get_id() == dev_idx[0]); + cur_ctx::push(pc_handle[1]); + std::cout << "Pushed primary context for device " << dev_idx[1] << " onto the stack\n"; + report_current_device(); + assert_(cur_dev::get_id() == dev_idx[1]); + pc::decrease_refcount(dev_idx[1]); + std::cout << "Deactivated/destroyed primary context for device " << dev_idx[1] << '\n'; + report_current_device(); + assert_(cur_dev::get_id() == dev_idx[1]); + + std::cout << "\nSUCCESS" << std::endl; +} + diff --git a/src/cuda/api.hpp b/src/cuda/api.hpp new file mode 100644 index 00000000..370e6edf --- /dev/null +++ b/src/cuda/api.hpp @@ -0,0 +1,50 @@ +/** + * @file runtime_api.hpp + * + * @brief A single file which includes, in turn, all (joint) + * wrappers for Runtime and Driver APIs, and related headers. + */ +#pragma once +#ifndef CUDA_API_WRAPPERS_HPP_ +#define CUDA_API_WRAPPERS_HPP_ + +static_assert(__cplusplus >= 201103L, "The CUDA API headers can only be compiled with C++11 or a later version of the C++ language standard"); + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +#include +#include +#include +#include +#include + +#endif // CUDA_API_WRAPPERS_HPP_ diff --git a/src/cuda/api/apriori_compiled_kernel.hpp b/src/cuda/api/apriori_compiled_kernel.hpp new file mode 100644 index 00000000..8276bf57 --- /dev/null +++ b/src/cuda/api/apriori_compiled_kernel.hpp @@ -0,0 +1,145 @@ +/** + * @file apriori_compiled_kernel.hpp + * + * @brief An implementation of a subclass of @ref `kernel_t` for kernels + * compiled together with the host-side program. + */ +#pragma once +#ifndef CUDA_API_WRAPPERS_APRIORI_COMPILED_KERNEL_HPP_ +#define CUDA_API_WRAPPERS_APRIORI_COMPILED_KERNEL_HPP_ + +#include +#include +#include + +namespace cuda { + +///@cond +class device_t; +class apriori_compiled_kernel_t; +///@nocond + +namespace kernel { + +namespace detail_ { + +inline handle_t get_handle(const void *kernel_function_ptr, const char* name = nullptr) +{ + handle_t handle; + auto status = cudaGetFuncBySymbol(&handle, kernel_function_ptr); + throw_if_error(status, "Failed obtaining a CUDA function handle for " + + ((name == nullptr) ? ::std::string("a kernel function") : ::std::string("kernel function ") + name) + + " at " + cuda::detail_::ptr_as_hex(kernel_function_ptr)); + return handle; +} + +apriori_compiled_kernel_t wrap( + device::id_t device_id, + context::handle_t context_id, + kernel::handle_t f, + const void* ptr); + + +} // namespace detail_ +} // namespace kernel + +/** + * @brief A subclass of the @ref `kernel_t` interface for kernels being + * functions marked as __global__ in source files and compiled apriori. + */ +class apriori_compiled_kernel_t final : public kernel_t { +public: // getters + const void *ptr() const noexcept { return ptr_; } + const void *get() const noexcept { return ptr_; } + +public: // type_conversions + explicit operator const void *() noexcept { return ptr_; } + +public: // non-mutators + + /** + * @brief Calculates the number of grid blocks which may be "active" on a given GPU + * multiprocessor simultaneously (i.e. with warps from any of these block + * being schedulable concurrently) + * + * @param num_threads_per_block + * @param dynamic_shared_memory_per_block + * @param disable_caching_override On some GPUs, the choice of whether to + * cache memory reads affects occupancy. But what if this caching results in 0 + * potential occupancy for a kernel? There are two options, controlled by this flag. + * When it is set to false - the calculator will assume caching is off for the + * purposes of its work; when set to true, it will return 0 for such device functions. + * See also the "Unified L1/Texture Cache" section of the + * Maxwell + * tuning guide. + */ + grid::dimension_t maximum_active_blocks_per_multiprocessor( + grid::block_dimension_t num_threads_per_block, + memory::shared::size_t dynamic_shared_memory_per_block, + bool disable_caching_override = false); + +protected: // ctors & dtor + apriori_compiled_kernel_t(device::id_t device_id, context::handle_t context_handle, + kernel::handle_t handle, const void *f) + : kernel_t(device_id, context_handle, handle), ptr_(f) { + // TODO: Consider checking whether this actually is a device function, at all and in this context +#ifndef NDEBUG + assert(f != nullptr && "Attempt to construct a kernel object for a nullptr kernel function pointer"); +#endif + } + apriori_compiled_kernel_t(device::id_t device_id, context::handle_t context_handle, const void *f) + : apriori_compiled_kernel_t(device_id, context_handle, kernel::detail_::get_handle(f), f) { } + +public: // ctors & dtor + apriori_compiled_kernel_t(const apriori_compiled_kernel_t&) = default; + apriori_compiled_kernel_t(apriori_compiled_kernel_t&&) = default; + +public: // friends + friend apriori_compiled_kernel_t kernel::detail_::wrap(device::id_t, context::handle_t, kernel::handle_t, const void*); + +protected: // data members + const void *const ptr_; +}; + +inline grid::dimension_t apriori_compiled_kernel_t::maximum_active_blocks_per_multiprocessor( + grid::block_dimension_t num_threads_per_block, + memory::shared::size_t dynamic_shared_memory_per_block, + bool disable_caching_override) +{ + context::current::detail_::scoped_override_t set_context_for_this_context(context_handle_); + int result; + unsigned int flags = disable_caching_override ? + cudaOccupancyDisableCachingOverride : cudaOccupancyDefault; + auto status = cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags( + &result, ptr_, (int) num_threads_per_block, + dynamic_shared_memory_per_block, flags); + throw_if_error(status, "Failed calculating the maximum occupancy " + "of device function blocks per multiprocessor"); + return result; +} + +namespace kernel { +namespace detail_ { + +inline apriori_compiled_kernel_t wrap( + device::id_t device_id, + context::handle_t context_id, + kernel::handle_t f, + const void *ptr) +{ + return {device_id, context_id, f, ptr}; +} + +} // namespace detail + +template +apriori_compiled_kernel_t get(device_t device, KernelFunctionPtr function_ptr); + +template +apriori_compiled_kernel_t get(context_t context, KernelFunctionPtr function_ptr); + +} // namespace kernel + +} // namespace cuda + +#endif // CUDA_API_WRAPPERS_APRIORI_COMPILED_KERNEL_HPP_ diff --git a/src/cuda/api/array.hpp b/src/cuda/api/array.hpp index 8608640f..876a63f5 100644 --- a/src/cuda/api/array.hpp +++ b/src/cuda/api/array.hpp @@ -12,10 +12,12 @@ #ifndef CUDA_API_WRAPPERS_ARRAY_HPP_ #define CUDA_API_WRAPPERS_ARRAY_HPP_ -#include +#include #include #include +#include +#include namespace cuda { @@ -26,48 +28,88 @@ class array_t; namespace array { -using handle_t = cudaArray*; +using handle_t = CUarray; +template +using descriptor_t = typename ::std::conditional::type; /** * @brief Wrap an existing CUDA array in an @ref array_t instance. */ template array_t wrap( - device::id_t device_id, + device::id_t device_id, + context::handle_t context_handle, handle_t handle, dimensions_t dimensions) noexcept; namespace detail_ { +template struct format_specifier {}; + +template <> struct format_specifier { static constexpr const CUarray_format value = CU_AD_FORMAT_UNSIGNED_INT8; }; +template <> struct format_specifier { static constexpr const CUarray_format value = CU_AD_FORMAT_UNSIGNED_INT16; }; +template <> struct format_specifier { static constexpr const CUarray_format value = CU_AD_FORMAT_UNSIGNED_INT32; }; +template <> struct format_specifier { static constexpr const CUarray_format value = CU_AD_FORMAT_SIGNED_INT8; }; +template <> struct format_specifier { static constexpr const CUarray_format value = CU_AD_FORMAT_SIGNED_INT16; }; +template <> struct format_specifier { static constexpr const CUarray_format value = CU_AD_FORMAT_SIGNED_INT32; }; +template <> struct format_specifier { static constexpr const CUarray_format value = CU_AD_FORMAT_HALF; }; +template <> struct format_specifier { static constexpr const CUarray_format value = CU_AD_FORMAT_FLOAT; }; + template -handle_t create_on_current_device(dimensions_t<3> dimensions) +handle_t create_in_current_context(dimensions_t<3> dimensions) { - auto channel_descriptor = cudaCreateChannelDesc(); - cudaExtent extent = dimensions; handle_t handle; - auto status = cudaMalloc3DArray(&handle, &channel_descriptor, extent); - throw_if_error(status, "Failed allocating 3D CUDA array"); + CUDA_ARRAY3D_DESCRIPTOR descriptor; + descriptor.Width = dimensions.width; + descriptor.Height = dimensions.height; + descriptor.Depth = dimensions.depth; + descriptor.Format = format_specifier::value; + descriptor.NumChannels = 1; + // We don't currently support an array of packed pairs or quadruplets; if you want this, + // file an issue. + descriptor.Flags = 0; + + auto status = cuArray3DCreate(&handle, &descriptor); + throw_if_error(status, "failed allocating 3D CUDA array"); return handle; } template -handle_t create_on_current_device(dimensions_t<2> dimensions) +handle_t create_in_current_context(dimensions_t<2> dimensions) { - auto channel_desc = cudaCreateChannelDesc(); + CUDA_ARRAY_DESCRIPTOR descriptor; + descriptor.Width = dimensions.width; + descriptor.Height = dimensions.height; + descriptor.Format = format_specifier::value; + descriptor.NumChannels = 1; handle_t handle; - auto status = cudaMallocArray(&handle, &channel_desc, dimensions.width, dimensions.height); - throw_if_error(status, "Failed allocating 2D CUDA array"); + auto status = cuArrayCreate(&handle, &descriptor); + throw_if_error(status, "failed allocating 2D CUDA array"); return handle; } template -handle_t create(const device_t& device, dimensions_t dimensions); +handle_t create(context::handle_t context_handle, dimensions_t dimensions) +{ + context::current::detail_::scoped_override_t set_context_for_this_scope(context_handle); + return create_in_current_context(dimensions); +} template -handle_t create(device::id_t device_id, dimensions_t dimensions) +handle_t create(const context_t& context, dimensions_t dimensions); + +template +handle_t get_descriptor(context::handle_t context_handle, handle_t handle) { - device::current::detail_::scoped_override_t set_device_for_this_scope(device_id); - return create_on_current_device(dimensions); + cuda::context::current::detail_::scoped_override_t set_context_for_this_scope(context_handle); + descriptor_t result; + auto status = (NumDimensions == 2) ? + cuArrayGetDescriptor(&result, handle) : + cuArray3DGetDescriptor(&result, handle); + throw_if_error(status, + ::std::string("Failed obtaining the descriptor of the CUDA ") + + (NumDimensions == 2 ? "2":"3") + "D array at " + cuda::detail_::ptr_as_hex(handle)); + return result; } } // namespace detail_ @@ -99,64 +141,78 @@ class array_t { public: using handle_type = array::handle_t; + using descriptor_type = array::descriptor_t; using dimensions_type = array::dimensions_t; /** * Constructs a CUDA array wrapper from the raw type used by the CUDA * Runtime API - and takes ownership of the array */ - array_t(device::id_t device_id, handle_type handle, dimensions_type dimensions) : - device_id_(device_id), dimensions_(dimensions), handle_(handle) + array_t(device::id_t device_id, context::handle_t context_handle, handle_type handle, dimensions_type dimensions) : + device_id_(device_id), context_handle_(context_handle), dimensions_(dimensions), handle_(handle) { assert(handle != nullptr); } array_t(const array_t& other) = delete; - array_t(array_t&& other) noexcept : array_t(other.device_id_, other.handle_, other.dimensions_) + array_t(array_t&& other) noexcept : array_t(other.device_id_, other.context_handle_, other.handle_, other.dimensions_) { other.handle_ = nullptr; } ~array_t() noexcept { + cuda::context::current::detail_::scoped_override_t set_context_for_this_scope(context_handle_); if (handle_) { - auto status = cudaFreeArray(handle_); + auto status = cuArrayDestroy(handle_); // Note: Throwing in a noexcept destructor; if the free'ing fails, the program // will likely terminate - throw_if_error(status, "Failed freeing CUDA array"); + throw_if_error(status, "Failed destroying CUDA array " + cuda::detail_::ptr_as_hex(handle_)); } } - friend array_t array::wrap(device::id_t, handle_type, dimensions_type) noexcept; + friend array_t array::wrap(device::id_t, context::handle_t, handle_type, dimensions_type) noexcept; - handle_type get() const noexcept { return handle_; } - device_t device() const noexcept; + handle_type get() const noexcept { return handle_; } + device::id_t device_id() const noexcept { return device_id_; } + context::handle_t context_handle() const noexcept { return context_handle_; } dimensions_type dimensions() const noexcept { return dimensions_; } + device_t device() const noexcept; + context_t context() const; ::std::size_t size() const noexcept { return dimensions().size(); } ::std::size_t size_bytes() const noexcept { return size() * sizeof(T); } + descriptor_type descriptor() const { return array::detail_::get_descriptor(context_handle_, handle_); } protected: - dimensions_type dimensions_; - handle_type handle_; - device::id_t device_id_; + dimensions_type dimensions_; + device::id_t device_id_; + context::handle_t context_handle_; + handle_type handle_; }; namespace array { template -inline array_t wrap( - device::id_t device_id, +array_t wrap( + device::id_t device_id, + context::handle_t context_handle, handle_t handle, dimensions_t dimensions) noexcept { - return array_t { device_id, handle, dimensions }; + return { device_id, context_handle, handle, dimensions }; } template -array_t create( - const device_t& device, +array_t create( + const context_t& context, dimensions_t dimensions); +template +array_t create( + device_t device, + dimensions_t dimensions); + + } // namespace array } // namespace cuda diff --git a/src/cuda/api/constants.hpp b/src/cuda/api/constants.hpp index 87a8c62d..aae8825e 100644 --- a/src/cuda/api/constants.hpp +++ b/src/cuda/api/constants.hpp @@ -10,7 +10,7 @@ #ifndef CUDA_API_WRAPPERS_CONSTANTS_HPP_ #define CUDA_API_WRAPPERS_CONSTANTS_HPP_ -#include +#include namespace cuda { @@ -116,6 +116,16 @@ enum : bool { do_not_take_ownership = false, }; +namespace context { + +namespace detail_ { + +constexpr const CUcontext none { 0 }; + +} // namespace detail_ + +} // namespace context + } // namespace cuda #endif // CUDA_API_WRAPPERS_CONSTANTS_HPP_ diff --git a/src/cuda/api/context.hpp b/src/cuda/api/context.hpp new file mode 100644 index 00000000..077a99ce --- /dev/null +++ b/src/cuda/api/context.hpp @@ -0,0 +1,850 @@ +/** + * @file context.hpp + * + * @brief Contains a proxy class for CUDA execution contexts. + */ +#pragma once +#ifndef CUDA_API_WRAPPERS_CONTEXT_HPP_ +#define CUDA_API_WRAPPERS_CONTEXT_HPP_ + +#include +#include +#include +#include +#include + +#include +#include + +namespace cuda { + +///@cond +class device_t; +class event_t; +class context_t; +class stream_t; +class module_t; +///@endcond + +namespace link { +class options_t; +} // namespace link + +namespace context { + +using limit_t = CUlimit; +using limit_value_t = size_t; +using shared_memory_bank_size_t = CUsharedconfig; + +/** + * A range of priorities supported by a CUDA context; ranges from the + * higher numeric value to the lower. + */ +struct stream_priority_range_t { + stream::priority_t least; /// Higher numeric value, lower priority + stream::priority_t greatest; /// Lower numeric value, higher priority + + /** + * When true, stream prioritization is not supported, i.e. all streams have + * "the same" priority - the default one. + */ + constexpr bool is_trivial() const { + return least == stream::default_priority and greatest == stream::default_priority; + } +}; + +namespace detail_ { + +::std::string identify(const context_t& context); + +inline limit_value_t get_limit(limit_t limit_id) +{ + limit_value_t limit_value; + auto status = cuCtxGetLimit(&limit_value, limit_id); + throw_if_error(status, + "Failed obtaining CUDA context limit value"); + return limit_value; +} + +inline void set_limit(limit_t limit_id, limit_value_t new_value) +{ + auto status = cuCtxSetLimit(limit_id, new_value); + throw_if_error(status, "Failed obtaining CUDA context limit value"); +} + +constexpr flags_t inline make_flags( + host_thread_synch_scheduling_policy_t synch_scheduling_policy, + bool keep_larger_local_mem_after_resize) +{ + return( + synch_scheduling_policy // this enum value is also a valid bitmask + | (keep_larger_local_mem_after_resize ? CU_CTX_LMEM_RESIZE_TO_MAX : 0) ); +} + +inline device::id_t get_device_id(handle_t context_handle) +{ + auto needed_push = current::detail_::push_if_not_on_top(context_handle); + auto device_id = current::detail_::get_device_id(); + if (needed_push) { + current::detail_::pop(); + } + return device_id; +} + + + +/** + * @brief Wrap an existing CUDA context in a @ref context_t instance + * + * @param device_id ID of the device for which the context is defined + * @param context_id + * @param take_ownership When set to `false`, the CUDA context + * will not be destroyed along with its proxy. When set to `true`, + * the proxy class will destroy the context when itself being destructed. + * @return The constructed `cuda::context_t`. + */ +context_t wrap( + device::id_t device_id, + context::handle_t context_id, + bool take_ownership = false) noexcept; + +context_t from_handle( + context::handle_t context_handle, + bool take_ownership = false); + +inline size_t total_memory(handle_t handle) +{ + size_t total_mem_in_bytes; + auto status = cuMemGetInfo(nullptr, &total_mem_in_bytes); + throw_if_error(status, "Failed determining amount of total memory for " + identify(handle)); + return total_mem_in_bytes; + +} + +inline size_t free_memory(handle_t handle) +{ + size_t free_mem_in_bytes; + auto status = cuMemGetInfo(&free_mem_in_bytes, nullptr); + throw_if_error(status, "Failed determining amount of free memory for " + identify(handle)); + return free_mem_in_bytes; +} + +inline void set_cache_preference(handle_t handle, multiprocessor_cache_preference_t preference) +{ + auto status = cuCtxSetCacheConfig(static_cast(preference)); + throw_if_error(status, + "Setting the multiprocessor L1/Shared Memory cache distribution preference to " + + ::std::to_string((unsigned) preference) + " for " + identify(handle)); +} + +inline multiprocessor_cache_preference_t cache_preference(handle_t handle) +{ + CUfunc_cache preference; + auto status = cuCtxGetCacheConfig(&preference); + throw_if_error(status, + "Obtaining the multiprocessor L1/Shared Memory cache distribution preference for " + identify(handle)); + return (multiprocessor_cache_preference_t) preference; +} + +inline shared_memory_bank_size_t shared_memory_bank_size(handle_t handle) +{ + CUsharedconfig bank_size; + auto status = cuCtxGetSharedMemConfig(&bank_size); + throw_if_error(status, "Obtaining the multiprocessor shared memory bank size for " + identify(handle)); + return static_cast(bank_size); +} + +inline void set_shared_memory_bank_size(handle_t handle, shared_memory_bank_size_t bank_size) +{ + auto status = cuCtxSetSharedMemConfig(static_cast(bank_size)); + throw_if_error(status, "Setting the multiprocessor shared memory bank size for " + identify(handle)); +} + +inline void synchronize(context::handle_t handle) +{ + context::current::detail_::scoped_override_t set_context_for_this_scope(handle); + context::current::detail_::synchronize(handle); +} + +inline void synchronize(device::id_t device_id, context::handle_t handle) +{ + context::current::detail_::scoped_override_t set_context_for_this_scope(handle); + context::current::detail_::synchronize(device_id, handle); +} + +inline void destroy(handle_t handle) +{ + auto status = cuCtxDestroy(handle); + throw_if_error(status, "Failed destroying " + identify(handle)); +} + +inline void destroy(handle_t handle, device::id_t device_index) +{ + auto status = cuCtxDestroy(handle); + throw_if_error(status, "Failed destroying " + identify(handle, device_index)); +} + +inline context::flags_t get_flags(handle_t handle) +{ + current::detail_::scoped_override_t set_context_for_this_scope{handle}; + return context::current::detail_::get_flags(); +} + +} // namespace detail_ + +} // namespace context + +inline void synchronize(const context_t& context); + +/** + * @brief Wrapper class for a CUDA context + * + * Use this class - built around a context id - to perform all + * context-related operations the CUDA Driver (or, in fact, Runtime) API is capable of. + * + * @note By default this class has RAII semantics, i.e. it creates a + * context on construction and destroys it on destruction, and isn't merely + * an ephemeral wrapper one could apply and discard; but this second kind of + * semantics is also supported, through the @ref context_t::holds_refcount_unit_ field. + * + * @note A context is a specific to a device; see, therefore, also @ref device_t . + * @note This class is a "reference type", not a "value type". Therefore, making changes + * to properties of the context is a const-respecting operation on this class. + */ +class context_t { +public: // types + using scoped_setter_type = context::current::detail_::scoped_override_t; + using flags_type = context::flags_t; + + static_assert( + ::std::is_same< ::std::underlying_type::type, ::std::underlying_type::type >::value, + "Unexpected difference between enumerators used for the same purpose by the CUDA runtime and the CUDA driver"); + +public: // inner classes + + /** + * @brief A class to create a faux member in a @ref device_t, in lieu of an in-class + * namespace (which C++ does not support); whenever you see a function + * `my_dev.memory::foo()`, think of it as a `my_dev::memory::foo()`. + * + * TODO: Should this be made context-specific? + */ + class global_memory_type { + protected: // data members + const device::id_t device_id_; + const context::handle_t context_handle_; + + public: + global_memory_type(device::id_t device_id, context::handle_t context_handle) + : device_id_(device_id), context_handle_(context_handle) { } + ///@endcond + + device_t associated_device() const; + context_t associated_context() const; + + /** + * Allocate a region of memory on the device + * + * @param size_in_bytes size in bytes of the region of memory to allocate + * @return a non-null (device-side) pointer to the allocated memory + */ + memory::region_t allocate(size_t size_in_bytes); + + /** + * Allocates memory on the device whose pointer is also visible on the host, + * and possibly on other devices as well - with the same address. This is + * nVIDIA's "managed memory" mechanism. + * + * @note Managed memory isn't as "strongly associated" with a single device + * as the result of allocate(), since it can be read or written from any + * device or from the host. However, the actual space is allocated on + * some device, so its creation is a device (device_t) object method. + * + * @note for a more complete description see the + * CUDA Runtime API + * reference) + * + * @param size_in_bytes Size of memory region to allocate + * @param initial_visibility if this equals ,to_supporters_of_concurrent_managed_access\ only the host (and the + * allocating device) will be able to utilize the pointer returned; if false, + * it will be made usable on all CUDA devices on the systems. + * @return the allocated pointer; never returns null (throws on failure) + */ + memory::region_t allocate_managed( + size_t size_in_bytes, + cuda::memory::managed::initial_visibility_t initial_visibility = + cuda::memory::managed::initial_visibility_t::to_supporters_of_concurrent_managed_access); + + /** + * Amount of total global memory on the CUDA device's primary context. + */ + size_t amount_total() const + { + scoped_setter_type set_context_for_this_scope(context_handle_); + return context::detail_::total_memory(context_handle_); + } + + /** + * Amount of free global memory on the CUDA device's primary context. + */ + size_t amount_free() const { + scoped_setter_type set_context_for_this_scope(context_handle_); + return context::detail_::free_memory(context_handle_); + } + }; // class global_memory_type + + +public: // data member non-mutator getters + + /** + * The CUDA context ID this object is wrapping + */ + context::handle_t handle() const noexcept { return handle_; } + + /** + * The device with which this context is associated + */ + device::id_t device_id() const noexcept { return device_id_; } + device_t device() const; + + /** + * Is this wrapper responsible for having the wrapped CUDA context destroyed on destruction? + */ + bool is_owning() const noexcept { return owning_; } + + /** + * The amount of total global device memory available to this context, including + * memory already allocated. + */ + size_t total_memory() const { + scoped_setter_type set_context_for_this_scope(handle_); + return context::detail_::total_memory(handle_); + } + + /** + * The amount of unallocated global device memory available to this context + * and not yet allocated. + * + * @note It is not guaranteed that this entire amount can actually be succefully allocated. + */ + size_t free_memory() const { + scoped_setter_type set_context_for_this_scope(handle_); + return context::detail_::free_memory(handle_); + } + +public: // other non-mutator methods + + /** + * Determines the balance between L1 space and shared memory space set + * for kernels executing within this context. + */ + multiprocessor_cache_preference_t cache_preference() const + { + scoped_setter_type set_context_for_this_scope(handle_); + return context::detail_::cache_preference(handle_); + } + + /** + * @return the stack size in bytes of each GPU thread + * + * @todo Is this really a feature of the context? Not of the device? + */ + size_t stack_size() const + { + scoped_setter_type set_context_for_this_scope(handle_); + return context::detail_::get_limit(CU_LIMIT_STACK_SIZE); + } + + /** + * @return the size of the FIFO (first-in, first-out) buffer used by the printf() function available to device kernels + * + * @todo Is this really a feature of the context? Not of the device? + */ + context::limit_value_t printf_buffer_size() const + { + scoped_setter_type set_context_for_this_scope(handle_); + return context::detail_::get_limit(CU_LIMIT_PRINTF_FIFO_SIZE); + } + + /** + * @return the size in bytes of the heap used by the malloc() and free() device system calls. + * + * @todo Is this really a feature of the context? Not of the device? + */ + context::limit_value_t memory_allocation_heap_size() const + { + scoped_setter_type set_context_for_this_scope(handle_); + return context::detail_::get_limit(CU_LIMIT_MALLOC_HEAP_SIZE); + } + + /** + * @return the maximum grid depth at which a thread can issue the device + * runtime call `cudaDeviceSynchronize()` / `cuda::device::synchronize()` + * to wait on child grid launches to complete. + * + * @todo Is this really a feature of the context? Not of the device? + */ + context::limit_value_t maximum_depth_of_child_grid_synch_calls() const + { + scoped_setter_type set_context_for_this_scope(handle_); + return context::detail_::get_limit(CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH); + } + + global_memory_type memory() const + { + return { device_id_, handle_ }; + } + + /** + * @return maximum number of outstanding device runtime launches that can be made from this context. + * + * @todo Is this really a feature of the context? Not of the device? + */ + context::limit_value_t maximum_outstanding_kernel_launches() const + { + scoped_setter_type set_context_for_this_scope(handle_); + return context::detail_::get_limit(CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT); + } + + /** + * @return maximum granularity of fetching from the L2 cache + * + * @note A value between 0 and 128; it is apparently a "hint" somehow. + * + * @todo Is this really a feature of the context? Not of the device? + */ + context::limit_value_t l2_fetch_granularity() const + { + scoped_setter_type set_context_for_this_scope(handle_); + return context::detail_::get_limit(CU_LIMIT_MAX_L2_FETCH_GRANULARITY); + } + + /** + * @brief Returns the shared memory bank size, as described in + * this Parallel-for-all blog entry + * + * @return the shared memory bank size in bytes + */ + context::shared_memory_bank_size_t shared_memory_bank_size() const + { + scoped_setter_type set_context_for_this_scope(handle_); + return context::detail_::shared_memory_bank_size(handle_); + } + + /** + * Determine if this context is the system's current CUDA context. + */ + bool is_current() const + { + return context::current::detail_::is_(handle_); + } + + /** + * Determine if this context is the primary context for its associated device. + */ + bool is_primary() const; + + /** + * + * @todo isn't this a feature of devices? + */ + context::stream_priority_range_t stream_priority_range() const + { + scoped_setter_type set_context_for_this_scope(handle_); + context::stream_priority_range_t result; + auto status = cuCtxGetStreamPriorityRange(&result.least, &result.greatest); + throw_if_error(status, "Obtaining the priority range for streams within " + + context::detail_::identify(*this)); + return result; + } + + context::limit_value_t get_limit(context::limit_t limit_id) const + { + scoped_setter_type set_context_for_this_scope(handle_); + return context::detail_::get_limit(limit_id); + } + + version_t api_version() const + { + unsigned int raw_version; + auto status = cuCtxGetApiVersion(handle_, &raw_version); + throw_if_error(status, "Failed obtaining the API version for " + context::detail_::identify(*this)); + return version_t::from_single_number((int) raw_version); + } + +protected: + context::flags_t flags() const + { + return context::detail_::get_flags(handle_); + } + + +public: // methods which mutate the context, but not its wrapper + /** + * Gets the synchronization policy to be used for threads synchronizing + * with this CUDA context. + * + * @note see @ref host_thread_synch_scheduling_policy_t + * for a description of the various policies. + */ + context::host_thread_synch_scheduling_policy_t synch_scheduling_policy() const + { + return context::host_thread_synch_scheduling_policy_t(flags() & CU_CTX_SCHED_MASK); + } + + bool keeping_larger_local_mem_after_resize() const + { + return flags() & CU_CTX_LMEM_RESIZE_TO_MAX; + } + + /** + * See @ref cuda::stream::create() + */ + stream_t create_stream( + bool will_synchronize_with_default_stream, + stream::priority_t priority = cuda::stream::default_priority); + + /** + * See @ref cuda::event::create() + */ + event_t create_event( + bool uses_blocking_sync = event::sync_by_busy_waiting, // Yes, that's the runtime default + bool records_timing = event::do_record_timings, + bool interprocess = event::not_interprocess); + + module_t create_module(const void* module_data, link::options_t link_options) const; + module_t create_module(const void* module_data) const; + template + module_t create_module(ContiguousContainer module_data) const; + +public: // Methods which don't mutate the context, but affect the device itself + + + void enable_access_to(const context_t& peer) const; + + void disable_access_to(const context_t& peer) const; + + void reset_persisting_l2_cache() const + { + scoped_setter_type set_context_for_this_scope(handle_); +#if (CUDART_VERSION >= 11000) + auto status = cuCtxResetPersistingL2Cache(); + throw_if_error(status, "Failed resetting/clearing the persisting L2 cache memory"); +#endif + throw cuda::runtime_error( + cuda::status::insufficient_driver, + "Resetting/clearing the persisting L2 cache memory is not supported when compiling CUDA versions lower than 11.0"); + } + +public: // other methods which don't mutate this class as a reference, but do mutate the context + + /** + * @brief Sets the shared memory bank size, described in + * this Parallel-for-all blog entry + * + * @param bank_size the shared memory bank size to set + */ + void set_shared_memory_bank_size(context::shared_memory_bank_size_t bank_size) const + { + scoped_setter_type set_context_for_this_scope(handle_); + context::detail_::set_shared_memory_bank_size(handle_, bank_size); + } + + /** + * Controls the balance between L1 space and shared memory space for + * kernels executing within this context. + * + * @param preference the preferred balance between L1 and shared memory + */ + void set_cache_preference(multiprocessor_cache_preference_t preference) const + { + scoped_setter_type set_context_for_this_scope(handle_); + context::detail_::set_cache_preference(handle_, preference); + } + + void set_limit(context::limit_t limit_id, context::limit_value_t new_value) const + { + scoped_setter_type set_context_for_this_scope(handle_); + return context::detail_::set_limit(limit_id, new_value); + } + + void stack_size(context::limit_value_t new_value) const + { + return set_limit(CU_LIMIT_STACK_SIZE, new_value); + } + + void printf_buffer_size(context::limit_value_t new_value) const + { + return set_limit(CU_LIMIT_PRINTF_FIFO_SIZE, new_value); + } + + void memory_allocation_heap_size(context::limit_value_t new_value) const + { + return set_limit(CU_LIMIT_MALLOC_HEAP_SIZE, new_value); + } + + void set_maximum_depth_of_child_grid_synch_calls(context::limit_value_t new_value) const + { + return set_limit(CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH, new_value); + } + + void set_maximum_outstanding_kernel_launches(context::limit_value_t new_value) const + { + return set_limit(CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT, new_value); + } + + /** + * Have the calling thread wait - either busy-waiting or blocking - and + * return only after all pending actions within this context have concluded. + */ + void synchronize() const + { + cuda::synchronize(*this); + } + +protected: // constructors + + context_t( + device::id_t device_id, + context::handle_t context_id, + bool take_ownership) noexcept + : device_id_(device_id), handle_(context_id), owning_(take_ownership) { } + +public: // friendship + + friend context_t context::detail_::wrap( + device::id_t device_id, + context::handle_t context_id, + bool take_ownership) noexcept; + +public: // constructors and destructor + + context_t(const context_t& other) : + context_t(other.device_id_, other.handle_, false) { }; + + context_t(context_t&& other) noexcept : + context_t(other.device_id_, other.handle_, other.owning_) + { + other.owning_ = false; + }; + + ~context_t() { + if (owning_) { + cuCtxDestroy(handle_); + // Note: "Swallowing" any potential error to avoid std::terminate(); also, + // because the context cannot possibly exist after this call. + } + } + +public: // operators + + context_t& operator=(const context_t& other) + { + if (owning_) { + context::detail_::destroy(handle_); + } + device_id_ = other.device_id_; + handle_ = other.handle_; + owning_ = false; + return *this; + } + + // Deleted since the handle_t and handle_t are constant + context_t& operator=(context_t&& other) noexcept + { + ::std::swap(device_id_, other.device_id_); + ::std::swap(handle_, other.handle_); + ::std::swap(owning_, other.owning_); + return *this; + } + +protected: // data members + device::id_t device_id_; + context::handle_t handle_; + bool owning_; + // this field is mutable only for enabling move construction; other + // than in that case it must not be altered + + // TODO: Should we hold a field indicating whether this context is + // primary or not? +}; + +inline bool operator==(const context_t& lhs, const context_t& rhs) +{ + return lhs.handle() == rhs.handle(); +} + +inline bool operator!=(const context_t& lhs, const context_t& rhs) +{ + return lhs.handle() != rhs.handle(); +} + +namespace context { + +namespace detail_ { + +/** + * Obtain a wrapper for an already-existing CUDA context + * + * @note This is a named constructor idiom instead of direct access to the ctor of the same + * signature, to emphase what this construction means - a new context is _not_ + * created. + * + * @param device_id Device with which the context is associated + * @param context_id id of the context to wrap with a proxy + * @param take_ownership when true, the wrapper will have the CUDA driver destroy + * the cuntext when the wrapper itself destruct; otherwise, it is assumed + * that the context is "owned" elsewhere in the code, and that location or entity + * is responsible for destroying it when relevant (possibly after this wrapper + * ceases to exist) + * @return a context wrapper associated with the specified context + */ +inline context_t wrap( + device::id_t device_id, + handle_t context_id, + bool take_ownership) noexcept +{ + return { device_id, context_id, take_ownership }; +} + +inline context_t from_handle( + context::handle_t context_handle, + bool take_ownership) +{ + device::id_t device_id = get_device_id(context_handle); + return wrap(device_id, context_handle, take_ownership); +} + +inline handle_t create_and_push( + device::id_t device_id, + host_thread_synch_scheduling_policy_t synch_scheduling_policy = automatic, + bool keep_larger_local_mem_after_resize = false) +{ + auto flags = context::detail_::make_flags( + synch_scheduling_policy, + keep_larger_local_mem_after_resize); + handle_t handle; + auto status = cuCtxCreate(&handle, flags, device_id); + cuda::throw_if_error(status, "failed creating a CUDA context associated with " + + device::detail_::identify(device_id)); + return handle; +} + +} // namespace detail_ + +/** + * @brief creates a new context on a given device + * + * @param device The device on which to create the new stream + * @param synch_scheduling_policy + * @param keep_larger_local_mem_after_resize + * @return + * @note Until CUDA 11, there used to also be a flag for enabling/disabling + * the ability of mapping pinned host memory to device addresses. However, it was + * being ignored since CUDA 3.2 already, with the minimum CUDA version supported + * by these wrappers being later than that, so - no sense in keeping it. + */ +context_t create( + device_t device, + host_thread_synch_scheduling_policy_t synch_scheduling_policy = heuristic, + bool keep_larger_local_mem_after_resize = false); + +context_t create_and_push( + device_t device, + host_thread_synch_scheduling_policy_t synch_scheduling_policy = heuristic, + bool keep_larger_local_mem_after_resize = false); + +namespace current { + +/** + * Determine whether any CUDA context is current, or whether the context stack is empty + */ +inline bool exists() +{ + return (detail_::get_handle() != context::detail_::none); +} + +/** + * Obtain the current CUDA context, if one exists. + * + * @throws ::std::runtime_error in case there is no current context + */ +inline context_t get() +{ + auto handle = detail_::get_handle(); + if (handle == context::detail_::none) { + throw ::std::runtime_error("Attempt to obtain the current CUDA context when no context is current."); + } + return context::detail_::from_handle(handle); +} + +inline void set(const context_t& context) +{ + return detail_::set(context.handle()); +} + +inline bool push_if_not_on_top(const context_t& context) +{ + return context::current::detail_::push_if_not_on_top(context.handle()); +} + +inline void push(const context_t& context) +{ + return context::current::detail_::push(context.handle()); +} + +inline context_t pop() +{ + constexpr const bool do_not_take_ownership { false }; + // Unfortunately, since we don't store the device IDs of contexts + // on the stack, this incurs an extra API call beyond just the popping... + auto handle = context::current::detail_::pop(); + auto device_id = context::detail_::get_device_id(handle); + return context::detail_::wrap(device_id, handle, do_not_take_ownership); +} + +namespace detail_ { + +/** + * If now current context exists, push the current device's primary context onto the stack + */ +handle_t push_default_if_missing(); + +/** + * Ensures that a current context exists by pushing the current device's primary context + * if necessary, and returns the current context + * + * @throws ::std::runtime_error in case there is no current context + */ +inline context_t get_with_fallback_push() +{ + auto handle = push_default_if_missing(); + return context::detail_::from_handle(handle); +} + + +} // namespace detail_ + +} // namespace current + +bool is_primary(const context_t& context); + +namespace detail_ { + +inline ::std::string identify(const context_t& context) +{ + return identify(context.handle(), context.device_id()); +} + +} // namespace detail_ + +} // namespace context + +inline void synchronize(const context_t& context) +{ + context::detail_::synchronize(context.device_id(), context.handle()); +} + +} // namespace cuda + +#endif // CUDA_API_WRAPPERS_CONTEXT_HPP_ diff --git a/src/cuda/api/current_context.hpp b/src/cuda/api/current_context.hpp new file mode 100644 index 00000000..33d47f9a --- /dev/null +++ b/src/cuda/api/current_context.hpp @@ -0,0 +1,241 @@ +/** + * @file current_context.hpp + */ +#pragma once +#ifndef CUDA_API_WRAPPERS_CURRENT_CONTEXT_HPP_ +#define CUDA_API_WRAPPERS_CURRENT_CONTEXT_HPP_ + +#include +#include +#include + +#include + +namespace cuda { + +///@cond +class device_t; +class context_t; +///@endcond + +namespace context { + +namespace current { + +namespace detail_ { + +/** + * Returns a raw handle for the current CUDA context + * + * @return the raw handle from the CUDA driver - if one exists; none + * if no context is current/active. + */ +inline bool is_(handle_t handle) +{ + handle_t current_context_handle; + auto status = cuCtxGetCurrent(¤t_context_handle); + switch(status) { + case CUDA_ERROR_NOT_INITIALIZED: + case CUDA_ERROR_INVALID_CONTEXT: + return false; + case CUDA_SUCCESS: + return (handle == current_context_handle); + default: + throw cuda::runtime_error((status_t) status, "Failed determining whether there's a current context, or what it is"); + } +} + +/** + * Returns a raw handle for the current CUDA context + * + * @return the raw handle from the CUDA driver - if one exists; none + * if no context is current/active. + */ +inline handle_t get_handle() +{ + handle_t handle; + auto status = cuCtxGetCurrent(&handle); + throw_if_error(status, "Failed obtaining the current context's handle"); + return handle; +} + +// Note: not calling this get_ since flags are read-only anyway +inline context::flags_t get_flags() +{ + context::flags_t result; + auto status = cuCtxGetFlags(&result); + throw_if_error(status, "Failed obtaining the current context's flags"); + return result; +} + +inline device::id_t get_device_id() +{ + device::id_t device_id; + auto result = cuCtxGetDevice(&device_id); + throw_if_error(result, "Failed obtaining the current context's device"); + return device_id; +} + +} // namespace detail_ + +inline bool exists(); +inline context_t get(); +inline void set(const context_t& context); + +namespace detail_ { + +/** + * Push a context handle onto the top of the context stack - if it is not already on the + * top of the stack + * + * @param context_handle A context handle to push + * + * @note behavior undefined if you try to push @ref none + */ +inline void push(handle_t context_handle) +{ + auto status = cuCtxPushCurrent(context_handle); + throw_if_error(status, + "Failed pushing to the top of the context stack: " + context::detail_::identify(context_handle)); +} + +/** + * Push a context handle onto the top of the context stack - if it is not already on the + * top of the stack + * + * @param context_handle A context handle to push + * + * @return true if a push actually occurred + * + * @note behavior undefined if you try to push @ref none + * @note The CUDA context stack is not a proper stack, in that it doesn't allow multiple + * consecutive copes of the same context on the stack; hence there is no `push()` method. + */ +inline bool push_if_not_on_top(handle_t context_handle) +{ + if (detail_::get_handle() == context_handle) { return false; } + push(context_handle); return true; +} + +inline context::handle_t pop() +{ + handle_t popped_context_handle; + auto status = cuCtxPopCurrent(&popped_context_handle); + throw_if_error(status, "Failed popping the current CUDA context"); + return popped_context_handle; +} + +inline void set(handle_t context_handle) +{ + // TODO: Would this help? + // if (detail_::get_handle() == context_handle_) { return; } + auto status = static_cast(cuCtxSetCurrent(context_handle)); + throw_if_error(status, + "Failed setting the current context to " + context::detail_::identify(context_handle)); +} + +/** + * @note See the out-of-`detail_::` version of this class. + * + */ +class scoped_override_t { +protected: +public: + explicit scoped_override_t(handle_t context_handle) { push(context_handle); } + ~scoped_override_t() { pop(); } + +// explicit scoped_context_override_t(handle_t context_handle_) : +// did_push(push_if_not_on_top(context_handle_)) { } +// scoped_context_override_terride_t() { if (did_push) { pop(); } } +// +//protected: +// bool did_push; +}; + +class scoped_ensurer_t { +public: + bool push_needed; + + explicit scoped_ensurer_t(handle_t fallback_context_handle) : push_needed(not exists()) + { + if (push_needed) { push(fallback_context_handle); } + } + ~scoped_ensurer_t() { if (push_needed) { pop(); } } +}; + +class scoped_current_device_fallback_t; + +} // namespace detail_ + +/** + * A RAII-based mechanism for pushing a context onto the context stack + * for what remains of the current (C++ language) scope - making it the + * current context - then popping it back when exiting the scope - + * restoring the stack and the current context to what they had been + * previously. + * + * @note if some other code pushes/pops from the context stack during + * the lifetime of this class, the pop-on-destruction may fail, or + * succeed but pop some other context handle than the one originally. + * pushed. + * + */ +class scoped_override_t : private detail_::scoped_override_t { +protected: + using parent = detail_::scoped_override_t; +public: + explicit scoped_override_t(const context_t& device); + explicit scoped_override_t(context_t&& device); + ~scoped_override_t() = default; +}; + +/** + * This macro will set the current device for the remainder of the scope in which it is + * invoked, and will change it back to the previous value when exiting the scope. Use + * it as an opaque command, which does not explicitly expose the variable defined under + * the hood to effect this behavior. + */ +#define CUDA_CONTEXT_FOR_THIS_SCOPE(_cuda_context_ctor_argument) \ + ::cuda::context::current::scoped_override_t scoped_device_override( ::cuda::context_t(_cuda_context_ctor_argument) ) + + +inline bool push_if_not_on_top(const context_t& context); +inline void push(const context_t& context); + +inline void synchronize() +{ + auto status = cuCtxSynchronize(); + if (not is_success(status)) { + throw cuda::runtime_error(status, "Failed synchronizing current context"); + } +} + +namespace detail_ { + +inline void synchronize(context::handle_t handle) +{ + auto status = cuCtxSynchronize(); + if (not is_success(status)) { + throw cuda::runtime_error(status,"Failed synchronizing " + + context::detail_::identify(handle)); + } +} + +inline void synchronize(device::id_t device_id, context::handle_t handle) +{ + auto status = cuCtxSynchronize(); + if (not is_success(status)) { + throw cuda::runtime_error(status, "Failed synchronizing " + + context::detail_::identify(handle, device_id)); + } +} + +} // namespace detail + +} // namespace current + +} // namespace context + +} // namespace cuda + +#endif // CUDA_API_WRAPPERS_CURRENT_CONTEXT_HPP_ diff --git a/src/cuda/api/current_device.hpp b/src/cuda/api/current_device.hpp index 78e1b384..597cd101 100644 --- a/src/cuda/api/current_device.hpp +++ b/src/cuda/api/current_device.hpp @@ -21,9 +21,9 @@ #define CUDA_API_WRAPPERS_CURRENT_DEVICE_HPP_ #include -#include #include -#include +#include +#include #include @@ -44,22 +44,71 @@ namespace detail_ { */ inline id_t get_id() { - id_t device; - status_t result = cudaGetDevice(&device); - throw_if_error(result, "Failure obtaining current device index"); - return device; + constexpr const id_t default_device_id { 0 }; + context::handle_t current_context_handle; + auto status = cuCtxGetCurrent(¤t_context_handle); + if (status == CUDA_ERROR_NOT_INITIALIZED) { + initialize_driver(); + // Should we activate and push the default device's context? probably not. + return default_device_id; + } + throw_if_error(status, "Failed obtaining the current context for determining which " + "device is active"); + + if (current_context_handle == context::detail_::none) { + // Should we activate and push the default device's context? probably not. + return default_device_id; + } + return cuda::context::current::detail_::get_device_id(); + // ... which is the equivalent of doing: + // +// handle_t device_id; +// auto status = cudaGetDevice(&device_id); +// throw_if_error(status, "Failure obtaining current device id"); +// return device_id; } /** * Set a device as the current one for the CUDA Runtime API (so that API calls * not specifying a device apply to it.) * + * @note This replaces the current CUDA context (rather than pushing a context + * onto the stack), so use with care. + * + * @note This causes a primary context for the device to be created, if it + * doesn't already exist. I'm not entirely sure regarding the conditions under + * which it will be destroyed, however. + * * @param[in] device Numeric ID of the device to make current */ -inline void set(id_t device) +inline void set(id_t device_id) { - status_t result = cudaSetDevice(device); - throw_if_error(result, "Failure setting current device to " + ::std::to_string(device)); + context::handle_t current_context_handle; + bool have_current_context; + auto status = cuCtxGetCurrent(¤t_context_handle); + if (status == CUDA_ERROR_NOT_INITIALIZED) { + initialize_driver(); + // Should we activate and PUSH the default device's context? probably not. + have_current_context = false; + } + else { + have_current_context = (current_context_handle != context::detail_::none); + } + if (have_current_context) { + auto current_context_device_id = context::detail_::get_device_id(current_context_handle); + if (current_context_device_id == device_id) { + return; + } + } + auto device_pc_is_active = device::primary_context::detail_::is_active(device_id); + bool need_refcount_increase = not device_pc_is_active; + auto dev_pc_handle = device::primary_context::detail_::get_handle(device_id, need_refcount_increase); + context::current::detail_::set(dev_pc_handle); + + + // ... which is the equivalent of doing: + // auto status = cudaSetDevice(device_id); + // throw_if_error(status, "Failure setting current device to " + ::std::to_string(device_id)); } /** @@ -67,6 +116,9 @@ inline void set(id_t device) * * @param[in] device_ids Numeric IDs of the devices to try and make current, in order * @param[in] num_devices The number of device IDs pointed to by @device_ids + * + * @note this replaces the current CUDA context (rather than pushing a context + * onto the stack), so use with care. */ inline void set(const id_t* device_ids, size_t num_devices) { @@ -80,25 +132,55 @@ inline void set(const id_t* device_ids, size_t num_devices) /** * @note See the out-of-`detail_::` version of this class. + * + * @note Perhaps it would be better to keep a copy of the current context ID in a + * member of this class, instead of on the stack? + * + * @note we have no guarantee that the context stack is not altered during + * the lifetime of this object; but - we assume it wasn't, and it's up to the users + * of this class to assure that's the case or face the consequences. + * + * @note We don't want to use the cuda::context::detail_scoped_override_t + * as the implementation, since we're not simply pushing and popping */ -class scoped_override_t { -protected: - static id_t replace(id_t new_device_id) + +class scoped_context_override_t { +public: + explicit scoped_context_override_t(id_t device_id) : + device_id_(device_id), + refcount_was_nonzero(device::primary_context::detail_::is_active(device_id)) { - id_t previous_device_id = device::current::detail_::get_id(); - device::current::detail_::set(new_device_id); - return previous_device_id; + auto top_of_context_stack = context::current::detail_::get_handle(); + if (top_of_context_stack != context::detail_::none) { + context::current::detail_::push(top_of_context_stack); // Yes, we're pushing a copy of the same context + } + device::current::detail_::set(device_id); // ... which now gets overwritten at the top of the stack + primary_context_handle = device::primary_context::detail_::obtain_and_increase_refcount(device_id); + +// auto top_of_context_stack = context::current::detail_::get_handle(); +// device::current::detail_::set(device_id); // ... which now gets overwritten at the top of the stack +// primary_context = device::primary_context::detail_::get_handle(device_id); +// context::current::detail_::push(primary_context); } + ~scoped_context_override_t() { + context::current::detail_::pop(); +//#else +// auto popped_context_handle = context::current::detail_::pop(); +// if (popped_context_handle != primary_context_handle) { +// throw ::std::logic_error("Expected the top of the context stack to hold the primary context of " +// + device::detail_::identify(device_id_)); +// } +//#endif + if (refcount_was_nonzero) { + device::primary_context::detail_::decrease_refcount(device_id_); + // We intentionally "leak" a refcount, as otherwise, the primary context + // gets destroyed after we have created it - and we don't want that happening. + } -public: - scoped_override_t(id_t new_device_id) : previous_device_id(replace(new_device_id)) { } - ~scoped_override_t() { - // Note that we have no guarantee that the current device was not - // already replaced while this object was in scope; but - that's life. - replace(previous_device_id); } -private: - id_t previous_device_id; + device::id_t device_id_; + primary_context::handle_t primary_context_handle; + bool refcount_was_nonzero; }; @@ -109,26 +191,21 @@ class scoped_override_t { */ inline void set_to_default() { return detail_::set(device::default_device_id); } -/** - * Obtains (a proxy for) the device which the CUDA runtime API considers to be current. - */ -inline device_t get(); - -/** - * Tells the CUDA runtime API to consider the specified device as the current one. - */ -inline void set(device_t device); +void set(const device_t& device); /** - * A RAII-based mechanism for setting the CUDA Runtime API's current device for + * A RAII-like mechanism for setting the CUDA Runtime API's current device for * what remains of the current scope, and changing it back to its previous value * when exiting the scope. + * + * @note The description says "RAII-like" because the reality is more complex. The + * runtime API sets a device by overwriting the current */ -class scoped_override_t : private detail_::scoped_override_t { +class scoped_override_t : private detail_::scoped_context_override_t { protected: - using parent = detail_::scoped_override_t; + using parent = detail_::scoped_context_override_t; public: - scoped_override_t(device_t& device); + scoped_override_t(const device_t& device); scoped_override_t(device_t&& device); ~scoped_override_t() = default; }; diff --git a/src/cuda/api/detail/device_properties.hpp b/src/cuda/api/detail/device_properties.hpp index 4c847fb9..6ffbeaba 100644 --- a/src/cuda/api/detail/device_properties.hpp +++ b/src/cuda/api/detail/device_properties.hpp @@ -1,5 +1,5 @@ /** - * @file detail_/device_properties.hpp + * @file detail/device_properties.hpp * * @brief Implementation of methods and helper functions for device-property-related classes. * @@ -56,7 +56,7 @@ template inline T ensure_arch_property_validity(T v, const compute_architecture_t& arch) { if (v == detail_::invalid_architecture_return) { - throw ::std::invalid_argument("No architecture numbered " + ::std::to_string(arch.major)); + throw ::std::invalid_argument("No known architecture numbered " + ::std::to_string(arch.major)); } return v; } @@ -65,7 +65,7 @@ template <> inline const char* ensure_arch_property_validity(const char* v, const compute_architecture_t& arch) { if (v == nullptr) { - throw ::std::invalid_argument("No architecture numbered " + ::std::to_string(arch.major)); + throw ::std::invalid_argument("No known architecture numbered " + ::std::to_string(arch.major)); } return v; } @@ -86,25 +86,30 @@ inline constexpr const char* architecture_name(const compute_architecture_t& arc nullptr; } +/** + * @note Remember that regardless of the value you get from this function, + * to use more than 48 KiB per block you may need a call such as: + * + * cudaFuncSetAttribute( + * my_kernel, + * cudaFuncAttributePreferredSharedMemoryCarveout, + * cudaSharedmemCarveoutMaxShared + * ); + * + * for details, see the CUDA Programming Guide, section K.7.3 + */ inline constexpr memory::shared::size_t max_shared_memory_per_block(const compute_architecture_t& arch) { return (arch.major == 1) ? 16 * KiB : (arch.major == 2) ? 48 * KiB : (arch.major == 3) ? 48 * KiB : - // Note: No architecture number 4! + // Note: No architecture number 4! (arch.major == 5) ? 48 * KiB : (arch.major == 6) ? 48 * KiB : - (arch.major == 7) ? 96 * KiB : - // this is the Volta figure, Turing is different. Also, values above 48 require a call such as: - // - // cudaFuncSetAttribute( - // my_kernel, - // cudaFuncAttributePreferredSharedMemoryCarveout, - // cudaSharedmemCarveoutMaxShared - // ); - // - // for details, see the CUDA C Programming Guide. + (arch.major == 7) ? 96 * KiB : // of 128 + // this is the Volta figure, Turing is different. + (arch.major == 8) ? 163 * KiB : // of 192 invalid_architecture_return; } @@ -118,6 +123,7 @@ inline unsigned max_resident_warps_per_processor(const compute_architecture_t& a (arch.major == 5) ? 64 : (arch.major == 6) ? 64 : (arch.major == 7) ? 64 : // this is the Volta figure, Turing is different + (arch.major == 8) ? 64 : invalid_architecture_return; } @@ -131,6 +137,7 @@ inline unsigned max_warp_schedulings_per_processor_cycle(const compute_architect (arch.major == 5) ? 4 : (arch.major == 6) ? 4 : (arch.major == 7) ? 4 : + (arch.major == 8) ? 4 : invalid_architecture_return; } @@ -144,6 +151,7 @@ inline constexpr unsigned max_in_flight_threads_per_processor(const compute_arch (arch.major == 5) ? 128 : (arch.major == 6) ? 128 : (arch.major == 7) ? 128 : // this is the Volta figure, Turing is different + (arch.major == 8) ? 2048 : invalid_architecture_return; } @@ -232,6 +240,7 @@ inline constexpr unsigned max_in_flight_threads_per_processor(const compute_capa cc.as_combined_number() == 21 ? 48 : cc.as_combined_number() == 60 ? 64 : cc.as_combined_number() == 75 ? 64 : + cc.as_combined_number() == 86 ? 1536 : max_in_flight_threads_per_processor(cc.architecture); } @@ -243,11 +252,24 @@ inline constexpr unsigned max_warp_schedulings_per_processor_cycle(const compute max_warp_schedulings_per_processor_cycle(cc.architecture); } +/** + * @note Remember that regardless of the value you get from this function, + * to use more than 48 KiB per block you may need a call such as: + * + * cudaFuncSetAttribute( + * my_kernel, + * cudaFuncAttributePreferredSharedMemoryCarveout, + * cudaSharedmemCarveoutMaxShared + * ); + * + * for details, see the CUDA Programming Guide, section K.7.3 + */ inline constexpr unsigned max_shared_memory_per_block(const compute_capability_t& cc) { return - cc.as_combined_number() == 37 ? 112 * KiB : - cc.as_combined_number() == 75 ? 64 * KiB : + cc.as_combined_number() == 37 ? 112 * KiB : // of 112 + cc.as_combined_number() == 75 ? 64 * KiB : // of 96 + cc.as_combined_number() == 86 ? 99 * KiB : // of 128 max_shared_memory_per_block(cc.architecture); } @@ -259,6 +281,7 @@ inline constexpr unsigned max_resident_warps_per_processor(const compute_capabil cc.as_combined_number() == 12 ? 32 * KiB : cc.as_combined_number() == 13 ? 32 * KiB : cc.as_combined_number() == 75 ? 32 * KiB : + cc.as_combined_number() == 86 ? 48 * KiB : max_resident_warps_per_processor(cc.architecture); } @@ -295,6 +318,25 @@ inline bool properties_t::usable_for_compute() const noexcept } // namespace device } // namespace cuda +namespace std { + + template <> + struct hash + { + ::std::size_t operator()(const cuda::device::compute_capability_t& cc) const + { + using ::std::hash; + + // Compute individual hash values for first, + // second and third and combine them using XOR + // and bit shifting: + + return hash()(cc.major()) ^ (hash()(cc.minor()) << 1); + } + }; + +} // namespace std + ///@endcond #endif // CUDA_API_WRAPPERS_DETAIL_DEVICE_PROPERTIES_HPP_ diff --git a/src/cuda/api/device.hpp b/src/cuda/api/device.hpp index fecc6d67..3a80dce7 100644 --- a/src/cuda/api/device.hpp +++ b/src/cuda/api/device.hpp @@ -9,15 +9,18 @@ #ifndef CUDA_API_WRAPPERS_DEVICE_HPP_ #define CUDA_API_WRAPPERS_DEVICE_HPP_ +#include #include #include #include #include -#include +#include +#include #include #include +#include #include namespace cuda { @@ -28,8 +31,27 @@ class stream_t; class device_t; ///@endcond +/** + * @brief Waits for all previously-scheduled tasks on all streams (= queues) + * on a specified device to conclude. + * + * Depending on the host_thread_synch_scheduling_policy_t set for this + * device, the thread calling this method will either yield, spin or block + * until all tasks scheduled previously scheduled on this device have been + * concluded. + */ +void synchronize(const device_t& device); + namespace device { +///@cond +class primary_context_t; +///@cendond + +using limit_t = context::limit_t; +using limit_value_t = context::limit_value_t; +using shared_memory_bank_size_t = context::shared_memory_bank_size_t; + namespace detail_ { /** @@ -41,99 +63,48 @@ namespace detail_ { * (see below) - but chose not too for consistency with other wrappers * and to avoid requiring multiple friend functions. */ -device_t wrap(id_t id) noexcept; +device_t wrap(id_t id, primary_context::handle_t primary_context_handle = context::detail_::none) noexcept; } -namespace peer_to_peer { - -/** - * @brief The value of type for all CUDA device "attributes"; see also @ref cuda::device::attribute_t. - */ -using attribute_value_t = int; - -/** - * @brief An identifier of a integral-numeric-value attribute of a CUDA device. - * - * @note Somewhat annoyingly, CUDA devices have attributes, properties and flags. - * Attributes have integral number values; properties have all sorts of values, - * including arrays and limited-length strings (see - * @ref cuda::device::properties_t), and flags are either binary or - * small-finite-domain type fitting into an overall flags value (see - * @ref cuda::device_t::flags_t). Flags and properties are obtained all at once, - * attributes are more one-at-a-time. - */ -using attribute_t = cudaDeviceP2PAttr; +using stream_priority_range_t = context::stream_priority_range_t; -/** - * Aliases for all CUDA device attributes - */ -enum : ::std::underlying_type::type { - link_performance_rank = cudaDevP2PAttrPerformanceRank, /**< A relative value indicating the performance of the link between two devices */ //!< link_performance_rank - access_support = cudaDevP2PAttrAccessSupported, /**< 1 if access is supported, 0 otherwise */ //!< access_support - native_atomics_support = cudaDevP2PAttrNativeAtomicSupported /**< 1 if the first device can perform native atomic operations on the second device, 0 otherwise *///!< native_atomics_support -}; +namespace detail_ { -/** - * @brief Get one of the numeric attributes for a(n ordered) pair of devices, - * relating to their interaction - * - * @note This is the device-pair equivalent of @ref device_t::get_attribute() - * - * @param attribute identifier of the attribute of interest - * @param source source device - * @param destination destination device - * @return the numeric attribute value - */ -inline attribute_value_t get_attribute(attribute_t attribute, id_t source, id_t destination) +inline ::std::string get_name(id_t id) { - attribute_value_t value; - auto status = cudaDeviceGetP2PAttribute(&value, attribute, source, destination); - throw_if_error(status, - "Failed obtaining peer-to-peer device attribute for device pair (" + ::std::to_string(source) + ", " - + ::std::to_string(destination) + ')'); - return value; + initialize_driver(); + using size_type = int; // Yes, an int, that's what cuDeviceName takes + constexpr const size_type initial_size_reservation { 100 }; + constexpr const size_type larger_size { 1000 }; // Just in case + char stack_buffer[initial_size_reservation]; + char* buffer = stack_buffer; + auto buffer_size = (size_type) (sizeof(stack_buffer) / sizeof(char)); + auto try_getting_name = [&](char* buffer, size_type buffer_size) -> size_type { + auto status = cuDeviceGetName(buffer, buffer_size-1, id); + throw_if_error(status, "Failed obtaining the CUDA device name"); + buffer[buffer_size-1] = '\0'; + return (size_type) ::std::strlen(buffer); + }; + auto prospective_name_length = try_getting_name(buffer, initial_size_reservation); + if (prospective_name_length >= buffer_size - 1) { + // This should really not happen, but just for the off chance... + if (buffer != stack_buffer) { delete buffer; } + buffer = new char[larger_size]; + prospective_name_length = try_getting_name(buffer, buffer_size); + } + if (prospective_name_length >= buffer_size - 1) { + throw ::std::runtime_error("CUDA device name longer than expected maximum size " + ::std::to_string(larger_size)); + } + return { buffer, (::std::size_t) prospective_name_length }; } -attribute_value_t get_attribute( - attribute_t attribute, - device_t source, - device_t destination); - -} // namespace peer_to_peer - -/** - * A range of priorities supported by a CUDA device; ranges from the - * higher numeric value to the lower. - */ -struct stream_priority_range_t { - stream::priority_t least; /// Higher numeric value, lower priority - stream::priority_t greatest; /// Lower numeric value, higher priority - - /** - * When true, stream prioritization is not supported, i.e. all streams have - * "the same" priority - the default one. - */ - constexpr bool is_trivial() const { - return least == stream::default_priority and greatest == stream::default_priority; - } -}; +} // namespace detail } // namespace device - /** - * @brief Suspends execution until all previously-scheduled tasks on - * the specified device (all contexts, all streams) have concluded. - * - * Depending on the host_thread_synch_scheduling_policy_t set for this - * device, the thread calling this method will either yield, spin or block - * until this completion. - */ -inline void synchronize(device_t device); - -/** - * @brief Proxy class for a CUDA device + * @brief Wrapper class for a CUDA device * * Use this class - built around a device ID, or for the current device - to * perform almost, if not all, device-related operations, as opposed to passing @@ -141,111 +112,29 @@ inline void synchronize(device_t device); * * @note this is one of the three main classes in the Runtime API wrapper library, * together with @ref cuda::stream_t and @ref cuda::event_t + * + * @note obtaining device LUID's is not supported (those are too graphics-specific) + * @note This class is a "reference type", not a "value type". Therefore, making changes + * to properties of the device is a const-respecting operation on this class. */ class device_t { public: // types using properties_t = device::properties_t; using attribute_value_t = device::attribute_value_t; - using limit_t = size_t; + using flags_type = device::flags_t; - using resource_id_t = cudaLimit; + // TODO: Consider a scoped/unscoped dichotomy + context_t::global_memory_type memory() const { return primary_context(unscoped_).memory(); } protected: // types - using scoped_setter_t = device::current::detail_::scoped_override_t; - using flags_t = unsigned; - -public: // types - - /** - * @brief A class to create a faux member in a @ref device_t, in lieu of an in-class - * namespace (which C++ does not support); whenever you see a function - * `my_dev.memory::foo()`, think of it as a `my_dev::memory::foo()`. - */ - class global_memory_t { - protected: - const device::id_t device_id_; - - using deleter = memory::device::detail_::deleter; - using allocator = memory::device::detail_::allocator; - - public: - ///@cond - explicit global_memory_t(device::id_t id) : device_id_(id) { } - ///@endcond - - cuda::device_t associated_device() const { return device::detail_::wrap(device_id_); } - - /** - * Allocate a region of memory on the device - * - * @param size_in_bytes size in bytes of the region of memory to allocate - * @return a non-null (device-side) pointer to the allocated memory - */ - memory::region_t allocate(size_t size_in_bytes) const - { - scoped_setter_t set_device_for_this_scope(device_id_); - return memory::device::detail_::allocate(size_in_bytes); - } - - // Perhaps drop this? it should really go into a managed namespace - using initial_visibility_t = cuda::memory::managed::initial_visibility_t; - - /** - * Allocates memory on the device whose pointer is also visible on the host, - * and possibly on other devices as well - with the same address. This is - * nVIDIA's "managed memory" mechanism. - * - * @note Managed memory isn't as "strongly associated" with a single device - * as the result of allocate(), since it can be read or written from any - * device or from the host. However, the actual space is allocated on - * some device, so its creation is a device (device_t) object method. - * - * @note for a more complete description see the - * CUDA Runtime API - * reference) - * - * @param size_in_bytes Size of memory region to allocate - * @param initial_visibility if this equals ,to_supporters_of_concurrent_managed_access\ only the host (and the - * allocating device) will be able to utilize the pointer returned; if false, - * it will be made usable on all CUDA devices on the systems. - * @return the allocated pointer; never returns null (throws on failure) - */ - memory::region_t allocate_managed( - size_t size_in_bytes, - initial_visibility_t initial_visibility = - initial_visibility_t::to_supporters_of_concurrent_managed_access) const - { - scoped_setter_t set_device_for_this_scope(device_id_); - return cuda::memory::managed::detail_::allocate(size_in_bytes, initial_visibility); - } + using context_setter_type = context::current::detail_::scoped_override_t; + // Note the context setter only affects the _currency_ of a context, not the + // activity of a primary context - /** - * Amount of total global memory on the CUDA device. - */ - size_t amount_total() const - { - scoped_setter_t set_device_for_this_scope(device_id_); - size_t total_mem_in_bytes; - auto status = cudaMemGetInfo(nullptr, &total_mem_in_bytes); - throw_if_error(status, "Failed determining amount of total memory for " + device::detail_::identify(device_id_)); - return total_mem_in_bytes; - } +protected: // constants + enum : bool { scoped_ = true, unscoped_ = false }; - /** - * Amount of memory on the CUDA device which is free and may be - * allocated for other uses. - * - * @note No guarantee of this free memory being contigous. - */ - size_t amount_free() const - { - scoped_setter_t set_device_for_this_scope(device_id_); - size_t free_mem_in_bytes; - auto status = cudaMemGetInfo(&free_mem_in_bytes, nullptr); - throw_if_error(status, "Failed determining amount of free memory for CUDA " + device::detail_::identify(device_id_)); - return free_mem_in_bytes; - } - }; // class global_memory_t +public: /** * @brief Determine whether this device can access the global memory @@ -256,10 +145,12 @@ class device_t { */ bool can_access(device_t peer) const { + context_setter_type set_for_this_scope(primary_context_handle()); int result; - auto status = cudaDeviceCanAccessPeer(&result, id(), peer.id()); - throw_if_error(status, "Failed determining whether CUDA device " + device::detail_::identify(id()) + " can access CUDA device " - + device::detail_::identify(peer.id())); + auto status = cuDeviceCanAccessPeer(&result, id(), peer.id()); + throw_if_error(status, "Failed determining whether " + + device::detail_::identify(id_) + " can access " + + device::detail_::identify(peer.id_)); return (result == 1); } @@ -268,14 +159,9 @@ class device_t { * * @param peer the device to which to enable access */ - void enable_access_to(device_t peer) const + void enable_access_to(const device_t& peer) const { - enum : unsigned {fixed_flags = 0 }; - // No flags are supported as of CUDA 8.0 - scoped_setter_t set_device_for_this_scope(id()); - auto status = cudaDeviceEnablePeerAccess(peer.id(), fixed_flags); - throw_if_error(status, - "Failed enabling access of " + device::detail_::identify(id()) + " to " + ::std::to_string(peer.id())); + primary_context(scoped_).enable_access_to(peer.primary_context(scoped_)); } /** @@ -283,48 +169,69 @@ class device_t { * * @param peer the device to which to disable access */ - void disable_access_to(device_t peer) const + void disable_access_to(const device_t& peer) const { - scoped_setter_t set_device_for_this_scope(id()); - auto status = cudaDeviceDisablePeerAccess(peer.id()); - throw_if_error(status, - "Failed disabling access of device " + ::std::to_string(id()) + " to device " + ::std::to_string(peer.id())); + primary_context(scoped_).disable_access_to(peer.primary_context(scoped_)); } -protected: - void set_flags(flags_t new_flags) const - { - scoped_setter_t set_device_for_this_scope(id_); - auto status = cudaSetDeviceFlags(new_flags); - throw_if_error(status, "Failed setting the flags for " + device::detail_::identify(id_)); - } - void set_flags( - host_thread_synch_scheduling_policy_t synch_scheduling_policy, - bool keep_larger_local_mem_after_resize) const - { - set_flags( (flags_t) - synch_scheduling_policy // this enum value is also a valid bitmask - | (keep_larger_local_mem_after_resize ? cudaDeviceLmemResizeToMax : 0)); + uuid_t uuid () const { + uuid_t result; + auto status = cuDeviceGetUuid(&result, id_); + throw_if_error(status, "Failed obtaining UUID for " + device::detail_::identify(id_)); + return result; } - flags_t flags() const - { - scoped_setter_t set_device_for_this_scope(id_); - flags_t flags; - auto status = cudaGetDeviceFlags(&flags); - throw_if_error(status, "Failed obtaining the flags for " + device::detail_::identify(id_)); - return flags; +protected: + void cache_and_ensure_primary_context_activation() const { + if (primary_context_handle_ == context::detail_::none) { + primary_context_handle_ = device::primary_context::detail_::obtain_and_increase_refcount(id_); + // The refcount should now be non-zero until we destruct this device_t! + } } + context::handle_t primary_context_handle() const + { + cache_and_ensure_primary_context_activation(); + return primary_context_handle_; + } + + +public: + /** + * Produce a proxy for the device's primary context - the one used by runtime API calls. + * + * @param scoped When true, the primary proxy object returned will not perform its + * own reference accounting, and will assume the primary context is active while + * this device object exists. When false, the returned primary context proxy object + * _will_ take care of its own reference count unit, and can outlive this object. + */ + device::primary_context_t primary_context(bool scoped) const; + + public: /** - * @brief Obtains a proxy for the device's global memory + * Produce a proxy for the device's primary context - the one used by runtime API calls. + * + * @note The CUDA driver reference counting for the primary scope is "taken core of" + * with this call, i.e. the caller does not need to add/decrease the refcount, and the + * object can safely outlive the device_t proxy object which created it. */ - global_memory_t memory() const { return global_memory_t{ id_ }; }; + device::primary_context_t primary_context() const { return primary_context(unscoped_); } + + void set_flags(flags_type new_flags) const + { + cache_and_ensure_primary_context_activation(); + auto status = cuDevicePrimaryCtxSetFlags(id(), new_flags); + throw_if_error(status, "Failed setting the flags for " + device::detail_::identify(id_)); + } + +public: /** * Obtains the (mostly) non-numeric properties for this device. + * + * @todo get rid of this in favor of individual properties only. */ properties_t properties() const { @@ -334,8 +241,7 @@ class device_t { return properties; } - static device_t choose_best_match(const properties_t& properties) - { + static device_t choose_best_match(const properties_t& properties) { device::id_t id; auto status = cudaChooseDevice(&id, &properties); throw_if_error(status, "Failed choosing a best matching device by a a property set."); @@ -347,10 +253,24 @@ class device_t { */ ::std::string name() const { - // I could get the name directly, but that would require - // direct use of the driver, and I'm not ready for that - // just yet - return properties().name; + // If I were lazy, I would just write: + // return properties().name; + // and let you wait for all of that to get populated. But not me! + return cuda::device::detail_::get_name(id_); + } + + /** + * Obtain a numeric-value attribute of the device + * + * @note See @ref device::attribute_t for explanation about attributes, + * properties and flags. + */ + attribute_value_t get_attribute(device::attribute_t attribute) const + { + attribute_value_t attribute_value; + auto status = cuDeviceGetAttribute(&attribute_value, attribute, id_); + throw_if_error(status, "Failed obtaining device properties for " + device::detail_::identify(id_)); + return attribute_value; } /** @@ -359,44 +279,52 @@ class device_t { */ device::pci_location_t pci_id() const { - auto pci_domain_id = get_attribute(cudaDevAttrPciDomainId); - auto pci_bus_id = get_attribute(cudaDevAttrPciBusId); - auto pci_device_id = get_attribute(cudaDevAttrPciDeviceId); + auto pci_domain_id = get_attribute(CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID); + auto pci_bus_id = get_attribute(CU_DEVICE_ATTRIBUTE_PCI_BUS_ID); + auto pci_device_id = get_attribute(CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID); return {pci_domain_id, pci_bus_id, pci_device_id}; } + device::multiprocessor_count_t multiprocessor_count() const + { + return get_attribute(CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT); + } + /** - * Obtains the device's hardware architecture generation numeric - * designator see @ref cuda::device::compute_architecture_t + * True if the device supports the facilities under namespace @ref memory::virtual_ + * including the separation of memory allocation from address range mapping, and + * the possibility of changing mapping after allocation. */ - device::compute_architecture_t architecture() const + bool supports_virtual_memory_management() const { - unsigned major = get_attribute(cudaDevAttrComputeCapabilityMajor); - return {major}; + return get_attribute( +#if CUDA_VERSION >= 11030 + CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED +#else + CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED +#endif + ); + } /** - * Obtains the device's compute capability; see @ref cuda::device::compute_capability_t + * Obtains the device's hardware architecture generation numeric + * designator see @ref cuda::device::compute_architecture_t */ - device::compute_capability_t compute_capability() const + device::compute_architecture_t architecture() const { - auto arch = architecture(); - unsigned minor = get_attribute(cudaDevAttrComputeCapabilityMinor); - return {arch, minor}; + unsigned major = get_attribute(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR); + return { major }; } /** - * Obtain a numeric-value attribute of the device - * - * @note See @ref device::attribute_t for explanation about attributes, - * properties and flags. + * Obtains the device's compute capability; see @ref cuda::device::compute_capability_t */ - attribute_value_t get_attribute(device::attribute_t attribute) const + device::compute_capability_t compute_capability() const { - attribute_value_t attribute_value; - auto ret = cudaDeviceGetAttribute(&attribute_value, attribute, id()); - throw_if_error(ret, "Failed obtaining device properties for " + device::detail_::identify(id_)); - return attribute_value; + auto major = architecture(); + unsigned minor = get_attribute(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR); + return {major, minor}; } /** @@ -405,7 +333,7 @@ class device_t { */ bool supports_concurrent_managed_access() const { - return get_attribute(cudaDevAttrConcurrentManagedAccess); + return (get_attribute(CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS) != 0); } /** @@ -414,7 +342,7 @@ class device_t { */ bool supports_block_cooperation() const { - return get_attribute(cudaDevAttrCooperativeLaunch); + return get_attribute(CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH); } /** @@ -423,22 +351,18 @@ class device_t { * * @param resource which resource's limit to obtain */ - limit_t get_limit(resource_id_t resource) const + device::limit_value_t get_limit(device::limit_t limit) const { - limit_t limit; - auto status = cudaDeviceGetLimit(&limit, resource); - throw_if_error(status, "Failed obtaining a resource limit for " + device::detail_::identify(id_)); - return limit; + return primary_context(scoped_).get_limit(limit); } /** * Set the upper limit of one of the named numeric resources * on this device */ - void set_limit(resource_id_t resource, limit_t new_limit) const + void set_limit(device::limit_t limit, device::limit_value_t new_value) const { - auto status = cudaDeviceSetLimit(resource, new_limit); - throw_if_error(status, "Failed setting a resource limit for " + device::detail_::identify(id_)); + primary_context(scoped_).set_limit(limit, new_value); } /** @@ -450,9 +374,28 @@ class device_t { * until all tasks scheduled previously scheduled on this device have been * concluded. */ - void synchronize() const + const device_t& synchronize() const + { + cuda::synchronize(*this); + return *this; + } + + device_t& synchronize() { cuda::synchronize(*this); + return *this; + } + + const device_t& make_current() const + { + device::current::set(*this); + return *this; + } + + device_t& make_current() + { + device::current::set(*this); + return *this; } /** @@ -463,9 +406,20 @@ class device_t { */ void reset() const { - scoped_setter_t set_device_for_this_scope(id_); - status_t status = cudaDeviceReset(); - throw_if_error(status, "Resetting " + device::detail_::identify(id_)); + // Notes: + // + // 1. We _cannot_ use cuDevicePrimaryCtxReset() - because that one only affects + // the device's primary context, while cudaDeviceReset() destroys _all_ contexts for + // the device. + // 2. We don't need the primary context to be active here, so not using the usual + // primary_context_handle() getter mechanism. + + auto pc_handle = (primary_context_handle_ == context::detail_::none) ? + device::primary_context::detail_::obtain_and_increase_refcount(id_) : + primary_context_handle_; + context_setter_type set_context_for_this_scope{pc_handle}; + auto status = cudaDeviceReset(); + throw_if_error(status, "Resetting " + device::detail_::identify(id_)); } /** @@ -476,10 +430,7 @@ class device_t { */ void set_cache_preference(multiprocessor_cache_preference_t preference) const { - scoped_setter_t set_device_for_this_scope(id_); - auto status = cudaDeviceSetCacheConfig((cudaFuncCache) preference); - throw_if_error(status, - "Setting the multiprocessor L1/Shared Memory cache distribution preference for " + device::detail_::identify(id_)); + primary_context(scoped_).set_cache_preference(preference); } /** @@ -488,12 +439,7 @@ class device_t { */ multiprocessor_cache_preference_t cache_preference() const { - scoped_setter_t set_device_for_this_scope(id_); - cudaFuncCache raw_preference; - auto status = cudaDeviceGetCacheConfig(&raw_preference); - throw_if_error(status, - "Obtaining the multiprocessor L1/Shared Memory cache distribution preference for " + device::detail_::identify(id_)); - return (multiprocessor_cache_preference_t) raw_preference; + return primary_context(scoped_).cache_preference(); } /** @@ -502,11 +448,9 @@ class device_t { * * @param new_bank_size the shared memory bank size to set, in bytes */ - void set_shared_memory_bank_size(memory::shared::bank_size_configuration_t new_bank_size) const + void set_shared_memory_bank_size(device::shared_memory_bank_size_t new_bank_size) const { - scoped_setter_t set_device_for_this_scope(id_); - auto status = cudaDeviceSetSharedMemConfig(new_bank_size); - throw_if_error(status, "Setting the multiprocessor shared memory bank size for " + device::detail_::identify(id_)); + primary_context(scoped_).set_shared_memory_bank_size(new_bank_size); } /** @@ -515,13 +459,9 @@ class device_t { * * @return the shared memory bank size in bytes */ - memory::shared::bank_size_configuration_t shared_memory_bank_size() const + device::shared_memory_bank_size_t shared_memory_bank_size() const { - scoped_setter_t set_device_for_this_scope(id_); - memory::shared::bank_size_configuration_t bank_size; - auto status = cudaDeviceGetSharedMemConfig(&bank_size); - throw_if_error(status, "Obtaining the multiprocessor shared memory bank size for " + device::detail_::identify(id_)); - return bank_size; + return primary_context(scoped_).shared_memory_bank_size(); } // For some reason, there is no cudaFuncGetCacheConfig. Weird. @@ -534,21 +474,16 @@ class device_t { * Return the proxied device's ID * */ - device::id_t id() const + device::id_t id() const noexcept { return id_; } - stream_t default_stream() const noexcept; + stream_t default_stream() const; - // I'm a worried about the creation of streams with the assumption - // that theirs is the current device, so I'm just forbidding it - // outright here - even though it's very natural to want to write - // - // cuda::device::curent::get().create_stream() - // - // (sigh)... safety over convenience I guess - // + /** + * See @ref cuda::stream::create() + */ stream_t create_stream( bool will_synchronize_with_default_stream, stream::priority_t priority = cuda::stream::default_priority) const; @@ -559,7 +494,11 @@ class device_t { event_t create_event( bool uses_blocking_sync = event::sync_by_busy_waiting, // Yes, that's the runtime default bool records_timing = event::do_record_timings, - bool interprocess = event::not_interprocess) const; + bool interprocess = event::not_interprocess); + + context_t create_context( + context::host_thread_synch_scheduling_policy_t synch_scheduling_policy = context::heuristic, + bool keep_larger_local_mem_after_resize = false) const; template void launch( @@ -576,97 +515,101 @@ class device_t { */ device::stream_priority_range_t stream_priority_range() const { - scoped_setter_t set_device_for_this_scope(id_); - stream::priority_t least, greatest; - auto status = cudaDeviceGetStreamPriorityRange(&least, &greatest); - throw_if_error(status, "Failed obtaining stream priority range for " + device::detail_::identify(id_)); - return {least, greatest}; + return primary_context(scoped_).stream_priority_range(); } public: - host_thread_synch_scheduling_policy_t synch_scheduling_policy() const + // TODO: Make the primary context do this (when that's even possible) + + context::host_thread_synch_scheduling_policy_t synch_scheduling_policy() const { - return (host_thread_synch_scheduling_policy_t) (flags() & cudaDeviceScheduleMask); + return primary_context(scoped_).synch_scheduling_policy(); } - void set_synch_scheduling_policy(host_thread_synch_scheduling_policy_t new_policy) const + void set_synch_scheduling_policy(context::host_thread_synch_scheduling_policy_t new_policy) { - auto other_flags = flags() & ~cudaDeviceScheduleMask; - set_flags(other_flags | (flags_t) new_policy); + primary_context().set_synch_scheduling_policy(new_policy); } bool keeping_larger_local_mem_after_resize() const { - return flags() & cudaDeviceLmemResizeToMax; + return primary_context().keeping_larger_local_mem_after_resize(); } - void keep_larger_local_mem_after_resize(bool keep = true) const + void keep_larger_local_mem_after_resize(bool keep = true) { - auto flags_ = flags(); - if (keep) { - flags_ |= cudaDeviceLmemResizeToMax; - } else { - flags_ &= ~cudaDeviceLmemResizeToMax; - } - set_flags(flags_); + primary_context().keep_larger_local_mem_after_resize(keep); } - void dont_keep_larger_local_mem_after_resize() const + void dont_keep_larger_local_mem_after_resize() { - keep_larger_local_mem_after_resize(false); + primary_context().keep_larger_local_mem_after_resize(false); } -public: - /** - * @brief Makes this device the CUDA Runtime API's current device - * - * @note a non-current device becoming current will not stop its methods from - * always expressly setting the current device before doing anything(!) - */ - device_t& make_current() +protected: + void maybe_decrease_primary_context_refcount() const { - device::current::detail_::set(id()); - return *this; + if (primary_context_handle_ != context::detail_::none) { + device::primary_context::detail_::decrease_refcount(id_); + } } - const device_t& make_current() const - { - device::current::detail_::set(id()); - return *this; - } - public: // constructors and destructor - ~device_t() noexcept = default; - device_t(device_t&& other) noexcept = default; - device_t(const device_t& other) noexcept = default; + friend void swap(device_t& lhs, device_t& rhs) noexcept + { + ::std::swap(lhs.id_, rhs.id_); + ::std::swap(lhs.primary_context_handle_, rhs.primary_context_handle_); + } + ~device_t() { maybe_decrease_primary_context_refcount(); } + device_t(device_t&& other) noexcept : id_(other.id_) + { + swap(*this, other); + } + + device_t(const device_t& other) noexcept : id_(other.id_) { } // Device proxies are not owning - as devices aren't allocated nor de-allocated. - // Also, the proxies don't hold any state - it's the devices _themselves_ which - // have state. ; so there's no problem copying the proxies around. This is - // unlike events and streams, which get created and destroyed. - device_t& operator=(const device_t& other) noexcept = default; - device_t& operator=(device_t&& other) noexcept = default; + // Also, the proxies don't hold any state (except for one bit regarding whether + // or not the device proxy has increased the primary context refcount); it's + // the devices _themselves_ which have state; so there's no problem copying + // the proxies around. This is unlike events and streams, which get created + // and destroyed. + + device_t& operator=(const device_t& other) noexcept + { + maybe_decrease_primary_context_refcount(); + id_ = other.id_; + primary_context_handle_ = other.primary_context_handle_; + return *this; + } + + device_t& operator=(device_t&& other) noexcept + { + swap(*this, other); + return *this; + } protected: // constructors /** - * @note Only @ref device::current::get() and @ref device::get() should be + * @note Only @ref device::detail_::wrap() and @ref device::get() should be * calling this one. */ - explicit device_t(device::id_t device_id) noexcept : id_( device_id ) { } + explicit device_t( + device::id_t device_id, + device::primary_context::handle_t primary_context_handle = context::detail_::none) noexcept + : id_(device_id), primary_context_handle_(primary_context_handle) { } public: // friends - friend device_t device::detail_::wrap(device::id_t) noexcept; - -protected: - // data members + friend device_t device::detail_::wrap(device::id_t, device::primary_context::handle_t handle) noexcept; - /** - * The numeric ID of the proxied device. - */ - device::id_t id_; +protected: // data members + device::id_t id_; /// Numeric ID of the proxied device. + mutable device::primary_context::handle_t primary_context_handle_ { context::detail_::none }; + /// Most work involving a device actually occurs using its primary context; we cache the handle + /// to this context here - albeit not necessary on construction }; ///@cond @@ -685,12 +628,13 @@ namespace device { namespace detail_ { -inline device_t wrap(id_t id) noexcept +inline device_t wrap(id_t id, primary_context::handle_t primary_context_handle) noexcept { - return device_t{ id }; + return device_t{ id, primary_context_handle }; } } // namespace detail_ + /** * Returns a proxy for the CUDA device with a given id * @@ -698,21 +642,36 @@ inline device_t wrap(id_t id) noexcept * @note direct constructor access is blocked so that you don't get the * idea you're actually creating devices */ -inline device_t get(id_t device_id) noexcept +inline device_t get(id_t id) noexcept { - return detail_::wrap(device_id); + ensure_driver_is_initialized(); // The device_t class mostly assumes the driver has been initialized + return detail_::wrap(id); } +/** + * A named constructor idiom for a "dummy" CUDA device representing the CPU. + * + * @note Only use this idiom when comparing the results of functions returning + * locations, which can be either a GPU device or the CPU; any other use will likely + * result in a runtime error being thrown. + */ +inline device_t cpu() { return get(CU_DEVICE_CPU); } + namespace current { /** - * Returns the current device in a wrapper which assumes it is indeed - * current, i.e. which will not set the current device before performing any - * other actions. + * Obtains (a proxy for) the device which the CUDA runtime API considers to be current. */ -inline device_t get() { return device::detail_::wrap(detail_::get_id()); } +inline device_t get() +{ + ensure_driver_is_initialized(); + return device::get(detail_::get_id()); +} -inline void set(device_t device) { detail_::set(device.id()); } +/** + * Tells the CUDA runtime API to consider the specified device as the current one. + */ +inline void set(const device_t& device) { detail_::set(device.id()); } } // namespace current @@ -747,15 +706,6 @@ inline device_t get(const ::std::string& pci_id_str) } // namespace device -inline void synchronize(device_t device) -{ - auto device_id = device.id(); - device::current::detail_::scoped_override_t set_device_for_this_scope(device_id); - auto status = cudaDeviceSynchronize(); - throw_if_error(status, "Failed synchronizing " + ::std::to_string(device_id)); -} - - } // namespace cuda #endif // CUDA_API_WRAPPERS_DEVICE_HPP_ diff --git a/src/cuda/api/device_properties.hpp b/src/cuda/api/device_properties.hpp index 90e6909f..92d708b3 100644 --- a/src/cuda/api/device_properties.hpp +++ b/src/cuda/api/device_properties.hpp @@ -11,7 +11,8 @@ #include #include -#include + +#include #include @@ -33,6 +34,11 @@ namespace cuda { namespace device { +/** + * Type of the number of mutiprocessors within a single GPU. + */ +using multiprocessor_count_t = int; + /** * A numeric designator of an architectural generation of CUDA devices * @@ -96,8 +102,6 @@ struct compute_capability_t { * setting */ memory::shared::size_t max_shared_memory_per_block() const; - - }; /** diff --git a/src/cuda/api/devices.hpp b/src/cuda/api/devices.hpp index 2b1da1b8..30a1c3d0 100644 --- a/src/cuda/api/devices.hpp +++ b/src/cuda/api/devices.hpp @@ -24,7 +24,7 @@ class all_devices { using reference = value_type; // device_t is already a reference type; and there is no instance-of-device_t here to reference using const_reference = const value_type; // ditto using size_type = decltype(device::count()); - using difference_type = typename std::make_signed::type; + using difference_type = typename ::std::make_signed::type; class index_based_iterator { public: @@ -121,6 +121,11 @@ class all_devices { return index_based_iterator(num_devices_, index_ - n); } + difference_type operator-(const index_based_iterator& other) const + { + return this->index_ - other.index_; + } + size_type index() const { return index_; } size_type num_devices() const { return num_devices_; } diff --git a/src/cuda/api/error.hpp b/src/cuda/api/error.hpp index 2f128085..02e32ddf 100644 --- a/src/cuda/api/error.hpp +++ b/src/cuda/api/error.hpp @@ -1,17 +1,26 @@ /** * @file error.hpp * - * @brief Facilities for exception-based handling of Runtime API - * errors, including a basic exception class wrapping - * `::std::runtime_error`. + * @brief Facilities for exception-based handling of Runtime + * and Driver API errors, including a basic exception class + * wrapping `::std::runtime_error`. + * + * @note Does not - for now - support wrapping errors generated + * by other CUDA-related libraries like NVRTC. + * + * @note Unlike the Runtime API, the driver API has no memory + * of "non-sticky" errors, which do not corrupt the current + * context. + * */ #pragma once #ifndef CUDA_API_WRAPPERS_ERROR_HPP_ #define CUDA_API_WRAPPERS_ERROR_HPP_ -#include - +#include #include +#include + #include #include #include @@ -28,8 +37,13 @@ namespace status { enum named_t : ::std::underlying_type::type { success = cudaSuccess, missing_configuration = cudaErrorMissingConfiguration, - memory_allocation = cudaErrorMemoryAllocation, - initialization_error = cudaErrorInitializationError, + memory_allocation_failure = cudaErrorMemoryAllocation, // == CUDA_ERROR_OUT_OF_MEMORY + initialization_error = cudaErrorInitializationError, // == CUDA_ERROR_NOT_INITIALIZED + already_deinitialized = cudaErrorCudartUnloading, // == CUDA_ERROR_DEINITIALIZED + profiler_disabled = cudaErrorProfilerDisabled, + profiler_not_initialized = cudaErrorProfilerNotInitialized, + profiler_already_started = cudaErrorProfilerAlreadyStarted, + profiler_already_stopped = cudaErrorProfilerAlreadyStopped, launch_failure = cudaErrorLaunchFailure, prior_launch_failure = cudaErrorPriorLaunchFailure, launch_timeout = cudaErrorLaunchTimeout, @@ -55,7 +69,6 @@ enum named_t : ::std::underlying_type::type { invalid_filter_setting = cudaErrorInvalidFilterSetting, invalid_norm_setting = cudaErrorInvalidNormSetting, mixed_device_execution = cudaErrorMixedDeviceExecution, - cuda_runtime_unloading = cudaErrorCudartUnloading, unknown = cudaErrorUnknown, not_yet_implemented = cudaErrorNotYetImplemented, memory_value_too_large = cudaErrorMemoryValueTooLarge, @@ -64,7 +77,7 @@ enum named_t : ::std::underlying_type::type { insufficient_driver = cudaErrorInsufficientDriver, set_on_active_process = cudaErrorSetOnActiveProcess, invalid_surface = cudaErrorInvalidSurface, - no_device = cudaErrorNoDevice, + no_device = cudaErrorNoDevice, // == 100 ecc_uncorrectable = cudaErrorECCUncorrectable, shared_object_symbol_not_found = cudaErrorSharedObjectSymbolNotFound, shared_object_init_failed = cudaErrorSharedObjectInitFailed, @@ -76,13 +89,11 @@ enum named_t : ::std::underlying_type::type { invalid_kernel_image = cudaErrorInvalidKernelImage, no_kernel_image_for_device = cudaErrorNoKernelImageForDevice, incompatible_driver_context = cudaErrorIncompatibleDriverContext, + invalid_context = CUDA_ERROR_INVALID_CONTEXT, + context_already_current = CUDA_ERROR_CONTEXT_ALREADY_CURRENT, peer_access_already_enabled = cudaErrorPeerAccessAlreadyEnabled, peer_access_not_enabled = cudaErrorPeerAccessNotEnabled, device_already_in_use = cudaErrorDeviceAlreadyInUse, - profiler_disabled = cudaErrorProfilerDisabled, - profiler_not_initialized = cudaErrorProfilerNotInitialized, - profiler_already_started = cudaErrorProfilerAlreadyStarted, - profiler_already_stopped = cudaErrorProfilerAlreadyStopped, assert = cudaErrorAssert, too_many_peers = cudaErrorTooManyPeers, host_memory_already_registered = cudaErrorHostMemoryAlreadyRegistered, @@ -126,18 +137,27 @@ constexpr inline bool is_success(status_t status) { return status == (status_t) /** * @brief Determine whether the API call returning the specified status had failed */ -constexpr inline bool is_failure(status_t status) { return status != (status_t) status::success; } +constexpr inline bool is_failure(status_t status) { return not is_success(status); } /** * Obtain a brief textual explanation for a specified kind of CUDA Runtime API status * or error code. */ -inline ::std::string describe(status_t status) { return cudaGetErrorString(status); } +///@{ +inline ::std::string describe(status_t status) +{ + const char* description; + auto description_lookup_status = cuGetErrorString(status, &description); + return (description_lookup_status != CUDA_SUCCESS) ? nullptr : description; +} +inline ::std::string describe(cudaError_t status) { return cudaGetErrorString(status); } +///@} + namespace detail_ { template -::std::string as_hex(I x) +std::string as_hex(I x) { static_assert(::std::is_unsigned::value, "only signed representations are supported"); unsigned num_hex_digits = 2*sizeof(I); @@ -162,7 +182,7 @@ ::std::string as_hex(I x) template inline ::std::string ptr_as_hex(const I* ptr) { - return as_hex((size_t) ptr); + return as_hex(reinterpret_cast(ptr)); } } // namespace detail_ @@ -173,6 +193,8 @@ inline ::std::string ptr_as_hex(const I* ptr) * * A CUDA runtime error can be constructed with either just a CUDA error code * (=status code), or a code plus an additional message. + * + * @todo Consider renaming this to avoid confusion with the CUDA Runtime. */ class runtime_error : public ::std::runtime_error { public: @@ -191,11 +213,11 @@ class runtime_error : public ::std::runtime_error { runtime_error(error_code, what_arg) { } ///@endcond - runtime_error(cuda::status::named_t error_code) : + explicit runtime_error(status::named_t error_code) : runtime_error(static_cast(error_code)) { } - runtime_error(cuda::status::named_t error_code, const ::std::string& what_arg) : + runtime_error(status::named_t error_code, const ::std::string& what_arg) : runtime_error(static_cast(error_code), what_arg) { } - runtime_error(cuda::status::named_t error_code, ::std::string&& what_arg) : + runtime_error(status::named_t error_code, ::std::string&& what_arg) : runtime_error(static_cast(error_code), what_arg) { } /** @@ -207,7 +229,7 @@ class runtime_error : public ::std::runtime_error { status_t code_; }; -// TODO: The following could use ::std::optiomal arguments - which would +// TODO: The following could use ::std::optional arguments - which would // prevent the need for dual versions of the functions - but we're // not writing C++17 here @@ -223,11 +245,21 @@ inline void throw_if_error(status_t status, const ::std::string& message) noexce if (is_failure(status)) { throw runtime_error(status, message); } } +inline void throw_if_error(cudaError_t status, const ::std::string& message) noexcept(false) +{ + throw_if_error(static_cast(status), message); +} + inline void throw_if_error(status_t status, ::std::string&& message) noexcept(false) { if (is_failure(status)) { throw runtime_error(status, message); } } +inline void throw_if_error(cudaError_t status, ::std::string&& message) noexcept(false) +{ + return throw_if_error(static_cast(status), message); +} + /** * Does nothing - unless the status indicates an error, in which case * a @ref cuda::runtime_error exception is thrown @@ -239,44 +271,73 @@ inline void throw_if_error(status_t status) noexcept(false) if (is_failure(status)) { throw runtime_error(status); } } +inline void throw_if_error(cudaError_t status) noexcept(false) +{ + throw_if_error(static_cast(status)); +} + enum : bool { dont_clear_errors = false, do_clear_errors = true }; -namespace outstanding_error { +namespace detail_ { + +namespace outstanding_runtime_error { /** - * Reset the CUDA status to @ref cuda::status::success. + * Clears the current CUDA context's status and return any outstanding error. + * + * @todo Reconsider what this does w.r.t. driver calls */ -inline status_t clear() noexcept { return cudaGetLastError(); } +inline status_t clear() noexcept +{ + return static_cast(cudaGetLastError()); +} /** * Get the code of the last error in a CUDA-related action. + * + * @todo Reconsider what this does w.r.t. driver calls + */ +inline status_t get() noexcept +{ + return static_cast(cudaPeekAtLastError()); +} + +} // namespace outstanding_runtime_error +} // namespace detail_ + +/** + * Unlike the Runtime API, where every error is outstanding + * until cleared, the Driver API, which we use mostly, only + * remembers "sticky" errors - severe errors which corrupt + * contexts. Such errors cannot be recovered from / cleared, + * and require either context destruction or process termination. + */ +namespace outstanding_error { + +/** + * @return the code of a sticky (= context-corrupting) error, + * if the CUDA driver has recently encountered any. */ -inline status_t get() noexcept { return cudaPeekAtLastError(); } +inline status_t get() +{ + constexpr const unsigned dummy_flags{0}; + auto status = cuInit(dummy_flags); + return static_cast(status); +} /** * @brief Does nothing (unless throwing an exception) * - * @note similar to @ref cuda::throw_if_error, but uses the CUDA Runtime API's internal - * state - * - * @throws cuda::runtime_error if the CUDA runtime API has - * encountered previously encountered an (uncleared) error - * - * @param message Additional message to incldue in the exception thrown - * @param clear_any_error When true, clears the CUDA Runtime API's state from - * recalling errors arising from before this moment - * - * + * @note similar to @ref cuda::throw_if_error, but uses the CUDA driver's + * own state regarding whether or not a sticky error has occurred */ -inline void ensure_none( - ::std::string message, - bool clear_any_error = do_clear_errors) noexcept(false) +inline void ensure_none(const ::std::string &message) noexcept(false) { - auto last_status = clear_any_error ? clear() : get(); - throw_if_error(last_status, message); + auto status = get(); + throw_if_error(status, message); } /** @@ -286,11 +347,9 @@ inline void ensure_none( * @note exists so as to avoid incorrect overload resolution of * `ensure_none(my_c_string)` calls. */ -inline void ensure_none( - const char* message, - bool clear_any_error = do_clear_errors) noexcept(false) +inline void ensure_none(const char *message) noexcept(false) { - return ensure_none(::std::string(message), clear_any_error); + return ensure_none(::std::string{message}); } /** @@ -305,56 +364,156 @@ inline void ensure_none( * @param clear_any_error When true, clears the CUDA Runtime API's state from * recalling errors arising from before this oment */ -inline void ensure_none(bool clear_any_error = do_clear_errors) noexcept(false) +inline void ensure_none() noexcept(false) { - auto last_status = clear_any_error ? clear() : get(); - throw_if_error(last_status); + auto status = get(); + throw_if_error(status); } } // namespace outstanding_error +// The following few functions are used in the error messages +// generated for exceptions thrown by various API wrappers. + namespace device { namespace detail_ { +inline ::std::string identify(device::id_t device_id) +{ + return ::std::string("device ") + ::std::to_string(device_id); +} +} // namespace detail_ +} // namespace device -inline ::std::string identify(id_t id) +namespace context { +namespace detail_ { + +inline ::std::string identify(handle_t handle) { - return ::std::string("device ") + std::to_string(id); + return "context " + cuda::detail_::ptr_as_hex(handle); } -} // namespace detail -} // namespace device +inline ::std::string identify(handle_t handle, device::id_t device_id) +{ + return identify(handle) + " on " + device::detail_::identify(device_id); +} -namespace event { +} // namespace detail_ + +namespace current{ namespace detail_ { +inline ::std::string identify(context::handle_t handle) +{ + return "current context: " + context::detail_::identify(handle); +} +inline ::std::string identify(context::handle_t handle, device::id_t device_id) +{ + return "current context: " + context::detail_::identify(handle, device_id); +} +} // namespace detail_ +} // namespace current + +} // namespace context -inline ::std::string identify(event::handle_t handle) +namespace device { +namespace primary_context { +namespace detail_ { + +inline ::std::string identify(handle_t handle, device::id_t device_id) +{ + return "context " + context::detail_::identify(handle, device_id); +} +inline ::std::string identify(handle_t handle) { - return ::std::string("event ") + cuda::detail_::ptr_as_hex(handle); + return "context " + context::detail_::identify(handle); } +} // namespace detail_ +} // namespace primary_context +} // namespace device +namespace stream { +namespace detail_ { +inline ::std::string identify(handle_t handle) +{ + return "event " + cuda::detail_::ptr_as_hex(handle); +} inline ::std::string identify(handle_t handle, device::id_t device_id) { return identify(handle) + " on " + device::detail_::identify(device_id); } +inline ::std::string identify(handle_t handle, context::handle_t context_handle) +{ + return identify(handle) + " in " + context::detail_::identify(context_handle); +} +inline ::std::string identify(handle_t handle, context::handle_t context_handle, device::id_t device_id) +{ + return identify(handle) + " in " + context::detail_::identify(context_handle, device_id); +} +} // namespace detail_ +} // namespace stream +namespace event { +namespace detail_ { +inline ::std::string identify(handle_t handle) +{ + return "event " + cuda::detail_::ptr_as_hex(handle); +} +inline ::std::string identify(handle_t handle, device::id_t device_id) +{ + return identify(handle) + " on " + device::detail_::identify(device_id); +} +inline ::std::string identify(handle_t handle, context::handle_t context_handle) +{ + return identify(handle) + " on " + context::detail_::identify(context_handle); +} +inline ::std::string identify(handle_t handle, context::handle_t context_handle, device::id_t device_id) +{ + return identify(handle) + " on " + context::detail_::identify(context_handle, device_id); +} } // namespace detail_ } // namespace event -namespace stream { +namespace kernel { namespace detail_ { -inline ::std::string identify(stream::handle_t handle) +inline ::std::string identify(const void* ptr) +{ + return "kernel " + cuda::detail_::ptr_as_hex(ptr); +} +inline ::std::string identify(const void* ptr, context::handle_t context_handle) +{ + return identify(ptr) + " in " + context::detail_::identify(context_handle); +} +inline ::std::string identify(const void* ptr, context::handle_t context_handle, device::id_t device_id) +{ + return identify(ptr) + " in " + context::detail_::identify(context_handle, device_id); +} +inline ::std::string identify(handle_t handle) +{ + return "kernel " + cuda::detail_::ptr_as_hex(handle); +} +inline ::std::string identify(handle_t handle, context::handle_t context_handle) { - return ::std::string("stream ") + cuda::detail_::ptr_as_hex(handle); + return identify(handle) + " in " + context::detail_::identify(context_handle); } +inline ::std::string identify(handle_t handle, context::handle_t context_handle, device::id_t device_id) +{ + return identify(handle) + " in " + context::detail_::identify(context_handle, device_id); +} + +} // namespace detail +} // namespace kernel -inline ::std::string identify(stream::handle_t handle, device::id_t device_id) +namespace memory { +namespace detail_ { + +inline ::std::string identify(region_t region) { - return identify(handle) + " on " + device::detail_::identify(device_id); + return ::std::string("memory region at ") + cuda::detail_::ptr_as_hex(region.data()) + + " of size " + ::std::to_string(region.size()); } } // namespace detail_ -} // namespace stream +} // namespace memory } // namespace cuda diff --git a/src/cuda/api/event.hpp b/src/cuda/api/event.hpp index 1cd04247..8fa51048 100644 --- a/src/cuda/api/event.hpp +++ b/src/cuda/api/event.hpp @@ -9,15 +9,15 @@ #ifndef CUDA_API_WRAPPERS_EVENT_HPP_ #define CUDA_API_WRAPPERS_EVENT_HPP_ -#include -#include -#include -#include -#include +#include #include #include // for duration types +#include +#include +#include +#include namespace cuda { @@ -38,18 +38,18 @@ namespace detail_ { * @param event_handle Event to be made to occur on stream @ref stream_handle */ inline void enqueue(stream::handle_t stream_handle, handle_t event_handle) { - auto status = cudaEventRecord(event_handle, stream_handle); + auto status = cuEventRecord(event_handle, stream_handle); cuda::throw_if_error(status, - "Failed recording event " + event::detail_::identify(event_handle) + "Failed recording " + event::detail_::identify(event_handle) + " on " + stream::detail_::identify(stream_handle)); } constexpr unsigned inline make_flags(bool uses_blocking_sync, bool records_timing, bool interprocess) { return - ( uses_blocking_sync ? cudaEventBlockingSync : 0 ) - | ( records_timing ? 0 : cudaEventDisableTiming ) - | ( interprocess ? cudaEventInterprocess : 0 ); + ( uses_blocking_sync ? CU_EVENT_BLOCKING_SYNC : 0 ) + | ( records_timing ? 0 : CU_EVENT_DISABLE_TIMING ) + | ( interprocess ? CU_EVENT_INTERPROCESS : 0 ); } } // namespace detail_ @@ -63,11 +63,12 @@ class event_t; namespace event { namespace detail_ { + /** * @brief Wrap an existing CUDA event in a @ref event_t instance. * - * @param device_id ID of the device for which the stream is defined - * @param event_handle handle for the pre-existing event + * @param context_handle Handle of the context in which this event was created + * @param event_handle handle of the pre-existing event * @param take_ownership When set to `false`, the CUDA event * will not be destroyed along with proxy; use this setting * when temporarily working with a stream existing irrespective of @@ -77,9 +78,13 @@ namespace detail_ { * @return The constructed `cuda::event_t`. */ event_t wrap( - device::id_t device_id, - handle_t event_handle, - bool take_ownership = false) noexcept; + device::id_t device_id, + context::handle_t context_handle, + handle_t event_handle, + bool take_ownership = false) noexcept; + +::std::string identify(const event_t& event); + } // namespace detail_ @@ -88,7 +93,7 @@ event_t wrap( inline void synchronize(const event_t& event); /** - * @brief Proxy class for a CUDA event + * @brief Wrapper class for a CUDA event * * Use this class - built around an event handle - to perform almost, if not all, * event-related operations the CUDA Runtime API is capable of. @@ -100,31 +105,31 @@ inline void synchronize(const event_t& event); * * @note this is one of the three main classes in the Runtime API wrapper library, * together with @ref cuda::device_t and @ref cuda::stream_t + * @note This class is a "reference type", not a "value type". Therefore, making changes + * to the event is a const-respecting operation on this class. */ class event_t { + public: // data member non-mutator getters - /** - * The CUDA handle this object is wrapping - */ - event::handle_t handle() const noexcept{ return handle_; } + /// The raw CUDA ID for the device w.r.t. which the event is defined + device::id_t device_id() const noexcept { return device_id_; }; - /** - * ID of the device with which this event is associated (and on whose - * streams this event can be enqueued) - */ - device::id_t device_id() const noexcept { return device_id_; } + /// The raw CUDA handle for the context in which the represented stream is defined. + context::handle_t context_handle() const noexcept { return context_handle_; } + + /// The raw CUDA handle for this event + event::handle_t handle() const noexcept { return handle_; } + + /// True if this wrapper is responsible for telling CUDA to destroy the event upon the wrapper's own destruction + bool is_owning() const noexcept { return owning; } + + /// The device w.r.t. which the event is defined + device_t device() const; + + /// The context in which this stream was defined. + context_t context() const; - /** - * The device with which this event is associated (i.e. on whose stream - * this event can be enqueued) - */ - device_t device() const noexcept; - /** - * Is this wrapper responsible for having the CUDA Runtime API destroy - * the event when it destructs? - */ - bool is_owning() const noexcept { return owning; } public: // other non-mutator methods @@ -141,11 +146,11 @@ class event_t { */ bool has_occurred() const { - auto status = cudaEventQuery(handle_); + auto status = cuEventQuery(handle_); if (status == cuda::status::success) return true; if (status == cuda::status::not_ready) return false; throw cuda::runtime_error(status, - "Could not determine whether " + event::detail_::identify(handle_, device_id_) + "Could not determine whether " + event::detail_::identify(handle_) + "has already occurred or not."); } @@ -158,14 +163,13 @@ class event_t { public: // other mutator methods - /** * Schedule a specified event to occur (= to fire) when all activities * already scheduled on the event's device's default stream have concluded. * * @note No protection against repeated calls. */ - void record() + void record() const { event::detail_::enqueue(stream::default_stream_handle, handle_); } @@ -176,7 +180,7 @@ class event_t { * * @note No protection against repeated calls. */ - void record(const stream_t& stream); + void record(const stream_t& stream) const; /** * Records the event and ensures it has occurred before returning @@ -184,39 +188,43 @@ class event_t { * * @note No protection against repeated calls. */ - void fire(const stream_t& stream); + void fire(const stream_t& stream) const; /** * Have the calling thread wait - either busy-waiting or blocking - and * return only after this event has occurred (see @ref has_occurred() ). */ - void synchronize() + void synchronize() const { return cuda::synchronize(*this); } protected: // constructors - event_t(device::id_t device_id, event::handle_t event_handle, bool take_ownership) noexcept - : device_id_(device_id), handle_(event_handle), owning(take_ownership) { } + event_t(device::id_t device_id, context::handle_t context_handle, event::handle_t event_handle, bool take_ownership) noexcept + : device_id_(device_id), context_handle_(context_handle), handle_(event_handle), owning(take_ownership) { } public: // friendship - friend event_t event::detail_::wrap(device::id_t device_id, event::handle_t event_handle, bool take_ownership) noexcept; + friend event_t event::detail_::wrap(device::id_t, context::handle_t context_handle, event::handle_t event_handle, bool take_ownership) noexcept; public: // constructors and destructor - event_t(const event_t& other) noexcept : event_t(other.device_id_, other.handle_, false) { } + event_t(const event_t& other) noexcept : event_t(other.device_id_, other.context_handle_, other.handle_, false) { } event_t(event_t&& other) noexcept : - event_t(other.device_id_, other.handle_, other.owning) + event_t(other.device_id_, other.context_handle_, other.handle_, other.owning) { other.owning = false; }; ~event_t() { - if (owning) { cudaEventDestroy(handle_); } + if (owning) { + cuEventDestroy(handle_); + // Note: "Swallowing" any potential error to avoid std::terminate(); also, + // because the context cannot possibly exist after this call. + } } public: // operators @@ -225,9 +233,10 @@ class event_t { event_t& operator=(event_t&& other) = delete; protected: // data members - const device::id_t device_id_; - const event::handle_t handle_; - bool owning; + const device::id_t device_id_; + const context::handle_t context_handle_; + const event::handle_t handle_; + bool owning; // this field is mutable only for enabling move construction; other // than in that case it must not be altered }; @@ -253,7 +262,7 @@ using duration_t = ::std::chrono::duration; inline duration_t time_elapsed_between(const event_t& start, const event_t& end) { float elapsed_milliseconds; - auto status = cudaEventElapsedTime(&elapsed_milliseconds, start.handle(), end.handle()); + auto status = cuEventElapsedTime(&elapsed_milliseconds, start.handle(), end.handle()); cuda::throw_if_error(status, "determining the time elapsed between events"); return duration_t { elapsed_milliseconds }; } @@ -276,29 +285,37 @@ namespace detail_ { * @return an event wrapper associated with the specified event */ inline event_t wrap( - device::id_t device_id, - handle_t event_handle, - bool take_ownership) noexcept + device::id_t device_id, + context::handle_t context_handle, + handle_t event_handle, + bool take_ownership) noexcept +{ + return { device_id, context_handle, event_handle, take_ownership }; +} + +inline ::std::string identify(const event_t& event) { - return event_t(device_id, event_handle, take_ownership); + return identify(event.handle(), event.context_handle(), event.device_id()); } // Note: For now, event_t's need their device's ID - even if it's the current device; // that explains the requirement in this function's interface -inline event_t create_on_current_device( - device::id_t current_device_id, - bool uses_blocking_sync, - bool records_timing, - bool interprocess) +inline event_t create_in_current_context( + device::id_t current_device_id, + context::handle_t current_context_handle, + bool uses_blocking_sync, + bool records_timing, + bool interprocess) { auto flags = make_flags(uses_blocking_sync, records_timing, interprocess); cuda::event::handle_t new_event_handle; - auto status = cudaEventCreateWithFlags(&new_event_handle, flags); + auto status = cuEventCreate(&new_event_handle, flags); cuda::throw_if_error(status, "failed creating a CUDA event associated with the current device"); // Note: We're trusting CUDA to actually have succeeded if it reports success, // so we're not checking the newly-created event handle - which is really just // a pointer - for nullness - return wrap(current_device_id, new_event_handle, do_take_ownership); + bool take_ownership = true; + return wrap(current_device_id, current_context_handle, new_event_handle, take_ownership); } /** @@ -306,14 +323,14 @@ inline event_t create_on_current_device( */ inline event_t create( - device::id_t device_id, - bool uses_blocking_sync, - bool records_timing, - bool interprocess) + device::id_t device_id, + context::handle_t context_handle, + bool uses_blocking_sync, + bool records_timing, + bool interprocess) { - device::current::detail_::scoped_override_t - set_device_for_this_scope(device_id); - return detail_::create_on_current_device(device_id, uses_blocking_sync, records_timing, interprocess); + context::current::detail_::scoped_override_t set_context_for_this_scope(context_handle); + return detail_::create_in_current_context(device_id, context_handle, uses_blocking_sync, records_timing, interprocess); } } // namespace detail_ @@ -330,10 +347,10 @@ inline event_t create( * @note Creating an event */ inline event_t create( - device_t device, - bool uses_blocking_sync = sync_by_busy_waiting, // Yes, that's the runtime default - bool records_timing = do_record_timings, - bool interprocess = not_interprocess); + device_t& device, + bool uses_blocking_sync = sync_by_busy_waiting, // Yes, that's the runtime default + bool records_timing = do_record_timings, + bool interprocess = not_interprocess); } // namespace event @@ -342,18 +359,18 @@ inline event_t create( * to the calling code. * * @todo Determine how this waiting takes place (as opposed to stream - * synchrnoization). + * synchronization). * * @param event the event for whose occurrence to wait; must be scheduled * to occur on some stream (possibly the different stream) */ inline void synchronize(const event_t& event) { - auto device_id = event.device_id(); + auto context_handle = event.context_handle(); auto event_handle = event.handle(); - device::current::detail_::scoped_override_t device_for_this_scope(device_id); - auto status = cudaEventSynchronize(event_handle); - throw_if_error(status, "Failed synchronizing " + event::detail_::identify(event_handle, device_id)); + context::current::detail_::scoped_override_t context_for_this_scope(context_handle); + auto status = cuEventSynchronize(event_handle); + throw_if_error(status, "Failed synchronizing " + event::detail_::identify(event)); } } // namespace cuda diff --git a/src/cuda/api/ipc.hpp b/src/cuda/api/ipc.hpp index c5d4a60b..6cb9b50a 100644 --- a/src/cuda/api/ipc.hpp +++ b/src/cuda/api/ipc.hpp @@ -2,7 +2,8 @@ * @file ipc.hpp * * @brief wrappers for CUDA's facilities for sharing on-device - * memory addresses and CUDA events between host processes + * memory addresses and CUDA events between host processes (Inter- + * Process Communication) * * CUDA addresses into device memory are not valid across different * host processes - somewhat, but not entirely, similarly to the @@ -23,9 +24,8 @@ #ifndef CUDA_API_WRAPPERS_IPC_HPP_ #define CUDA_API_WRAPPERS_IPC_HPP_ -#include +#include #include -#include #include @@ -43,7 +43,7 @@ namespace ipc { * The concrete value passed between processes, used to tell * the CUDA Runtime API which memory area is desired. */ -using handle_t = cudaIpcMemHandle_t; +using handle_t = CUipcMemHandle; /** * Obtain a handle for a region of on-device memory which can @@ -59,9 +59,9 @@ using handle_t = cudaIpcMemHandle_t; */ inline handle_t export_(void* device_ptr) { handle_t handle; - auto status = cudaIpcGetMemHandle(&handle, device_ptr); - cuda::throw_if_error(status, - "Failed producing an IPC memory handle for device pointer " + cuda::detail_::ptr_as_hex(device_ptr)); + auto status = cuIpcGetMemHandle(&handle, device::address(device_ptr)); + cuda::throw_if_error(status, "Failed producing an IPC memory handle for device pointer " + + cuda::detail_::ptr_as_hex(device_ptr)); return handle; } @@ -69,7 +69,7 @@ inline handle_t export_(void* device_ptr) { * @brief Obtain a CUDA pointer from a handle passed * by inter-process communication * - * @note the couterpart of @ref memory::ipc::unmap. + * @note the counterpart of @ref memory::ipc::unmap. * * @param handle the handle which allows us access to the on-device address * @return a pointer to the relevant address (which may not have the same value @@ -78,10 +78,9 @@ inline handle_t export_(void* device_ptr) { template inline T* import(const handle_t& handle) { - void* device_ptr; - auto status = cudaIpcOpenMemHandle(&device_ptr, handle, cudaIpcMemLazyEnablePeerAccess); - cuda::throw_if_error(status, - "Failed obtaining a device pointer from an IPC memory handle"); + CUdeviceptr device_ptr; + auto status = cuIpcOpenMemHandle(&device_ptr, handle, CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS); + cuda::throw_if_error(status, "Failed obtaining a device pointer from an IPC memory handle"); return reinterpret_cast(device_ptr); } @@ -92,10 +91,8 @@ inline T* import(const handle_t& handle) */ inline void unmap(void* ipc_mapped_ptr) { - auto status = cudaIpcCloseMemHandle(ipc_mapped_ptr); - cuda::throw_if_error(status, - "Failed unmapping IPC memory mapped to " + - cuda::detail_::ptr_as_hex(ipc_mapped_ptr)); + auto status = cuIpcCloseMemHandle(device::address(ipc_mapped_ptr)); + cuda::throw_if_error(status, "Failed unmapping IPC memory mapped to " + cuda::detail_::ptr_as_hex(ipc_mapped_ptr)); } /** @@ -109,7 +106,7 @@ inline void unmap(void* ipc_mapped_ptr) template class imported_t { public: // constructors & destructor - imported_t(const handle_t& handle) : ptr_(import(handle)) + explicit imported_t(const handle_t& handle) : ptr_(import(handle)) { if (ptr_ == nullptr) { throw ::std::logic_error("IPC memory handle yielded a null pointer"); @@ -156,25 +153,24 @@ namespace ipc { * The concrete value passed between processes, used to tell * the CUDA Runtime API which event is desired. */ -using handle_t = cudaIpcEventHandle_t; +using handle_t = CUipcEventHandle; namespace detail_ { inline handle_t export_(event::handle_t event_handle) { handle_t ipc_handle; - auto status = cudaIpcGetEventHandle(&ipc_handle, event_handle); - cuda::throw_if_error(status, - "Failed obtaining an IPC event handle for " + event::detail_::identify(event_handle)); + auto status = cuIpcGetEventHandle(&ipc_handle, event_handle); + cuda::throw_if_error(status, "Failed obtaining an IPC event handle for " + + event::detail_::identify(event_handle)); return ipc_handle; } inline event::handle_t import(const handle_t& handle) { event::handle_t event_handle; - auto status = cudaIpcOpenEventHandle(&event_handle, handle); - cuda::throw_if_error(status, - "Failed obtaining an event handle from an IPC event handle"); + auto status = cuIpcOpenEventHandle(&event_handle, handle); + cuda::throw_if_error(status, "Failed obtaining an event handle from an IPC event handle"); return event_handle; } @@ -188,7 +184,7 @@ inline event::handle_t import(const handle_t& handle) * may obtain a proper CUDA event * */ -inline handle_t export_(event_t& event); +inline handle_t export_(const event_t& event); /** * Obtain a proper CUDA event, corresponding to an event created by another @@ -198,10 +194,19 @@ inline handle_t export_(event_t& event); * from an event handle (or otherwise - have a handle provide both an event handle and * a device ID), but that is not currently the case. * - * @param device the device to which the imported event corresponds - * @param handle the handle obtained via inter-process communications + * @param event_ipc_handle the handle obtained via inter-process communications */ -inline event_t import(device_t& device, const handle_t& handle); +///@{ + /** + * @param device the device with which the imported event is associated + */ +inline event_t import(const device_t& device, const handle_t& event_ipc_handle); + +/** + * @param context the device-context with which the imported event is associated + */ +inline event_t import(const context_t& device, const handle_t& event_ipc_handle); +///@} } // namespace ipc } // namespace event diff --git a/src/cuda/api/kernel.hpp b/src/cuda/api/kernel.hpp index c24fea32..9efb79bb 100644 --- a/src/cuda/api/kernel.hpp +++ b/src/cuda/api/kernel.hpp @@ -1,110 +1,148 @@ /** * @file kernel.hpp * - * @brief Functions for querying information and making settings - * regarding CUDA kernels (`__global__` functions). + * @brief Contains a base wrapper class for CUDA kernels - both statically and + * dynamically compiled; and some related functionality. * - * @note This file does _not_ define any kernels itself. + * @note This file does _not_ define any kernels itself. */ #pragma once #ifndef CUDA_API_WRAPPERS_KERNEL_HPP_ #define CUDA_API_WRAPPERS_KERNEL_HPP_ -#include -#include +#include #include -#include +#include +// #include #include +#include namespace cuda { ///@cond class device_t; -class stream_t; class kernel_t; -///@endcond +///@nocond namespace kernel { namespace detail_ { -inline kernel_t wrap(device::id_t device_id, const void* ptr); +kernel_t wrap(device::id_t device_id, context::handle_t context_id, kernel::handle_t f); -} // namespace detail - -/** - * @brief a wrapper around `cudaFuncAttributes`, offering - * a few convenience member functions. - */ -struct attributes_t : cudaFuncAttributes { +#ifndef NDEBUG +static const char* attribute_name(int attribute_index) +{ + // Note: These correspond to the values of enum CUfunction_attribute_enum + static const char* names[] = { + "Maximum number of threads per block", + "Statically-allocated shared memory size in bytes", + "Required constant memory size in bytes", + "Required local memory size in bytes", + "Number of registers used by each thread", + "PTX virtual architecture version into which the kernel code was compiled", + "Binary architecture version for which the function was compiled", + "Indication whether the function was compiled with cache mode CA", + "Maximum allowed size of dynamically-allocated shared memory use size bytes", + "Preferred shared memory carve-out to actual shared memory" + }; + return names[attribute_index]; +} +#endif - cuda::device::compute_capability_t ptx_version() const noexcept { - return device::compute_capability_t::from_combined_number(ptxVersion); - } +inline attribute_value_t get_attribute_in_current_context(handle_t handle, attribute_t attribute) +{ + kernel::attribute_value_t attribute_value; + auto result = cuFuncGetAttribute(&attribute_value, attribute, handle); + throw_if_error(result, + ::std::string("Failed obtaining attribute ") + +#ifdef NDEBUG + ::std::to_string(static_cast<::std::underlying_type::type>(attribute)) +#else + attribute_name(attribute) +#endif + ); + return attribute_value; +} - cuda::device::compute_capability_t binary_compilation_target_architecture() const noexcept { - return device::compute_capability_t::from_combined_number(binaryVersion); - } -}; +} // namespace detail_ } // namespace kernel /** - * A non-owning wrapper class for CUDA `__global__` functions + * A non-owning wrapper for CUDA kernels - whether they be `__global__` functions compiled + * apriori, or the result of dynamic NVRTC compilation, or obtained in some other future + * way. + * + * @note The association of a `kernel_t` with an individual device or context is somewhat + * tenuous. That is, the same function could be used with any other compatible device; + * However, many/most of the features, attributes and settings are context-specific + * or device-specific. + * + * @note NVRTC-compiled kernels can only use this class, with apriori-compiled + * kernels can use their own subclass. * - * @note The association of a `kernel_t` with an individual device is somewhat tenuous. - * That is, the same function pointer could be used with any other device (provided the kernel - * was compiled appropriately). However, many/most of the features, attributes and settings - * are device-specific. + * @todo Consider holding a module handle (possibly null/0/invalid), and a boolean + * saying whether this kernel wrapper holds it. This would allow passing kernel_t's + * without accompanying module_t's. */ class kernel_t { + public: // getters - const void* ptr() const noexcept { return ptr_; } + context_t context() const noexcept; device_t device() const noexcept; -protected: - device::id_t device_id() const noexcept { return device_id_; } - -public: // type_conversions - operator const void*() noexcept { return ptr_; } + device::id_t device_id() const noexcept { return device_id_; } + context::handle_t context_handle() const noexcept { return context_handle_; } + kernel::handle_t handle() const noexcept { return handle_; } public: // non-mutators - inline kernel::attributes_t attributes() const; + kernel::attribute_value_t get_attribute(kernel::attribute_t attribute) const + { + context::current::detail_::scoped_override_t set_context_for_this_context(context_handle_); + return kernel::detail_::get_attribute_in_current_context(handle(), attribute); + } -/* - // The following are commented out because there are no CUDA API calls for them! - // You may uncomment them if you'd rather get an exception... + cuda::device::compute_capability_t ptx_version() const noexcept { + auto raw_attribute = get_attribute(CU_FUNC_ATTRIBUTE_PTX_VERSION); + return device::compute_capability_t::from_combined_number(raw_attribute); + } - multiprocessor_cache_preference_t cache_preference() const; - multiprocessor_shared_memory_bank_size_option_t shared_memory_bank_size() const; -*/ + cuda::device::compute_capability_t binary_compilation_target_architecture() const noexcept { + auto raw_attribute = get_attribute(CU_FUNC_ATTRIBUTE_BINARY_VERSION); + return device::compute_capability_t::from_combined_number(raw_attribute); + } /** - * @brief Calculates the number of grid blocks which may be "active" on a given GPU - * multiprocessor simultaneously (i.e. with warps from any of these block - * being schedulable concurrently) + * @return the maximum number of threads per block for which the GPU device can satisfy + * this kernel's hardware requirement - typically, the number of registers in use. * - * @param num_threads_per_block - * @param dynamic_shared_memory_per_block - * @param disable_caching_override On some GPUs, the choice of whether to - * cache memory reads affects occupancy. But what if this caching results in 0 - * potential occupancy for a kernel? There are two options, controlled by this flag. - * When it is set to false - the calculator will assume caching is off for the - * purposes of its work; when set to true, it will return 0 for such device functions. - * See also the "Unified L1/Texture Cache" section of the - * Maxwell - * tuning guide. + * @note the kernel may have other constraints, requiring a different number of threads + * per block; these cannot be determined using this method. */ - grid::dimension_t maximum_active_blocks_per_multiprocessor( - grid::block_dimension_t num_threads_per_block, - memory::shared::size_t dynamic_shared_memory_per_block, - bool disable_caching_override = false); + grid::block_dimension_t maximum_threads_per_block() const + { + return get_attribute(CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK); + } -public: // mutators + grid::complete_dimensions_t min_grid_params_for_max_occupancy( + memory::shared::size_t dynamic_shared_memory_size = no_dynamic_shared_memory, + grid::block_dimension_t block_size_limit = 0, + bool disable_caching_override = false) const; - void set_attribute(kernel::attribute_t attribute, kernel::attribute_value_t value); + using shared_memory_size_determiner_t = size_t (*)(int block_size); + + grid::complete_dimensions_t min_grid_params_for_max_occupancy( + shared_memory_size_determiner_t shared_memory_size_determiner, + grid::block_dimension_t block_size_limit = 0, + bool disable_caching_override = false) const; + + +public: // methods mutating the kernel-in-context, but not this reference object + + void set_attribute(kernel::attribute_t attribute, kernel::attribute_value_t value) const; /** * @brief Change the hardware resource carve-out between L1 cache and shared memory @@ -117,54 +155,16 @@ class kernel_t { * also be set on the individual device-function level, by specifying the amount of shared * memory the kernel may require. */ - void opt_in_to_extra_dynamic_memory(cuda::memory::shared::size_t amount_required_by_kernel); - - /** - * - * @param dynamic_shared_memory_size The amount of dynamic shared memory each grid block will - * need. - * @param block_size_limit do not return a block size above this value; the default, 0, - * means no limit on the returned block size. - * @param disable_caching_override On platforms where global caching affects occupancy, - * and when enabling caching would result in zero occupancy, the occupancy calculator will - * calculate the occupancy as if caching is disabled. Setting this to true makes the - * occupancy calculator return 0 in such cases. More information can be found about this - * feature in the "Unified L1/Texture Cache" section of the - * Maxwell tuning guide. - * - * @return A pair, with the second element being the maximum achievable block size - * (1-dimensional), and the first element being the minimum number of such blocks necessary - * for keeping the GPU "busy" (again, in a 1-dimensional grid). - */ - grid::complete_dimensions_t min_grid_params_for_max_occupancy( - memory::shared::size_t dynamic_shared_memory_size = no_dynamic_shared_memory, - grid::block_dimension_t block_size_limit = 0, - bool disable_caching_override = false) const; - - template - grid::complete_dimensions_t min_grid_params_for_max_occupancy( - UnaryFunction block_size_to_dynamic_shared_mem_size, - grid::block_dimension_t block_size_limit = 0, - bool disable_caching_override = false) const; - - /** - * @brief Indicate the desired carve-out between shared memory and L1 cache when launching - * this kernel - with fine granularity. - * - * On several nVIDIA GPU micro-architectures, the L1 cache and the shared memory in each - * symmetric multiprocessor (=physical core) use the same hardware resources. The - * carve-out between the two uses has a device-wide value (which can be changed), but the - * driver can set another value for a specific function. This function doesn't make a demand - * from the CUDA runtime (as in @p opt_in_to_extra_dynamic_memory), but rather indicates - * what is the fraction of L1 to shared memory it would like the kernel scheduler to carve - * out. - * - * @param shared_mem_percentage The percentage - from 0 to 100 - of the combined L1/shared - * memory space the user wishes to assign to shared memory. - * - * @note similar to @ref set_cache_preference() - but with finer granularity. - */ - void set_preferred_shared_mem_fraction(unsigned shared_mem_percentage); + void set_maximum_dynamic_shared_memory_per_block(cuda::memory::shared::size_t amount_required_by_kernel) const + { + auto amount_required_by_kernel_ = (kernel::attribute_value_t) amount_required_by_kernel; + if (amount_required_by_kernel != (cuda::memory::shared::size_t) amount_required_by_kernel_) { + throw ::std::invalid_argument("Requested amount of maximum shared memory exceeds the " + "representation range for kernel attribute values"); + } + // TODO: Consider a check in debug mode for the value being within range + set_attribute(CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,amount_required_by_kernel_); + } /** * @brief Indicate the desired carve-out between shared memory and L1 cache when launching @@ -183,51 +183,241 @@ class kernel_t { * * @note similar to @ref set_preferred_shared_mem_fraction() - but with coarser granularity. */ - void set_cache_preference(multiprocessor_cache_preference_t preference); + void set_cache_preference(multiprocessor_cache_preference_t preference) + { + context::current::detail_::scoped_override_t set_context_for_this_context(context_handle_); + auto result = cuFuncSetCacheConfig(handle(), (CUfunc_cache) preference); + throw_if_error(result, + "Setting the multiprocessor L1/Shared Memory cache distribution preference for a " + "CUDA device function"); + } /** - * @brief Sets a device function's preference of shared memory bank size preference - * (for the current device probably) + * @brief Sets a device function's preference of shared memory bank size * * @param config bank size setting to make */ - void set_shared_memory_bank_size(multiprocessor_shared_memory_bank_size_option_t config); - - -protected: // ctors & dtor - kernel_t(device::id_t device_id, const void* f) - : device_id_(device_id), ptr_(f) + void set_shared_memory_bank_size(multiprocessor_shared_memory_bank_size_option_t config) { - // TODO: Consider checking whether this actually is a device function - // TODO: Consider performing a check for nullptr + // TODO: Need to set a context, not a device + context::current::detail_::scoped_override_t set_context_for_this_context(context_handle_); + auto result = cuFuncSetSharedMemConfig(handle(), static_cast(config) ); + throw_if_error(result, "Failed setting the shared memory bank size"); } +protected: // ctors & dtor + kernel_t(device::id_t device_id, context::handle_t context_handle, kernel::handle_t handle) + : device_id_(device_id), context_handle_(context_handle), handle_(handle) { } + public: // ctors & dtor - ~kernel_t() = default; + friend kernel_t kernel::detail_::wrap(device::id_t, context::handle_t, kernel::handle_t); - friend kernel_t kernel::detail_::wrap(device::id_t, const void* ptr); + kernel_t(const kernel_t& other) = default; // Note: be careful with subclasses + kernel_t(kernel_t&& other) = default; // Note: be careful with subclasses + +public: // ctors & dtor + virtual ~kernel_t() = default; protected: // data members - const device::id_t device_id_; - const void* const ptr_; -}; + device::id_t device_id_; // We don't _absolutely_ need the device ID, but - why not have it if we can? + context::handle_t context_handle_; + mutable kernel::handle_t handle_; +}; // kernel_t namespace kernel { namespace detail_ { -inline kernel_t wrap(device::id_t device_id, const void* function_ptr) +inline kernel_t wrap( + device::id_t device_id, + context::handle_t context_id, + kernel::handle_t f) { - return { device_id, reinterpret_cast(function_ptr) }; + return kernel_t{ device_id, context_id, f }; } } // namespace detail_ -template -kernel_t wrap(const device_t &device, KernelFunctionPtr function_ptr); +namespace occupancy { + +namespace detail_ { + +/* +// TODO: Can't we make full use of the closure here? If only we had +// some void* parameter to the b2d function... +template +cuda::size_t CUDA_CB block_size_to_dynamic_shared_mem_size_helper(int blockSize) +{ + return UnaryFunction{}(blockSize); +} + + +#if CUDART_VERSION <= 10000 + throw cuda::runtime_error {cuda::status::not_yet_implemented}; +#else + int min_grid_size_in_blocks { 0 }; + int block_size { 0 }; + // Note: only initializing the values her because of a + // spurious (?) compiler warning about potential uninitialized use. + + size_t ignored_fixed_dynamic_shared_mem_size { 0 }; + auto result = cuOccupancyMaxPotentialBlockSizeWithFlags( + &min_grid_size_in_blocks, &block_size, + kernel_handle, + block_size_to_dynamic_shared_mem_size_helper, + ignored_fixed_dynamic_shared_mem_size, + static_cast(block_size_limit), + disable_caching_override ? CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE : CU_OCCUPANCY_DEFAULT + ); + +// CUresult CUDAAPI cuOccupancyMaxPotentialBlockSizeWithFlags( +// int *minGridSize, int *blockSize, CUfunction func, +// CUoccupancyB2DSize blockSizeToDynamicSMemSize, +// size_t dynamicSMemSize, int blockSizeLimit, unsigned int flags); + + throw_if_error(result, + "Failed obtaining parameters for a minimum-size grid for " + kernel::detail_::identify(kernel_handle) + + " on " + device::detail_::identify(device_id)); + return { min_grid_size_in_blocks, block_size }; +#endif // CUDART_VERSION <= 10000 +} +*/ + +// Note: If determine_shared_mem_by_block_size is not null, fixed_shared_mem_size is ignored; +// if block_size_limit is 0, it is ignored. +inline grid::complete_dimensions_t min_grid_params_for_max_occupancy( + CUfunction kernel_handle, + cuda::device::id_t device_id, + CUoccupancyB2DSize determine_shared_mem_by_block_size, + cuda::memory::shared::size_t fixed_shared_mem_size, + cuda::grid::block_dimension_t block_size_limit, + bool disable_caching_override) +{ +#if CUDART_VERSION <= 10000 + throw cuda::runtime_error {cuda::status::not_yet_implemented}; +#else + int min_grid_size_in_blocks { 0 }; + int block_size { 0 }; + // Note: only initializing the values her because of a + // spurious (?) compiler warning about potential uninitialized use. + + auto result = cuOccupancyMaxPotentialBlockSizeWithFlags( + &min_grid_size_in_blocks, &block_size, + kernel_handle, + determine_shared_mem_by_block_size, + fixed_shared_mem_size, + static_cast(block_size_limit), + disable_caching_override ? CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE : CU_OCCUPANCY_DEFAULT + ); + + throw_if_error(result, + "Failed obtaining parameters for a minimum-size grid for " + kernel::detail_::identify(kernel_handle) + + " on " + device::detail_::identify(device_id) + " with maximum occupancy given dynamic shared " + "memory and block size data"); + return { min_grid_size_in_blocks, block_size }; +#endif // CUDART_VERSION <= 10000 +} + +} // namespace detail_ + + +/** +* @brief See the Driver API documentation for @ref cuOccupancyAvailableDynamicSMemPerBlock +*/ +inline memory::shared::size_t max_dynamic_shared_memory_per_block( + const kernel_t &kernel, + grid::dimension_t blocks_on_multiprocessor, + grid::block_dimension_t block_size_in_threads) +{ + size_t result; + auto status = cuOccupancyAvailableDynamicSMemPerBlock( + &result, kernel.handle(), (int) blocks_on_multiprocessor, (int) block_size_in_threads); + throw_if_error(status, + "Determining the available dynamic memory per block, given the number of blocks on a multiprocessor and their size"); + return (memory::shared::size_t) result; +} + +/** +* @brief See the Driver API documentation for @ref cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags +*/ +inline grid::dimension_t max_blocks_per_multiprocessor( + const kernel_t &kernel, + grid::block_dimension_t block_size_in_threads, + memory::shared::size_t dynamic_shared_memory_per_block, + bool disable_caching_override = false) +{ + int result; + auto flags = (unsigned) disable_caching_override ? CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE : CU_OCCUPANCY_DEFAULT; + auto status = cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags( + &result, kernel.handle(), (int) block_size_in_threads, (int) dynamic_shared_memory_per_block, flags); + throw_if_error(status, + "Determining the maximum occupancy in blocks per multiprocessor, given the block size and the amount of dyanmic memory per block"); + return result; +} + +/** +* +* @param dynamic_shared_memory_size The amount of dynamic shared memory each grid block will +* need. +* @param block_size_limit do not return a block size above this value; the default, 0, +* means no limit on the returned block size. +* @param disable_caching_override On platforms where global caching affects occupancy, +* and when enabling caching would result in zero occupancy, the occupancy calculator will +* calculate the occupancy as if caching is disabled. Setting this to true makes the +* occupancy calculator return 0 in such cases. More information can be found about this +* feature in the "Unified L1/Texture Cache" section of the +* Maxwell tuning guide. +* +* @return A pair, with the second element being the maximum achievable block size +* (1-dimensional), and the first element being the minimum number of such blocks necessary +* for keeping the GPU "busy" (again, in a 1-dimensional grid). +*/ +inline grid::complete_dimensions_t min_grid_params_for_max_occupancy( + kernel_t kernel, + memory::shared::size_t dynamic_shared_memory_size = no_dynamic_shared_memory, + grid::block_dimension_t block_size_limit = 0, + bool disable_caching_override = false) +{ + return detail_::min_grid_params_for_max_occupancy( + kernel.handle(), kernel.device_id(), nullptr, + dynamic_shared_memory_size, block_size_limit, disable_caching_override); +} + +inline grid::complete_dimensions_t min_grid_params_for_max_occupancy( + kernel_t kernel, + kernel_t::shared_memory_size_determiner_t + shared_memory_size_determiner, + cuda::grid::block_dimension_t block_size_limit = 0, + bool disable_caching_override = false) +{ + size_t ignored_fixed_dynamic_shared_mem_size { 0 }; + return detail_::min_grid_params_for_max_occupancy( + kernel.handle(), kernel.device_id(), shared_memory_size_determiner, + ignored_fixed_dynamic_shared_mem_size, block_size_limit, disable_caching_override); +} + +} // namespace occupancy } // namespace kernel +inline grid::complete_dimensions_t kernel_t::min_grid_params_for_max_occupancy( + memory::shared::size_t dynamic_shared_memory_size, + grid::block_dimension_t block_size_limit, + bool disable_caching_override) const +{ + return kernel::occupancy::min_grid_params_for_max_occupancy( + *this, dynamic_shared_memory_size, block_size_limit, disable_caching_override); +} + +inline grid::complete_dimensions_t kernel_t::min_grid_params_for_max_occupancy( + shared_memory_size_determiner_t shared_memory_size_determiner, + cuda::grid::block_dimension_t block_size_limit, + bool disable_caching_override) const +{ + return kernel::occupancy::min_grid_params_for_max_occupancy( + *this, shared_memory_size_determiner, block_size_limit, disable_caching_override); +} + } // namespace cuda #endif // CUDA_API_WRAPPERS_KERNEL_HPP_ diff --git a/src/cuda/api/kernel_launch.hpp b/src/cuda/api/kernel_launch.hpp index 68eda773..162659e3 100644 --- a/src/cuda/api/kernel_launch.hpp +++ b/src/cuda/api/kernel_launch.hpp @@ -17,7 +17,7 @@ * and sticking to proper C++; in other words, the wrappers are "ugly" * instead of client code having to be. *
  • Avoiding some of the "parameter soup" of launching a kernel: It's - * rather easy to mix up shared memory sizes with stream IDs; grid and + * not so difficult to mix up shared memory sizes with stream handles; grid and * block dimensions with each other; and even grid/block dimensions with * the scalar parameters - since a `dim3` is constructible from * integral values. Instead, we enforce a launch configuration structure: @@ -42,9 +42,8 @@ #ifndef CUDA_API_WRAPPERS_KERNEL_LAUNCH_CUH_ #define CUDA_API_WRAPPERS_KERNEL_LAUNCH_CUH_ -#include #include -#include +#include #if (__CUDACC_VER_MAJOR__ >= 9) #include @@ -73,6 +72,38 @@ constexpr grid::block_dimensions_t single_thread_per_block() { return 1; } namespace detail_ { +template struct bool_pack; + +template +using all_true = ::std::is_same, bool_pack>; + +/** + * @brief adapt a type to be usable as a kernel parameter. + * + * CUDA kernels don't accept just any parameter type a C++ function may accept. + * Specifically: No references, arrays decay (IIANM) and functions pass by address. + * However - not all "decaying" of `::std::decay` is necessary. Such transformation + * can be effected by this type-trait struct. + */ +template +struct kernel_parameter_decay { +private: + typedef typename ::std::remove_reference

    ::type U; +public: + typedef typename ::std::conditional< + ::std::is_array::value, + typename ::std::remove_extent::type*, + typename ::std::conditional< + ::std::is_function::value, + typename ::std::add_pointer::type, + U + >::type + >::type type; +}; + +template +using kernel_parameter_decay_t = typename kernel_parameter_decay

    ::type; + template struct is_function_ptr: ::std::integral_constant::value and ::std::is_function::type>::value> { }; @@ -86,25 +117,46 @@ inline void collect_argument_addresses(void** collected_addresses, Arg&& arg, Ar collect_argument_addresses(collected_addresses + 1, ::std::forward(args)...); } -// Note: Unlike the non-detail_ functions - this one -// cannot handle type-erased kernel_t's. -template -inline void enqueue_launch( - RawKernel kernel_function, - stream::handle_t stream_handle, - launch_configuration_t launch_configuration, - KernelParameters&&... parameters) +// For partial template specialization on WrappedKernel... +template +struct enqueue_launch_helper { + void operator()( + Kernel kernel_function, + const stream_t & stream, + launch_configuration_t launch_configuration, + KernelParameters &&... parameters); +}; + +template +void enqueue_launch( + ::std::integral_constant, + Kernel kernel_function, + const stream_t& stream, + launch_configuration_t launch_configuration, + KernelParameters&&... parameters); + +template +void enqueue_launch( + ::std::integral_constant, + Kernel kernel, + const stream_t& stream, + launch_configuration_t launch_configuration, + KernelParameters&&... parameters); + +template +void enqueue_raw_kernel_launch( + KernelFunction kernel_function, + stream::handle_t stream_handle, + launch_configuration_t launch_configuration, + KernelParameters&&... parameters) #ifndef __CUDACC__ // If we're not in CUDA's NVCC, this can't run properly anyway, so either we throw some // compilation error, or we just do nothing. For now it's option 2. - ; +; #else { - static_assert(::std::is_function::value or - (is_function_ptr::value), - "Only a bona fide function can be a CUDA kernel and be launched; " - "you were attempting to enqueue a launch of something other than a function"); - + static_assert(::std::is_function::value or (is_function_ptr::value), + "Only a bona fide function can be launched as a CUDA kernel"); if (launch_configuration.block_cooperation == thread_blocks_may_not_cooperate) { // regular plain vanilla launch kernel_function <<< @@ -112,11 +164,11 @@ inline void enqueue_launch( launch_configuration.dimensions.block, launch_configuration.dynamic_shared_memory_size, stream_handle - >>>(::std::forward(parameters)...); + >>>(::std::forward(parameters)...); cuda::outstanding_error::ensure_none("Kernel launch failed"); } else { -#if __CUDACC_VER_MAJOR__ >= 9 +#if __CUDACC_VER_MAJOR__ >= 9 // Cooperative launches cannot be made using the triple-chevron syntax, // nor is there a variadic-template of the launch API call, so we need to // a bit of useless work here. We could have done exactly the same thing @@ -133,13 +185,18 @@ inline void enqueue_launch( // of the two terms is confusing here and depends on how you // look at things. detail_::collect_argument_addresses(argument_ptrs, ::std::forward(parameters)...); - auto status = cudaLaunchCooperativeKernel( - (const void*) kernel_function, - launch_configuration.dimensions.grid, - launch_configuration.dimensions.block, - argument_ptrs, + kernel::handle_t kernel_function_handle = kernel::detail_::get_handle( (const void*) kernel_function); + auto status = cuLaunchCooperativeKernel( + kernel_function_handle, + launch_configuration.dimensions.grid.x, + launch_configuration.dimensions.grid.y, + launch_configuration.dimensions.grid.z, + launch_configuration.dimensions.block.x, + launch_configuration.dimensions.block.y, + launch_configuration.dimensions.block.z, launch_configuration.dynamic_shared_memory_size, - stream_handle); + stream_handle, + argument_ptrs); throw_if_error(status, "Cooperative kernel launch failed"); #else @@ -153,6 +210,59 @@ inline void enqueue_launch( } // namespace detail_ + +namespace kernel { + +namespace detail_ { + +// The helper code here is intended for re-imbuing kernel-related classes with the types +// of the kernel parameters. This is necessary since kernel wrappers may be type-erased +// (which makes it much easier to work with them and avoids a bunch of code duplication). +// +// Note: The type-unerased kernel must be a non-const function pointer. Why? Not sure. +// even though function pointers can't get written through, for some reason they are +// expected not to be const. + + +template +struct raw_kernel_typegen { + // You should be careful to only instantiate this class with nice simple types we can pass to CUDA kernels. +// static_assert( +// all_true< +// ::std::is_same< +// KernelParameters, +// ::cuda::detail_::kernel_parameter_decay_t>::value... +// >::value, +// "All kernel parameter types must be decay-invariant" ); + using type = void(*)(cuda::detail_::kernel_parameter_decay_t...); +}; + +} // namespace detail_ + +template +typename detail_::raw_kernel_typegen::type +unwrap(apriori_compiled_kernel_t kernel) +{ + using raw_kernel_t = typename detail_::raw_kernel_typegen::type; + return reinterpret_cast(const_cast(kernel.ptr())); +} + +} // namespace kernel + +namespace detail_ { + +template +struct enqueue_launch_helper { + void operator()( + apriori_compiled_kernel_t wrapped_kernel, + const stream_t & stream, + launch_configuration_t launch_configuration, + KernelParameters &&... parameters); +}; + +} // namespace detail_ + + /** * @brief Enqueues a kernel on a stream (=queue) on the current CUDA device. * @@ -171,9 +281,8 @@ inline void enqueue_launch( *

    As kernels do not return values, neither does this function. It also contains no hooks, logging * commands etc. - if you want those, write an additional wrapper (perhaps calling this one in turn). * - * @param kernel_function the kernel to apply. Pass it just as-it-is, as though it were any other function. Note: + * @param kernel the kernel to apply. Pass it just as-it-is, as though it were any other function. Note: * If the kernel is templated, you must pass it fully-instantiated. Alternatively, you can pass a - * @ref kernel_t wrapping the raw pointer to the function. * @param stream the CUDA hardware command queue on which to place the command to launch the kernel (affects * the scheduling of the launch and the execution) * @param launch_configuration not all launches of the same kernel are identical: The launch may be configured @@ -184,13 +293,28 @@ inline void enqueue_launch( */ template void enqueue_launch( - Kernel kernel_function, + Kernel kernel, const stream_t& stream, launch_configuration_t launch_configuration, - KernelParameters&&... parameters); + KernelParameters&&... parameters) +{ + static_assert( + detail_::all_true< + ::std::is_trivially_copyable>::value... + >::value, + "All kernel parameter types must be of a trivially copyable (decayed) type." ); + constexpr const bool wrapped_kernel = ::std::is_base_of::type>::value; + // We would have liked an "if constexpr" here, but that is unsupported by C++11, so we have to + // use tagged dispatch for the separate behavior for raw and wrapped kernels - although the enqueue_launch + // function for each of them will basically be just a one-liner :-( + detail_::enqueue_launch( + ::std::integral_constant{}, + ::std::forward(kernel), stream, launch_configuration, + ::std::forward(parameters)...); +} /** - * Variant of @ref enqueue_launch for use with the default stream on the current device. + * Variant of @ref enqueue_launch for use with the default stream in the current context. * * @note This isn't called `enqueue` since the default stream is synchronous. */ diff --git a/src/cuda/api/link.hpp b/src/cuda/api/link.hpp new file mode 100644 index 00000000..eeca6743 --- /dev/null +++ b/src/cuda/api/link.hpp @@ -0,0 +1,215 @@ +/** + * @file link.hpp + * + * @brief Wrappers for linking modules of compiled CUDA code. + */ +#pragma once +#ifndef CUDA_API_WRAPPERS_LINK_HPP_ +#define CUDA_API_WRAPPERS_LINK_HPP_ + +#include +#include +#include +#include +#include + +#if __cplusplus >= 201703L +#include +#endif + +namespace cuda { + +///@cond +class device_t; +class module_t; +class link_t; +///@endcond + +namespace link { + +using handle_t = CUlinkState; + +namespace detail_ { + +// TODO: Check if the linking has been completed! +inline link_t wrap( + context::handle_t context, + link::handle_t handle, + link::options_t options, + bool take_ownership = false) noexcept; + +} // namespace detail_ + +inline link_t create(const void* image, link::options_t options); + +// TODO: Use a clase-class with C++17 of later, made up of the two classes here +namespace input { + +/** + * A typed, named, image in memory which can be used as an input to a runtime + * CUDA linking process. + */ +struct image_t : memory::region_t { + const char* name; + link::input_type_t type; +}; + +struct file_t { + const char* path; // TODO: Use a proper path in C++14 and later + link::input_type_t type; +}; + +} // namespace input + +} // namespace link + +/** + * @brief Wrapper class for a CUDA link (a process of linking compiled code together into an + * executable binary, using CUDA, at run-time) + * + * @note This class is a "reference type", not a "value type". Therefore, making changes + * to the link is a const-respecting operation on this class. + */ +class link_t { + +public: + /** + * Complete a linking process, producing a completely-linked cubin image (for loading into + * modules). + * + * @return The completely-linked cubin image, in a sized memory range. This memory is owned + * by the link object, and must not be freed/deleted. + */ + memory::region_t complete() const { + void* cubin_output_start; + size_t cubin_output_size; + auto status = cuLinkComplete(handle_, &cubin_output_start, &cubin_output_size); + throw_if_error(status, + "Failed completing the link with state at address " + cuda::detail_::ptr_as_hex(handle_)); + return memory::region_t{cubin_output_start, cubin_output_size}; + } + + // TODO: Replace this with methods which take wrapper classes. + void add(link::input::image_t image, const link::options_t ptx_compilation_options = {}) const + { + auto marshalled_options = ptx_compilation_options.marshal(); + auto status = cuLinkAddData( + handle_, + static_cast(image.type), + image.data(), // TODO: Is this really safe? + image.size(), + image.name, + marshalled_options.count(), + const_cast(marshalled_options.options()), + const_cast(marshalled_options.values()) + ); + throw_if_error(status, + "Failed adding input " + ::std::string(image.name) + " of type " + ::std::to_string(image.type) + " to a link."); + } + + void add_file(link::input::file_t file_input, const link::options_t& options) const + { + auto marshalled_options = options.marshal(); + auto status = cuLinkAddFile( + handle_, + static_cast(file_input.type), + file_input.path, + marshalled_options.count(), + const_cast(marshalled_options.options()), + const_cast(marshalled_options.values()) + ); + throw_if_error(status, + "Failed loading an object of type " + ::std::to_string(file_input.type) + " from file " + file_input.path); + } + +#if __cplusplus >= 201703L + void add_file(const ::std::filesystem::path& path, link::input_type_t file_contents_type) const + { + return add_file(path.c_str(), file_contents_type); + } +#endif + +protected: // constructors + + link_t(context::handle_t context, link::handle_t handle, link::options_t options, bool take_ownership) noexcept + : context_handle_(context), handle_(handle), options_(options), owning(take_ownership) { } + +public: // friendship + + friend link_t link::detail_::wrap(context::handle_t context, link::handle_t handle, link::options_t, bool take_ownership) noexcept; + +public: // constructors and destructor + + link_t(const link_t&) = delete; + + link_t(link_t&& other) noexcept : + link_t(other.context_handle_, other.handle_, other.options_, other.owning) + { + other.owning = false; + }; + + ~link_t() + { + if (owning) { + context::current::detail_::scoped_override_t set_context_for_this_scope(context_handle_); + auto status = cuLinkDestroy(handle_); + throw_if_error(status, + ::std::string("Failed destroying the link ") + detail_::ptr_as_hex(handle_) + + " in " + context::detail_::identify(context_handle_)); + } + } + +public: // operators + + link_t& operator=(const link_t& other) = delete; + link_t& operator=(link_t&& other) = delete; + +protected: // data members + const context::handle_t context_handle_; + const link::handle_t handle_; + link::options_t options_; + bool owning; + // this field is mutable only for enabling move construction; other + // than in that case it must not be altered +}; + +namespace link { + +inline link_t create(link::options_t options = link::options_t{}) +{ + handle_t new_link_handle; + auto marshalled_options = options.marshal(); + auto status = cuLinkCreate( + marshalled_options.count(), + const_cast(marshalled_options.options()), + const_cast(marshalled_options.values()), + &new_link_handle + ); + throw_if_error(status, "Failed creating a new link "); + auto do_take_ownership = true; + return detail_::wrap( + context::current::detail_::get_handle(), + new_link_handle, + options, + do_take_ownership); +} + +namespace detail_ { + +// TODO: Check if the linking has been completed! +inline link_t wrap( + context::handle_t context, + link::handle_t handle, + link::options_t options, + bool take_ownership) noexcept +{ + return link_t{context, handle, options, take_ownership}; +} + +} // namespace detail_ + +} // namespace link + +} // namespace cuda + +#endif // CUDA_API_WRAPPERS_LINK_HPP_ diff --git a/src/cuda/api/link_options.hpp b/src/cuda/api/link_options.hpp new file mode 100644 index 00000000..24b6b8d1 --- /dev/null +++ b/src/cuda/api/link_options.hpp @@ -0,0 +1,339 @@ +/** + * @file jit.hpp + * + * @brief Definitions and utility functions relating to just-in-time compilation and linking of CUDA code. + */ +#pragma once +#ifndef CUDA_API_WRAPPERS_ASSEMBLY_AND_LINK_OPTIONS_HPP_ +#define CUDA_API_WRAPPERS_ASSEMBLY_AND_LINK_OPTIONS_HPP_ + + +#include + +#if __cplusplus >= 202002L +#include +#endif +#include + +namespace cuda { + +#if __cplusplus >= 202002L +using ::std::span; +#else + template + // Poor man's span. TODO: Replace it with a proper span. + // + // Note: A span is a reference type. That means that changes + // to the pointed-to data are _not_considered changes to the + // span, hence you can get to that data with const methods. + struct span { + T* data_; + size_t size_; + +// T*& data() noexcept { return const_cast(data_); } + T* data() const noexcept { return data_; } + constexpr size_t size() const noexcept { return size_; } + const T* cbegin() const { return data(); } + const T* cend() const { return data() + size_; } + T* begin() const { return data(); } + T* end() const { return data() + size_; } + }; +#endif + + +///@cond +class module_t; +///@endcond + +namespace link { + +enum input_type_t { + cubin, /// Compiled device-class-specific device code + ptx, /// PTX (microarchitecture-inspecific intermediate representation) + fatbin, /// A bundle of multiple cubin and/or PTX inputs; typically + object, /// A host-side binary object with embedded device code; a `.o` file + library, /// An archive of objects files with embedded device code; a `.a` file +} ; + +enum fallback_strategy_t { + prefer_ptx = 0, + prefer_binary = 1, +}; + +enum class caching_mode_t { + + /** + * ca - Cache at all levels, likely to be accessed again. + * + * The default load instruction cache operation is ld.ca, + * which allocates cache lines in all levels (L1 and L2) with + * normal eviction policy. Global data is coherent at the L2 + * level, but multiple L1 caches are not coherent for global + * data. + */ + cash_at_all_levels, + cash_in_l1_and_l2 = cash_at_all_levels, + ca = cash_at_all_levels, + + /** + * Cache at global level (cache in L2 and below, not L1). + * + * Use ld.cg to cache loads only globally, bypassing the L1 + * cache, and cache only in the L2 cache. + */ + cache_at_global_level, + cache_in_l2_only = cache_at_global_level, + cg = cache_at_global_level, + + /** + * Cache streaming, likely to be accessed once. + * + * The ld.cs load cached streaming operation allocates global + * lines with evict-first policy in L1 and L2 to limit cache + * pollution by temporary streaming data that may be accessed + * once or twice. When ld.cs is applied to a Local window + * address, it performs the ld.lu operation. + */ + cache_as_evict_first, + cache_streaming = cache_as_evict_first, + cs = cache_streaming, + + /** + * Last use. + * + * The compiler/programmer may use ld.lu when restoring spilled + * registers and popping function stack frames to avoid needless + * write-backs of lines that will not be used again. The ld.lu + * instruction performs a load cached streaming operation + * (ld.cs) on global addresses. + */ + last_use, + lu = last_use, + + /** + * Don't cache and fetch again (consider cached system memory + * lines stale, fetch again). + * + * The ld.cv load operation applied to a global System Memory + * address invalidates (discards) a matching L2 line and + * re-fetches the line on each new load. + */ + fetch_again_and_dont_cache, + cv = fetch_again_and_dont_cache, +}; + +using register_index_t = unsigned; +using optimization_level_t = unsigned; +using option_t = CUjit_option; +constexpr const optimization_level_t maximum_optimization_level { 4 }; + +struct marshalled_options_t { + using size_type = unsigned; + constexpr static const size_type max_num_options { CU_JIT_NUM_OPTIONS }; + +protected: + ::std::array option_buffer; + ::std::array value_buffer; + size_type count_ { 0 }; +public: + size_type count() { return count_; } + + void push_back(option_t option) + { + if (count_ >= max_num_options) { + throw ::std::invalid_argument("Attempt to push back the same option a second time"); + // If each option is pushed back at most once, the count cannot exist the number + // of possible options. In fact, it can't even reach it because some options contradict. + // + // Note: This check will not catch all repeat push-backs, nor the case of conflicting + // options - the cuLink methods will catch those. We just want to avoid overflow. + } + option_buffer[count_] = option; + count_++; + } +protected: + template + void* process_value(typename ::std::enable_if<::std::is_integral::value, I>::type value) + { + return reinterpret_cast(static_cast(value)); + } + + template + void* process_value(T* value) + { + return static_cast(value); + } + + void* process_value(bool value) { return process_value(value ? 1 : 0); } + + void* process_value(caching_mode_t value) + { + return process_value(static_cast::type>(value)); + } + +public: + + template + void push_back(option_t option, T value) + { + push_back(option); + process_value(value); + // Now set value_buffer[count-1]... + value_buffer[count_-1] = process_value(value); + } + const option_t* options() const { return option_buffer.data(); } + const void * const * values() const { return value_buffer.data(); } +}; + +struct options_t { + + static constexpr const register_index_t no_max_registers_limit { 0 }; + + /** + * Limit the number of registers which a kernel thread may use. + * + * @todo Use an optional. + */ + register_index_t max_num_registers_per_thread { no_max_registers_limit }; + + static constexpr const register_index_t no_min_num_threads_per_block { 0 }; + + /** + * The minimum number of threads per block which the compiler should target + * @note can't be combined with a value for the @ref target property. + * + * @todo Use an optional. + */ + grid::block_dimension_t min_num_threads_per_block { no_min_num_threads_per_block }; + + // Note: The sizes are used as parameters too. + span info_log, error_log; + + static constexpr const optimization_level_t dont_set_optimization_level { maximum_optimization_level + 1 }; + /** + * Compilation optimization level (as in -O1, -O2 etc.) + * + * @todo Use an optional. + */ + optimization_level_t optimization_level { dont_set_optimization_level }; + + /** + * + * @todo Use a variant or optional+variant. + */ + struct { + bool obtain_from_cuda_context { true }; + bool use_specific { true }; + device::compute_capability_t specific; + } target; // Can't be combined with CU_JIT_THREADS_PER_BLOCK + + bool specify_fallback_strategy { false }; + /** + * @todo Use an optional. + */ + fallback_strategy_t fallback_strategy { prefer_ptx }; // fallback behavior if a cubin matching (WHAT?) is not found + + /** + * Whether or not to generate indications of which PTX/SASS instructions correspond to which + * lines of the source code, within the compiled output (-lineinfo) + */ + bool generate_debug_information { false }; /// Whether or not to generate debug information within the compiled output (-g) + bool generate_source_line_number_information { false }; + + // It _seems_ that the verbosity is a boolean setting - but this is not clear + bool verbose_log; + + bool specify_default_load_caching_mode { false }; + /** + * Specifies which of the PTX load caching modes use by default, + * when no caching mode is specified in a PTX instruction (-dlcm) + */ + caching_mode_t default_load_caching_mode; + + // Ignoring the "internal purposes only" options; + // + // CU_JIT_NEW_SM3X_OPT + // CU_JIT_FAST_COMPILE + // CU_JIT_GLOBAL_SYMBOL_NAMES + // CU_JIT_GLOBAL_SYMBOL_ADDRESSES + // CU_JIT_GLOBAL_SYMBOL_COUNT + // + +public: + marshalled_options_t marshal() const; +}; + +inline marshalled_options_t options_t::marshal() const +{ + marshalled_options_t marshalled; + + if (max_num_registers_per_thread != no_max_registers_limit) { + marshalled.push_back(CU_JIT_MAX_REGISTERS, max_num_registers_per_thread); + } + + if (min_num_threads_per_block != no_min_num_threads_per_block) { + marshalled.push_back(CU_JIT_THREADS_PER_BLOCK, min_num_threads_per_block); + } + + auto cil = const_cast*>(&info_log); + if (cil->data() != nullptr and cil->size() != 0) { + marshalled.push_back(CU_JIT_INFO_LOG_BUFFER, cil->data()); + marshalled.push_back(CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, cil->size()); + } + + auto cel = const_cast*>(&error_log); + if (cel->data() != nullptr and cel->size() != 0) { + marshalled.push_back(CU_JIT_ERROR_LOG_BUFFER, cel->data()); + marshalled.push_back(CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, cel->size()); + } + + if (optimization_level != dont_set_optimization_level) { + marshalled.push_back(CU_JIT_OPTIMIZATION_LEVEL, optimization_level); + } + + if (target.obtain_from_cuda_context) { + marshalled.push_back(CU_JIT_TARGET_FROM_CUCONTEXT); + } + else if (target.use_specific) { + marshalled.push_back(CU_JIT_TARGET, target.specific.as_combined_number()); + } + + if (specify_fallback_strategy) { + marshalled.push_back(CU_JIT_FALLBACK_STRATEGY, fallback_strategy); + } + + if (generate_debug_information) { + marshalled.push_back(CU_JIT_GENERATE_DEBUG_INFO); + } + + if (generate_source_line_number_information) { + marshalled.push_back(CU_JIT_GENERATE_LINE_INFO); + } + + if (generate_source_line_number_information) { + marshalled.push_back(CU_JIT_GENERATE_LINE_INFO); + } + + if (verbose_log) { + marshalled.push_back(CU_JIT_LOG_VERBOSE); + } + + if (specify_default_load_caching_mode) { + marshalled.push_back(CU_JIT_CACHE_MODE, default_load_caching_mode); + } + + return marshalled; +} + + +// TODO: Compiler "output options": +// +// threads per block targeted +// compilation wall time +// amount written to info log + +} // namespace assembly_and_link + +} // namespace cuda + +#endif // CUDA_API_WRAPPERS_ASSEMBLY_AND_LINK_OPTIONS_HPP_ diff --git a/src/cuda/api/memory.hpp b/src/cuda/api/memory.hpp index 422d4744..71c14337 100644 --- a/src/cuda/api/memory.hpp +++ b/src/cuda/api/memory.hpp @@ -31,7 +31,10 @@ #include #include #include +#include + #include // needed, rather than cuda_runtime_api.h, e.g. for cudaMalloc +#include #include #include // for ::std::memset @@ -41,48 +44,13 @@ namespace cuda { ///@cond class device_t; +class context_t; class stream_t; +class module_t; ///@endcond -/** - * @namespace memory - * Representation, allocation and manipulation of CUDA-related memory, with - * its various namespaces and kinds of memory regions. - */ namespace memory { -namespace detail_ { - -template - class base_region_t { - private: - T* start_ = nullptr; - size_t size_in_bytes_ = 0; - public: - base_region_t() = default; - base_region_t(T* start, size_t size_in_bytes) - : start_(start), size_in_bytes_(size_in_bytes) {} - - T*& start() { return start_; } - size_t& size() { return size_in_bytes_; } - - size_t size() const { return size_in_bytes_; } - T* start() const { return start_; } - T* data() const { return start(); } - T* get() const { return start(); } - }; - -} // namespace detail_ - -struct region_t : public detail_::base_region_t { - using base_region_t::base_region_t; -}; - -struct const_region_t : public detail_::base_region_t { - using base_region_t::base_region_t; - const_region_t(const region_t& r) : base_region_t(r.start(), r.size()) {} -}; - /** * A memory allocation setting: Can the allocated memory be used in other * CUDA driver contexts (in addition to the implicit default context we @@ -127,8 +95,8 @@ namespace detail_ { inline unsigned make_cuda_host_alloc_flags(allocation_options options) { return - (options.portability == portability_across_contexts::is_portable ? cudaHostAllocPortable : 0) & - (options.write_combining == cpu_write_combining::with_wc ? cudaHostAllocWriteCombined : 0); + (options.portability == portability_across_contexts::is_portable ? CU_MEMHOSTALLOC_PORTABLE : 0) & + (options.write_combining == cpu_write_combining::with_wc ? CU_MEMHOSTALLOC_WRITECOMBINED : 0); } } // namespace detail_ @@ -161,11 +129,6 @@ struct region_pair { } // namespace mapped -} // namespace memory - - -namespace memory { - /** * @brief CUDA-Device-global memory on a single device (not accessible from the host) */ @@ -176,34 +139,31 @@ namespace detail_ { /** * Allocate memory on current device * - * @param size_in_bytes amount of memory to allocate in bytes + * @param num_bytes amount of memory to allocate in bytes */ -inline region_t allocate(size_t size_in_bytes) +inline cuda::memory::region_t allocate_in_current_context(size_t num_bytes) { - void* allocated = nullptr; + device::address_t allocated = 0; // Note: the typed cudaMalloc also takes its size in bytes, apparently, // not in number of elements - auto status = cudaMalloc(&allocated, size_in_bytes); - if (is_success(status) && allocated == nullptr) { + auto status = cuMemAlloc(&allocated, num_bytes); + if (is_success(status) && allocated == 0) { // Can this even happen? hopefully not - status = cudaErrorUnknown; + status = (status_t) status::unknown; } - throw_if_error(status, - "Failed allocating " + ::std::to_string(size_in_bytes) + - " bytes of global memory on CUDA device " + - ::std::to_string(cuda::device::current::detail_::get_id())); - return {allocated, size_in_bytes}; + throw_if_error(status, "Failed allocating " + ::std::to_string(num_bytes) + + " bytes of global memory on the current CUDA device"); + return {as_pointer(allocated), num_bytes}; } -inline region_t allocate(cuda::device::id_t device_id, size_t size_in_bytes) +inline region_t allocate(context::handle_t context_handle, size_t size_in_bytes) { - cuda::device::current::detail_::scoped_override_t set_device_for_this_scope(device_id); - return memory::device::detail_::allocate(size_in_bytes); + context::current::detail_::scoped_override_t set_context_for_this_scope(context_handle); + return allocate_in_current_context(size_in_bytes); } } // namespace detail_ - namespace async { namespace detail_ { @@ -212,27 +172,27 @@ namespace detail_ { * Allocate memory asynchronously on a specified stream. */ inline region_t allocate( - cuda::device::id_t device_id, - cuda::stream::handle_t stream_handle, - size_t size_in_bytes) + context::handle_t context_handle, + stream::handle_t stream_handle, + size_t num_bytes) { #if CUDART_VERSION >= 11020 - void* allocated = nullptr; + device::address_t allocated = 0; // Note: the typed cudaMalloc also takes its size in bytes, apparently, // not in number of elements - auto status = cudaMallocAsync(&allocated, size_in_bytes, stream_handle); - if (is_success(status) && allocated == nullptr) { + auto status = cuMemAllocAsync(&allocated, num_bytes, stream_handle); + if (is_success(status) && allocated == 0) { // Can this even happen? hopefully not - status = static_cast(cuda::status::unknown); + status = static_cast(status::unknown); } throw_if_error(status, - "Failed scheduling an asynchronous allocation of " + ::std::to_string(size_in_bytes) + - " bytes of global memory on " + stream::detail_::identify(stream_handle, device_id)); - return {allocated, size_in_bytes}; + "Failed scheduling an asynchronous allocation of " + ::std::to_string(num_bytes) + + " bytes of global memory on " + stream::detail_::identify(stream_handle, context_handle) ); + return {as_pointer(allocated), num_bytes}; #else - (void) device_id; + (void) context_handle; (void) stream_handle; - (void) size_in_bytes; + (void) num_bytes; throw cuda::runtime_error(cuda::status::not_yet_implemented, "Asynchronous memory allocation is not supported with CUDA versions below 11.2"); #endif } @@ -243,8 +203,7 @@ inline region_t allocate( * Schedule an allocation of device-side memory on a CUDA stream. * * @note The CUDA memory allocator guarantees alignment "suitabl[e] for any kind of variable" - * (CUDA 9.0 Runtime API documentation), and the CUDA programming guide guarantees - * since at least version 5.0 that the minimum allocation is 256 bytes. + * (CUDA 9.0 Runtime API documentation), so probably at least 128 bytes. * * @throws cuda::runtime_error if scheduling fails for any reason * @@ -253,8 +212,7 @@ inline region_t allocate( * @return a pointer to the region of memory which will become allocated once the stream * completes all previous tasks and proceeds to also complete the allocation. */ -inline region_t allocate(const cuda::stream_t& stream, size_t size_in_bytes); - +inline region_t allocate(const stream_t& stream, size_t size_in_bytes); } // namespace async @@ -265,36 +223,69 @@ inline region_t allocate(const cuda::stream_t& stream, size_t size_in_bytes); ///@{ inline void free(void* ptr) { - auto result = cudaFree(ptr); + auto result = cuMemFree(address(ptr)); throw_if_error(result, "Freeing device memory at 0x" + cuda::detail_::ptr_as_hex(ptr)); } inline void free(region_t region) { free(region.start()); } ///@} +/** + * Allocate device-side memory on a CUDA device context. + * + * @note The CUDA memory allocator guarantees alignment "suitabl[e] for any kind of variable" + * (CUDA 9.0 Runtime API documentation), and the CUDA programming guide guarantees + * since at least version 5.0 that the minimum allocation is 256 bytes. + * + * @throws cuda::runtime_error if allocation fails for any reason + * + * @param device the context in which to allocate memory + * @param size_in_bytes the amount of global device memory to allocate + * @return a pointer to the allocated stretch of memory (only usable within @p context) + */ +inline region_t allocate(const context_t& context, size_t size_in_bytes); + /** * Allocate device-side memory on a CUDA device. * * @note The CUDA memory allocator guarantees alignment "suitabl[e] for any kind of variable" - * (CUDA 9.0 Runtime API documentation), so probably at least 128 bytes. + * (CUDA 9.0 Runtime API documentation), and the CUDA programming guide guarantees + * since at least version 5.0 that the minimum allocation is 256 bytes. * * @throws cuda::runtime_error if allocation fails for any reason * * @param device the device on which to allocate memory - * @param size_in_bytes the amount of memory to allocate - * @return a pointer to the allocated stretch of memory (only usable on the CUDA device) + * @param size_in_bytes the amount of global device memory to allocate + * @return a pointer to the allocated stretch of memory (only usable on @p device) */ -inline region_t allocate(cuda::device_t device, size_t size_in_bytes); +inline region_t allocate(const device_t& device, size_t size_in_bytes); namespace detail_ { + +// Note: Allocates _in the current context_! No current context => failure! struct allocator { - // Allocates on the current device! - void* operator()(size_t size_in_bytes) const { return detail_::allocate(size_in_bytes).start(); } + void* operator()(size_t num_bytes) const { return detail_::allocate_in_current_context(num_bytes).start(); } }; struct deleter { void operator()(void* ptr) const { cuda::memory::device::free(ptr); } }; + } // namespace detail_ + +/** + * @brief Sets consecutive elements of a region of memory to a fixed + * value of some width + * + * @note A generalization of `set()`, for different-size units. + * + * @tparam T An unsigned integer type of size 1, 2, 4 or 8 + * @param start The first location to set to @p value ; must be properly aligned. + * @param value A (properly aligned) value to set T-elements to. + * @param num_elements The number of type-T elements (i.e. _not_ necessarily the number of bytes). + */ +template +inline void typed_set(T* start, const T& value, size_t num_elements); + /** * @brief Sets all bytes in a region of memory to a fixed value * @@ -304,20 +295,17 @@ struct deleter { */ ///@{ /** - * @param start address at which to start setting memory bytes - * in global CUDA-device-side memory or CUDA-managed memory. - * @param byte_value the value to which to set memory bytes - * @param num_bytes the number of bytes to set to @p byte_value + * @param start starting address of the memory region to set, in a CUDA + * device's global memory + * @param num_bytes size of the memory region in bytes */ inline void set(void* start, int byte_value, size_t num_bytes) { - auto result = cudaMemset(start, byte_value, num_bytes); - throw_if_error(result, "memsetting an on-device buffer"); + return typed_set(static_cast(start), byte_value, num_bytes); } /** - * @param region a stretch of memory whose contents is to be set - * @param byte_value the value to which to set all bytes of @p region + * @param region a region to zero-out, in a CUDA device's global memory */ inline void set(region_t region, int byte_value) { @@ -325,7 +313,6 @@ inline void set(region_t region, int byte_value) } ///@} - /** * @brief Sets all bytes in a region of memory to 0 (zero) */ @@ -339,7 +326,9 @@ inline void zero(void* start, size_t num_bytes) } /** - * @param region the memory region to zero-out + * @param start starting address of the memory region to zero-out, + * in a CUDA device's global memory + * @param num_bytes size of the memory region in bytes */ inline void zero(region_t region) { @@ -367,41 +356,23 @@ inline void zero(T* ptr) * @note Since we assume Compute Capability >= 2.0, all devices support the * Unified Virtual Address Space, so the CUDA driver can determine, for each pointer, * where the data is located, and one does not have to specify this. - */ -///@{ -/** - * @param destination A pointer to a memory region of size @p num_bytes, either in - * host memory or on any CUDA device's global memory - * @param source A pointer to a a memory region of size @p num_bytes, either in - * host memory or on any CUDA device's global memory + * + * @note asynchronous version of @ref memory::copy + * + * @param destination A (pointer to) a memory region of size @p num_bytes. + * Must be defined in the same context as @p stream. + * @param source A (pointer to) a memory region of size @p num_bytes. + * Must be defined in the same context as @p stream. * @param num_bytes The number of bytes to copy from @p source to @p destination */ -inline void copy(void *destination, const void *source, size_t num_bytes) -{ - auto result = cudaMemcpy(destination, source, num_bytes, cudaMemcpyDefault); - // TODO: Determine whether it was from host to device, device to host etc and - // add this information to the error string - throw_if_error(result, "Synchronously copying data"); -} +///@{ +void copy(void *destination, const void *source, size_t num_bytes); -/** - * @param destination A memory region of the same size as @p source, in - * host memory or on any CUDA device's global memory - * @param source A region whose contents is to be copied, either in host memory - * or on any CUDA device's global memory - */ inline void copy(void* destination, const_region_t source) { return copy(destination, source.start(), source.size()); } -/** - * @param destination A region of memory to which to copy the data in@source, of - * size at least that of @p source , either in host memory or on any CUDA - * device's global memory. - * @param source A region whose contents is to be copied, either in host memory - * or on any CUDA device's global memory - */ inline void copy(region_t destination, const_region_t source) { #ifndef NDEBUG @@ -411,6 +382,30 @@ inline void copy(region_t destination, const_region_t source) #endif return copy(destination.start(), source); } + +/** + * @param source A plain array whose contents is to be copied. + */ +template +inline void copy(region_t destination, const T(&source)[N]) +{ +#ifndef NDEBUG + if (destination.size() < N) { + throw ::std::logic_error("Source size exceeds destination size"); + } +#endif + return copy(destination.start(), source, N); +} + +inline void copy(region_t destination, void* source, size_t num_bytes) +{ +#ifndef NDEBUG + if (destination.size() < num_bytes) { + throw ::std::logic_error("Number of bytes to copy exceeds destination size"); + } +#endif + return copy(destination.start(), source, num_bytes); +} ///@} /** @@ -426,13 +421,13 @@ inline void copy(region_t destination, const_region_t source) */ inline void set(void* ptr, int byte_value, size_t num_bytes) { - pointer_t pointer { ptr }; - switch ( pointer.attributes(). memory_type() ) { - case device_memory: - case managed_memory: + switch ( type_of(ptr) ) { + case device_: +// case managed_: + case unified_: memory::device::set(ptr, byte_value, num_bytes); break; - case unregistered_memory: - case host_memory: +// case unregistered_: + case host_: ::std::memset(ptr, byte_value, num_bytes); break; default: throw runtime_error( @@ -471,9 +466,8 @@ inline void zero(region_t region) /** * @brief Sets a number of bytes starting in at a given address of memory to 0 (zero) * - * @param start address at which to start setting memory bytes to 0, in + * @param region the memory region to zero-out; may be in host-side memory, * global CUDA-device-side memory or CUDA-managed memory. - * @param num_bytes the number of bytes to set to zero */ inline void zero(void* ptr, size_t num_bytes) { @@ -495,104 +489,210 @@ inline void zero(T* ptr) namespace detail_ { -/** - * @note When constructing this class - destination first, source second - * (otherwise you're implying the opposite direction of transfer). - */ -struct copy_params_t : cudaMemcpy3DParms { - struct tag { }; -protected: - template - copy_params_t(tag, const void *ptr, const array_t& array) : - cudaMemcpy3DParms { 0 }, - pitch(sizeof(T) * array.dimensions().width), - pitched_ptr(make_cudaPitchedPtr( - const_cast(ptr), - pitch, - array.dimensions().width, - array.dimensions().height)) - { - kind = cudaMemcpyDefault; - extent = array.dimensions(); +template +struct base_copy_params; + +template<> +struct base_copy_params<2> { + using intra_context_type = CUDA_MEMCPY2D; + using type = intra_context_type; // Why is there no inter-context type, CUDA_MEMCPY2D_PEER ? +}; + +template<> +struct base_copy_params<3> { + using type = CUDA_MEMCPY3D_PEER; + using intra_context_type = CUDA_MEMCPY3D; +}; + +// Note these, by default, support inter-context +template +using base_copy_params_t = typename base_copy_params::type; + + +enum class endpoint_t { + source, destination +}; + +template +struct copy_parameters_t : base_copy_params_t { + // TODO: Perhaps use proxies? + + using intra_context_type = typename base_copy_params::intra_context_type; + + using dimensions_type = array::dimensions_t; + + template + void set_endpoint(endpoint_t endpoint, const cuda::array_t &array); + + template + void set_endpoint(endpoint_t endpoint, T *ptr, array::dimensions_t dimensions); + + template + void set_endpoint(endpoint_t endpoint, context::handle_t context_handle, T *ptr, + array::dimensions_t dimensions); + + // TODO: Perhaps we should have an dimensioned offset type? + template + void set_offset(endpoint_t endpoint, dimensions_type offset); + + template + void clear_offset(endpoint_t endpoint) + { set_offset(endpoint, dimensions_type::zero()); } + + template + void set_extent(dimensions_type extent); + // Sets how much is being copies, as opposed to the sizes of the endpoints which may be larger + + void clear_rest(); + // Clear any dummy fields which are required to be set to 0. Note that important fields, + // which you have not set explicitly, will _not_ be cleared by this method. + +}; + +template<> +template +void copy_parameters_t<2>::set_endpoint(endpoint_t endpoint, const cuda::array_t &array) +{ + (endpoint == endpoint_t::source ? srcMemoryType : dstMemoryType) = CU_MEMORYTYPE_ARRAY; + (endpoint == endpoint_t::source ? srcArray : dstArray) = array.get(); + // Can't set the endpoint context - the basic data structure doesn't support that! +} + +template<> +template +void copy_parameters_t<3>::set_endpoint(endpoint_t endpoint, const cuda::array_t &array) +{ + (endpoint == endpoint_t::source ? srcMemoryType : dstMemoryType) = CU_MEMORYTYPE_ARRAY; + (endpoint == endpoint_t::source ? srcArray : dstArray) = array.get(); + (endpoint == endpoint_t::source ? srcContext : dstContext) = array.context_handle(); +} + +template<> +template +inline void copy_parameters_t<2>::set_endpoint(endpoint_t endpoint, context::handle_t context_handle, T *ptr, + array::dimensions_t<2> dimensions) +{ + if (context_handle != context::detail_::none) { + throw cuda::runtime_error( + cuda::status::named_t::not_supported, + "Inter-context copying of 2D arrays is not supported by the CUDA driver"); } + set_endpoint<2>(endpoint, ptr, dimensions); +} -public: - template - copy_params_t(const array_t& destination, const void *source) : - copy_params_t(tag{}, source, destination) - { - srcPtr = pitched_ptr; - dstArray = destination.get(); +template<> +template +inline void copy_parameters_t<2>::set_endpoint(endpoint_t endpoint, T *ptr, array::dimensions_t<2> dimensions) +{ + auto memory_type = memory::type_of(ptr); + if (memory_type == memory::type_t::unified_ or memory_type == type_t::device_) { + (endpoint == endpoint_t::source ? srcDevice : dstDevice) = device::address(ptr); + } else { + if (endpoint == endpoint_t::source) { srcHost = ptr; } + else { dstHost = ptr; } } + (endpoint == endpoint_t::source ? srcPitch : dstPitch) = dimensions.width * sizeof(T); + (endpoint == endpoint_t::source ? srcMemoryType : dstMemoryType) = (CUmemorytype) memory_type; + // Can't set the endpoint context - the basic data structure doesn't support that! +} - template - copy_params_t(const T* destination, const array_t& source) : - copy_params_t(tag{}, destination, source) - { - srcArray = source.get(); - dstPtr = pitched_ptr; +template<> +template +inline void copy_parameters_t<3>::set_endpoint(endpoint_t endpoint, context::handle_t context_handle, T *ptr, + array::dimensions_t<3> dimensions) +{ + cuda::memory::pointer_t wrapped{ptr}; + auto memory_type = memory::type_of(ptr); + if (memory_type == memory::type_t::unified_ or memory_type == type_t::device_) { + (endpoint == endpoint_t::source ? srcDevice : dstDevice) = device::address(ptr); + } else { + if (endpoint == endpoint_t::source) { srcHost = ptr; } + else { dstHost = ptr; } } + (endpoint == endpoint_t::source ? srcPitch : dstPitch) = dimensions.width * sizeof(T); + (endpoint == endpoint_t::source ? srcHeight : dstHeight) = dimensions.height; + (endpoint == endpoint_t::source ? srcMemoryType : dstMemoryType) = (CUmemorytype) memory_type; + (endpoint == endpoint_t::source ? srcContext : dstContext) = context_handle; +} - size_t pitch; - cudaPitchedPtr pitched_ptr; -}; +template<> +template +inline void copy_parameters_t<3>::set_endpoint(endpoint_t endpoint, T *ptr, array::dimensions_t<3> dimensions) +{ + set_endpoint(endpoint, context::detail_::none, ptr, dimensions); +} + +template<> +inline void copy_parameters_t<2>::clear_rest() +{} +template<> +inline void copy_parameters_t<3>::clear_rest() +{ + srcLOD = 0; + dstLOD = 0; +} + +template<> template -inline void copy(array_t& destination, const T *source) -{ - const auto dimensions = destination.dimensions(); - const auto width_in_bytes = sizeof(T) * dimensions.width; - const auto source_pitch = width_in_bytes; // i.e. no padding - const array::dimensions_t<2> offsets { 0, 0 }; - auto result = cudaMemcpy2DToArray( - destination.get(), - offsets.width, - offsets.height, - source, - source_pitch, - width_in_bytes, - dimensions.height, - cudaMemcpyDefault); - throw_if_error(result, "Synchronously copying into a 2D CUDA array"); +inline void copy_parameters_t<2>::set_extent(dimensions_type extent) +{ + WidthInBytes = extent.width * sizeof(T); + Height = extent.height; } -template -inline void copy(array_t& destination, const T *source) +template<> +template +void copy_parameters_t<3>::set_extent(dimensions_type extent) { - const auto copy_params = detail_::copy_params_t(destination, source); - auto result = cudaMemcpy3D(©_params); - throw_if_error(result, "Synchronously copying into a 3-dimensional CUDA array"); + WidthInBytes = extent.width * sizeof(T); + Height = extent.height; + Depth = extent.depth; } -template -inline void copy(T *destination, const array_t& source) -{ - const auto dimensions = source.dimensions(); - const auto width_in_bytes = sizeof(T) * dimensions.width; - const auto destination_pitch = width_in_bytes; // i.e. no padding - const array::dimensions_t<2> offsets { 0, 0 }; - auto result = cudaMemcpy2DFromArray( - destination, - destination_pitch, - source.get(), - offsets.width, - offsets.height, - width_in_bytes, - dimensions.height, - cudaMemcpyDefault); - throw_if_error(result, "Synchronously copying out of a 2D CUDA array"); +template<> +template +void copy_parameters_t<3>::set_offset(endpoint_t endpoint, dimensions_type offset) +{ + (endpoint == endpoint_t::source ? srcXInBytes : dstXInBytes) = offset.width * sizeof(T); + (endpoint == endpoint_t::source ? srcY : dstY) = offset.height; + (endpoint == endpoint_t::source ? srcZ : dstZ) = offset.depth; } -template -inline void copy(T* destination, const array_t& source) +template<> +template +void copy_parameters_t<2>::set_offset(endpoint_t endpoint, dimensions_type offset) { - const auto copy_params = detail_::copy_params_t(destination, source); - auto result = cudaMemcpy3D(©_params); - throw_if_error(result, "Synchronously copying from a 3-dimensional CUDA array"); + (endpoint == endpoint_t::source ? srcXInBytes : dstXInBytes) = offset.width * sizeof(T); + (endpoint == endpoint_t::source ? srcY : dstY) = offset.height; } -} // namespace detail_ +void set_endpoint(endpoint_t endpoint, void *src); + +inline status_t multidim_copy(::std::integral_constant, copy_parameters_t<2> params) +{ + // Note this _must_ be an intra-context copy, as inter-context is not supported + // and there's no indication of context in the relevant data structures + return cuMemcpy2D(¶ms); +} + +inline status_t multidim_copy(::std::integral_constant, copy_parameters_t<3> params) +{ + if (params.srcContext == params.dstContext) { + auto *intra_context_params = reinterpret_cast::intra_context_type *>(¶ms); + return cuMemcpy3D(intra_context_params); + } + return cuMemcpy3DPeer(¶ms); +} + +template +status_t multidim_copy(context::handle_t context_handle, copy_parameters_t params) +{ + context::current::detail_::scoped_ensurer_t ensure_context_for_this_scope{context_handle}; + return multidim_copy(::std::integral_constant{}, params); +} +} // namespace detail /** * Synchronously copies data from a CUDA array into non-array memory. @@ -604,12 +704,20 @@ inline void copy(T* destination, const array_t& source) * @param source A pointer to a region of contiguous memory holding `destination.size()` values * of type @tparam T. The memory may be located either on a CUDA device or in host memory. */ -template -inline void copy(array_t& destination, const T* source) +template +void copy(const array_t& destination, const T *source) { - detail_::copy(destination, source); + detail_::copy_parameters_t params{}; + auto dims = destination.dimensions(); + params.template clear_offset(detail_::endpoint_t::source); + params.template clear_offset(detail_::endpoint_t::destination); + params.template set_extent(dims); + params.clear_rest(); + params.set_endpoint(detail_::endpoint_t::source, const_cast(source), dims); + params.set_endpoint(detail_::endpoint_t::destination, destination); + auto status = detail_::multidim_copy(destination.context_handle(), params); + throw_if_error(status, "Copying from a regular memory region into a CUDA array"); } - /** * Synchronously copies data into a CUDA array from non-array memory. * @@ -621,13 +729,41 @@ inline void copy(array_t& destination, const T* source) * @param source A {@tparam NumDimensions}-dimensional CUDA array */ template -inline void copy(T* destination, const array_t& source) +void copy(T *destination, const array_t& source) { - detail_::copy(destination, source); + detail_::copy_parameters_t params{}; + auto dims = source.dimensions(); + params.template clear_offset(detail_::endpoint_t::source); + params.template clear_offset(detail_::endpoint_t::destination); + params.template set_extent(source.dimensions()); + params.clear_rest(); + params.set_endpoint(detail_::endpoint_t::source, source); + params.template set_endpoint(detail_::endpoint_t::destination, destination, dims); + params.dstPitch = params.srcPitch = dims.width * sizeof(T); + auto status = detail_::multidim_copy(source.context_handle(), params); + throw_if_error(status, "Copying from a CUDA array into a regular memory region"); } template -inline void copy(region_t destination, const array_t& source) +void copy(array_t destination, array_t source) +{ + detail_::copy_parameters_t params{}; + auto dims = source.dimensions(); + params.template clear_offset(detail_::endpoint_t::source); + params.template clear_offset(detail_::endpoint_t::destination); + params.template set_extent(source.dimensions()); + params.clear_rest(); + params.set_endpoint(detail_::endpoint_t::source, source); + params.set_endpoint(detail_::endpoint_t::destination, destination); + params.dstPitch = params.srcPitch = dims.width * sizeof(T); + auto status = //(source.context() == destination.context()) ? + detail_::multidim_copy(source.context_handle(), params); + throw_if_error(status, "Copying from a CUDA array into a regular memory region"); +} + + +template +void copy(region_t destination, const array_t& source) { if (source.size_bytes() < destination.size()) { throw ::std::logic_error("Attempt to copy an array into a memory region too small to hold the copy"); @@ -644,7 +780,7 @@ inline void copy(region_t destination, const array_t& source) * device's global memory */ template -inline void copy_single(T* destination, const T* source) +void copy_single(T* destination, const T* source) { copy(destination, source, sizeof(T)); } @@ -654,7 +790,8 @@ namespace async { namespace detail_ { /** - * Asynchronously copies data between memory spaces or within a memory space. + * Asynchronously copies data between memory spaces or within a memory space, but + * within a single CUDA context. * * @note Since we assume Compute Capability >= 2.0, all devices support the * Unified Virtual Address Space, so the CUDA driver can determine, for each pointer, @@ -667,16 +804,16 @@ namespace detail_ { ///@{ /** -* @param destination A pointer to a memory region of size @p num_bytes, either in -* host memory or on any CUDA device's global memory -* @param source A pointer to a memory region of size @p num_bytes, either in -* host memory or on any CUDA device's global memory -* @param num_bytes number of bytes to copy from @p source + * @param destination A pointer to a memory region of size @p num_bytes, either in + * host memory or on any CUDA device's global memory + * @param source A pointer to a memory region of size at least @p num_bytes, either in + * host memory or on any CUDA device's global memory + * @param num_bytes number of bytes to copy from @p source * @param stream_handle The handle of a stream on which to schedule the copy operation */ inline void copy(void* destination, const void* source, size_t num_bytes, stream::handle_t stream_handle) { - auto result = cudaMemcpyAsync(destination, source, num_bytes, cudaMemcpyDefault, stream_handle); + auto result = cuMemcpyAsync(device::address(destination), device::address(source), num_bytes, stream_handle); // TODO: Determine whether it was from host to device, device to host etc and // add this information to the error string @@ -684,77 +821,103 @@ inline void copy(void* destination, const void* source, size_t num_bytes, stream } /** - * @param destination a memory region of size at least that of @p source, either - * in host memory or on any CUDA device's global memory - * @param source a memory region, either in host memory or on any CUDA device's - * global memory. + * @param destination a memory region of size @p num_bytes, either in + * host memory or on any CUDA device's global memory + * @param source a memory region of size @p num_bytes, either in + * host memory or on any CUDA device's global memory * @param stream_handle The handle of a stream on which to schedule the copy operation */ inline void copy(region_t destination, const_region_t source, stream::handle_t stream_handle) { #ifndef NDEBUG if (destination.size() < source.size()) { - throw std::logic_error("Can't copy a large region into a smaller one"); + throw ::std::logic_error("Source size exceeds destination size"); } #endif copy(destination.start(), source.start(), source.size(), stream_handle); } ///@} -template -void copy(array_t& destination, const T* source, stream::handle_t stream_handle) +using memory::detail_::copy_parameters_t; + +inline status_t multidim_copy_in_current_context( + ::std::integral_constant, + copy_parameters_t<2> params, + stream::handle_t stream_handle) { - const auto copy_params = memory::detail_::copy_params_t(destination, source); - auto result = cudaMemcpy3DAsync(©_params, stream_handle); - throw_if_error(result, "Scheduling a memory copy into a 3D CUDA array on " + stream::detail_::identify(stream_handle)); + // Must be an intra-context copy, because CUDA does not support 2D inter-context copies and the copy parameters + // structure holds no information about contexts. + return cuMemcpy2DAsync(¶ms, stream_handle); } -template -void copy(T* destination, const array_t& source, stream::handle_t stream_handle) +inline status_t multidim_copy_in_current_context( + ::std::integral_constant, + copy_parameters_t<3> params, + stream::handle_t stream_handle) { - const auto copy_params = memory::detail_::copy_params_t(destination, source); - auto result = cudaMemcpy3DAsync(©_params, stream_handle); - throw_if_error(result, "Scheduling a memory copy out of a 3D CUDA array on " + stream::detail_::identify(stream_handle)); + if (params.srcContext == params.dstContext) { + using intra_context_type = memory::detail_::base_copy_params<3>::intra_context_type; + auto* intra_context_params = reinterpret_cast(¶ms); + return cuMemcpy3DAsync(intra_context_params, stream_handle); + } + return cuMemcpy3DPeerAsync(¶ms, stream_handle); + } -template -void copy(array_t& destination, const T* source, stream::handle_t stream_handle) -{ - const auto dimensions = destination.dimensions(); - const auto width_in_bytes = sizeof(T) * dimensions.width; - const auto source_pitch = width_in_bytes; // i.e. no padding - const array::dimensions_t<2> offsets { 0, 0 }; - auto result = cudaMemcpy2DToArrayAsync( - destination.get(), - offsets.width, - offsets.height, - source, - source_pitch, - width_in_bytes, - dimensions.height, - cudaMemcpyDefault, - stream_handle); - throw_if_error(result, "Scheduling a memory copy into a 2D CUDA array on " + stream::detail_::identify(stream_handle)); +template +status_t multidim_copy_in_current_context(copy_parameters_t params, stream::handle_t stream_handle) { + return multidim_copy_in_current_context(::std::integral_constant{}, params, stream_handle); } -template -void copy(T* destination, const array_t& source, cuda::stream::handle_t stream_handle) -{ - const auto dimensions = source.dimensions(); - const auto width_in_bytes = sizeof(T) * dimensions.width; - const auto destination_pitch = width_in_bytes; // i.e. no padding - const array::dimensions_t<2> offsets { 0, 0 }; - auto result = cudaMemcpy2DFromArrayAsync( - destination, - destination_pitch, - source.get(), - offsets.width, - offsets.height, - width_in_bytes, - dimensions.height, - cudaMemcpyDefault, - stream_handle); - throw_if_error(result, "Scheduling a memory copy out of a 3D CUDA array on " + stream::detail_::identify(stream_handle)); +// Note: Assumes the stream handle is for a stream in the current context +template +status_t multidim_copy( + context::handle_t context_handle, + copy_parameters_t params, + stream::handle_t stream_handle) +{ + context::current::detail_::scoped_override_t set_context_for_this_scope(context_handle); + return multidim_copy_in_current_context(::std::integral_constant{}, params, stream_handle); +} + + +// Assumes the array and the stream share the same context, and that the destination is +// accessible from that context (e.g. allocated within it, or being managed memory, etc.) +template +void copy(T *destination, const array_t& source, stream::handle_t stream_handle) +{ + using memory::detail_::endpoint_t; + auto dims = source.dimensions(); + //auto params = make_multidim_copy_params(destination, const_cast(source), destination.dimensions()); + detail_::copy_parameters_t params{}; + params.template clear_offset(endpoint_t::source); + params.template clear_offset(endpoint_t::destination); + params.template set_extent(dims); + params.clear_rest(); + params.set_endpoint(endpoint_t::source, source); + params.set_endpoint(endpoint_t::destination, const_cast(destination), dims); + params.dstPitch = dims.width * sizeof(T); + auto status = multidim_copy_in_current_context(params, stream_handle); + throw_if_error(status, "Scheduling an asynchronous copy from an array into a regular memory region"); +} + + +template +void copy(const array_t& destination, const T* source, stream::handle_t stream_handle) +{ + using memory::detail_::endpoint_t; + auto dims = destination.dimensions(); + //auto params = make_multidim_copy_params(destination, const_cast(source), destination.dimensions()); + detail_::copy_parameters_t params{}; + params.template clear_offset(endpoint_t::source); + params.template clear_offset(endpoint_t::destination); + params.template set_extent(destination.dimensions()); + params.srcPitch = dims.width * sizeof(T); + params.clear_rest(); + params.set_endpoint(endpoint_t::source, const_cast(source), dims); + params.set_endpoint(endpoint_t::destination, destination); + auto status = multidim_copy_in_current_context(params, stream_handle); + throw_if_error(status, "Scheduling an asynchronous copy from regular memory into an array"); } /** @@ -762,6 +925,9 @@ void copy(T* destination, const array_t& source, cuda::stream::handle_t st * * @note asynchronous version of @ref memory::copy_single * + * @note assumes the source and destination are all valid in the same context as that of the + * context handle + * * @param destination a value residing either in host memory or on any CUDA * device's global memory * @param source a value residing either in host memory or on any CUDA @@ -769,7 +935,7 @@ void copy(T* destination, const array_t& source, cuda::stream::handle_t st * @param stream_handle A stream on which to enqueue the copy operation */ template -inline void copy_single(T& destination, const T& source, stream::handle_t stream_handle) +void copy_single(T& destination, const T& source, stream::handle_t stream_handle) { copy(&destination, &source, sizeof(T), stream_handle); } @@ -785,13 +951,16 @@ inline void copy_single(T& destination, const T& source, stream::handle_t stream * * @note asynchronous version of @ref memory::copy * - * @param destination A pointer to a memory region of size @p num_bytes, - * either in host memory or on any CUDA device's global memory. - * @param source A pointer to a a memory region of size at least @p num_bytes, - * either in host memory or on any CUDA device's global memory + * @param destination A (pointer to) a memory region of size @p num_bytes, either in + * host memory or on any CUDA device's global memory. Must be defined in the same context + * as the stream. + * @param source A (pointer to) a memory region of size @p num_bytes, either in + * host memory or on any CUDA device's global memory. Must be defined in the same context + * as the stream * @param num_bytes The number of bytes to copy from @p source to @p destination * @param stream A stream on which to enqueue the copy operation */ +///@{ void copy(void* destination, void const* source, size_t num_bytes, const stream_t& stream); inline void copy(void* destination, const_region_t source, size_t num_bytes, const stream_t& stream) @@ -824,15 +993,42 @@ inline void copy(region_t destination, const_region_t source, const stream_t& st copy(destination, source, source.size(), stream); } +/** + * @param source A plain array whose contents is to be copied. + */ +template +inline void copy(region_t destination, const T(&source)[N], const stream_t& stream) +{ +#ifndef NDEBUG + if (destination.size() < N) { + throw ::std::logic_error("Source size exceeds destination size"); + } +#endif + return copy(destination.start(), source, N, stream); +} + +inline void copy(region_t destination, void* source, size_t num_bytes, const stream_t& stream) +{ +#ifndef NDEBUG + if (destination.size() < num_bytes) { + throw ::std::logic_error("Number of bytes to copy exceeds destination size"); + } +#endif + return copy(destination.start(), source, num_bytes, stream); +} +///@} + /** * Asynchronously copies data from memory spaces into CUDA arrays. * - * @param destination A CUDA array (see @ref cuda::array_t ) + * @note asynchronous version of @ref memory::copy + * + * @param destination A CUDA array @ref cuda::array_t * @param source A pointer to a a memory region of size `destination.size() * sizeof(T)` * @param stream schedule the copy operation into this CUDA stream */ template -inline void copy(array_t& destination, const T* source, const stream_t& stream); +void copy(array_t& destination, const T* source, const stream_t& stream); template void copy(array_t& destination, const_region_t source, const stream_t& stream) @@ -864,7 +1060,7 @@ template void copy(region_t destination, const array_t& source, const stream_t& stream) { #ifndef NDEBUG - size_t required_size = destination.size() * sizeof(T); + size_t required_size = source.size() * sizeof(T); if (destination.size() < required_size) { throw ::std::invalid_argument( "Attempt to copy " + ::std::to_string(required_size) + " bytes from an array into a " @@ -899,7 +1095,7 @@ namespace detail_ { inline void set(void* start, int byte_value, size_t num_bytes, stream::handle_t stream_handle) { // TODO: Double-check that this call doesn't require setting the current device - auto result = cudaMemsetAsync(start, byte_value, num_bytes, stream_handle); + auto result = cuMemsetD8Async(address(start), byte_value, num_bytes, stream_handle); throw_if_error(result, "asynchronously memsetting an on-device buffer"); } @@ -908,7 +1104,6 @@ inline void set(region_t region, int byte_value, stream::handle_t stream_handle) set(region.start(), byte_value, region.size(), stream_handle); } - inline void zero(void* start, size_t num_bytes, stream::handle_t stream_handle) { set(start, 0, num_bytes, stream_handle); @@ -919,8 +1114,43 @@ inline void zero(region_t region, stream::handle_t stream_handle) zero(region.start(), region.size(), stream_handle); } +// TODO: Drop this in favor of -like functions under `cuda::~. +template +inline void typed_set(T* start, const T& value, size_t num_elements, stream::handle_t stream_handle) +{ + static_assert(::std::is_trivially_copyable::value, "Non-trivially-copyable types cannot be used for setting memory"); + static_assert( + sizeof(T) == 1 or sizeof(T) == 2 or + sizeof(T) == 4 or sizeof(T) == 8, + "Unsupported type size - only sizes 1, 2 and 4 are supported"); + // TODO: Consider checking for alignment when compiling without NDEBUG + auto result {CUDA_SUCCESS}; + switch(sizeof(T)) { + case(1): result = cuMemsetD8Async (address(start), reinterpret_cast(value), num_elements, stream_handle); break; + case(2): result = cuMemsetD16Async(address(start), reinterpret_cast(value), num_elements, stream_handle); break; + case(4): result = cuMemsetD32Async(address(start), reinterpret_cast(value), num_elements, stream_handle); break; + } + throw_if_error(result, "Setting global device memory bytes"); +} + } // namespace detail_ + +/** + * @brief Sets consecutive elements of a region of memory to a fixed + * value of some width + * + * @note A generalization of `async::set()`, for different-size units. + * + * @tparam T An unsigned integer type of size 1, 2, 4 or 8 + * @param start The first location to set to @p value ; must be properly aligned. + * @param value A (properly aligned) value to set T-elements to. + * @param num_elements The number of type-T elements (i.e. _not_ necessarily the number of bytes). + * @param stream The stream on which to enqueue the operation. + */ +template +void typed_set(T* start, const T& value, size_t num_elements, const stream_t& stream); + /** * Asynchronously sets all bytes in a stretch of memory to a single value * @@ -932,7 +1162,10 @@ inline void zero(region_t region, stream::handle_t stream_handle) * @param num_bytes size of the memory region in bytes * @param stream stream on which to schedule this action */ -inline void set(void* start, int byte_value, size_t num_bytes, const stream_t& stream); +inline void set(void* start, int byte_value, size_t num_bytes, const stream_t& stream) +{ + return typed_set(static_cast(start), byte_value, num_bytes, stream); +} /** * Similar to @ref set(), but sets the memory to zero rather than an arbitrary value @@ -945,7 +1178,8 @@ inline void zero(void* start, size_t num_bytes, const stream_t& stream); * * @note asynchronous version of @ref memory::zero * - * @param ptr a pointer to the value to be to zero + * @param ptr a pointer to the value to be to zero; must be valid in the + * CUDA context of @p stream * @param stream stream on which to schedule this action */ template @@ -956,8 +1190,166 @@ inline void zero(T* ptr, const stream_t& stream) } // namespace async + } // namespace device +namespace inter_context { + +namespace detail_ { + +inline void copy( + void * destination_address, + context::handle_t destination_context, + const void * source_address, + context::handle_t source_context, + size_t num_bytes) +{ + auto status = cuMemcpyPeer( + reinterpret_cast(destination_address), + destination_context, + reinterpret_cast(source_address), + source_context, num_bytes); + throw_if_error(status, + ::std::string("Failed copying data between devices: From address ") + + cuda::detail_::ptr_as_hex(source_address) + " in " + + context::detail_::identify(source_context) + " to address " + + cuda::detail_::ptr_as_hex(destination_address) + " in " + + context::detail_::identify(destination_context) ); +} + +} // namespace detail_ + +void copy( + void * destination, + const context_t& destination_context, + const void * source_address, + const context_t& source_context, + size_t num_bytes); + +inline void copy( + void * destination, + const context_t& destination_context, + const_region_t source, + const context_t& source_context) +{ + copy(destination, destination_context, source.start(), source_context, source.size()); +} + +inline void copy( + region_t destination, + const context_t& destination_context, + const_region_t source, + const context_t& source_context) +{ +#ifndef NDEBUG + if (destination.size() < destination.size()) { + throw ::std::invalid_argument( + "Attempt to copy a region of " + ::std::to_string(source.size()) + + " bytes into a region of size " + ::std::to_string(destination.size()) + " bytes"); + } +#endif + copy(destination.start(), destination_context, source, source_context); +} + +template +inline void copy( + array_t destination, + array_t source) +{ + // for arrays, a single mechanism handles both intra- and inter-context copying + return memory::copy(destination, source); +} + +namespace async { + +namespace detail_ { + +inline void copy( + void *destination, + context::handle_t destination_context_handle, + const void *source, + context::handle_t source_context_handle, + size_t num_bytes, + stream::handle_t stream_handle) +{ + auto result = cuMemcpyPeerAsync( + device::address(destination), + destination_context_handle, + device::address(source), + source_context_handle, + num_bytes, stream_handle); + + // TODO: Determine whether it was from host to device, device to host etc and + // add this information to the error string + throw_if_error(result, "Scheduling an inter-context memory copy from " + + context::detail_::identify(source_context_handle) + " to " + + context::detail_::identify(destination_context_handle) + " on " + + stream::detail_::identify(stream_handle)); +} + +/** + * @param destination a memory region of size @p num_bytes, either in + * host memory or on any CUDA device's global memory + * @param source a memory region of size @p num_bytes, either in + * host memory or on any CUDA device's global memory + * @param stream_handle The handle of a stream on which to schedule the copy operation + */ +inline void copy( + region_t destination, + context::handle_t destination_context_handle, + const_region_t source, + context::handle_t source_context_handle, + stream::handle_t stream_handle) +{ +#ifndef NDEBUG + if (destination.size() < source.size()) { + throw ::std::logic_error("Can't copy a large region into a smaller one"); + } +#endif + copy(destination.start(), destination_context_handle, source.start(), source_context_handle, source.size(), + stream_handle); +} +///@} + +} // namespace detail_ + +void copy( + void * destination_address, + context_t destination_context, + const void * source_address, + context_t source_context, + size_t num_bytes, + stream_t stream); + +void copy( + void * destination, + context_t destination_context, + const_region_t source, + context_t source_context, + stream_t stream); + +inline void copy( + region_t destination, + context_t destination_context, + const_region_t source, + context_t source_context, + stream_t stream); + +template +inline void copy( + array_t destination, + array_t source, + const stream_t& stream) +{ + // for arrays, a single mechanism handles both intra- and inter-context copying + return memory::async::copy(destination, source, stream); +} + + +} // namespace async + +} // namespace inter_context + /** * @namespace host * Host-side (= system) memory which is "pinned", i.e. resides in @@ -968,12 +1360,13 @@ namespace host { /** * Allocate pinned host memory * - * @note "Pinned" memory is excepted from virtual memory swapping-out, - * and is allocated in contiguous physical RAM addresses, making it - * possible to copy to and from it to the the GPU using DMA without - * assistance from the GPU. Typically for PCIe 3.0, the effective - * bandwidth is twice as fast as copying from or to naively-allocated - * host memory. + * @note This function will fail if + * + * @note "Pinned" memory is allocated in contiguous physical RAM + * addresses, making it possible to copy to and from it to the the + * GPU using DMA without assistance from the GPU. This improves + * the copying bandwidth significantly over naively-allocated + * host memory, and reduces overhead for the CPU. * * @throws cuda::runtime_error if allocation fails for any reason * @@ -985,20 +1378,10 @@ namespace host { * * @return a pointer to the allocated stretch of memory */ -inline void* allocate( +void* allocate( size_t size_in_bytes, - allocation_options options) -{ - void* allocated = nullptr; - auto flags = cuda::memory::detail_::make_cuda_host_alloc_flags(options); - auto result = cudaHostAlloc(&allocated, size_in_bytes, flags); - if (is_success(result) && allocated == nullptr) { - // Can this even happen? hopefully not - result = cudaErrorUnknown; - } - throw_if_error(result, "Failed allocating " + ::std::to_string(size_in_bytes) + " bytes of host memory"); - return allocated; -} + allocation_options options); + inline void* allocate( size_t size_in_bytes, @@ -1018,14 +1401,14 @@ inline void* allocate(size_t size_in_bytes, cpu_write_combining cpu_wc) */ inline void free(void* host_ptr) { - auto result = cudaFreeHost(host_ptr); - throw_if_error(result, "Freeing pinned host memory at 0x" + cuda::detail_::ptr_as_hex(host_ptr)); + auto result = cuMemFreeHost(host_ptr); + throw_if_error(result, "Freeing pinned host memory at " + cuda::detail_::ptr_as_hex(host_ptr)); } namespace detail_ { struct allocator { - void* operator()(size_t size_in_bytes) const { return cuda::memory::host::allocate(size_in_bytes); } + void* operator()(size_t num_bytes) const { return cuda::memory::host::allocate(num_bytes); } }; struct deleter { void operator()(void* ptr) const { cuda::memory::host::free(ptr); } @@ -1036,8 +1419,8 @@ struct deleter { * @brief Makes a preallocated memory region behave as though it were allocated with @ref host::allocate. * * Page-locks the memory range specified by ptr and size and maps it for the device(s) as specified by - * flags. This memory range also is added to the same tracking mechanism as cudaHostAlloc() to - * automatically accelerate calls to functions such as cudaMemcpy(). + * flags. This memory range also is added to the same tracking mechanism as cuMemAllocHost() to + * automatically accelerate calls to functions such as cuMemcpy(). * * @param ptr A pre-allocated stretch of host memory * @param size the size in bytes the memory region to register/pin @@ -1045,7 +1428,7 @@ struct deleter { */ inline void register_(const void *ptr, size_t size, unsigned flags) { - auto result = cudaHostRegister(const_cast(ptr), size, flags); + auto result = cuMemHostRegister(const_cast(ptr), size, flags); throw_if_error(result, "Could not register and page-lock the region of " + ::std::to_string(size) + " bytes of host memory at " + cuda::detail_::ptr_as_hex(ptr)); @@ -1097,9 +1480,9 @@ inline void register_(const void *ptr, size_t size, { detail_::register_( ptr, size, - (register_mapped_io_space ? cudaHostRegisterIoMemory : 0) - | (map_into_device_space ? cudaHostRegisterMapped : 0) - | (make_device_side_accesible_to_all ? cudaHostRegisterPortable : 0) + (register_mapped_io_space ? CU_MEMHOSTREGISTER_IOMEMORY : 0) + | (map_into_device_space ? CU_MEMHOSTREGISTER_DEVICEMAP : 0) + | (make_device_side_accesible_to_all ? CU_MEMHOSTREGISTER_PORTABLE : 0) ); } @@ -1120,7 +1503,8 @@ inline void register_( inline void register_(void const *ptr, size_t size) { - detail_::register_(ptr, size, cudaHostRegisterDefault); + unsigned no_flags_set { 0 }; + detail_::register_(ptr, size, no_flags_set); } inline void register_(const_region_t region) @@ -1133,7 +1517,7 @@ inline void register_(const_region_t region) // just ended inline void deregister(const void *ptr) { - auto result = cudaHostUnregister(const_cast(ptr)); + auto result = cuMemHostUnregister(const_cast(ptr)); throw_if_error(result, "Could not unregister the memory segment starting at address *a"); } @@ -1149,9 +1533,9 @@ inline void deregister(const_region_t region) * @note a wrapper for @ref ::std::memset * * @param start starting address of the memory region to set, - * in host memory; can be either CUDA-allocated or otherwise. + * in host memory; can be either CUDA-allocated or otherwise. * @param byte_value value to set the memory region to - * @param num_bytes number of bytes at @p address to be set + * @param num_bytes size of the memory region in bytes */ inline void set(void* start, int byte_value, size_t num_bytes) { @@ -1195,11 +1579,13 @@ struct const_region_t; namespace detail_ { +using advice_t = CUmem_advise; + template -inline T get_scalar_range_attribute(managed::const_region_t region, cudaMemRangeAttribute attribute); +inline T get_scalar_range_attribute(managed::const_region_t region, range_attribute_t attribute); -inline void set_scalar_range_attribute(managed::const_region_t region, cudaMemoryAdvise advice, cuda::device::id_t device_id); -inline void set_scalar_range_attribute(managed::const_region_t region, cudaMemoryAdvise attribute); +inline void advise(managed::const_region_t region, advice_t advice, cuda::device::id_t device_id); +// inline void advise(managed::const_region_t region, advice_t attribute); template struct base_region_t : public memory::detail_::base_region_t { @@ -1208,17 +1594,17 @@ struct base_region_t : public memory::detail_::base_region_t { bool is_read_mostly() const { - return get_scalar_range_attribute(*this, cudaMemRangeAttributeReadMostly); + return get_scalar_range_attribute(*this, CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY); } void designate_read_mostly() const { - set_scalar_range_attribute(*this, cudaMemAdviseSetReadMostly); + set_range_attribute(*this, CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY); } void undesignate_read_mostly() const { - detail_::set_scalar_range_attribute(*this, cudaMemAdviseUnsetReadMostly); + unset_range_attribute(*this, CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY); } device_t preferred_location() const; @@ -1244,69 +1630,89 @@ void advise_expected_access_by(managed::const_region_t region, device_t& device) void advise_no_access_expected_by(managed::const_region_t region, device_t& device); template > -typename ::std::vector accessors(managed::const_region_t region, const Allocator& allocator = Allocator() ); + typename ::std::vector accessors(managed::const_region_t region, const Allocator& allocator = Allocator() ); namespace detail_ { template -inline T get_scalar_range_attribute(managed::const_region_t region, cudaMemRangeAttribute attribute) +inline T get_scalar_range_attribute(managed::const_region_t region, range_attribute_t attribute) { uint32_t attribute_value { 0 }; - auto result = cudaMemRangeGetAttribute( - &attribute_value, sizeof(attribute_value), attribute, region.start(), region.size()); + auto result = cuMemRangeGetAttribute( + &attribute_value, sizeof(attribute_value), attribute, device::address(region.start()), region.size()); throw_if_error(result, "Obtaining an attribute for a managed memory range at " + cuda::detail_::ptr_as_hex(region.start())); return static_cast(attribute_value); } -inline void set_scalar_range_attribute(managed::const_region_t region, cudaMemoryAdvise advice, cuda::device::id_t device_id) +// CUDA's range "advice" is simply a way to set the attributes of a range; unfortunately that's +// not called cuMemRangeSetAttribute, and uses a different enum. +inline void advise(managed::const_region_t region, advice_t advice, cuda::device::id_t device_id) { - auto result = cudaMemAdvise(region.start(), region.size(), advice, device_id); - throw_if_error(result, - "Setting an attribute for a managed memory range at " + cuda::detail_::ptr_as_hex(region.start())); + auto result = cuMemAdvise(device::address(region.start()), region.size(), advice, device_id); + throw_if_error(result, "Setting an attribute for a managed memory range at " + + cuda::detail_::ptr_as_hex(region.start())); } -inline void set_scalar_range_attribute(managed::const_region_t region, cudaMemoryAdvise attribute) +// inline void set_range_attribute(managed::const_region_t region, range_attribute_t attribute, cuda::device::handle_t device_id) + +inline advice_t as_advice(range_attribute_t attribute, bool set) { - cuda::device::id_t ignored_device_index{}; - set_scalar_range_attribute(region, attribute, ignored_device_index); + switch (attribute) { + case CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY: + return set ? CU_MEM_ADVISE_SET_READ_MOSTLY : CU_MEM_ADVISE_UNSET_READ_MOSTLY; + case CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION: + return set ? CU_MEM_ADVISE_SET_PREFERRED_LOCATION : CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION; + case CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY: + return set ? CU_MEM_ADVISE_SET_ACCESSED_BY : CU_MEM_ADVISE_UNSET_ACCESSED_BY; + default: + throw std::invalid_argument( + "CUDA memory range attribute does not correspond to any range advice value"); + } } -} // namespace detail_ +inline void set_range_attribute(managed::const_region_t region, range_attribute_t settable_attribute, cuda::device::id_t device_id) +{ + constexpr const bool set { true }; + advise(region, as_advice(settable_attribute, set), device_id); +} +inline void unset_range_attribute(managed::const_region_t region, range_attribute_t settable_attribute) +{ + constexpr const bool unset { false }; + constexpr const cuda::device::id_t dummy_device_id { 0 }; + advise(region, as_advice(settable_attribute, unset), dummy_device_id); +} -enum class initial_visibility_t { - to_all_devices, - to_supporters_of_concurrent_managed_access, -}; +} // namespace detail_ -enum class attachment_t { - global = cudaMemAttachGlobal, - host = cudaMemAttachHost, - single_stream = cudaMemAttachSingle, -}; +enum class attachment_t : unsigned { + global = CU_MEM_ATTACH_GLOBAL, + host = CU_MEM_ATTACH_HOST, + single_stream = CU_MEM_ATTACH_SINGLE, + }; namespace detail_ { -inline region_t allocate( - size_t size_in_bytes, +inline region_t allocate_in_current_context( + size_t num_bytes, initial_visibility_t initial_visibility = initial_visibility_t::to_all_devices) { - void* allocated = nullptr; + device::address_t allocated = 0; auto flags = (initial_visibility == initial_visibility_t::to_all_devices) ? - cudaMemAttachGlobal : cudaMemAttachHost; + attachment_t::global : attachment_t::host; // Note: Despite the templating by T, the size is still in bytes, // not in number of T's - auto status = cudaMallocManaged(&allocated, size_in_bytes, flags); - if (is_success(status) && allocated == nullptr) { + auto status = cuMemAllocManaged(&allocated, num_bytes, (unsigned) flags); + if (is_success(status) && allocated == 0) { // Can this even happen? hopefully not status = (status_t) status::unknown; } - throw_if_error(status, - "Failed allocating " + ::std::to_string(size_in_bytes) + " bytes of managed CUDA memory"); - return {allocated, size_in_bytes}; + throw_if_error(status, "Failed allocating " + + ::std::to_string(num_bytes) + " bytes of managed CUDA memory"); + return {as_pointer(allocated), num_bytes}; } /** @@ -1315,7 +1721,7 @@ inline region_t allocate( ///@{ inline void free(void* ptr) { - auto result = cudaFree(ptr); + auto result = cuMemFree(device::address(ptr)); throw_if_error(result, "Freeing managed memory at 0x" + cuda::detail_::ptr_as_hex(ptr)); } inline void free(region_t region) @@ -1326,44 +1732,74 @@ inline void free(region_t region) template struct allocator { - // Allocates on the current device! - void* operator()(size_t size_in_bytes) const + // Allocates in the current context! + void* operator()(size_t num_bytes) const { - return detail_::allocate(size_in_bytes, InitialVisibility).start(); + return detail_::allocate_in_current_context(num_bytes, InitialVisibility).start(); } }; + struct deleter { - void operator()(void* ptr) const { cuda::memory::device::free(ptr); } + void operator()(void* ptr) const { memory::device::free(ptr); } }; inline region_t allocate( - cuda::device::id_t device_id, - size_t size_in_bytes, + context::handle_t context_handle, + size_t num_bytes, initial_visibility_t initial_visibility = initial_visibility_t::to_all_devices) { - cuda::device::current::detail_::scoped_override_t set_device_for_this_scope(device_id); - return detail_::allocate(size_in_bytes, initial_visibility); + context::current::detail_::scoped_override_t set_context_for_this_scope(context_handle); + return allocate_in_current_context(num_bytes, initial_visibility); } } // namespace detail_ +/** + * @brief Allocate a a region of managed memory, accessible with the same + * address on the host and on CUDA devices. + * + * @param context the initial context which is likely to access the managed + * memory region (and which will certainly have the region actually allocated + * for it) + * @param num_bytes size of each of the regions of memory to allocate + * @param initial_visibility will the allocated region be visible, using the + * common address, to all CUDA device (= more overhead, more work for the CUDA + * runtime) or just to those devices with some hardware features to assist in + * this task (= less overhead)? + */ +inline region_t allocate( + const context_t& context, + size_t num_bytes, + initial_visibility_t initial_visibility = initial_visibility_t::to_all_devices); + /** * @brief Allocate a a region of managed memory, accessible with the same * address on the host and on CUDA devices * * @param device the initial device which is likely to access the managed - * memory region (and which will certainly have actually allocated for it) - * @param size_in_bytes size of each of the regions of memory to allocate + * memory region (and which will certainly have the region actually allocated + * for it) + * @param num_bytes size of each of the regions of memory to allocate * @param initial_visibility will the allocated region be visible, using the * common address, to all CUDA device (= more overhead, more work for the CUDA * runtime) or just to those devices with some hardware features to assist in * this task (= less overhead)? */ -region_t allocate( - cuda::device_t device, - size_t size_in_bytes, - initial_visibility_t initial_visibility = initial_visibility_t::to_all_devices -); +inline region_t allocate( + device_t device, + size_t num_bytes, + initial_visibility_t initial_visibility = initial_visibility_t::to_all_devices); + +/** + * @brief Allocate a a region of managed memory, accessible with the same + * address on the host and on all CUDA devices. + * + * @note While the allocated memory should be available universally, the + * allocation itself does require some GPU context. This will be the current + * context, if one exists, or the primary context on the runtime-defined current + * device. + */ +region_t allocate(size_t num_bytes); /** * Free a managed memory region (host-side and device-side regions on all devices @@ -1372,7 +1808,7 @@ region_t allocate( */ inline void free(void* managed_ptr) { - auto result = cudaFree(managed_ptr); + auto result = cuMemFree(device::address(managed_ptr)); throw_if_error(result, "Freeing managed memory (host and device regions) at address 0x" + cuda::detail_::ptr_as_hex(managed_ptr)); @@ -1385,23 +1821,26 @@ inline void free(region_t region) namespace advice { -enum device_inspecific_kind_t { - read_mostly = cudaMemAdviseSetReadMostly, +enum kind_t { + read_mostly = CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY, + preferred_location = CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION, + accessor = CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY, + // Note: CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION is never set }; -enum device_specific_kind_t { - preferred_location, - accessor, -}; +namespace detail_ { -inline void set(const_region_t region, device_inspecific_kind_t advice) +inline void set(const_region_t region, kind_t advice, cuda::device::id_t device_id) { - cuda::device::id_t ignored_device_index{}; - auto result = cudaMemAdvise(region.start(), region.size(), (cudaMemoryAdvise) advice, ignored_device_index); - throw_if_error(result, - "Setting advice on a (managed) memory region at" + cuda::detail_::ptr_as_hex(region.start())); + auto result = cuMemAdvise(device::address(region.start()), region.size(), (managed::detail_::advice_t) advice, device_id); + throw_if_error(result, "Setting advice on a (managed) memory region at" + + cuda::detail_::ptr_as_hex(region.start()) + " w.r.t. " + cuda::device::detail_::identify(device_id)); } +} // namespace detail_ + +void set(const_region_t region, kind_t advice, const device_t& device); + } // namespace advice namespace async { @@ -1411,12 +1850,13 @@ namespace detail_ { inline void prefetch( const_region_t region, cuda::device::id_t destination, - stream::handle_t stream_handle) + stream::handle_t source_stream_handle) { - auto result = cudaMemPrefetchAsync(region.start(), region.size(), destination, stream_handle); + auto result = cuMemPrefetchAsync(device::address(region.start()), region.size(), destination, source_stream_handle); throw_if_error(result, "Prefetching " + ::std::to_string(region.size()) + " bytes of managed memory at address " - + cuda::detail_::ptr_as_hex(region.start()) + " to device " + ::std::to_string(destination)); + + cuda::detail_::ptr_as_hex(region.start()) + " to " + ( + (destination == CU_DEVICE_CPU) ? "the host" : cuda::device::detail_::identify(destination)) ); } } // namespace detail_ @@ -1427,27 +1867,17 @@ inline void prefetch( * devices. */ void prefetch( - const_region_t region, - cuda::device_t destination, - const stream_t& stream); + const_region_t region, + const cuda::device_t& destination, + const stream_t& stream); /** * @brief Prefetches a region of managed memory into host memory. It can * later be used there without waiting for I/O from any of the CUDA devices. */ -inline void prefetch_to_host(const_region_t managed_region) -{ - auto result = cudaMemPrefetchAsync( - managed_region.start(), - managed_region.size(), - cudaCpuDeviceId, - stream::default_stream_handle); - // The stream handle will be ignored by the CUDA runtime API when this pseudo - // device indicator is used. - throw_if_error(result, - "Prefetching " + ::std::to_string(managed_region.size()) + " bytes of managed memory at address " - + cuda::detail_::ptr_as_hex(managed_region.start()) + " into host memory"); -} +void prefetch_to_host( + const_region_t region, + const stream_t& stream); } // namespace async @@ -1462,107 +1892,95 @@ namespace mapped { template inline T* device_side_pointer_for(T* host_memory_ptr) { - T* device_side_ptr; + device::address_t device_side_ptr; auto get_device_pointer_flags = 0u; // see the CUDA runtime documentation - auto status = cudaHostGetDevicePointer( + auto status = cuMemHostGetDevicePointer( &device_side_ptr, host_memory_ptr, get_device_pointer_flags); throw_if_error(status, "Failed obtaining the device-side pointer for host-memory pointer " + cuda::detail_::ptr_as_hex(host_memory_ptr) + " supposedly mapped to device memory"); - return device_side_ptr; + return as_pointer(device_side_ptr); } namespace detail_ { /** - * Allocates a mapped pair of memory regions - on the current device - * and in host memory. + * Allocates a mapped pair of memory regions - in the current + * context and in host and device memory. * * @param size_in_bytes size of each of the two regions, in bytes. * @param options indication of how the CUDA driver will manage * the region pair * @return the allocated pair (with both regions being non-null) */ -inline region_pair allocate( +inline region_pair allocate_in_current_context( + context::handle_t current_context_handle, size_t size_in_bytes, allocation_options options) { - region_pair allocated; + region_pair allocated {}; + // The default initialization is unnecessary, but let's play it safe allocated.size_in_bytes = size_in_bytes; - auto flags = cudaHostAllocMapped & + auto flags = CU_MEMHOSTALLOC_DEVICEMAP & cuda::memory::detail_::make_cuda_host_alloc_flags(options); - // Note: the typed cudaHostAlloc also takes its size in bytes, apparently, - // not in number of elements - auto status = cudaHostAlloc(&allocated.host_side, size_in_bytes, flags); + auto status = cuMemHostAlloc(&allocated.host_side, size_in_bytes, flags); if (is_success(status) && (allocated.host_side == nullptr)) { // Can this even happen? hopefully not - status = cudaErrorUnknown; + status = (status_t) status::named_t::unknown; } throw_if_error(status, "Failed allocating a mapped pair of memory regions of size " + ::std::to_string(size_in_bytes) - + " bytes of global memory on device " + ::std::to_string(cuda::device::current::detail_::get_id())); + + " bytes of global memory in " + context::detail_::identify(current_context_handle)); allocated.device_side = device_side_pointer_for(allocated.host_side); return allocated; } -/** - * Allocates a mapped pair of memory regions - on a CUDA device - * and in host memory. - * - * @param device_id The device on which to allocate the device-side region - * @param size_in_bytes size of each of the two regions, in bytes. - * @param options indication of how the CUDA driver will manage - * the region pair - * @return the allocated pair (with both regions being non-null) - */ inline region_pair allocate( - cuda::device::id_t device_id, + context::handle_t context_handle, size_t size_in_bytes, allocation_options options) { - cuda::device::current::detail_::scoped_override_t set_device_for_this_scope(device_id); - return detail_::allocate(size_in_bytes, options); + context::current::detail_::scoped_override_t set_context_for_this_scope(context_handle); + return detail_::allocate_in_current_context(context_handle, size_in_bytes, options); +} + +inline void free(void* host_side_pair) +{ + auto result = cuMemFreeHost(host_side_pair); + throw_if_error(result, "Freeing a mapped memory region pair with host-side address " + + cuda::detail_::ptr_as_hex(host_side_pair)); } } // namespace detail_ /** - * Allocate a pair of memory regions, on the host and on the device, mapped to each other so - * that changes to one will be reflected in the other. + * Allocate a memory region on the host, which is also mapped to a memory region in + * a context of some CUDA device - so that changes to one will be reflected in the other. * - * @param device The device on which the device-side region in the pair will be allocated + * @param context The device context in which the device-side region in the pair will be + * allocated. * @param size_in_bytes amount of memory to allocate (in each of the regions) * @param options see @ref allocation_options */ region_pair allocate( - cuda::device_t& device, + cuda::context_t& context, size_t size_in_bytes, allocation_options options); /** - * @brief A variant of @ref allocate facilitating only specifying some of the allocation options - */ -inline region_pair allocate( - cuda::device_t& device, - size_t size_in_bytes, - portability_across_contexts portability = portability_across_contexts(false), - cpu_write_combining cpu_wc = cpu_write_combining(false)) -{ - return allocate(device, size_in_bytes, allocation_options{ portability, cpu_wc } ); -} - -/** - * @brief A variant of @ref allocate facilitating only specifying some of the allocation options + * Allocate a memory region on the host, which is also mapped to a memory region in + * the global memory of a CUDA device - so that changes to one will be reflected in the other. + * + * @param device The device on which the device-side region in the pair will be allocated + * @param size_in_bytes amount of memory to allocate (in each of the regions) + * @param options see @ref allocation_options */ -inline region_pair allocate( +region_pair allocate( cuda::device_t& device, size_t size_in_bytes, - cpu_write_combining cpu_wc) -{ - return allocate(device, size_in_bytes, allocation_options{ portability_across_contexts(false), cpu_write_combining(cpu_wc)} ); -} + allocation_options options = allocation_options{}); /** @@ -1573,8 +1991,7 @@ inline region_pair allocate( */ inline void free(region_pair pair) { - auto result = cudaFreeHost(pair.host_side); - throw_if_error(result, "Could not free mapped memory region pair."); + detail_::free(pair.host_side); } /** @@ -1585,15 +2002,21 @@ inline void free(region_pair pair) */ inline void free_region_pair_of(void* ptr) { - auto wrapped_ptr = pointer_t { ptr }; - auto result = cudaFreeHost(wrapped_ptr.get_for_host()); - throw_if_error(result, "Could not free mapped memory region pair."); + // TODO: What if the pointer is not part of a mapped region pair? + // We could check this... + void* host_side_ptr; + auto status = cuPointerGetAttribute (&host_side_ptr, CU_POINTER_ATTRIBUTE_HOST_POINTER, memory::device::address(ptr)); + throw_if_error(status, "Failed obtaining the host-side address of supposedly-device-side pointer " + + cuda::detail_::ptr_as_hex(ptr)); + detail_::free(host_side_ptr); } /** * Determine whether a given stretch of memory was allocated as part of * a mapped pair of host and device memory regions * + * @todo What if it's a managed pointer? + * * @param ptr the beginning of a memory region - in either host or device * memory - to check * @return `true` iff the region was allocated as one side of a mapped @@ -1609,24 +2032,25 @@ inline bool is_part_of_a_region_pair(const void* ptr) } // namespace memory - namespace symbol { /** * Locates a CUDA symbol in global or constant device memory * + * @note `symbol_t` symbols are associated with the primary context + * * @return The region of memory CUDA associates with the symbol */ -inline memory::region_t locate(symbol_t symbol) +template +inline memory::region_t locate(T&& symbol) { void *start; size_t symbol_size; - auto api_call_result = cudaGetSymbolAddress(&start, symbol.handle); - throw_if_error(api_call_result, - "Could not locate the device memory address for symbol " + cuda::detail_::ptr_as_hex(symbol.handle)); - api_call_result = cudaGetSymbolSize(&symbol_size, symbol.handle); - throw_if_error(api_call_result, - "Could not locate the device memory address for symbol " + cuda::detail_::ptr_as_hex(symbol.handle)); - return {start, symbol_size}; + auto api_call_result = cudaGetSymbolAddress(&start, std::forward(symbol)); + throw_if_error(api_call_result, "Could not locate the device memory address for a symbol"); + api_call_result = cudaGetSymbolSize(&symbol_size, std::forward(symbol)); + throw_if_error(api_call_result, "Could not locate the device memory address for the symbol at address" + + cuda::detail_::ptr_as_hex(start)); + return { start, symbol_size }; } } // namespace symbol diff --git a/src/cuda/api/miscellany.hpp b/src/cuda/api/miscellany.hpp index dfaea734..05d4cd0c 100644 --- a/src/cuda/api/miscellany.hpp +++ b/src/cuda/api/miscellany.hpp @@ -8,28 +8,37 @@ #ifndef CUDA_API_WRAPPERS_MISCELLANY_HPP_ #define CUDA_API_WRAPPERS_MISCELLANY_HPP_ -#include -#include +#include #include +#include + +#include +#include +#include namespace cuda { /** - * @brief Ensures the CUDA runtime has fully initialized + * Obtains the CUDA Runtime version * - * @note The CUDA runtime uses lazy initialization, so that until you perform - * certain actions, the CUDA driver is not used to create a context, nothing - * is done on the device etc. This function forces this initialization to - * happen immediately, while not having any other effect. + * @note unlike {@ref maximum_supported_by_driver()}, 0 cannot be returned, + * as we are actually using the runtime to obtain the version, so it does + * have _some_ version. */ -inline -void force_runtime_initialization() +inline void initialize_driver() { + constexpr const unsigned dummy_flags { 0 }; // this is the only allowed value for flags + auto status = cuInit(dummy_flags); + throw_if_error(status, "Failed initializing the CUDA driver"); +} + +inline void ensure_driver_is_initialized() { - // nVIDIA's Robin Thoni (https://www.rthoni.com/) guarantees - // the following code "does the trick" - auto status = cudaFree(nullptr); - throw_if_error(status, "Forcing CUDA runtime initialization"); + thread_local bool driver_known_to_be_initialized { false }; + if (not driver_known_to_be_initialized) { + initialize_driver(); + driver_known_to_be_initialized = true; + } } namespace device { @@ -46,15 +55,17 @@ namespace device { * @return the number of CUDA devices on this system * @throws cuda::error if the device count could not be obtained */ -inline device::id_t count() +inline device::id_t count() { + initialize_driver(); + // This function is often called before any device is obtained (which is where we + // expect the driver to be initialized) int device_count = 0; // Initializing, just to be on the safe side - status_t result = cudaGetDeviceCount(&device_count); - if (result == status::no_device) { - return 0; - } - else { - throw_if_error(result, "Failed obtaining the number of CUDA devices on the system"); + status_t result = cuDeviceGetCount(&device_count); + switch(result) { + case status::no_device: return 0; + case status::success: break; + default: throw runtime_error(result, "Failed obtaining the number of CUDA devices on the system"); } if (device_count < 0) { throw ::std::logic_error("cudaGetDeviceCount() reports an invalid number of CUDA devices"); diff --git a/src/cuda/api/module.hpp b/src/cuda/api/module.hpp new file mode 100644 index 00000000..69b9c070 --- /dev/null +++ b/src/cuda/api/module.hpp @@ -0,0 +1,355 @@ +/** + * @file module.hpp + * + * @brief Wrappers for working with modules of compiled CUDA code. + */ +#pragma once +#ifndef CUDA_API_WRAPPERS_MODULE_HPP_ +#define CUDA_API_WRAPPERS_MODULE_HPP_ + +#include +#include +#include +#include +#include +#include +#include +#include + +#if __cplusplus >= 201703L +#include +#endif + +namespace cuda { + +///@cond +class device_t; +class context_t; +class module_t; +class kernel_t; +///@endcond + +namespace module { + +namespace detail_ { + +inline module_t construct( + device::id_t device_id, + context::handle_t context_handle, + handle_t handle, + link::options_t options, + bool take_ownership = false, + bool hold_primary_context_reference = false) noexcept; + +inline ::std::string identify(const module::handle_t &handle) +{ + return std::string("module ") + cuda::detail_::ptr_as_hex(handle); +} + +inline ::std::string identify(const module::handle_t &handle, context::handle_t context_handle) +{ + return identify(handle) + " in " + context::detail_::identify(context_handle); +} + +inline ::std::string identify(const module::handle_t &handle, context::handle_t context_handle, device::id_t device_id) +{ + return identify(handle) + " in " + context::detail_::identify(context_handle, device_id); +} + +::std::string identify(const module_t &module); + +} // namespace detail_ + +/** + * Load a module from an appropriate compiled or semi-compiled file, allocating all + * relevant resources for it. + * + * @param path of a cubin, PTX, or fatbin file constituting the module to be loaded. + * @return the loaded module + * + * @note this covers cuModuleLoadFatBinary() even though that's not directly used + */ +module_t load_from_file(const char *path, link::options_t link_options = {}); + +module_t load_from_file(const ::std::string &path, link::options_t link_options = {}); + +#if __cplusplus >= 201703L +module_t load_from_file(const ::std::filesystem::path& path, link::options_t options = {}); +#endif + +/** + * Create a CUDA driver module from raw module image data. + * + * @param[inout] context The CUDA context into which the module data will be loaded (and + * in which the module contents may be used) + * @parem[in + * The pointer may be obtained by mapping a cubin or PTX or fatbin file, passing a cubin or PTX or fatbin file as a NULL-terminated text string, or incorporating a cubin or fatbin object into the executable resources and using operating system calls such as Windows FindResource() to obtain the pointer. + */ +///@{ +module_t create(context_t context, const void* module_data, link::options_t link_options); +module_t create(context_t context, const void* module_data); +module_t create(device_t device, const void* module_data, link::options_t link_options); +module_t create(device_t device, const void* module_data); +template +module_t create(context_t context, ContiguousContainer module_data); +///@} + +} // namespace module + +/** + * Wrapper class for a CUDA code module + * + * @note This class is a "reference type", not a "value type". Therefore, making changes + * to the module is a const-respecting operation on this class. + */ +class module_t { + +public: // getters + + module::handle_t handle() const { return handle_; } + context::handle_t context_handle() const { return context_handle_; } + device::id_t device_id() const { return device_id_; } + context_t context() const; + device_t device() const; + + // These API calls are not really the way you want to work. + cuda::kernel_t get_kernel(const char* name) const + { + context::current::detail_::scoped_override_t set_context_for_this_scope(context_handle_); + kernel::handle_t kernel_function_handle; + auto result = cuModuleGetFunction(&kernel_function_handle, handle_, name); + throw_if_error(result, ::std::string("Failed obtaining function ") + name + + " from " + module::detail_::identify(*this)); + return kernel::detail_::wrap( + context::detail_::get_device_id(context_handle_), context_handle_, kernel_function_handle); + } + + cuda::memory::region_t get_global_object(const char* name) const; + + // TODO: Implement a surface reference and texture reference class rather than these raw pointers. + + CUsurfref* get_surface(const char* name) const; + CUtexref* get_texture_reference(const char* name) const; + +protected: // constructors + + module_t( + device::id_t device_id, + context::handle_t context, + module::handle_t handle, + link::options_t options, + bool owning, + bool hold_primary_context_reference) +#ifdef NDEBUG + noexcept +#endif + : device_id_(device_id), context_handle_(context), handle_(handle), options_(options), owning_(owning), + holds_primary_context_refcount_unit_(hold_primary_context_reference) + { +#ifndef NDEBUG + if (not owning and hold_primary_context_reference) { + throw std::invalid_argument("A non-owning module proxy should not try to hold its own primary context refcount unit"); + } + if (hold_primary_context_reference and not context::detail_::is_primary(context_handle_)) + { + throw std::invalid_argument("A module in a non-primary context should not presume to hold a primary context refcount unit"); + } +#endif + if (owning and hold_primary_context_reference) { + device::primary_context::detail_::increase_refcount(device_id); + } + } + + module_t(device::id_t device_id, context::handle_t context, module::handle_t handle, link::options_t options, bool owning) noexcept + : module_t(device_id, context, handle, options, owning, false) + { } + +public: // friendship + + friend module_t module::detail_::construct(device::id_t, context::handle_t, module::handle_t, link::options_t, bool, bool) noexcept; + + +public: // constructors and destructor + + module_t(const module_t&) = delete; + + module_t(module_t&& other) noexcept : + module_t(other.device_id_, other.context_handle_, other.handle_, other.options_, other.owning_) + { + other.owning_ = false; + }; + + ~module_t() + { + if (owning_) { + context::current::detail_::scoped_override_t set_context_for_this_scope(context_handle_); + auto status = cuModuleUnload(handle_); + throw_if_error(status, "Failed unloading " + module::detail_::identify(*this)); + + if (holds_primary_context_refcount_unit_) { + device::primary_context::detail_::decrease_refcount(device_id_); + } + } + } + +public: // operators + + module_t& operator=(const module_t& other) = delete; + module_t& operator=(module_t&& other) = delete; + +protected: // data members + device::id_t device_id_; + context::handle_t context_handle_; + module::handle_t handle_; + link::options_t options_; + bool owning_; + // this field is mutable only for enabling move construction; other + // than in that case it must not be altered + bool holds_primary_context_refcount_unit_; +}; + +namespace module { + +using handle_t = CUmodule; + +/** +* Loads a populated module from a file on disk +* +* @param path Filesystem path of a fatbin, cubin or PTX file +* +* @todo: Do we really need link options here? + * @todo: Make this take a context_t; and consider adding load_module methods to context_t +*/ +inline module_t load_from_file(const char* path, link::options_t link_options) +{ + handle_t new_module_handle; + auto status = cuModuleLoad(&new_module_handle, path); + throw_if_error(status, ::std::string("Failed loading a module from file ") + path); + bool do_take_ownership { true }; + auto current_context_handle = context::current::detail_::get_handle(); + auto current_device_id = context::detail_::get_device_id(current_context_handle); + return detail_::construct(current_device_id, current_context_handle, new_module_handle, link_options, + do_take_ownership); +} + +inline module_t load_from_file(const ::std::string& path, link::options_t link_options) +{ + return load_from_file(path.c_str(), link_options); +} + +#if __cplusplus >= 201703L +inline module_t load_from_file(const ::std::filesystem::path& path) +{ + return load_from_file(path.c_str()); +} +#endif + +namespace detail_ { + +// This might have been called "wrap", if we had not needed to take care +// of primary context reference counting +inline module_t construct( + device::id_t device_id, + context::handle_t context_handle, + handle_t module_handle, + link::options_t options, + bool take_ownership, + bool hold_primary_context_reference) noexcept +{ + return module_t{device_id, context_handle, module_handle, options, take_ownership, hold_primary_context_reference}; +} + +template +inline module_t create(const context_t& context, const void* module_data, Creator creator_function, bool hold_pc_reference) +{ + context::current::scoped_override_t set_context_for_this_scope(context); + handle_t new_module_handle; + auto status = creator_function(new_module_handle, module_data); + throw_if_error(status, ::std::string( + "Failed loading a module from memory location ") + cuda::detail_::ptr_as_hex(module_data) + + "within " + context::detail_::identify(context)); + bool do_take_ownership { true }; + // TODO: Make sure the default-constructed options correspond to what cuModuleLoadData uses as defaults + return detail_::construct(context.device_id(), context.handle(), new_module_handle, + link::options_t{}, do_take_ownership, hold_pc_reference); +} + +// TODO: Consider adding create_module() methods to context_t +inline module_t create(const context_t& context, const void* module_data, const link::options_t& link_options, bool hold_pc_reference) +{ + auto creator_function = + [&link_options](handle_t& new_module_handle, const void* module_data) { + auto marshalled_options = link_options.marshal(); + return cuModuleLoadDataEx( + &new_module_handle, + module_data, + marshalled_options.count(), + const_cast(marshalled_options.options()), + const_cast(marshalled_options.values()) + ); + }; + return detail_::create(context, module_data, creator_function, hold_pc_reference); +} + +inline module_t create(const context_t& context, const void* module_data, bool hold_pc_reference) +{ + auto creator_function = + [](handle_t& new_module_handle, const void* module_data) { + return cuModuleLoadData(&new_module_handle, module_data); + }; + return detail_::create(context, module_data, creator_function, hold_pc_reference); +} + +} // namespace detail_ + +// TODO: Use an optional to reduce the number of functions here... when the +// library starts requiring C++14. + +inline module_t create(context_t context, const void* module_data) +{ + return detail_::create(context, module_data, false); +} + +inline module_t create(context_t context, const void* module_data, link::options_t link_options) +{ + return detail_::create(context, module_data, link_options, false); +} + +inline module_t create(device::primary_context_t primary_context, const void* module_data) +{ +#ifndef NDEBUG + if (module_data == nullptr) { + throw std::invalid_argument("Attempt to create a module with a null pointer for its data"); + } +#endif + constexpr const bool do_hold_primary_context_reference { true }; + const context_t& context = primary_context; + return detail_::create(context, module_data, do_hold_primary_context_reference); +} + +inline module_t create(device::primary_context_t primary_context, const void* module_data, link::options_t link_options) +{ +#ifndef NDEBUG + if (module_data == nullptr) { + throw std::invalid_argument("Attempt to create a module with a null pointer for its data"); + } +#endif + constexpr const bool do_hold_primary_context_reference {true }; + const context_t& context = primary_context; + return detail_::create(context, module_data, link_options, do_hold_primary_context_reference); +} + +namespace detail_ { + +inline ::std::string identify(const module_t& module) +{ + return identify(module.handle(), module.context_handle(), module.device().id()); +} + +} // namespace detail_ + +} // namespace module + +} // namespace cuda + +#endif // CUDA_API_WRAPPERS_MODULE_HPP_ diff --git a/src/cuda/api/multi_wrapper_impls.hpp b/src/cuda/api/multi_wrapper_impls.hpp index a461dc8f..b8b86faf 100644 --- a/src/cuda/api/multi_wrapper_impls.hpp +++ b/src/cuda/api/multi_wrapper_impls.hpp @@ -2,7 +2,7 @@ * @file multi_wrapper_impls.hpp * * @brief Implementations of methods or functions requiring the definitions of - * multiple CUDA entity proxy classes. In some cases these are declared in the + * multiple CUDA entity proxy classes. In most cases these are declared in the * individual proxy class files, with the other classes forward-declared. */ #pragma once @@ -13,12 +13,21 @@ #include #include #include -#include #include #include #include +#include +#include +#include +#include +#include +#include +#include #include +#include + #include +#include #include #include @@ -26,51 +35,46 @@ namespace cuda { -template -device_t array_t::device() const noexcept -{ - return device::get(device_id_); -} +namespace detail_ { -template -texture_view::texture_view( - const cuda::array_t& arr, - texture::descriptor_t descriptor) - : device_id_(arr.device().id()), owning(true) -{ - cudaResourceDesc resource_descriptor; - memset(&resource_descriptor, 0, sizeof(resource_descriptor)); - resource_descriptor.resType = cudaResourceTypeArray; - resource_descriptor.res.array.array = arr.get(); +template +using void_t = void; - auto status = cudaCreateTextureObject(&raw_handle_, &resource_descriptor, &descriptor, nullptr); - throw_if_error(status, "failed creating a CUDA texture object"); -} +template class, typename = void> +struct is_detected : ::std::false_type {}; -inline device_t texture_view::associated_device() const noexcept -{ - return cuda::device::get(device_id_); -} +template class Op> +struct is_detected>> : ::std::true_type {}; -namespace array { +template< class, class = void > +struct has_data : ::std::false_type { }; -namespace detail_ { +template< class T> +struct has_data().data())>> +: std::is_same().data()), void*>::type { }; + +} // namespace detail_ + +namespace array { template -handle_t create(const device_t& device, dimensions_t dimensions) +array_t create( + const context_t& context, + dimensions_t dimensions) { - return create(device.id(), dimensions); + handle_t handle = detail_::create(context.handle(), dimensions); + return wrap(context.device_id(), context.handle(), handle, dimensions); } -} // namespace detail_ - template -array_t create( - const device_t& device, - dimensions_t dimensions) +array_t create( + device_t device, + dimensions_t dimensions) { - handle_t handle { detail_::create(device, dimensions) }; - return wrap(device.id(), handle, dimensions); + device::current::detail_::scoped_context_override_t set_context_for_this_scope(device.id()); + auto context_handle = set_context_for_this_scope.primary_context_handle; + handle_t handle = detail_::create(context_handle, dimensions); + return wrap(device.id(), context_handle, handle, dimensions); } } // namespace array @@ -78,27 +82,48 @@ array_t create( namespace event { inline event_t create( - device_t device, + const context_t& context, + bool uses_blocking_sync, + bool records_timing, + bool interprocess) +{ + // Yes, we need the ID explicitly even on the current device, + // because event_t's don't have an implicit device ID. + return event::detail_::create(context.device_id(), context.handle(), uses_blocking_sync, records_timing, interprocess); +} + +inline event_t create( + device_t& device, bool uses_blocking_sync, bool records_timing, bool interprocess) { - auto device_id = device.id(); - // Yes, we need the ID explicitly even on the current device, - // because event_t's don't have an implicit device ID. - return event::detail_::create(device_id , uses_blocking_sync, records_timing, interprocess); + device::current::detail_::scoped_context_override_t set_context_for_this_scope(device.id()); + return event::detail_::create_in_current_context( + device.id(), + context::current::detail_::get_handle(), + uses_blocking_sync, records_timing, interprocess); } namespace ipc { -inline handle_t export_(event_t& event) +inline handle_t export_(const event_t& event) { return detail_::export_(event.handle()); } -inline event_t import(device_t& device, const handle_t& handle) +inline event_t import(const context_t& context, const handle_t& event_ipc_handle) +{ + bool do_not_take_ownership { false }; + return event::detail_::wrap(context.device_id(), context.handle(), detail_::import(event_ipc_handle), do_not_take_ownership); +} + + +inline event_t import(const device_t& device, const handle_t& event_ipc_handle) { - return event::detail_::wrap(device.id(), detail_::import(handle), do_not_take_ownership); + device::current::detail_::scoped_context_override_t set_context_for_this_scope(device.id()); + auto handle = detail_::import(event_ipc_handle); + return event::detail_::wrap(device.id(), context::current::detail_::get_handle(), handle, do_not_take_ownership); } } // namespace ipc @@ -108,30 +133,182 @@ inline event_t import(device_t& device, const handle_t& handle) // device_t methods -inline stream_t device_t::default_stream() const noexcept +inline device::primary_context_t device_t::primary_context(bool scoped) const +{ + auto pc_handle = primary_context_handle(); + auto decrease_refcount_on_destruct = not scoped; + if (not scoped) { + device::primary_context::detail_::increase_refcount(id_); + // Q: Why increase the refcount here, when `primary_context_handle()` + // ensured this has already happened for this object? + // A: Because an unscoped primary_context_t needs its own refcount + // unit (e.g. in case this object gets destructed but the + // primary_context_t is still alive. + } + return device::primary_context::detail_::wrap(id_, pc_handle, decrease_refcount_on_destruct); +} + +inline stream_t device_t::default_stream() const { - return stream::detail_::wrap(id(), stream::default_stream_handle); + return stream::detail_::wrap(id(), primary_context_handle(), stream::default_stream_handle); } -inline stream_t -device_t::create_stream( +inline stream_t device_t::create_stream( bool will_synchronize_with_default_stream, stream::priority_t priority) const { - device::current::detail_::scoped_override_t set_device_for_this_scope(id_); - return stream::detail_::wrap(id(), stream::detail_::create_on_current_device( - will_synchronize_with_default_stream, priority), do_take_ownership); + device::current::detail_::scoped_context_override_t set_context_for_this_scope(id_); + return stream::detail_::create(id_, primary_context_handle(), will_synchronize_with_default_stream, priority); +} + +inline bool context_t::is_primary() const +{ + auto pc_handle = device::primary_context::detail_::obtain_and_increase_refcount(device_id_); + device::primary_context::detail_::decrease_refcount(device_id_); + return handle_ == pc_handle; +} + +inline module_t context_t::create_module(const void* module_data, link::options_t link_options) const +{ + return module::create(*this, module_data, link_options); +} + +inline module_t context_t::create_module(const void* module_data) const +{ + return module::create(*this, module_data); +} + +template +module_t context_t::create_module(ContiguousContainer module_data) const +{ + return module::create(*this, module_data); +} + +inline void context_t::enable_access_to(const context_t& peer) const +{ + context::peer_to_peer::enable_access(*this, peer); +} + +inline void context_t::disable_access_to(const context_t& peer) const +{ + context::peer_to_peer::disable_access(*this, peer); +} + +inline device_t context_t::device() const +{ + return device::detail_::wrap(device_id_); +} + +inline stream_t context_t::create_stream( + bool will_synchronize_with_default_stream, + stream::priority_t priority) +{ + return stream::detail_::create(device_id_, handle_, will_synchronize_with_default_stream, priority); } namespace device { + +namespace primary_context { + +inline bool is_active(const device_t& device) +{ + return detail_::is_active(device.id()); +} + +inline void destroy(const device_t& device) +{ + auto status = cuDevicePrimaryCtxReset(device.id()); + throw_if_error(status, "Failed destroying/resetting the primary context of device " + ::std::to_string(device.id())); +} + +inline primary_context_t get(const device_t& device) +{ + auto pc_handle = detail_::get_handle(device.id(), true); + return detail_::wrap( device.id(), pc_handle, true); +} + + +} // namespace primary_context + +namespace peer_to_peer { + +inline bool can_access(device_t accessor, device_t peer) +{ + return detail_::can_access(accessor.id(), peer.id()); +} + +inline void enable_access(device_t accessor, device_t peer) +{ + return context::peer_to_peer::enable_access(accessor.primary_context(), peer.primary_context()); +} + +inline void disable_access(device_t accessor, device_t peer) +{ +#ifndef NDEBUG + if (accessor == peer) { + throw std::invalid_argument("A device cannot be used as its own peer"); + } +#endif + context::peer_to_peer::disable_access(accessor.primary_context(), peer.primary_context()); +} + +inline bool can_access_each_other(device_t first, device_t second) +{ + return can_access(first, second) and can_access(second, first); +} + +inline void enable_bidirectional_access(device_t first, device_t second) +{ +#ifndef NDEBUG + if (first == second) { + throw std::invalid_argument("A device cannot be used as its own peer"); + } +#endif + context::peer_to_peer::enable_bidirectional_access(first.primary_context(), second.primary_context()); +} + +inline void disable_bidirectional_access(device_t first, device_t second) +{ +#ifndef NDEBUG + if (first == second) { + throw std::invalid_argument("A device cannot be used as its own peer"); + } +#endif + context::peer_to_peer::disable_bidirectional_access(first.primary_context(), second.primary_context()); +} + +inline attribute_value_t get_attribute(attribute_t attribute, device_t first, device_t second) +{ +#ifndef NDEBUG + if (first == second) { + throw std::invalid_argument("A device cannot be used as its own peer"); + } +#endif + return detail_::get_attribute(attribute, first.id(), second.id()); +} + +} // namespace peer_to_peer + +inline stream_t primary_context_t::default_stream() const noexcept +{ + return stream::detail_::wrap(device_id_, handle_, stream::default_stream_handle); +} + namespace current { -inline scoped_override_t::scoped_override_t(device_t& device) : parent(device.id()) { } + +inline scoped_override_t::scoped_override_t(const device_t& device) : parent(device.id()) { } inline scoped_override_t::scoped_override_t(device_t&& device) : parent(device.id()) { } } // namespace current } // namespace device +inline void synchronize(const device_t& device) +{ + auto pc = device.primary_context(); + context::current::detail_::scoped_override_t set_context_for_this_scope(pc.handle()); + context::current::detail_::synchronize(device.id(), pc.handle()); +} namespace detail_ { @@ -146,10 +323,17 @@ void device_t::launch( kernel_function, launch_configuration, parameters...); } +inline context_t device_t::create_context( + context::host_thread_synch_scheduling_policy_t synch_scheduling_policy, + bool keep_larger_local_mem_after_resize) const +{ + return context::create(*this, synch_scheduling_policy, keep_larger_local_mem_after_resize); +} + inline event_t device_t::create_event( bool uses_blocking_sync, bool records_timing, - bool interprocess) const + bool interprocess) { // The current implementation of event::create is not super-smart, // but it's probably not worth it trying to improve just this function @@ -158,20 +342,27 @@ inline event_t device_t::create_event( // event_t methods -inline device_t event_t::device() const noexcept +inline device_t event_t::device() const +{ + return cuda::device::get(device_id()); +} + +inline context_t event_t::context() const { - return cuda::device::get(device_id_); + constexpr const bool dont_take_ownership { false }; + return context::detail_::wrap(device_id(), context_handle_, dont_take_ownership); } -inline void event_t::record(const stream_t& stream) + + +inline void event_t::record(const stream_t& stream) const { // Note: - // TODO: Perhaps check the device ID here, rather than - // have the Runtime API call fail? + // TODO: Perhaps check the context match here, rather than have the Runtime API call fail? event::detail_::enqueue(stream.handle(), handle_); } -inline void event_t::fire(const stream_t& stream) +inline void event_t::fire(const stream_t& stream) const { record(stream); stream.synchronize(); @@ -181,34 +372,42 @@ inline void event_t::fire(const stream_t& stream) inline device_t stream_t::device() const noexcept { - return cuda::device::get(device_id_); + return cuda::device::detail_::wrap(device_id_); } -inline void stream_t::enqueue_t::wait(const event_t& event) +inline context_t stream_t::context() const noexcept +{ + constexpr const bool dont_take_ownership { false }; + return context::detail_::wrap(device_id_, context_handle_, dont_take_ownership); +} + +inline void stream_t::enqueue_t::wait(const event_t& event_) { auto device_id = associated_stream.device_id_; - device::current::detail_::scoped_override_t set_device_for_this_context(device_id); + device::current::detail_::scoped_context_override_t set_device_for_this_context(device_id); // Required by the CUDA runtime API; the flags value is currently unused constexpr const unsigned int flags = 0; - auto status = cudaStreamWaitEvent(associated_stream.handle_, event.handle(), flags); - throw_if_error(status, - ::std::string("Failed scheduling a wait for " + event::detail_::identify(event.handle()) - + " on stream " + stream::detail_::identify(associated_stream.handle_, associated_stream.device_id_))); + auto status = cuStreamWaitEvent(associated_stream.handle_, event_.handle(), flags); + throw_if_error(status, "Failed scheduling a wait for " + event::detail_::identify(event_.handle()) + + " on " + stream::detail_::identify(associated_stream)); } inline event_t& stream_t::enqueue_t::event(event_t& existing_event) { auto device_id = associated_stream.device_id_; - if (existing_event.device_id() != device_id) { - throw ::std::invalid_argument("Attempt to enqueue a CUDA event associated with " - + device::detail_::identify(existing_event.device_id()) + " to be triggered by a stream on " - + device::detail_::identify(device_id)); + auto context_handle = associated_stream.context_handle_; + auto stream_context_handle_ = associated_stream.context_handle_; + if (existing_event.context_handle() != stream_context_handle_) { + throw ::std::invalid_argument("Attempt to enqueue " + + event::detail_::identify(existing_event) + + ", to be triggered by " + stream::detail_::identify(associated_stream)); } - device::current::detail_::scoped_override_t set_device_for_this_context(device_id); - stream::detail_::record_event_on_current_device(device_id, associated_stream.handle_, existing_event.handle()); + context::current::detail_::scoped_override_t set_context_for_this_scope(context_handle); + stream::detail_::record_event_in_current_context(device_id, context_handle, + associated_stream.handle_,existing_event.handle()); return existing_event; } @@ -217,21 +416,58 @@ inline event_t stream_t::enqueue_t::event( bool records_timing, bool interprocess) { - auto device_id = associated_stream.device_id_; - device::current::detail_::scoped_override_t set_device_for_this_scope(device_id); + auto context_handle = associated_stream.context_handle_; + context::current::detail_::scoped_override_t set_context_for_this_scope(context_handle); - event_t ev { event::detail_::create_on_current_device(device_id, uses_blocking_sync, records_timing, interprocess) }; + event_t ev { event::detail_::create_in_current_context( + associated_stream.device_id_, context_handle, + uses_blocking_sync, records_timing, interprocess) }; // Note that, at this point, the event is not associated with this enqueue object's stream. - stream::detail_::record_event_on_current_device(device_id, associated_stream.handle_, ev.handle()); + this->event(ev); return ev; } namespace memory { template -inline device_t pointer_t::device() const noexcept +inline device_t pointer_t::device() const +{ + cuda::device::id_t device_id = get_attribute(); + return cuda::device::get(device_id); +} +template +inline pointer_t pointer_t::other_side_of_region_pair() const +{ + pointer::attribute_t attributes[] = { + CU_POINTER_ATTRIBUTE_MEMORY_TYPE, + CU_POINTER_ATTRIBUTE_HOST_POINTER, + CU_POINTER_ATTRIBUTE_DEVICE_POINTER + }; + type_t memory_type; + T* host_ptr; + T* device_ptr; + void* value_ptrs[] = { &memory_type, &host_ptr, &device_ptr }; + pointer::detail_::get_attributes(3, attributes, value_ptrs, ptr_); + +#ifndef NDEBUG + assert(host_ptr == ptr_ or device_ptr == ptr_); +#endif + return { ptr_ == host_ptr ? device_ptr : host_ptr }; +} + + +template +inline context_t pointer_t::context() const { - return cuda::device::get(attributes().device); + pointer::attribute_t attributes[] = { + CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL, + CU_POINTER_ATTRIBUTE_CONTEXT + }; + cuda::device::id_t device_id; + context::handle_t context_handle; + void* value_ptrs[] = {&device_id, &context_handle}; + pointer::detail_::get_attributes(2, attributes, value_ptrs, ptr_); + return context::detail_::wrap(device_id, context_handle); } namespace async { @@ -241,16 +477,23 @@ inline void copy(void *destination, const void *source, size_t num_bytes, const detail_::copy(destination, source, num_bytes, stream.handle()); } +// Note: Assumes the source pointer is valid in the stream's context template inline void copy(array_t& destination, const T* source, const stream_t& stream) { - detail_::copy(destination, source, stream.handle()); + detail_::copy(destination, source, stream.handle()); } +// Note: Assumes the destination, source and stream are all usable on the same content template inline void copy(T* destination, const array_t& source, const stream_t& stream) { - detail_::copy(destination, source, stream.handle()); + if (stream.context_handle() != source.context_handle()) { + throw std::invalid_argument("Attempt to copy an array in" + + context::detail_::identify(source.context_handle()) + " via " + + stream::detail_::identify(stream)); + } + detail_::copy(destination, source, stream.handle()); } template @@ -263,21 +506,29 @@ inline void copy_single(T& destination, const T& source, const stream_t& stream) namespace device { -inline region_t allocate(cuda::device_t device, size_t size_in_bytes) +inline region_t allocate(const context_t& context, size_t size_in_bytes) +{ + return detail_::allocate(context.handle(), size_in_bytes); +} + + +inline region_t allocate(const device_t& device, size_t size_in_bytes) { - return detail_::allocate(device.id(), size_in_bytes); + cuda::device::current::detail_::scoped_context_override_t set_context_for_this_scope{device.id()}; + return detail_::allocate_in_current_context(size_in_bytes); } namespace async { inline region_t allocate(const stream_t& stream, size_t size_in_bytes) { - return detail_::allocate(stream.device().id(), stream.handle(), size_in_bytes); + return detail_::allocate(stream.context().handle(), stream.handle(), size_in_bytes); } -inline void set(void* start, int byte_value, size_t num_bytes, const stream_t& stream) +template +inline void typed_set(T* start, const T& value, size_t num_elements, const stream_t& stream) { - detail_::set(start, byte_value, num_bytes, stream.handle()); + detail_::set(start, value, num_elements, stream.handle()); } inline void zero(void* start, size_t num_bytes, const stream_t& stream) @@ -287,20 +538,28 @@ inline void zero(void* start, size_t num_bytes, const stream_t& stream) } // namespace async + /** * @brief Create a variant of ::std::unique_pointer for an array in - * the current device's global memory + * device-global memory. + * + * @note CUDA's runtime API always has a current device; but - + * there is not necessary a current context; so a primary context + * for a device may be created through this call. * * @tparam T an array type; _not_ the type of individual elements * + * @param context The CUDA device context in which to make the + * allocation. * @param num_elements the number of elements to allocate + * * @return an ::std::unique_ptr pointing to the constructed T array - */ -template -inline unique_ptr make_unique(size_t num_elements) +*/ +template +inline unique_ptr make_unique(const context_t& context, size_t num_elements) { - static_assert(::std::is_array::value, "make_unique(device, num_elements) can only be invoked for T being an array type, T = U[]"); - return cuda::memory::detail_::make_unique(num_elements); + static_assert(::std::is_array::value, "make_unique() can only be invoked for T being an array type, T = U[]"); + return memory::detail_::make_unique(context.handle(), num_elements); } /** @@ -312,30 +571,54 @@ inline unique_ptr make_unique(size_t num_elements) * @param device on which to construct the array of elements * @param num_elements the number of elements to allocate * @return an ::std::unique_ptr pointing to the constructed T array - */template + */ +template inline unique_ptr make_unique(device_t device, size_t num_elements) { - cuda::device::current::detail_::scoped_override_t set_device_for_this_scope(device.id()); - return make_unique(num_elements); + static_assert(::std::is_array::value, "make_unique() can only be invoked for T being an array type, T = U[]"); + cuda::device::current::detail_::scoped_context_override_t set_context_for_this_scope(device.id()); + return memory::detail_::make_unique(num_elements); +} + +/** + * @brief Create a variant of ::std::unique_pointer for an array in + * device-global memory on the current device. + * + * @note The allocation will be made in the device's primary context - + * which will be created if it has not yet been. + * + * @tparam T an array type; _not_ the type of individual elements + * + * @param num_elements the number of elements to allocate + * + * @return an ::std::unique_ptr pointing to the constructed T array + */ +template +inline unique_ptr make_unique(size_t num_elements) +{ + static_assert(::std::is_array::value, "make_unique() can only be invoked for T being an array type, T = U[]"); + auto device = cuda::device::current::get(); + make_unique(device, num_elements); } /** * @brief Create a variant of ::std::unique_pointer for a single value - * in the current device's global memory + * in device-global memory. * * @tparam T the type of value to construct in device memory * + * @param device on which to construct the T element * @return an ::std::unique_ptr pointing to the allocated memory */ template -inline unique_ptr make_unique() +inline unique_ptr make_unique(const context_t& context) { - return cuda::memory::detail_::make_unique(); + return cuda::memory::detail_::make_unique(context.handle()); } /** * @brief Create a variant of ::std::unique_pointer for a single value - * in device-global memory + * in device-global memory. * * @tparam T the type of value to construct in device memory * @@ -345,12 +628,94 @@ inline unique_ptr make_unique() template inline unique_ptr make_unique(device_t device) { - cuda::device::current::detail_::scoped_override_t set_device_for_this_scope(device.id()); - return make_unique(); + cuda::device::current::detail_::scoped_context_override_t set_context_for_this_scope(device.id()); + return memory::detail_::make_unique(); +} + +/** + * @brief Create a variant of ::std::unique_pointer for a single value + * in device-global memory, on the current device + * + * @note The allocation will be made in the device's primary context - + * which will be created if it has not yet been. + * + * @tparam T the type of value to construct in device memory + * + * @param num_elements the number of elements to allocate + * + * @return an ::std::unique_ptr pointing to the allocated memory + */ +template +inline unique_ptr make_unique() +{ + auto device = cuda::device::current::get(); + make_unique(device); } } // namespace device +namespace inter_context { + +inline void copy( + void * destination_address, + context_t destination_context, + const void * source_address, + context_t source_context, + size_t num_bytes) +{ + return detail_::copy( + destination_address, destination_context.handle(), + source_address, source_context.handle(), num_bytes); +} + +namespace async { + +inline void copy( + void * destination_address, + context_t destination_context, + const void * source_address, + context_t source_context, + size_t num_bytes, + stream_t stream) +{ + return detail_::copy( + destination_address, destination_context.handle(), source_address, + source_context.handle(), num_bytes, stream.handle()); +} + +inline void copy( + region_t destination, + context_t destination_context, + const_region_t source, + context_t source_context, + stream_t stream) +{ +#ifndef NDEBUG + if (destination.size() < destination.size()) { + throw ::std::invalid_argument( + "Attempt to copy a region of " + ::std::to_string(source.size()) + + " bytes into a region of size " + ::std::to_string(destination.size()) + " bytes"); + } +#endif + copy(destination.start(), destination_context, source, source_context, stream); +} + + +inline void copy( + void * destination, + context_t destination_context, + const_region_t source, + context_t source_context, + const stream_t& stream) +{ + copy(destination, destination_context, source.start(), source_context, source.size(), stream); +} + +} // namespace async + +} // namespace inter_context + + namespace managed { namespace detail_ { @@ -358,49 +723,102 @@ namespace detail_ { template inline device_t base_region_t::preferred_location() const { - auto device_id = detail_::get_scalar_range_attribute(*this, cudaMemRangeAttributePreferredLocation); + auto device_id = detail_::get_scalar_range_attribute(*this, CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION); return cuda::device::get(device_id); } template inline void base_region_t::set_preferred_location(device_t& device) const { - detail_::set_scalar_range_attribute(*this, (cudaMemoryAdvise) cudaMemAdviseSetPreferredLocation, device.id()); + detail_::set_range_attribute(*this,CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION, device.id()); } template inline void base_region_t::clear_preferred_location() const { - detail_::set_scalar_range_attribute(*this, (cudaMemoryAdvise) cudaMemAdviseUnsetPreferredLocation); + detail_::unset_range_attribute(*this, CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION); } } // namespace detail_ +template +inline unique_ptr make_unique( + const context_t& context, + size_t n, + initial_visibility_t initial_visibility) +{ + context::current::scoped_override_t set_context_for_this_scope(context); + return detail_::make_unique_in_current_context(n, initial_visibility); +} + +template +inline unique_ptr make_unique( + const device_t& device, + size_t n, + initial_visibility_t initial_visibility) +{ + cuda::device::current::detail_::scoped_context_override_t set_context_for_this_scope(device.id()); + return detail_::make_unique_in_current_context(n, initial_visibility); +} + +template +inline unique_ptr make_unique( + size_t n, + initial_visibility_t initial_visibility) +{ + auto device = cuda::device::current::get(); + return make_unique(device, n, initial_visibility); +} + +template +inline unique_ptr make_unique( + const context_t& context, + initial_visibility_t initial_visibility) +{ + context::current::scoped_override_t set_context_for_this_scope(context); + return detail_::make_unique_in_current_context(initial_visibility); +} + +template +inline unique_ptr make_unique( + device_t device, + initial_visibility_t initial_visibility) +{ + cuda::device::current::detail_::scoped_context_override_t set_context_for_this_scope(device.id()); + return detail_::make_unique_in_current_context(initial_visibility); +} + +template +inline unique_ptr make_unique( + initial_visibility_t initial_visibility) +{ + auto device = cuda::device::current::get(); + return make_unique(initial_visibility); +} + inline void advise_expected_access_by(const_region_t region, device_t& device) { - detail_::set_scalar_range_attribute(region, cudaMemAdviseSetAccessedBy, device.id()); + detail_::advise(region, CU_MEM_ADVISE_SET_ACCESSED_BY, device.id()); } inline void advise_no_access_expected_by(const_region_t region, device_t& device) { - detail_::set_scalar_range_attribute(region, cudaMemAdviseUnsetAccessedBy, device.id()); + detail_::advise(region, CU_MEM_ADVISE_UNSET_ACCESSED_BY, device.id()); } template ::std::vector accessors(const_region_t region, const Allocator& allocator) { - static_assert(sizeof(cuda::device::id_t) == sizeof(device_t), "Unexpected size difference between device IDs and their wrapper class, device_t"); - auto num_devices = cuda::device::count(); ::std::vector devices(num_devices, allocator); auto device_ids = reinterpret_cast(devices.data()); - - auto status = cudaMemRangeGetAttribute( + auto status = cuMemRangeGetAttribute( device_ids, sizeof(device_t) * devices.size(), - cudaMemRangeAttributeAccessedBy, region.start(), region.size()); - throw_if_error(status, "Obtaining the IDs of devices with access to the managed memory range at " + cuda::detail_::ptr_as_hex(region.start())); + CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY, device::address(region.start()), region.size()); + throw_if_error(status, "Obtaining the IDs of devices with access to the managed memory range at " + + cuda::detail_::ptr_as_hex(region.start())); auto first_invalid_element = ::std::lower_bound(device_ids, device_ids + num_devices, cudaInvalidDeviceId); // We may have gotten less results that the set of all devices, so let's whittle that down @@ -414,41 +832,43 @@ ::std::vector accessors(const_region_t region, const Alloca namespace async { inline void prefetch( + const_region_t region, + const cuda::device_t& destination, + const stream_t& stream) +{ + detail_::prefetch(region, destination.id(), stream.handle()); +} + +inline void prefetch_to_host( const_region_t region, - cuda::device_t destination, const stream_t& stream) { - detail_::prefetch(region, destination.id(), stream.handle()); + detail_::prefetch(region, CU_DEVICE_CPU, stream.handle()); } } // namespace async - inline region_t allocate( - cuda::device_t device, - size_t size_in_bytes, + const context_t& context, + size_t num_bytes, initial_visibility_t initial_visibility) { - return detail_::allocate(device.id(), size_in_bytes, initial_visibility); + return detail_::allocate(context.handle(), num_bytes, initial_visibility); } -template -inline unique_ptr make_unique( - device_t device, - size_t num_elements, - initial_visibility_t initial_visibility) +inline region_t allocate( + device_t device, + size_t num_bytes, + initial_visibility_t initial_visibility) { - cuda::device::current::detail_::scoped_override_t(device.id()); - return make_unique(num_elements, initial_visibility); + cuda::device::current::detail_::scoped_context_override_t set_context_for_this_scope{device.id()}; + return detail_::allocate_in_current_context(num_bytes, initial_visibility); } -template -inline unique_ptr make_unique( - device_t device, - initial_visibility_t initial_visibility) +inline region_t allocate(size_t num_bytes) { - cuda::device::current::detail_::scoped_override_t(device.id()); - return make_unique(initial_visibility); + auto context_handle = context::current::detail_::get_with_fallback_push(); + return allocate(context_handle, num_bytes, initial_visibility_t::to_all_devices); } } // namespace managed @@ -460,7 +880,17 @@ inline region_pair allocate( size_t size_in_bytes, allocation_options options) { - return cuda::memory::mapped::detail_::allocate(device.id(), size_in_bytes, options); + auto pc = device.primary_context(); + return cuda::memory::mapped::detail_::allocate(pc.handle(), size_in_bytes, options); +} + + +inline region_pair allocate( + cuda::context_t& context, + size_t size_in_bytes, + allocation_options options) +{ + return cuda::memory::mapped::detail_::allocate(context.handle(), size_in_bytes, options); } } // namespace mapped @@ -469,312 +899,591 @@ inline region_pair allocate( // kernel_t methods -inline device_t kernel_t::device() const noexcept { return device::get(device_id_); } +inline context_t kernel_t::context() const noexcept +{ + constexpr bool dont_take_ownership { false }; + return context::detail_::from_handle(context_handle_, dont_take_ownership); +} -inline void kernel_t::set_attribute(kernel::attribute_t attribute, kernel::attribute_value_t value) +inline device_t kernel_t::device() const noexcept { - device::current::detail_::scoped_override_t set_device_for_this_context(device_id_); - auto result = cudaFuncSetAttribute(ptr_, attribute, value); - throw_if_error(result, "Setting CUDA device function attribute " + ::std::to_string(attribute) + " to value " + ::std::to_string(value)); + return device::get(device_id_); } -inline void kernel_t::opt_in_to_extra_dynamic_memory(cuda::memory::shared::size_t amount_required_by_kernel) +inline void kernel_t::set_attribute(kernel::attribute_t attribute, kernel::attribute_value_t value) const { - device::current::detail_::scoped_override_t set_device_for_this_context(device_id_); #if CUDART_VERSION >= 9000 - auto result = cudaFuncSetAttribute(ptr_, cudaFuncAttributeMaxDynamicSharedMemorySize, amount_required_by_kernel); + context::current::detail_::scoped_override_t set_context_for_this_context(context_handle_); + auto result = cuFuncSetAttribute(handle_, static_cast(attribute), value); throw_if_error(result, - "Trying to opt-in to " + ::std::to_string(amount_required_by_kernel) + " bytes of dynamic shared memory, " - "exceeding the maximum available on device " + ::std::to_string(device_id_) + " (consider the amount of static shared memory" - "in use by the function)."); + "Setting CUDA device function attribute " + +#ifndef NDEBUG + ::std::string(kernel::detail_::attribute_name(attribute)) + #else + ::std::to_string(static_cast::type>(attribute)) + +#endif + " to value " + ::std::to_string(value) ); throw(cuda::runtime_error {cuda::status::not_yet_implemented}); #endif } -#if defined(__CUDACC__) +/* +namespace kernel { -// Unfortunately, the CUDA runtime API does not allow for computation of the grid parameters for maximum occupancy -// from code compiled with a host-side-only compiler! See cuda_runtime.h for details +namespace occupancy { -namespace detail_ { +inline grid::complete_dimensions_t min_grid_params_for_max_occupancy( + const kernel_t& kernel, + memory::shared::size_t dynamic_shared_memory_size, + grid::block_dimension_t block_size_limit, + bool disable_caching_override) +{ + return detail_::min_grid_params_for_max_occupancy( + kernel.handle(), kernel.device().id(), dynamic_shared_memory_size, block_size_limit, disable_caching_override); +} template -inline grid::complete_dimensions_t min_grid_params_for_max_occupancy( - const void * ptr, - device::id_t device_id, +grid::complete_dimensions_t +apriori_compiled_kernel_t::min_grid_params_for_max_occupancy( UnaryFunction block_size_to_dynamic_shared_mem_size, grid::block_dimension_t block_size_limit, - bool disable_caching_override) + bool disable_caching_override) const { -#if CUDART_VERSION <= 10000 - throw(cuda::runtime_error {cuda::status::not_yet_implemented}); -#else - int min_grid_size_in_blocks { 0 }; - int block_size { 0 }; - // Note: only initializing the values her because of a - // spurious (?) compiler warning about potential uninitialized use. - auto result = cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags( - &min_grid_size_in_blocks, &block_size, - ptr, - block_size_to_dynamic_shared_mem_size, - static_cast(block_size_limit), - disable_caching_override ? cudaOccupancyDisableCachingOverride : cudaOccupancyDefault - ); - throw_if_error(result, - "Failed obtaining parameters for a minimum-size grid for kernel " + detail_::ptr_as_hex(ptr) + - " on device " + ::std::to_string(device_id) + "."); - return { min_grid_size_in_blocks, block_size }; -#endif // CUDART_VERSION <= 10000 + return detail_::min_grid_params_for_max_occupancy( + ptr_, device_id_, block_size_to_dynamic_shared_mem_size, block_size_limit, disable_caching_override); } -inline grid::complete_dimensions_t min_grid_params_for_max_occupancy( - const void * ptr, - device::id_t device_id, - memory::shared::size_t dynamic_shared_mem_size, - grid::block_dimension_t block_size_limit, - bool disable_caching_override) +} // namespace occupancy + +} // namespace kernel +*/ + +namespace kernel { + +template +apriori_compiled_kernel_t get(context_t context, KernelFunctionPtr function_ptr) { - auto always_need_same_shared_mem_size = - [dynamic_shared_mem_size](::size_t) { return dynamic_shared_mem_size; }; - return min_grid_params_for_max_occupancy( - ptr, device_id, always_need_same_shared_mem_size, block_size_limit, disable_caching_override); + static_assert( + ::std::is_pointer::value + and ::std::is_function::type>::value, + "function_ptr must be a bona fide pointer to a kernel (__global__) function"); + + auto ptr_ = reinterpret_cast(function_ptr); + auto handle = detail_::get_handle(ptr_); + return detail_::wrap(context.device_id(), context.handle(), handle, ptr_); } -} // namespace detail_ +template +apriori_compiled_kernel_t get(device_t device, KernelFunctionPtr function_ptr) +{ + return get(device.primary_context(), function_ptr); +} +} // namespace kernel -inline grid::complete_dimensions_t min_grid_params_for_max_occupancy( - const kernel_t& kernel, - memory::shared::size_t dynamic_shared_memory_size, - grid::block_dimension_t block_size_limit, - bool disable_caching_override) + +namespace stream { + +namespace detail_ { + +inline device::id_t device_id_of(stream::handle_t stream_handle) { - return detail_::min_grid_params_for_max_occupancy( - kernel.ptr(), kernel.device().id(), dynamic_shared_memory_size, block_size_limit, disable_caching_override); + return context::detail_::get_device_id(context_handle_of(stream_handle)); } +inline void record_event_in_current_context( + device::id_t current_device_id, + context::handle_t current_context_handle_, + stream::handle_t stream_handle, + event::handle_t event_handle) +{ + auto status = cuEventRecord(event_handle, stream_handle); + throw_if_error(status, "Failed scheduling " + event::detail_::identify(event_handle) + + " on " + stream::detail_::identify(stream_handle, current_context_handle_, current_device_id)); +} -inline grid::complete_dimensions_t kernel_t::min_grid_params_for_max_occupancy( - memory::shared::size_t dynamic_shared_memory_size, - grid::block_dimension_t block_size_limit, - bool disable_caching_override) const +} // namespace detail_ + +inline stream_t create( + const device_t& device, + bool synchronizes_with_default_stream, + priority_t priority) { - return detail_::min_grid_params_for_max_occupancy( - ptr_, device_id_, dynamic_shared_memory_size, block_size_limit, disable_caching_override); + cuda::device::current::detail_::scoped_context_override_t set_context_for_this_scope{device.id()}; + auto stream_handle = detail_::create_in_current_context(synchronizes_with_default_stream, priority); + return stream::detail_::wrap(device.id(), context::current::detail_::get_handle(), stream_handle); } -template -grid::complete_dimensions_t kernel_t::min_grid_params_for_max_occupancy( - UnaryFunction block_size_to_dynamic_shared_mem_size, - grid::block_dimension_t block_size_limit, - bool disable_caching_override) const +inline stream_t create( + const context_t& context, + bool synchronizes_with_default_stream, + priority_t priority) { - return detail_::min_grid_params_for_max_occupancy( - ptr_, device_id_, block_size_to_dynamic_shared_mem_size, block_size_limit, disable_caching_override); + return detail_::create(context.device_id(), context.handle(), synchronizes_with_default_stream, priority); } -#endif // defined __CUDACC__ +} // namespace stream + +namespace detail_ { -inline void kernel_t::set_preferred_shared_mem_fraction(unsigned shared_mem_percentage) +template +void enqueue_launch_helper::operator()( + apriori_compiled_kernel_t wrapped_kernel, + const stream_t & stream, + launch_configuration_t launch_configuration, + KernelParameters &&... parameters) +{ + using raw_kernel_t = typename kernel::detail_::raw_kernel_typegen::type; + auto unwrapped_kernel_function = reinterpret_cast(const_cast(wrapped_kernel.ptr())); + // Notes: + // 1. The inner cast here is because we store the pointer as const void* - as an extra + // precaution against anybody trying to write through it. Now, function pointers + // can't get written through, but are still for some reason not considered const. + // 2. We rely on the caller providing us with more-or-less the correct parameters - + // corresponding to the compiled kernel function's. I say "more or less" because the + // `KernelParameter` pack may contain some references, arrays and so on - which CUDA + // kernels cannot accept; so we massage those a bit. + + detail_::enqueue_raw_kernel_launch( + unwrapped_kernel_function, + stream.handle(), + launch_configuration, + ::std::forward(parameters)...); +} + +template +std::array +marshal_dynamic_kernel_arguments(KernelParameters&&... parameters) { - device::current::detail_::scoped_override_t set_device_for_this_context(device_id_); - if (shared_mem_percentage > 100) { - throw ::std::invalid_argument("Percentage value can't exceed 100"); + return ::std::array { ¶meters... }; +} + +template +struct enqueue_launch_helper { + + void operator()( + const kernel_t& wrapped_kernel, + const stream_t & stream, + launch_configuration_t lc, + KernelParameters &&... parameters) + { + auto marshalled_arguments { marshal_dynamic_kernel_arguments(::std::forward(parameters)...) }; + auto function_handle = wrapped_kernel.handle(); + status_t status; + if (lc.block_cooperation) + status = cuLaunchCooperativeKernel( + function_handle, + lc.dimensions.grid.x, lc.dimensions.grid.y, lc.dimensions.grid.z, + lc.dimensions.block.x, lc.dimensions.block.y, lc.dimensions.block.z, + lc.dynamic_shared_memory_size, + stream.handle(), + marshalled_arguments.data() + ); + else { + constexpr const auto no_arguments_in_alternative_format = nullptr; + // TODO: Consider passing arguments in the alternative format + status = cuLaunchKernel( + function_handle, + lc.dimensions.grid.x, lc.dimensions.grid.y, lc.dimensions.grid.z, + lc.dimensions.block.x, lc.dimensions.block.y, lc.dimensions.block.z, + lc.dynamic_shared_memory_size, + stream.handle(), + marshalled_arguments.data(), + no_arguments_in_alternative_format + ); + } + throw_if_error(status, + (lc.block_cooperation ? "Cooperative " : "") + + ::std::string(" kernel launch failed for ") + kernel::detail_::identify(function_handle) + + " on " + stream::detail_::identify(stream)); } -#if CUDART_VERSION >= 9000 - auto result = cudaFuncSetAttribute(ptr_, cudaFuncAttributePreferredSharedMemoryCarveout, shared_mem_percentage); - throw_if_error(result, "Trying to set the carve-out of shared memory/L1 cache memory"); -#else - throw(cuda::runtime_error {cuda::status::not_yet_implemented}); -#endif // CUDART_VERSION <= 9000 + +}; + +template +void enqueue_launch( + ::std::integral_constant, // Got a raw kernel function + RawKernelFunction kernel_function, + const stream_t& stream, + launch_configuration_t launch_configuration, + KernelParameters&&... parameters) +{ + detail_::enqueue_raw_kernel_launch( + ::std::forward(kernel_function), stream.handle(), launch_configuration, + ::std::forward(parameters)...); } -inline kernel::attributes_t kernel_t::attributes() const +template +void enqueue_launch( + ::std::integral_constant, // a kernel wrapped in a kernel_t (sub)class + Kernel kernel, + const stream_t& stream, + launch_configuration_t launch_configuration, + KernelParameters&&... parameters) { - device::current::detail_::scoped_override_t set_device_for_this_context(device_id_); - kernel::attributes_t function_attributes; - auto status = cudaFuncGetAttributes(&function_attributes, ptr_); - throw_if_error(status, "Failed obtaining attributes for a CUDA device function"); - return function_attributes; + enqueue_launch_helper{}( + ::std::forward(kernel), stream, launch_configuration, + ::std::forward(parameters)...); } -inline void kernel_t::set_cache_preference(multiprocessor_cache_preference_t preference) +} // namespace detail_ + +#if CUDA_VERSION >= 10020 +namespace memory { +namespace virtual_ { +namespace physical_allocation { + +inline device_t properties_t::device() const { - device::current::detail_::scoped_override_t set_device_for_this_context(device_id_); - auto result = cudaFuncSetCacheConfig(ptr_, (cudaFuncCache) preference); - throw_if_error(result, - "Setting the multiprocessor L1/Shared Memory cache distribution preference for a " - "CUDA device function"); + return cuda::device::detail_::wrap(raw.location.id); } +template +properties_t create_properties_for(cuda::device_t device) +{ + return detail_::create_properties(device.id()); +} -inline void kernel_t::set_shared_memory_bank_size( - multiprocessor_shared_memory_bank_size_option_t config) +template +inline physical_allocation_t create(size_t size, device_t device) { - device::current::detail_::scoped_override_t set_device_for_this_context(device_id_); - auto result = cudaFuncSetSharedMemConfig(ptr_, (cudaSharedMemConfig) config); - throw_if_error(result); + auto properties = create_properties_for(device); + return create(size, properties); } -inline grid::dimension_t kernel_t::maximum_active_blocks_per_multiprocessor( - grid::block_dimension_t num_threads_per_block, - memory::shared::size_t dynamic_shared_memory_per_block, - bool disable_caching_override) +} // namespace physical_allocation + +inline void set_access_mode( + region_t fully_mapped_region, + device_t device, + access_mode_t access_mode) { - device::current::detail_::scoped_override_t set_device_for_this_context(device_id_); - int result; - unsigned int flags = disable_caching_override ? - cudaOccupancyDisableCachingOverride : cudaOccupancyDefault; - auto status = cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags( - &result, ptr_, (int) num_threads_per_block, - dynamic_shared_memory_per_block, flags); - throw_if_error(status, "Failed calculating the maximum occupancy " - "of device function blocks per multiprocessor"); - return result; + CUmemAccessDesc desc { { CU_MEM_LOCATION_TYPE_DEVICE, device.id() }, CUmemAccess_flags(access_mode) }; + constexpr const size_t count { 1 }; + auto result = cuMemSetAccess(fully_mapped_region.device_address(), fully_mapped_region.size(), &desc, count); + throw_if_error(result, "Failed setting the access mode to the virtual memory mapping to the range of size " + + ::std::to_string(fully_mapped_region.size()) + " bytes at " + cuda::detail_::ptr_as_hex(fully_mapped_region.data())); } +inline void set_access_mode(mapping_t mapping, device_t device, access_mode_t access_mode) +{ + set_access_mode(mapping.address_range(), device, access_mode); +} -namespace kernel { +template