Fixes #9: A near-complete revamp of the APIs, now taking the driver A…

…PI into account and exposing most of its functionality.
eyalroz · Jan 14, 2022 · bbfd6ea · bbfd6ea
1 parent a07dff9
commit bbfd6ea
Show file tree

Hide file tree

Showing 64 changed files with 8,434 additions and 2,340 deletions.
diff --git a/.github/workflows/cmake-build-linux.yml b/.github/workflows/cmake-build-linux.yml
@@ -63,21 +63,21 @@ jobs:
           gcc: 9 # may fail with gcc-10 due to an internal compiler error
           shell: "bash"
           cmake-generator: "Unix Makefiles"
-        - os: ubuntu-18.04
-          cuda: "10.2"
-          gcc: 8
-          shell: "bash"
-          cmake-generator: "Unix Makefiles"
-        - os: ubuntu-18.04
-          cuda: "10.1"
-          gcc: 8
-          shell: "bash"
-          cmake-generator: "Unix Makefiles"
-        - os: ubuntu-18.04
-          cuda: "10.0"
-          gcc: 7 # fails with GCC 8 - no supported in CUDA 10.0
-          shell: "bash"
-          cmake-generator: "Unix Makefiles"
+#        - os: ubuntu-18.04
+#          cuda: "10.2"
+#          gcc: 8
+#          shell: "bash"
+#          cmake-generator: "Unix Makefiles"
+#        - os: ubuntu-18.04
+#          cuda: "10.1"
+#          gcc: 8
+#          shell: "bash"
+#          cmake-generator: "Unix Makefiles"
+#        - os: ubuntu-18.04
+#          cuda: "10.0"
+#          gcc: 7 # fails with GCC 8 - no supported in CUDA 10.0
+#          shell: "bash"
+#          cmake-generator: "Unix Makefiles"
 # GitHub has remoted ubuntu-16.04 runnings,
 # so we're not testing builds with older CUDA versions
 #       - os: ubuntu-16.04

diff --git a/.github/workflows/cmake-build-windows.yml b/.github/workflows/cmake-build-windows.yml
@@ -66,18 +66,18 @@ jobs:
             shell: "powershell"
             os-type: "windows"
             cmake-platform-flag: "-A x64"
-          - os: windows-2019
-            cuda: "10.2.89"
-            visual-studio: "Visual Studio 16 2019"
-            shell: "powershell"
-            os-type: "windows"
-            cmake-platform-flag: "-A x64"
-          - os: windows-2019
-            cuda: "10.1.243"
-            visual-studio: "Visual Studio 16 2019"
-            shell: "powershell"
-            os-type: "windows"
-            cmake-platform-flag: "-A x64"
+#          - os: windows-2019
+#            cuda: "10.2.89"
+#            visual-studio: "Visual Studio 16 2019"
+#            shell: "powershell"
+#            os-type: "windows"
+#            cmake-platform-flag: "-A x64"
+#          - os: windows-2019
+#            cuda: "10.1.243"
+#            visual-studio: "Visual Studio 16 2019"
+#            shell: "powershell"
+#            os-type: "windows"
+#            cmake-platform-flag: "-A x64"
 
           # Windows2016 & VS 2017 supports 10.0+
           # - os: windows-2016

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -17,14 +17,14 @@ if(WIN32 AND "${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
 endif()
 
 PROJECT(cuda-api-wrappers
-	VERSION 0.4.4
+	VERSION 0.5.0
 	DESCRIPTION "Thin C++-flavored wrappers for the CUDA Runtime API"
 	HOMEPAGE_URL https://github.com/eyalroz/cuda-api-wrappers
 	LANGUAGES CUDA CXX)
 
 include(GNUInstallDirs)
 
-find_package(CUDAToolkit REQUIRED)
+find_package(CUDAToolkit 11.0 REQUIRED)
 find_package(Threads REQUIRED)
 set(CMAKE_THREAD_PREFER_PTHREAD TRUE)
 
@@ -35,9 +35,9 @@ set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY "lib/")
 # Our library targets
 # -------------------
 
-add_library(runtime-api INTERFACE) # A header-only library!
+add_library(runtime-and-driver INTERFACE) # A header-only library!
 add_library(nvtx)
-set(wrapper-libraries runtime-api nvtx)
+set(wrapper-libraries runtime-and-driver nvtx)
 
 foreach(WRAPPER_LIB ${wrapper-libraries})
 	target_compile_features(${WRAPPER_LIB} INTERFACE cxx_std_11) # This means _at least_ C++11
@@ -47,11 +47,11 @@ foreach(WRAPPER_LIB ${wrapper-libraries})
 		"$<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/src>"
 		"$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>"
 	)
-	target_link_libraries(${WRAPPER_LIB} INTERFACE CUDA::cudart) # CUDA::cuda_driver)
+	target_link_libraries(${WRAPPER_LIB} INTERFACE CUDA::cudart CUDA::nvToolsExt CUDA::cuda_driver)
 endforeach()
 
 set_target_properties(nvtx PROPERTIES OUTPUT_NAME "cuda-nvtx-wrappers")
-target_link_libraries(nvtx PUBLIC runtime-api)
+target_link_libraries(nvtx PUBLIC runtime-and-driver)
 target_link_libraries(nvtx PRIVATE Threads::Threads CUDA::nvToolsExt)
 set_property(TARGET nvtx PROPERTY CXX_STANDARD 11)
 set_property(TARGET nvtx PROPERTY CXX_STANDARD_REQUIRED ON)

diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
@@ -28,12 +28,13 @@ set(CMAKE_CUDA_STANDARD 11)
 set(CMAKE_CUDA_STANDARD_REQUIRED ON)
 set(CMAKE_CUDA_EXTENSIONS OFF)
 
-link_libraries(runtime-api)
+link_libraries(runtime-and-driver)
 
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "bin")
 add_executable(vectorAdd modified_cuda_samples/vectorAdd/vectorAdd.cu)
 add_executable(vectorAddMapped modified_cuda_samples/vectorAddMapped/vectorAddMapped.cu)
 add_executable(vectorAddManaged modified_cuda_samples/vectorAddManaged/vectorAddManaged.cu)
+add_executable(simpleDrvRuntimePTX modified_cuda_samples/simpleDrvRuntimePTX/simpleDrvRuntimePTX.cpp)
 add_executable(inlinePTX modified_cuda_samples/inlinePTX/inlinePTX.cu)
 add_executable(simpleStreams modified_cuda_samples/simpleStreams/simpleStreams.cu)
 add_executable(simpleIPC modified_cuda_samples/simpleIPC/simpleIPC.cu)
@@ -50,12 +51,14 @@ add_dependencies(modified_cuda_samples vectorAdd inlinePTX simpleStreams simpleI
 add_executable(version_management by_runtime_api_module/version_management.cpp)
 add_executable(error_handling by_runtime_api_module/error_handling.cu)
 add_executable(device_management by_runtime_api_module/device_management.cpp)
+add_executable(context_management by_driver_api_module/context_management.cpp)
 add_executable(execution_control by_runtime_api_module/execution_control.cu)
 
 add_executable(stream_management by_runtime_api_module/stream_management.cu)
 add_executable(event_management by_runtime_api_module/event_management.cu)
 add_executable(unified_addressing by_runtime_api_module/unified_addressing.cpp)
 add_executable(io_compute_overlap_with_streams other/io_compute_overlap_with_streams.cu)
+add_executable(manipulate_current_device other/manipulate_current_device.cu)
 add_executable(inclusion_in_two_translation_units other/inclusion_in_two_translation_units/main.cpp other/inclusion_in_two_translation_units/second_tu.cpp )
 
 if(NOT "${CMAKE_CUDA_COMPILER_ID}" STREQUAL "Clang")

diff --git a/examples/by_driver_api_module/context_management.cpp b/examples/by_driver_api_module/context_management.cpp
@@ -0,0 +1,191 @@
+/**
+ * An example program utilizing most/all calls from the CUDA
+ * Driver API module:
+ *
+ *   Device Management
+ */
+#include "../common.hpp"
+
+void current_context_manipulation(const cuda::device_t &device, const cuda::device::primary_context_t &pc,
+	const cuda::context_t &created_context);
+
+void test_context(
+	const cuda::context_t& context,
+	bool is_primary,
+	cuda::device::id_t device_id)
+{
+	std::cout << "Testing " << (is_primary ? "" : "non-") << "primary context " << context << '\n';
+	if (context.device_id() != device_id) {
+		die_("The device's primary context's reported ID and the device wrapper's ID differ: "
+			+ std::to_string(context.device_id()) + " !=" +  std::to_string(device_id));
+	}
+
+	if (context.device().id() != device_id) {
+		die_("The context's associated device's ID is not the same as that of the device for which we obtained the context: "
+			+ std::to_string(context.device().id()) + " !=" +  std::to_string(device_id) );
+	}
+
+	if (context.is_primary() != is_primary) {
+		die_(std::string("The ") + (is_primary ? "" : "non-") + "primary context " + std::to_string(context)
+			+ " \"believes\" it is " + (is_primary ? "not " : "") + "primary.");
+	}
+
+	// Specific attributes and properties with their own API calls:
+	// L1/shared mem (CacheConfig), shared memory bank size (SharedMemConfig)
+	// and stream priority range
+	// ----------------------------------------------------------------
+
+	auto cache_preference = context.cache_preference();
+	std::cout << "The cache preference for context " << context << " is: " << cache_preference << ".\n";
+
+	auto new_cache_preference =
+		cache_preference == cuda::multiprocessor_cache_preference_t::prefer_l1_over_shared_memory ?
+		cuda::multiprocessor_cache_preference_t::prefer_shared_memory_over_l1 :
+		cuda::multiprocessor_cache_preference_t::prefer_l1_over_shared_memory;
+	context.set_cache_preference(new_cache_preference);
+	cache_preference = context.cache_preference();
+	assert_(cache_preference == new_cache_preference);
+	std::cout << "The cache preference for context " << context << " has now been set to: " << new_cache_preference << ".\n";
+
+	auto shared_mem_bank_size = context.shared_memory_bank_size();
+	shared_mem_bank_size =
+		(shared_mem_bank_size == CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE) ?
+			CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE : CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE;
+	context.set_shared_memory_bank_size(shared_mem_bank_size);
+	auto stream_priority_range = context.stream_priority_range();
+	if (stream_priority_range.is_trivial()) {
+		std::cout << "Context " << context <<  " does not support stream priorities. "
+			"All streams will have the same (default) priority.\n";
+	}
+	else {
+		std::cout << "Streams in context " << context << " have priorities between "
+			<< stream_priority_range.least << " (highest numeric value, least prioritized) and "
+			<< std::to_string(stream_priority_range.greatest) << "(lowest numeric values, most prioritized).\n";
+		assert(stream_priority_range.least > stream_priority_range.greatest);
+	}
+
+	// Resource limits
+	// --------------------
+
+	auto printf_fifo_size = context.get_limit(CU_LIMIT_PRINTF_FIFO_SIZE);
+	std::cout << "The printf FIFO size for context " << context << " is " << printf_fifo_size << ".\n";
+	decltype(printf_fifo_size) new_printf_fifo_size =
+		(printf_fifo_size <= 1024) ?  2 * printf_fifo_size : printf_fifo_size - 512;
+	context.set_limit(CU_LIMIT_PRINTF_FIFO_SIZE, new_printf_fifo_size);
+	printf_fifo_size = context.get_limit(CU_LIMIT_PRINTF_FIFO_SIZE);
+	assert_(printf_fifo_size == new_printf_fifo_size);
+
+	// Flags - yes, yet another kind of attribute/property
+	// ----------------------------------------------------
+
+	std::cout << "Context " << context << " uses a"
+		<< (context.synch_scheduling_policy() ? " synchronous" : "n asynchronous")
+		<< " scheduling policy.\n";
+	std::cout << "Context " << context << " is set to "
+		<< (context.keeping_larger_local_mem_after_resize() ? "keep" : "discard")
+		<< " shared memory allocation after launch.\n";
+	// TODO: Change the settings as well obtaining them
+
+}
+
+void current_context_manipulation(
+	cuda::device_t &device,
+	cuda::device::primary_context_t &pc,
+	cuda::context_t &created_context)
+{
+	cuda::context_t context_0 = pc;
+	cuda::context_t context_1 = created_context;
+	cuda::context::current::set(context_0);
+	assert_(cuda::context::current::get() == context_0);
+	assert_(cuda::context::current::detail_::get_handle() == context_0.handle());
+	cuda::context::current::set(context_1);
+	assert_(cuda::context::current::get() == context_1);
+	assert_(cuda::context::current::detail_::get_handle() == context_1.handle());
+
+
+	auto context_2 = cuda::context::create(device);
+	{
+		cuda::context::current::scoped_override_t context_for_this_block { context_2 };
+		assert_(context_2.handle() == cuda::context::current::get().handle());
+		assert_(context_2 == cuda::context::current::get());
+	}
+	auto gotten = cuda::context::current::get();
+	assert_(gotten == context_1);
+
+	auto context_3 = cuda::context::create_and_push(device);
+
+//	std::cout << "Contexts:\n";
+//	std::cout << "context_0: " << context_0 << '\n';
+//	std::cout << "context_1: " << context_1 << '\n';
+//	std::cout << "context_2: " << context_2 << '\n';
+//	std::cout << "context_3: " << context_3 << '\n';
+
+	{
+		cuda::context::current::scoped_override_t context_for_this_block { context_3 };
+		assert_(context_3.handle() == cuda::context::current::get().handle());
+		assert_(context_3 == cuda::context::current::get());
+	}
+
+	{
+		auto popped = cuda::context::current::pop();
+		assert_(popped == context_3);
+	}
+	gotten = cuda::context::current::get();
+	assert_(gotten == context_1);
+}
+
+
+int main(int argc, char **argv)
+{
+	if (cuda::device::count() == 0) {
+		die_("No CUDA devices on this system");
+	}
+
+	// Being very cavalier about our command-line arguments here...
+	cuda::device::id_t device_id =  (argc > 1) ?
+		std::stoi(argv[1]) : cuda::device::default_device_id;
+
+	if (cuda::device::count() <= device_id) {
+		die_("No CUDA device with ID " + std::to_string(device_id));
+	}
+
+	auto device = cuda::device::get(device_id);
+
+	std::cout << "Using CUDA device " << device.name() << " (having device ID " << device.id() << ")\n";
+
+//	report_context_stack("Before anything is done");
+	auto pc = device.primary_context();
+//	report_context_stack("After getting the primary context");
+
+
+	cuda::context::current::push(pc);
+	constexpr const bool is_primary = true;
+	constexpr const bool isnt_primary = false;
+	test_context(pc, is_primary, device_id);
+
+	{
+		auto popped = cuda::context::current::pop();
+		if (popped != pc) {
+			die_("After pushing context " + std::to_string(pc) + " and popping it - the pop result is a different context, " + std::to_string(popped));
+		}
+	}
+
+	auto created_context = cuda::context::create(device);
+	test_context(created_context, isnt_primary, device_id);
+	current_context_manipulation(device, pc, created_context);
+
+	std::cout << std::endl;
+//	report_context_stack("After current_context_manipulation");
+	cuda::context::current::push(created_context);
+	cuda::context::current::push(created_context);
+	// We should have 3 copies of created_context on the stack at this point, and nothing else
+	cudaSetDevice(device_id);
+//	report_context_stack("After cudaSetDevice " + std::to_string(device_id));
+	// We should have the primary context of the device
+
+
+	device.synchronize();
+	device.reset();
+
+	std::cout << "\nSUCCESS\n";
+}