Skip to content

Commit

Permalink
Fixes #9: A near-complete revamp of the APIs, now taking the driver A…
Browse files Browse the repository at this point in the history
…PI into account and exposing most of its functionality.
  • Loading branch information
Eyal Rozenberg authored and eyalroz committed Jan 14, 2022
1 parent a07dff9 commit bbfd6ea
Show file tree
Hide file tree
Showing 64 changed files with 8,434 additions and 2,340 deletions.
30 changes: 15 additions & 15 deletions .github/workflows/cmake-build-linux.yml
Original file line number Diff line number Diff line change
Expand Up @@ -63,21 +63,21 @@ jobs:
gcc: 9 # may fail with gcc-10 due to an internal compiler error
shell: "bash"
cmake-generator: "Unix Makefiles"
- os: ubuntu-18.04
cuda: "10.2"
gcc: 8
shell: "bash"
cmake-generator: "Unix Makefiles"
- os: ubuntu-18.04
cuda: "10.1"
gcc: 8
shell: "bash"
cmake-generator: "Unix Makefiles"
- os: ubuntu-18.04
cuda: "10.0"
gcc: 7 # fails with GCC 8 - no supported in CUDA 10.0
shell: "bash"
cmake-generator: "Unix Makefiles"
# - os: ubuntu-18.04
# cuda: "10.2"
# gcc: 8
# shell: "bash"
# cmake-generator: "Unix Makefiles"
# - os: ubuntu-18.04
# cuda: "10.1"
# gcc: 8
# shell: "bash"
# cmake-generator: "Unix Makefiles"
# - os: ubuntu-18.04
# cuda: "10.0"
# gcc: 7 # fails with GCC 8 - no supported in CUDA 10.0
# shell: "bash"
# cmake-generator: "Unix Makefiles"
# GitHub has remoted ubuntu-16.04 runnings,
# so we're not testing builds with older CUDA versions
# - os: ubuntu-16.04
Expand Down
24 changes: 12 additions & 12 deletions .github/workflows/cmake-build-windows.yml
Original file line number Diff line number Diff line change
Expand Up @@ -66,18 +66,18 @@ jobs:
shell: "powershell"
os-type: "windows"
cmake-platform-flag: "-A x64"
- os: windows-2019
cuda: "10.2.89"
visual-studio: "Visual Studio 16 2019"
shell: "powershell"
os-type: "windows"
cmake-platform-flag: "-A x64"
- os: windows-2019
cuda: "10.1.243"
visual-studio: "Visual Studio 16 2019"
shell: "powershell"
os-type: "windows"
cmake-platform-flag: "-A x64"
# - os: windows-2019
# cuda: "10.2.89"
# visual-studio: "Visual Studio 16 2019"
# shell: "powershell"
# os-type: "windows"
# cmake-platform-flag: "-A x64"
# - os: windows-2019
# cuda: "10.1.243"
# visual-studio: "Visual Studio 16 2019"
# shell: "powershell"
# os-type: "windows"
# cmake-platform-flag: "-A x64"

# Windows2016 & VS 2017 supports 10.0+
# - os: windows-2016
Expand Down
12 changes: 6 additions & 6 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,14 @@ if(WIN32 AND "${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
endif()

PROJECT(cuda-api-wrappers
VERSION 0.4.4
VERSION 0.5.0
DESCRIPTION "Thin C++-flavored wrappers for the CUDA Runtime API"
HOMEPAGE_URL https://github.com/eyalroz/cuda-api-wrappers
LANGUAGES CUDA CXX)

include(GNUInstallDirs)

find_package(CUDAToolkit REQUIRED)
find_package(CUDAToolkit 11.0 REQUIRED)
find_package(Threads REQUIRED)
set(CMAKE_THREAD_PREFER_PTHREAD TRUE)

Expand All @@ -35,9 +35,9 @@ set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY "lib/")
# Our library targets
# -------------------

add_library(runtime-api INTERFACE) # A header-only library!
add_library(runtime-and-driver INTERFACE) # A header-only library!
add_library(nvtx)
set(wrapper-libraries runtime-api nvtx)
set(wrapper-libraries runtime-and-driver nvtx)

foreach(WRAPPER_LIB ${wrapper-libraries})
target_compile_features(${WRAPPER_LIB} INTERFACE cxx_std_11) # This means _at least_ C++11
Expand All @@ -47,11 +47,11 @@ foreach(WRAPPER_LIB ${wrapper-libraries})
"$<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/src>"
"$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>"
)
target_link_libraries(${WRAPPER_LIB} INTERFACE CUDA::cudart) # CUDA::cuda_driver)
target_link_libraries(${WRAPPER_LIB} INTERFACE CUDA::cudart CUDA::nvToolsExt CUDA::cuda_driver)
endforeach()

set_target_properties(nvtx PROPERTIES OUTPUT_NAME "cuda-nvtx-wrappers")
target_link_libraries(nvtx PUBLIC runtime-api)
target_link_libraries(nvtx PUBLIC runtime-and-driver)
target_link_libraries(nvtx PRIVATE Threads::Threads CUDA::nvToolsExt)
set_property(TARGET nvtx PROPERTY CXX_STANDARD 11)
set_property(TARGET nvtx PROPERTY CXX_STANDARD_REQUIRED ON)
Expand Down
5 changes: 4 additions & 1 deletion examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,13 @@ set(CMAKE_CUDA_STANDARD 11)
set(CMAKE_CUDA_STANDARD_REQUIRED ON)
set(CMAKE_CUDA_EXTENSIONS OFF)

link_libraries(runtime-api)
link_libraries(runtime-and-driver)

set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "bin")
add_executable(vectorAdd modified_cuda_samples/vectorAdd/vectorAdd.cu)
add_executable(vectorAddMapped modified_cuda_samples/vectorAddMapped/vectorAddMapped.cu)
add_executable(vectorAddManaged modified_cuda_samples/vectorAddManaged/vectorAddManaged.cu)
add_executable(simpleDrvRuntimePTX modified_cuda_samples/simpleDrvRuntimePTX/simpleDrvRuntimePTX.cpp)
add_executable(inlinePTX modified_cuda_samples/inlinePTX/inlinePTX.cu)
add_executable(simpleStreams modified_cuda_samples/simpleStreams/simpleStreams.cu)
add_executable(simpleIPC modified_cuda_samples/simpleIPC/simpleIPC.cu)
Expand All @@ -50,12 +51,14 @@ add_dependencies(modified_cuda_samples vectorAdd inlinePTX simpleStreams simpleI
add_executable(version_management by_runtime_api_module/version_management.cpp)
add_executable(error_handling by_runtime_api_module/error_handling.cu)
add_executable(device_management by_runtime_api_module/device_management.cpp)
add_executable(context_management by_driver_api_module/context_management.cpp)
add_executable(execution_control by_runtime_api_module/execution_control.cu)

add_executable(stream_management by_runtime_api_module/stream_management.cu)
add_executable(event_management by_runtime_api_module/event_management.cu)
add_executable(unified_addressing by_runtime_api_module/unified_addressing.cpp)
add_executable(io_compute_overlap_with_streams other/io_compute_overlap_with_streams.cu)
add_executable(manipulate_current_device other/manipulate_current_device.cu)
add_executable(inclusion_in_two_translation_units other/inclusion_in_two_translation_units/main.cpp other/inclusion_in_two_translation_units/second_tu.cpp )

if(NOT "${CMAKE_CUDA_COMPILER_ID}" STREQUAL "Clang")
Expand Down
191 changes: 191 additions & 0 deletions examples/by_driver_api_module/context_management.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,191 @@
/**
* An example program utilizing most/all calls from the CUDA
* Driver API module:
*
* Device Management
*/
#include "../common.hpp"

void current_context_manipulation(const cuda::device_t &device, const cuda::device::primary_context_t &pc,
const cuda::context_t &created_context);

void test_context(
const cuda::context_t& context,
bool is_primary,
cuda::device::id_t device_id)
{
std::cout << "Testing " << (is_primary ? "" : "non-") << "primary context " << context << '\n';
if (context.device_id() != device_id) {
die_("The device's primary context's reported ID and the device wrapper's ID differ: "
+ std::to_string(context.device_id()) + " !=" + std::to_string(device_id));
}

if (context.device().id() != device_id) {
die_("The context's associated device's ID is not the same as that of the device for which we obtained the context: "
+ std::to_string(context.device().id()) + " !=" + std::to_string(device_id) );
}

if (context.is_primary() != is_primary) {
die_(std::string("The ") + (is_primary ? "" : "non-") + "primary context " + std::to_string(context)
+ " \"believes\" it is " + (is_primary ? "not " : "") + "primary.");
}

// Specific attributes and properties with their own API calls:
// L1/shared mem (CacheConfig), shared memory bank size (SharedMemConfig)
// and stream priority range
// ----------------------------------------------------------------

auto cache_preference = context.cache_preference();
std::cout << "The cache preference for context " << context << " is: " << cache_preference << ".\n";

auto new_cache_preference =
cache_preference == cuda::multiprocessor_cache_preference_t::prefer_l1_over_shared_memory ?
cuda::multiprocessor_cache_preference_t::prefer_shared_memory_over_l1 :
cuda::multiprocessor_cache_preference_t::prefer_l1_over_shared_memory;
context.set_cache_preference(new_cache_preference);
cache_preference = context.cache_preference();
assert_(cache_preference == new_cache_preference);
std::cout << "The cache preference for context " << context << " has now been set to: " << new_cache_preference << ".\n";

auto shared_mem_bank_size = context.shared_memory_bank_size();
shared_mem_bank_size =
(shared_mem_bank_size == CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE) ?
CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE : CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE;
context.set_shared_memory_bank_size(shared_mem_bank_size);
auto stream_priority_range = context.stream_priority_range();
if (stream_priority_range.is_trivial()) {
std::cout << "Context " << context << " does not support stream priorities. "
"All streams will have the same (default) priority.\n";
}
else {
std::cout << "Streams in context " << context << " have priorities between "
<< stream_priority_range.least << " (highest numeric value, least prioritized) and "
<< std::to_string(stream_priority_range.greatest) << "(lowest numeric values, most prioritized).\n";
assert(stream_priority_range.least > stream_priority_range.greatest);
}

// Resource limits
// --------------------

auto printf_fifo_size = context.get_limit(CU_LIMIT_PRINTF_FIFO_SIZE);
std::cout << "The printf FIFO size for context " << context << " is " << printf_fifo_size << ".\n";
decltype(printf_fifo_size) new_printf_fifo_size =
(printf_fifo_size <= 1024) ? 2 * printf_fifo_size : printf_fifo_size - 512;
context.set_limit(CU_LIMIT_PRINTF_FIFO_SIZE, new_printf_fifo_size);
printf_fifo_size = context.get_limit(CU_LIMIT_PRINTF_FIFO_SIZE);
assert_(printf_fifo_size == new_printf_fifo_size);

// Flags - yes, yet another kind of attribute/property
// ----------------------------------------------------

std::cout << "Context " << context << " uses a"
<< (context.synch_scheduling_policy() ? " synchronous" : "n asynchronous")
<< " scheduling policy.\n";
std::cout << "Context " << context << " is set to "
<< (context.keeping_larger_local_mem_after_resize() ? "keep" : "discard")
<< " shared memory allocation after launch.\n";
// TODO: Change the settings as well obtaining them

}

void current_context_manipulation(
cuda::device_t &device,
cuda::device::primary_context_t &pc,
cuda::context_t &created_context)
{
cuda::context_t context_0 = pc;
cuda::context_t context_1 = created_context;
cuda::context::current::set(context_0);
assert_(cuda::context::current::get() == context_0);
assert_(cuda::context::current::detail_::get_handle() == context_0.handle());
cuda::context::current::set(context_1);
assert_(cuda::context::current::get() == context_1);
assert_(cuda::context::current::detail_::get_handle() == context_1.handle());


auto context_2 = cuda::context::create(device);
{
cuda::context::current::scoped_override_t context_for_this_block { context_2 };
assert_(context_2.handle() == cuda::context::current::get().handle());
assert_(context_2 == cuda::context::current::get());
}
auto gotten = cuda::context::current::get();
assert_(gotten == context_1);

auto context_3 = cuda::context::create_and_push(device);

// std::cout << "Contexts:\n";
// std::cout << "context_0: " << context_0 << '\n';
// std::cout << "context_1: " << context_1 << '\n';
// std::cout << "context_2: " << context_2 << '\n';
// std::cout << "context_3: " << context_3 << '\n';

{
cuda::context::current::scoped_override_t context_for_this_block { context_3 };
assert_(context_3.handle() == cuda::context::current::get().handle());
assert_(context_3 == cuda::context::current::get());
}

{
auto popped = cuda::context::current::pop();
assert_(popped == context_3);
}
gotten = cuda::context::current::get();
assert_(gotten == context_1);
}


int main(int argc, char **argv)
{
if (cuda::device::count() == 0) {
die_("No CUDA devices on this system");
}

// Being very cavalier about our command-line arguments here...
cuda::device::id_t device_id = (argc > 1) ?
std::stoi(argv[1]) : cuda::device::default_device_id;

if (cuda::device::count() <= device_id) {
die_("No CUDA device with ID " + std::to_string(device_id));
}

auto device = cuda::device::get(device_id);

std::cout << "Using CUDA device " << device.name() << " (having device ID " << device.id() << ")\n";

// report_context_stack("Before anything is done");
auto pc = device.primary_context();
// report_context_stack("After getting the primary context");


cuda::context::current::push(pc);
constexpr const bool is_primary = true;
constexpr const bool isnt_primary = false;
test_context(pc, is_primary, device_id);

{
auto popped = cuda::context::current::pop();
if (popped != pc) {
die_("After pushing context " + std::to_string(pc) + " and popping it - the pop result is a different context, " + std::to_string(popped));
}
}

auto created_context = cuda::context::create(device);
test_context(created_context, isnt_primary, device_id);
current_context_manipulation(device, pc, created_context);

std::cout << std::endl;
// report_context_stack("After current_context_manipulation");
cuda::context::current::push(created_context);
cuda::context::current::push(created_context);
// We should have 3 copies of created_context on the stack at this point, and nothing else
cudaSetDevice(device_id);
// report_context_stack("After cudaSetDevice " + std::to_string(device_id));
// We should have the primary context of the device


device.synchronize();
device.reset();

std::cout << "\nSUCCESS\n";
}
Loading

0 comments on commit bbfd6ea

Please sign in to comment.