Skip to content
This repository has been archived by the owner on Mar 21, 2024. It is now read-only.

Revamp samples and benchmarks #31

Merged
merged 29 commits into from
Oct 29, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
cc3e866
Move and rename benchmark to it separate benchmark directory.
jrhemstad Sep 30, 2020
25d155c
Use CUDA as project language.
jrhemstad Sep 30, 2020
e8cc0bb
Revampe benchmark cmake script.
jrhemstad Sep 30, 2020
3c32ceb
typo
jrhemstad Sep 30, 2020
60e7fb3
tense.
jrhemstad Sep 30, 2020
325513e
Changed path to text files.
jrhemstad Oct 1, 2020
50ad605
Add error checking to opening files.
jrhemstad Oct 1, 2020
afd58a7
Add initial cmake for building trie_st.
jrhemstad Oct 1, 2020
cd8ef16
Update txt file paths and add error checking to trie_mt.
jrhemstad Oct 1, 2020
e642ce1
Add cmake config for trie_mt.
jrhemstad Oct 1, 2020
e504db4
Update paths and error checking for trie.cu.
jrhemstad Oct 1, 2020
acf3db3
Only download books if they don't already exists.
jrhemstad Oct 1, 2020
f2e0eaa
Add config for trie.
jrhemstad Oct 1, 2020
9ca2303
Add cmake config for trie.cu.
jrhemstad Oct 1, 2020
32ae0d7
Delete old scripts.
jrhemstad Oct 1, 2020
bbde1ee
Update include paths.
jrhemstad Oct 1, 2020
39a0645
Remove hardcoded paths in nvrtc example.
jrhemstad Oct 1, 2020
f2ecd55
Add cmake config for nvrtc example.
jrhemstad Oct 1, 2020
e6688a4
Remove unused cmake stuff.
jrhemstad Oct 1, 2020
218a033
Doc.
jrhemstad Oct 1, 2020
ec2a4b6
Format.
jrhemstad Oct 1, 2020
20a702c
Replace simt namespace with cuda.
jrhemstad Oct 2, 2020
dff74e8
Point nvrtc include path to relative location of libcu++.
jrhemstad Oct 2, 2020
e291950
Reorganize cmake file.
jrhemstad Oct 2, 2020
e53370b
Add concurrent hash table example.
brycelelbach Oct 16, 2020
69e3dc3
Merge remote-tracking branch 'origin/main' into reorganize_samples
jrhemstad Oct 20, 2020
ba3538e
rename samples -> examples.
jrhemstad Oct 20, 2020
818d2b1
Add hash map to cmake file.
jrhemstad Oct 20, 2020
1b716a1
Docs: Update link in README.md to point to the new examples directory.
brycelelbach Oct 29, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 38 additions & 0 deletions benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
cmake_minimum_required(VERSION 3.18)

project(libcudacxx-benchmarks LANGUAGES CXX CUDA)

set(CMAKE_BUILD_TYPE "RelWithDebInfo")

set(GPU_ARCHS "70;72;75;80" CACHE STRING "List of GPU architectures to compile for.")

message(STATUS "Enabled CUDA architectures:${GPU_ARCHS}")

find_package(Threads REQUIRED)
find_package(OpenMP)

function(ConfigureHostBench BENCH_NAME BENCH_SRC)
jrhemstad marked this conversation as resolved.
Show resolved Hide resolved
add_executable("${BENCH_NAME}" "${BENCH_SRC}")
target_link_libraries("${BENCH_NAME}" PRIVATE Threads::Threads)

# TODO: Link against libcudaxx interface target instead
target_include_directories("${BENCH_NAME}" PRIVATE
"${CMAKE_CURRENT_SOURCE_DIR}/../include")
if(OpenMP_CXX_FOUND)
target_link_libraries("${BENCH_NAME}" PRIVATE OpenMP::OpenMP_CXX)
endif()
endfunction(ConfigureHostBench)

function(ConfigureDeviceBench BENCH_NAME BENCH_SRC)
add_executable("${BENCH_NAME}" "${BENCH_SRC}")
set_property(TARGET "${BENCH_NAME}" PROPERTY CUDA_ARCHITECTURES "${GPU_ARCHS}")
# TODO: Link against libcudaxx interface target instead
target_include_directories("${BENCH_NAME}" PRIVATE
"${CMAKE_CURRENT_SOURCE_DIR}/../include")
target_compile_options("${BENCH_NAME}" PRIVATE --expt-extended-lambda --expt-relaxed-constexpr)
endfunction(ConfigureDeviceBench)

ConfigureHostBench(concurrency_host concurrency.cpp)

ConfigureDeviceBench(concurrency_device concurrency.cu)

File renamed without changes.
File renamed without changes.
2 changes: 1 addition & 1 deletion docs/readme.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# libcu++: The C++ Standard Library for Your Entire System

<table><tr>
<th><b><a href="https://github.com/nvidia/libcudacxx/tree/main/samples">Examples</a></b></th>
<th><b><a href="https://github.com/nvidia/libcudacxx/tree/main/examples">Examples</a></b></th>
<th><b><a href="https://godbolt.org/z/M11W7j">Godbolt</a></b></th>
<th><b><a href="https://nvidia.github.io/libcudacxx">Documentation</a></b></th>
</tr></table>
Expand Down
53 changes: 53 additions & 0 deletions examples/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
cmake_minimum_required(VERSION 3.18)

project(libcudacxx-examples LANGUAGES CXX CUDA)

set(CMAKE_BUILD_TYPE "RelWithDebInfo")

find_package(CUDAToolkit REQUIRED)
find_package(Threads REQUIRED)
find_package(OpenMP)

# Download input files for the trie examples.
if(NOT (EXISTS books))
execute_process(COMMAND mkdir books)
file(DOWNLOAD https://www.gutenberg.org/files/2600/2600-0.txt books/2600-0.txt SHOW_PROGRESS)
file(DOWNLOAD http://www.gutenberg.org/cache/epub/996/pg996.txt books/pg996.txt SHOW_PROGRESS)
file(DOWNLOAD http://www.gutenberg.org/cache/epub/55/pg55.txt books/pg55.txt SHOW_PROGRESS)
file(DOWNLOAD https://www.gutenberg.org/files/8800/8800.txt books/8800.txt SHOW_PROGRESS)
file(DOWNLOAD https://www.gutenberg.org/files/84/84-0.txt books/84-0.txt SHOW_PROGRESS)
file(DOWNLOAD http://www.gutenberg.org/cache/epub/6130/pg6130.txt books/pg6130.txt SHOW_PROGRESS)
file(DOWNLOAD http://www.gutenberg.org/cache/epub/1727/pg1727.txt books/pg1727.txt SHOW_PROGRESS)
file(DOWNLOAD https://www.gutenberg.org/files/2701/2701-0.txt books/2701-0.txt SHOW_PROGRESS)
file(DOWNLOAD https://www.gutenberg.org/files/35/35-0.txt books/35-0.txt SHOW_PROGRESS)
file(DOWNLOAD https://www.gutenberg.org/files/1342/1342-0.txt books/1342-0.txt SHOW_PROGRESS)
endif()

add_executable(trie_st trie_st.cpp)
target_compile_features(trie_st PRIVATE cxx_std_11)

add_executable(trie_mt trie_mt.cpp)
target_compile_features(trie_mt PRIVATE cxx_std_11)
target_link_libraries(trie_mt Threads::Threads)

if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 11.1)
add_executable(trie_cuda trie.cu)
target_compile_features(trie_cuda PRIVATE cxx_std_11 cuda_std_11)
target_compile_options(trie_cuda PRIVATE --expt-relaxed-constexpr)
set_property(TARGET trie_cuda PROPERTY CUDA_ARCHITECTURES 70)
else()
message(STATUS "Insufficient CUDA version. Skipping trie.cu example.")
endif()

if(CUDAToolkit_VERSION VERSION_GREATER 10.2)
add_executable(rtc rtc_example.cpp)
target_link_libraries(rtc CUDA::nvrtc)
target_compile_features(rtc PRIVATE cxx_std_11)
else()
message(STATUS "Insufficient CUDA version. Skipping rtc_example.cpp example.")
endif()

add_executable(hash_map concurrent_hash_table.cu)
target_compile_features(hash_map PRIVATE cxx_std_14 cuda_std_14)
set_property(TARGET hash_map PROPERTY CUDA_ARCHITECTURES 70)
target_compile_options(hash_map PRIVATE --expt-extended-lambda)
260 changes: 260 additions & 0 deletions examples/concurrent_hash_table.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,260 @@
// Copyright (c) 2018-2020 NVIDIA Corporation
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// Released under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.

#include <cuda/std/cstdint>
#include <cuda/std/atomic>

// TODO: It would be great if this example could NOT depend on Thrust.
#include <thrust/pair.h>
#include <thrust/functional.h>
#include <thrust/allocate_unique.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/iterator/counting_iterator.h>

#include <cassert>
#include <random>

#include <iostream>
#include <cstdio>
#include <cassert>

// TODO: This should be upstreamed and then removed.
namespace thrust {

using universal_raw_memory_resource =
thrust::system::cuda::detail::cuda_memory_resource<
thrust::system::cuda::detail::cudaMallocManaged, cudaFree, void*
>;

template <typename T>
using universal_allocator =
thrust::mr::stateless_resource_allocator<T, universal_raw_memory_resource>;

template <typename T>
using universal_vector = thrust::device_vector<T, universal_allocator<T>>;

} // thrust

template <
typename Key, typename Value,
typename Hash = thrust::identity<Key>,
typename KeyEqual = thrust::equal_to<Key>,
typename MemoryResource = thrust::universal_raw_memory_resource
>
struct concurrent_hash_table {
// Elements transition from state_empty -> state_reserved ->
// state_filled; no other transitions are allowed.
enum state_type {
state_empty, state_reserved, state_filled
};

using key_type = Key;
using mapped_type = Value;
using size_type = cuda::std::uint64_t;

using key_allocator = thrust::mr::stateless_resource_allocator<
key_type, MemoryResource
>;
using mapped_allocator = thrust::mr::stateless_resource_allocator<
mapped_type, MemoryResource
>;
using state_allocator = thrust::mr::stateless_resource_allocator<
cuda::std::atomic<state_type>, MemoryResource
>;

using key_iterator = typename key_allocator::pointer;
using value_iterator = typename mapped_allocator::pointer;
using state_iterator = typename state_allocator::pointer;

// This whole thing is silly and should be a lambda, or at least a private
// nested class, but alas, NVCC doesn't like that.
struct element_destroyer {
private:
size_type capacity_;
key_iterator keys_;
value_iterator values_;
state_iterator states_;

public:
__host__ __device__
element_destroyer(size_type capacity,
key_iterator keys,
value_iterator values,
state_iterator states)
: capacity_(capacity), keys_(keys), values_(values), states_(states)
{}

element_destroyer(element_destroyer const&) = default;

__host__ __device__
void operator()(size_type i) {
if (state_empty != states_[i]) {
(keys_ + i)->~key_type();
(values_ + i)->~mapped_type();
}
}
};

private:
size_type capacity_;
key_iterator keys_;
value_iterator values_;
state_iterator states_;
Hash hash_;
KeyEqual key_equal_;

public:
__host__
concurrent_hash_table(size_type capacity,
Hash hash = Hash(),
KeyEqual key_equal = KeyEqual())
: capacity_(capacity)
, keys_(key_allocator{}.allocate(capacity_))
, values_(mapped_allocator{}.allocate(capacity_))
, states_(state_allocator{}.allocate(capacity_))
, hash_(std::move(hash))
, key_equal_(std::move(key_equal))
{
thrust::uninitialized_fill(thrust::device,
states_, states_ + capacity_,
state_empty);
}

__host__
~concurrent_hash_table()
{
thrust::for_each(thrust::device,
thrust::counting_iterator<size_type>(0),
thrust::counting_iterator<size_type>(capacity_),
element_destroyer(capacity_, keys_, values_, states_));
}

// TODO: Change return type to an enum with three possible values, succeeded,
// exists, and full.
template <typename UKey, typename... Args>
__host__ __device__
thrust::pair<value_iterator, bool>
try_emplace(UKey&& key, Args&&... args) {
auto index{hash_(key) % capacity_};
// Linearly probe the storage space up to `capacity_` times; if we haven't
// succeeded by then, the container is full.
for (size_type i = 0; i < capacity_; ++i) {
state_type old = states_[index].load(cuda::std::memory_order_acquire);
while (old == state_empty) {
// As long as the state of this element is empty, attempt to set it to
// reserved.
if (states_[index].compare_exchange_weak(
old, state_reserved, cuda::std::memory_order_acq_rel))
{
// We succeeded; the element is now "locked" as reserved.
new (keys_ + index) key_type(std::forward<UKey>(key));
new (values_ + index) mapped_type(std::forward<Args>(args)...);
states_[index].store(state_filled, cuda::std::memory_order_release);
return thrust::make_pair(values_ + index, true);
}
}
// If we are here, the element we are probing is not empty and we didn't
// fill it, so we need to wait for it to be filled.
while (state_filled != states_[index].load(cuda::std::memory_order_acquire))
;
// Now we know that the element we are probing has been filled by someone
// else, so we check if our key is equal to it.
if (key_equal_(keys_[index], key))
// It is, so the element already exists.
return thrust::make_pair(values_ + index, false);
// Otherwise, the element isn't a match, so move on to the next element.
index = (index + 1) % capacity_;
}
// If we are here, the container is full.
return thrust::make_pair(value_iterator{}, false);
}

__host__ __device__
mapped_type& operator[](key_type const& key) {
return (*try_emplace(key).first);
}
__host__ __device__
mapped_type& operator[](key_type&& key) {
return (*try_emplace(std::move(key)).first);
}
};

template <typename T>
struct identity_modulo {
private:
T const modulo_;

public:
__host__ __device__
identity_modulo(T modulo) : modulo_(std::move(modulo)) {}

identity_modulo(identity_modulo const&) = default;

__host__ __device__
T operator()(T i) { return i % modulo_; }
};

int main() {
{
using table = concurrent_hash_table<int, cuda::std::atomic<int>>;

auto freq = thrust::allocate_unique<table>(thrust::universal_allocator<table>{}, 8);

thrust::universal_vector<int> input = [] {
thrust::universal_vector<int> v(2048);
std::mt19937 gen(1337);
std::uniform_int_distribution<long> dis(0, 7);
thrust::generate(v.begin(), v.end(), [&] { return dis(gen); });
return v;
}();

thrust::for_each(thrust::device, input.begin(), input.end(),
[freq = freq.get()] __device__ (int i) {
(*freq)[i].fetch_add(1, cuda::std::memory_order_relaxed);
}
);

thrust::host_vector<int> gold(8);
thrust::for_each(input.begin(), input.end(), [&] (int i) { ++gold[i]; });

for (cuda::std::uint64_t i = 0; i < 8; ++i)
std::cout << "i: " << i
<< " gold: " << gold[i]
<< " observed: " << (*freq)[i] << "\n";

assert(cudaSuccess == cudaDeviceSynchronize());
}
{
using table = concurrent_hash_table<int, cuda::std::atomic<int>, identity_modulo<int>>;

auto freq = thrust::allocate_unique<table>(thrust::universal_allocator<table>{}, 8, identity_modulo<int>(4));

thrust::universal_vector<int> input = [] {
thrust::universal_vector<int> v(2048);
std::mt19937 gen(1337);
std::uniform_int_distribution<long> dis(0, 7);
thrust::generate(v.begin(), v.end(), [&] { return dis(gen); });
return v;
}();

thrust::for_each(thrust::device, input.begin(), input.end(),
[freq = freq.get()] __device__ (int i) {
(*freq)[i].fetch_add(1, cuda::std::memory_order_relaxed);
}
);

thrust::host_vector<int> gold(8);
thrust::for_each(input.begin(), input.end(), [&] (int i) { ++gold[i]; });

for (cuda::std::uint64_t i = 0; i < 8; ++i)
std::cout << "i: " << i
<< " gold: " << gold[i]
<< " observed: " << (*freq)[i] << "\n";

assert(cudaSuccess == cudaDeviceSynchronize());
}
}

Loading