diff --git a/docs/Dockerfile b/docs/Dockerfile new file mode 100644 index 0000000000..972e9c7fe5 --- /dev/null +++ b/docs/Dockerfile @@ -0,0 +1,3 @@ +FROM jekyll/jekyll:4.0 +COPY Gemfile /Gemfile +RUN /bin/bash -l -c "bundle install" \ No newline at end of file diff --git a/docs/Gemfile b/docs/Gemfile index 68f7a4a659..05acb6ed7c 100644 --- a/docs/Gemfile +++ b/docs/Gemfile @@ -6,4 +6,5 @@ group :jekyll_plugins do gem "jekyll-default-layout" # GitHub Pages. gem "jekyll-titles-from-headings" # GitHub Pages. gem "jekyll-relative-links" # GitHub Pages. + gem "jekyll-remote-theme" # GitHub Pages. end diff --git a/docs/_config.yml b/docs/_config.yml index 868d927d07..99e24ca44b 100644 --- a/docs/_config.yml +++ b/docs/_config.yml @@ -13,7 +13,8 @@ search.heading_level: 4 # just-the-docs ignores these filenames by default. include: [ "contributing.md", "code_of_conduct.md" ] -plugins: +plugins_dir: + - jekyll-remote-theme - jekyll-optional-front-matter # GitHub Pages. - jekyll-default-layout # GitHub Pages. - jekyll-titles-from-headings # GitHub Pages. @@ -81,6 +82,14 @@ defaults: has_children: true has_toc: false nav_order: 4 + - + scope: + path: extended_api/memory_access_properties.md + values: + parent: Extended API + has_children: true + has_toc: false + nav_order: 5 - scope: path: releases/changelog.md diff --git a/docs/extended_api.md b/docs/extended_api.md index d31d768c46..bb5ba43399 100644 --- a/docs/extended_api.md +++ b/docs/extended_api.md @@ -17,6 +17,7 @@ nav_order: 3 {% include_relative extended_api/asynchronous_operations.md %} +{% include_relative extended_api/memory_access_properties.md %} [Thread Scopes]: ./extended_api/thread_groups.md [Thread Groups]: ./extended_api/thread_scopes.md diff --git a/docs/extended_api/asynchronous_operations/memcpy_async.md b/docs/extended_api/asynchronous_operations/memcpy_async.md index 8d9e34ed85..ebe3773fc2 100644 --- a/docs/extended_api/asynchronous_operations/memcpy_async.md +++ b/docs/extended_api/asynchronous_operations/memcpy_async.md @@ -28,16 +28,42 @@ Defined in header ``: ```cuda // (3) template +__host__ __device__ void cuda::memcpy_async(void* destination, void const* source, Shape size, cuda::pipeline& pipeline); // (4) template +__host__ __device__ void cuda::memcpy_async(Group const& group, - void * destination, void const* source, Shape size, + void* destination, void const* source, Shape size, cuda::pipeline& pipeline); ``` +Defined in header ``: + +```cuda +// (5) +template +__host__ __device__ +void memcpy_async(Dst* dst, cuda::annotated_ptr src, Shape size, Sync& sync); + +// (6) +template +__host__ __device__ +void memcpy_async(cuda::annotated_ptr dst, cuda::annotated_ptr src, Shape size, Sync& sync); + +// (7) +template +__host__ __device__ +void memcpy_async(Group const& group, Dst* dst, cuda::annotated_ptr src, Shape size, Sync& sync); + +// (8) +template +__host__ __device__ +void memcpy_async(Group const& group, cuda::annotated_ptr dst, cuda::annotated_ptr src, Shape size, Sync& sync); +``` + `cuda::memcpy_async` asynchronously copies `size` bytes from the memory location pointed to by `source` to the memory location pointed to by `destination`. @@ -51,6 +77,8 @@ Both objects are reinterpreted as arrays of `unsigned char`. in the current thread. 4. Binds the asynchronous copy completion to `cuda::pipeline` and cooperatively issues the copy across all threads in `group`. +5. 5-8: convenience wrappers using `cuda::annotated_ptr` where `Sync` is + either `cuda::barrier` or `cuda::pipeline`. ## Notes diff --git a/docs/extended_api/memory_access_properties.md b/docs/extended_api/memory_access_properties.md new file mode 100644 index 0000000000..a3f34fc253 --- /dev/null +++ b/docs/extended_api/memory_access_properties.md @@ -0,0 +1,13 @@ +## Memory access properties + +| [`cuda::annotated_ptr`] | Binds an access property to a pointer. `(class template)`

1.6.0 / CUDA 11.5 | +| [`cuda::access_property`] | Represents a memory access property. `(class)`

1.6.0 / CUDA 11.5 | +| [`cuda::apply_access_property`] | Applies access property to memory location. `(function template)`

1.6.0 / CUDA 11.5 | +| [`cuda::associate_access_property`] | Associates access property with raw pointer. `(function template)`

1.6.0 / CUDA 11.5 | +| [`cuda::discard_memory`] | Writes indeterminate values to memory. `(function)`

1.6.0 / CUDA 11.5 | + +[`cuda::annotated_ptr`]: {{ "extended_api/memory_access_properties/annotated_ptr.html" | relative_url }} +[`cuda::access_property`]: {{ "extended_api/memory_access_properties/access_property.html" | relative_url }} +[`cuda::associate_access_property`]: {{ "extended_api/memory_access_properties/associate_access_property.html" | relative_url }} +[`cuda::apply_access_property`]: {{ "extended_api/memory_access_properties/apply_access_property.html" | relative_url }} +[`cuda::discard_memory`]: {{ "extended_api/memory_access_properties/discard_memory.html" | relative_url }} diff --git a/docs/extended_api/memory_access_properties/access_property.md b/docs/extended_api/memory_access_properties/access_property.md new file mode 100644 index 0000000000..741789cad2 --- /dev/null +++ b/docs/extended_api/memory_access_properties/access_property.md @@ -0,0 +1,287 @@ +--- +parent: Memory access properties +grand_parent: Extended API +nav_order: 2 +--- + +# `cuda::access_property` + +Defined in header ``: + +```cuda +namespace cuda { +class access_property; +} // namespace cuda +``` + +The class [`cuda::access_property`] is a [`LiteralType`] that provides an opaque encoding for properties of memory operations. +It is used in combination with [`cuda::annotated_ptr`], [`cuda::associate_access_property`] and [`cuda::apply_access_property`] to _request_ the application of properties to memory operations. + +```cuda +namespace cuda { + +class access_property { + public: + // Static memory space property: + struct shared {}; + struct global {}; + + // Static global memory residence control property: + struct normal { + __host__ __device__ constexpr operator cudaAccessProperty() const noexcept; + }; + struct persisting { + __host__ __device__ constexpr operator cudaAccessProperty() const noexcept; + }; + struct streaming { + __host__ __device__ constexpr operator cudaAccessProperty() const noexcept; + }; + + // Default constructor: + __host__ __device__ constexpr access_property() noexcept; + + // Copy constructor: + constexpr access_property(access_property const&) noexcept = default; + + // Copy assignment: + access_property& operator=(const access_property& other) noexcept = default; + + // Constructors from static global memory residence control properties: + __host__ __device__ constexpr access_property(global) noexcept; + __host__ __device__ constexpr access_property(normal) noexcept; + __host__ __device__ constexpr access_property(streaming) noexcept; + __host__ __device__ constexpr access_property(persisting) noexcept; + + // Dynamic interleaved global memory residence control property constructors: + __host__ __device__ constexpr access_property(normal, float probability); + __host__ __device__ constexpr access_property(streaming, float probability); + __host__ __device__ constexpr access_property(persisting, float probability); + __host__ __device__ constexpr access_property(normal, float probability, streaming); + __host__ __device__ constexpr access_property(persisting, float probability, streaming); + + // Dynamic range global memory residence control property constructors: + __host__ __device__ constexpr access_property(void* ptr, size_t partition_bytes, size_t total_bytes, normal); + __host__ __device__ constexpr access_property(void* ptr, size_t partition_bytes, size_t total_bytes, streaming); + __host__ __device__ constexpr access_property(void* ptr, size_t partition_bytes, size_t total_bytes, persisting); + __host__ __device__ constexpr access_property(void* ptr, size_t partition_bytes, size_t total_bytes, normal, streaming); + __host__ __device__ constexpr access_property(void* ptr, size_t partition_bytes, size_t total_bytes, persisting, streaming); +}; + +} // namespace cuda +``` + +## Kinds of access properties + +Access properties are either _static_ compile-time values or _dynamic_ runtime values. +The following properties of a memory access are provided: + +* Static memory space properties: + * [`cuda::access_property::shared`] : memory access to the shared memory space, +* Static global memory space _and_ residence control properties: + * [`cuda::access_property::global`] : memory access to the global memory space without indicating an expected frequency of access to that memory, + * [`cuda::access_property::normal`] : memory access to the global memory space expecting the memory to be accessed as frequent as other memory, + * [`cuda::access_property::persisting`] : memory access to the global memory space expecting the memory to be accessed more frequently than other memory; this priority is suitable for data that should remain persistent in cache, + * [`cuda::access_property::streaming`] : memory access to the global memory space expecting the memory to be accessed infrequently; this priority is suitable for streaming data. +* Dynamic global memory residence control properties: + * `normal`, `persisting`, `streaming`: static memory residence control properties may be specified at runtime, + * `interleaved`: choose a `probability` of memory addresses to be accessed with one property and the remaining `1 - probability` addresses with another, + * `range`: choose a partitioned memory range with memory accesses to the "middle" sub-partition using the _primary_ property, and memory accesess to the head and tail sub-partitions using the _secondary_ property. + +**Note**: the difference between [`cuda::access_property::global`] and [`cuda::access_property::normal`] is subtle. +The [`cuda::access_property::normal`] hints that the pointer points to the global address space _and_ the memory will be accessed with "normal frequency", while [`cuda::access_property::global`] only hints that the pointer points to the global address-space, it does not hint about how frequent the accesses will be. + +> **WARNING**: the behavior of _requesting_ the application of `cuda::access_property` to memory accesses, or their association with memory addresses, outside of the corresponding address space is _undefined_ (note: even if that address is not "used"). + + +## Default constructor + +```cuda +__host__ __device__ constexpr access_property() noexcept; +``` + +**Effects**: as if `access_property(global)`. + + +## Static global memory residence control property constructors + +```cuda +__host__ __device__ constexpr access_property::access_property(global) noexcept; +__host__ __device__ constexpr access_property::access_property(normal) noexcept; +__host__ __device__ constexpr access_property::access_property(streaming) noexcept; +__host__ __device__ constexpr access_property::access_property(persisting) noexcept; +``` + +**Effects**: as-if `access_property(PROPERTY, 1.0)` where `PROPERTY` is one of `global`, `normal`, `streaming`, or `persisting`. + + +## Dynamic interleaved global memory residence control property constructors + +```cuda +__host__ __device__ constexpr access_property::access_property(normal, float probability); +__host__ __device__ constexpr access_property::access_property(streaming, float probability); +__host__ __device__ constexpr access_property::access_property(persisting, float probability); +__host__ __device__ constexpr access_property::access_property(normal, float probability, streaming); +__host__ __device__ constexpr access_property::access_property(persisting, float probability, streaming); +``` + +**Preconditions**: `0 < probability <= 1.0`. + +**Effects**: constructs an _interleaved_ access property that _requests_ the first and third arguments - access properties - to be applied with `probability` and `1 - probability` to memory accesses. +The overloads without a third argument request applying `global` with `1 - probability`. + +## Dynamic range global memory residence control property constructors + +```cuda +__host__ __device__ constexpr access_property::access_property(void* ptr, size_t leading_bytes, size_t total_bytes, normal); +__host__ __device__ constexpr access_property::access_property(void* ptr, size_t leading_bytes, size_t total_bytes, streaming); +__host__ __device__ constexpr access_property::access_property(void* ptr, size_t leading_bytes, size_t total_bytes, persisting); +__host__ __device__ constexpr access_property::access_property(void* ptr, size_t leading_bytes, size_t total_bytes, normal, streaming); +__host__ __device__ constexpr access_property::access_property(void* ptr, size_t leading_bytes, size_t total_bytes, persisting, streaming); +``` + +> note: pointer arithmetic below performed `char* ptr` instead of `void* ptr` + +**Preconditions**: +* `ptr` is a generic pointer that is _valid_ to cast to a pointer to the global memory address space. +* `0 < leading_bytes <= total_bytes <= 4GB`. + +**Postconditions**: memory accesses requesting the application of this property must be in range `[max(0, ptr + leading_bytes - total_bytes), ptr + total_bytes)`. + +**Effects**: the fourth and fifth arguments, access properties, are called _primary_ and _secondary_ properties. +The overloads without a fifth argument use `global` as the _secondary_ property. +Constructs a _range_ access property _requesting_ the properties to be **approximately** applied to memory accesses as follows: + + * secondary property to accesses in address-range: `[max(0, ptr + leading_bytes - total_bytes), ptr)` + * primary property to accesses in address-range: `[ptr, ptr + leading_bytes)` + * secondary property to accesses in address-range: `[ptr + leading_bytes, ptr + total_bytes)` + +**Note**: This property enables three main use cases: + +1. Unary range `[ptr, ptr + total_bytes)` with primary property by using `leading_bytes == total_bytes`. +2. Binary range `[ptr, ptr + leading_bytes)` and `[ptr + leading_bytes, ptr + total_bytes)` with primary and secondary properties by just not using this range to access any memory in range `[max(0, ptr + leading_bytes - total_bytes), ptr)`. +3. Primary range with secondary "halo" ranges (see example below). Given `leading_bytes` for the primary range, and `halo_bytes` for the size of each of the secondary ranges by using `total_bytes == leading_bytes + halo_bytes`: + + ```c++ + ____________________________________________________________ + | halo / secondary | leading / primary | halo / secondary | + ------------------------------------------------------------ + ^ + | ptr + + |<-- halo_bytes -->|<-- leading_bytes -->|<-- halo_bytes -->| + |<-- total_bytes -->| + ``` + +## Conversion operators + +```cuda +__host__ __device__ constexpr access_property::normal::operator cudaAccessProperty() const noexcept; +__host__ __device__ constexpr access_property::streaming::operator cudaAccessProperty() const noexcept; +__host__ __device__ constexpr access_property::persisting::operator cudaAccessProperty() const noexcept; +``` + +**Returns**: corresponding CUDA Runtime [`cudaAccessProperty`] value. + +**Note**: Allows `constexpr cuda::access_property::normal{}`, +`cuda::access_property::streaming{}`, and `cuda::access_property::persisting{}` +to be used in lieu of the corresponding CUDA Runtime [`cudaAccessProperty`] +enumerated values. + +## Mapping of access properties to NVVM-IR and the PTX ISA + +> **WARNING**: The implementation makes **no guarantees** about the content of this section; it can change any time. + +When `cuda::access_property` is applied to memory operation, it sometimes +matches with some of the cache eviction priorities and cache hints introduced in the [PTX ISA +Version 7.4]. See [Cache Eviction Priority Hints] + +* `global`: `evict_unchanged` +* `normal`: `evict_normal` +* `persisting`: `evict_last` +* `streaming`: `evict_first` + +When using `shared` and `global`, the pointer being accessed can be assumed to point to memory in the `shared` and `global` address spaces. +This is exploited for optimization purposes in NVVM-IR. + +## Example + +```cuda +#include + +__global__ void undefined_behavior(int* global) { + // Associating pointers with mismatching address spaces is undefined: + cuda::associate_access_property(global, cuda::access_property::shared{}); // undefined behavior + __shared__ int shmem; + cuda::associate_access_property(&shmem, cuda::access_property::normal{}); // undefined behavior + cuda::associate_access_property(&shmem, cuda::access_property::streaming{}); // undefined behavior + cuda::associate_access_property(&shmem, cuda::access_property::persisting{}); // undefined behavior + + cuda::access_property interleaved_implicit_global(cuda::access_property::streaming{}, 0.5); + cuda::associate_access_property(&shmem, interleaved_implicit_global); // undefined behavior + + cuda::access_property range_implicit_global0(&shmem, 0, sizeof(int), cuda::access_property::streaming{}); + cuda::associate_access_property(&shmem, range_implicit_global0); // undefined behavior + + // Using a zero probability or probability out-of-range (0, 1] is undefined: + cuda::access_property interleaved(cuda::access_property::streaming{}, 0.0); // undefined behavior +} + +__global__ void correct(int* global) { + __shared__ int shmem; + cuda::associate_access_property(&shmem, cuda::access_property::shared{}); + + cuda::access_property global_range0(global, 0, sizeof(int), cuda::access_property::streaming{}); + cuda::associate_access_property(global, global_range0); + + cuda::access_property global_interleaved(cuda::access_property::streaming{}, 1.0); + cuda::associate_access_property(global, global_interleaved); + + // Access properties can be constructed for any address range + cuda::access_property global_range1(global, 0, sizeof(int), cuda::access_property::streaming{}); + cuda::access_property global_range2(nullptr, 0, sizeof(int), cuda::access_property::streaming{}); +} + +__global__ void range(int* g, size_t n) { + // To apply a single property to all elements in the range [g, g+n), set leading_bytes = total_bytes = n + auto range_property = cuda::access_property(g, n, n, cuda::access_property::persisting{}); +} + +__global__ void range_with_halos(int* g, size_t n, size_t halos) { + // In the range [g, g + n), the first and last "halos" elements of `int` type are halos. + // This example applies one property to the halo elements, and another property to the internal elements: + // - halos: streaming (secondary property) + // - internal: persisting (primary property) + + auto internal_property = cuda::access_property::persisting{}; + auto halo_property = cuda::access_property::streaming{}; + + // For the range property, the pointer used to build the property + // must satisfy p = g + halos + int* p = g + halos; + // Then, "total_elements" (total_size * sizeof(int)) must satisfy: + // g + n = p + total_elements + int total_bytes = (g + n - p) * sizeof(int); + // Finally, "leading_elements" (leading_bytes * sizeof(int)) must satisfy: + // g = p + leading_elements - total_elements + int leading_bytes = (g - p) * sizeof(int) + total_bytes; + + // Is a property that we can use for halo exchange: + auto range_property = cuda::access_property(p, leading_bytes, total_bytes, internal_property, halo_property); +} +``` + +[`cuda::annotated_ptr`]: {{ "extended_api/memory_access_properties/annotated_ptr.html" | relative_url }} +[`cuda::access_propety`]: {{ "extended_api/memory_access_properties/access_property.html" | relative_url }} +[`cuda::associate_access_property`]: {{ "extended_api/memory_access_properties/associate_access_property.html" | relative_url }} +[`cuda::apply_access_property`]: {{ "extended_api/memory_access_properties/apply_access_property.html" | relative_url }} +[`cuda::access_property::shared`]: {{ "extended_api/memory_access_properties/access_property.html#kinds-of-access-properties" | relative_url }} +[`cuda::access_property::global`]: {{ "extended_api/memory_access_properties/access_property.html#kinds-of-access-properties" | relative_url }} +[`cuda::access_property::persisting`]: {{ "extended_api/memory_access_properties/access_property.html#kinds-of-access-properties" | relative_url }} +[`cuda::access_property::normal`]: {{ "extended_api/memory_access_properties/access_property.html#kinds-of-access-properties" | relative_url }} +[`cuda::access_property::streaming`]: {{ "extended_api/memory_access_properties/access_property.html#kinds-of-access-properties" | relative_url }} + +[`LiteralType`]: https://en.cppreference.com/w/cpp/named_req/LiteralType +[`cudaAccessProperty`]: https://docs.nvidia.com/cuda/cuda-runtime-api +[PTX ISA Version 7.4]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#ptx-isa-version-7-4 +[Cache Eviction Priority Hints]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#cache-eviction-priority-hints + diff --git a/docs/extended_api/memory_access_properties/annotated_ptr.md b/docs/extended_api/memory_access_properties/annotated_ptr.md new file mode 100644 index 0000000000..a5148547ce --- /dev/null +++ b/docs/extended_api/memory_access_properties/annotated_ptr.md @@ -0,0 +1,290 @@ +--- +parent: Memory access properties +grand_parent: Extended API +nav_order: 1 +--- + +# `cuda::annotated_ptr` + +Defined in header ``: + +```cuda +namespace cuda { +template +class annotated_ptr; +} // namespace cuda +``` + +**Mandates**: `Property` is one of: + +* [`cuda::access_property::shared`], +* [`cuda::access_property::global`], +* [`cuda::access_property::persisting`], +* [`cuda::access_property::normal`], +* [`cuda::access_property::streaming`], or +* [`cuda::access_property`] (a type-erased property with a runtime value). + +_Note_: if `Property` is [`cuda::access_property`], i.e. a dynamic property with a runtime value, then `sizeof(cuda::annotated_ptr) == 2 * sizeof(Type*)`. Otherwise, its size is `sizeof(Type*)`. + +The class template [`cuda::annotated_ptr`] is a pointer annotated with an access property that _may_ be applied to memory operations performed through the [`cuda::annotated_ptr`]. + +In contrast with [`cuda::associate_access_property`], [`cuda::annotated_ptr`] maintains the association when passed through ABI boundaries, e.g., calling a non-inlined library function with a [`cuda::annotated_ptr`] argument. + +It implements a pointer-like interface: + +| Pointer Expression | `cuda::annotated_ptr` | Description | +|=====================|===========================================|=============================================| +| `T* a` | `cuda::annotated_ptr a` | non-`const` pointer to non-`const` memory | +| `T const * a` | `cuda::annotated_ptr a` | non-`const` pointer to `const` memory | +| `T* const a` | `const cuda::annotated_ptr a` | `const` pointer to non-`const` memory | +| `T const* const a` | `const cuda::annotated_ptr a` | `const` pointer to `const` memory | +| `val = *a;` | `val = *a;` | dereference operator to load an element | +| `*a = val;` | `*a = val;` | dereference operator to store an element | +| `val = a[n];` | `val = a[n];` | subscript operator to load an element | +| `a[n] = val;` | `a[n] = val;` | subscript operator to store an element | +| `T* a = nullptr;` | `annotated_ptr a = nullptr;` | `nullptr` initialization | +| `n = a - b;` | `n = a - b;` | difference operator | +| `if (a) { ... }` | `if (a) { ... }` | explicit bool conversion | + +But it is not a drop-in replacement for pointers since, among others, it does not: + +* model any [`Iterator`] concept, +* implement [`std::pointer_traits`], [`std::iterator_traits`], etc. +* have the same variance as pointer. + +```cuda +namespace cuda { + +template +class annotated_ptr { +public: + using value_type = Type; + using size_type = std::size_t; + using reference = value_type &; + using pointer = value_type *; + using const_pointer = value_type const *; + using difference_type = std::ptrdiff_t; + + __host__ __device__ constexpr annotated_ptr() noexcept; + __host__ __device__ constexpr annotated_ptr(annotated_ptr const&) noexcept = default; + __host__ __device__ constexpr annotated_ptr& operator=(annotated_ptr const&) noexcept = default; + __host__ __device__ explicit annotated_ptr(pointer); + template + __host__ __device__ annotated_ptr(pointer, RuntimeProperty); + template + __host__ __device__ annotated_ptr(annotated_ptr const&); + + __host__ __device__ constexpr explicit operator bool() const noexcept; + __host__ __device__ pointer get() const noexcept; + + __host__ __device__ reference operator*() const; + __host__ __device__ pointer operator->() const; + __host__ __device__ reference operator[](std::ptrdiff_t) const; + __host__ __device__ constexpr difference_type operator-(annotated_ptr); + +private: + pointer ptr; // exposition only + Property prop; // exposition only +}; + +} // namespace cuda +``` + +## Constructors and assignment + +### Default constructor + +```cuda +constexpr annotated_ptr() noexcept; +``` + +**Effects**:  as if constructed by `annotated_ptr(nullptr)`; + +### Constructor from pointer + +```cuda +constexpr explicit annotated_ptr(pointer ptr); +``` + +**Preconditions**: + +* if `Property` is [`cuda::access_property::shared`] then `ptr` must be a generic pointer that is valid to cast to a pointer to the shared memory address space. +* if `Property` is [`cuda::access_property::global`], [`cuda::access_property::normal`], [`cuda::access_property::streaming`], [`cuda::access_property::persisting`], or [`cuda::access_property`] then `ptr` must be a generic pointer that is valid to cast to a pointer to the global memory address space. + +**Effects**:  Constructs an `annotated_ptr` requesting associating `ptr` with `Property`. +If `Property` is [`cuda::access_property`] then `prop` is initialized with [`cuda::access_property::global`]. + +**Note**: in **Preconditions** "valid" means that casting the generic pointer to the corresponding address space does not introduce undefined behavior. + +### Constructor from pointer and access property + +```cuda +template +annotated_ptr(pointer ptr, RuntimeProperty prop); +``` + +**Mandates**: + +* `Property` is [`cuda::access_property`]. +* `RuntimeProperty` is any of [`cuda::access_property::global`], [`cuda::access_property::normal`], [`cuda::access_property::streaming`], [`cuda::access_property::persisting`], or [`cuda::access_property`]. + +**Preconditions**: `ptr` is a pointer to a valid allocation in the global memory address space. + +**Effects**:  Constructs an `annotated_ptr` requesting the association of `ptr` with the property `prop`. + +# Copy constructor from a different `annotated_ptr` + +```cuda +template +constexpr annotated_ptr(annotated_ptr const& a); +``` + +**Mandates**: + +* `annotated_ptr::pointer` is assignable from `annotated_ptr::pointer`. +* `Property` is either [`cuda::access_property`] or `P`. +* `Property` and `P` specify the same memory space. + +**Preconditions**: `pointer` is compatible with `Property`. + +**Effects**: Constructs an `annotated_ptr` for the same pointer as the input `annotated_ptr`. + + +## Explicit conversion operator to `bool` + +```cuda +constexpr operator bool() const noexcept; +``` + +**Returns**: `false` if the pointer is a `nullptr`, `true` otherwise. + + +## Raw pointer access + +```cuda +pointer get() const noexcept; +``` + +**Returns**: A pointer derived from the `annotated_ptr`. + +## Operators + +### Dereference + +```cuda +reference operator*() const; +``` + +**Preconditions**: The `annotated_ptr` is not null and points to a valid `T` value. + +**Returns**: [`*cuda::associate_access_property(ptr, prop)`][`cuda::associate_access_property`] + +### Pointer-to-member + +```cuda +pointer operator->() const; +``` + +**Preconditions**: the `annotated_ptr` is not null. + +**Returns**: [`cuda::associate_access_property(ptr, prop)`][`cuda::associate_access_property`] + +### Subscript + +```cuda +reference operator[](ptrdiff_t i) const; +``` + +**Preconditions**: `ptr` points to a valid allocation of at least size `[ptr, ptr+i]`. + +**Returns**: [`*cuda::associate_access_property(ptr+i,prop)`][`cuda::associate_access_property`] + +### Pointer distance + +```cuda +constexpr difference_type operator-(annotated_ptr p) const; +``` + +**Preconditions**: `ptr` and `p` point to the same allocation. + +**Returns**: as-if `get() - p.get()`. + +## Example + +Given three input and output vectors `x`, `y`, and `z`, and two arrays of coefficients `a` and `b`, all of length `N`: + +```cuda +size_t N; +int* x, *y, *z; +int* a, *b; +``` + +the grid-strided kernel: + +```cuda +__global__ void update(int* const x, int const* const a, int const* const b, size_t N) { + auto g = cooperative_groups::this_grid(); + for (int i = g.thread_rank(); idx < N; idx += g.size()) { + x[i] = a[i] * x[i] + b[i]; + } +} +``` + +updates `x`, `y`, and `z` as follows: + +```cuda +update<<>>(x, a, b, N); +update<<>>(y, a, b, N); +update<<>>(z, a, b, N); +``` + +The elements of `a` and `b` are used in all kernels. +If `N` is large enough, elements of `a` and `b` might be evicted from the L2 cache, requiring these to be re-loaded from memory in the next `update`. + +We can make the `update` kernel generic to allow the caller to pass [`cuda::annotated_ptr`] objects that hint at how memory will be accessed: + +```cuda +template +__global__ void update_template(PointerX x, PointerA a, PointerB b, size_t N) { + auto g = cooperative_groups::this_grid(); + for (int idx = g.thread_rank(); idx < N; idx += g.size()) { + x[idx] = a[idx] * x[idx] + b[idx]; + } +} +``` + +With [`cuda::annotated_ptr`], the caller can then specify the temporal locality of the memory accesses: + +```cuda +// Frequent accesses to "a" and "b"; infrequent accesses to "x" and "y": +cuda::annotated_ptr a_p {a}, b_p{b}; +cuda::annotated_ptr x_s{x}, y_s{y}; +update_template<<>>(x_s, a_p, b_p, N); +update_template<<>>(y_s, a_p, b_p, N); + +// Infrequent accesses to "a" and "b"; frequent acceses to "z": +cuda::annotated_ptr a_s {a}, b_s{b}; +cuda::annotated_ptr z_p{z}; +update_template<<>>(z_p, a_s, b_s, N); + +// Different kernel, "update_z", uses "z" again one last time. +// Since "z" was accessed as "persisting" by the previous kernel, +// parts of it are more likely to have previously survived in the L2 cache. +update_z<<>>(z, ...); +``` + +Notice how the raw pointers to `a` and `b` can be wrapped by both `annotated_ptr` and `annotated_ptr`, and accesses through each pointer applies the corresponding access property. + +[`Iterator`]: https://en.cppreference.com/w/cpp/iterator +[`std::pointer_traits`]: https://en.cppreference.com/w/cpp/memory/pointer_traits +[`std::iterator_traits`]: https://en.cppreference.com/w/cpp/iterator/iterator_traits + +[`cuda::annotated_ptr`]: {{ "extended_api/memory_access_properties/annotated_ptr.html" | relative_url }} +[`cuda::access_propety`]: {{ "extended_api/memory_access_properties/access_property.html" | relative_url }} +[`cuda::associate_access_property`]: {{ "extended_api/memory_access_properties/associate_access_property.html" | relative_url }} +[`cuda::apply_access_property`]: {{ "extended_api/memory_access_properties/apply_access_property.html" | relative_url }} +[`cuda::access_property::shared`]: {{ "extended_api/memory_access_properties/access_property.html#kinds-of-access-properties" | relative_url }} +[`cuda::access_property::global`]: {{ "extended_api/memory_access_properties/access_property.html#kinds-of-access-properties" | relative_url }} +[`cuda::access_property::persisting`]: {{ "extended_api/memory_access_properties/access_property.html#kinds-of-access-properties" | relative_url }} +[`cuda::access_property::normal`]: {{ "extended_api/memory_access_properties/access_property.html#kinds-of-access-properties" | relative_url }} +[`cuda::access_property::streaming`]: {{ "extended_api/memory_access_properties/access_property.html#kinds-of-access-properties" | relative_url }} diff --git a/docs/extended_api/memory_access_properties/apply_access_property.md b/docs/extended_api/memory_access_properties/apply_access_property.md new file mode 100644 index 0000000000..749d2208b6 --- /dev/null +++ b/docs/extended_api/memory_access_properties/apply_access_property.md @@ -0,0 +1,107 @@ +--- +parent: Memory access properties +grand_parent: Extended API +nav_order: 3 +--- + +# `cuda::apply_access_property` + +```cuda +template +__host__ __device__ +void apply_access_property(void const volatile* ptr, ShapeT shape, cuda::access_property::persisting) noexcept; +template +__host__ __device__ +void apply_access_property(void const volatile* ptr, ShapeT shape, cuda::access_property::normal) noexcept; +``` + +**Mandates**: [`ShapeT`] is either [`std::size_t`] or [`cuda::aligned_size_t`]. + +**Preconditions**: `ptr` points to a valid allocation for `shape` in the global memory address space. + +**Effects**: no effects. + +**_Hint_**: to prefetch `shape` bytes of memory starting at `ptr` while applying a property. Two properties are supported: + +* [`cuda::access_property::persisting`] +* [`cuda::access_property::normal`] + + +**Note**: in **Preconditions** "valid allocation for `shape` means that: + +* if `ShapeT` is `aligned_size_t(sz)` then `ptr` is aligned to an `N`-bytes alignment boundary, and +* for all offsets `i` in the extent of `shape`, i.e., `i` in `[0, shape)` then the expression `*(ptr + i)` does not exhibit undefined behavior. + +**Note**: currently `apply_access_property` is ignored by nvcc and nvc++ on the host. + +# Example + +Given three input and output vectors `x`, `y`, and `z`, and two arrays of coefficients `a` and `b`, all of length `N`: + +```cuda +size_t N; +int* x, *y, *z; +int* a, *b; +``` + +the grid-strided kernel: + +```cuda +__global__ void update(int* const x, int const* const a, int const* const b, size_t N) { + auto g = cooperative_groups::this_grid(); + for (int idx = g.thread_rank(); idx < N; idx += g.size()) { + x[idx] = a[idx] * x[idx] + b[idx]; + } +} +``` + +updates `x`, `y`, and `z` as follows: + +```cuda +update<<>>(x, a, b, N); +update<<>>(y, a, b, N); +update<<>>(z, a, b, N); +``` + +The elements of `a` and `b` are used in all kernels. +For certain values of `N`, this may prevent parts of `a` and `b` from being evicted from the L2 cache, avoiding reloading these from memory in the subsequent `update` kernel. + +With [`cuda::access_property`] and [`cuda::apply_access_property`], we can write kernels that specify that `a` and `b` are accessed more often than (`pin`) and as often as (`unpin`) other data: + +```cuda +__global__ void pin(int* a, int* b, size_t N) { + auto g = cooperative_groups::this_grid(); + for (int idx = g.thread_rank(); idx < N; idx += g.size()) { + cuda::apply_access_property(a + idx, sizeof(int), cuda::access_property::persisting{}); + cuda::apply_access_property(b + idx, sizeof(int), cuda::access_property::persisting{}); + } +} +__global__ void unpin(int* a, int* b, size_t N) { + auto g = cooperative_groups::this_grid(); + for (int idx = g.thread_rank(); idx < N; idx += g.size()) { + cuda::apply_access_property(a + idx, sizeof(int), cuda::access_property::normal{}); + cuda::apply_access_property(b + idx, sizeof(int), cuda::access_property::normal{}); + } +} +``` + +which we can launch before and after the `update` kernels: + +```cuda +pin<<>>(a, b, N); +update<<>>(x, a, b, N); +update<<>>(y, a, b, N); +update<<>>(z, a, b, N); +unpin<<>>(a, b, N); +``` + +This does not require modifying the `update` kernel, and for certain values of `N` prevents `a` and `b` from having to be re-loaded from memory. + +The `pin` and `unpin` kernels can be fused into the kernels for the `x` and `z` updates by modifying these kernels. + +[`std::size_t`]: https://en.cppreference.com/w/cpp/types/size_t +[`ShapeT`]: {{ "extended_api/shapes.html" | relative_url }} +[`cuda::aligned_size_t`]: {{ "extended_api/shapes/aligned_size_t.html" | relative_url }} +[`cuda::access_propety`]: {{ "extended_api/memory_access_properties/access_property.html" | relative_url }} +[`cuda::access_property::persisting`]: {{ "extended_api/memory_access_properties/access_property.html#kinds-of-access-properties" | relative_url }} +[`cuda::access_property::normal`]: {{ "extended_api/memory_access_properties/access_property.html#kinds-of-access-properties" | relative_url }} diff --git a/docs/extended_api/memory_access_properties/associate_access_property.md b/docs/extended_api/memory_access_properties/associate_access_property.md new file mode 100644 index 0000000000..91580e8ed4 --- /dev/null +++ b/docs/extended_api/memory_access_properties/associate_access_property.md @@ -0,0 +1,53 @@ +--- +parent: Memory access properties +grand_parent: Extended API +nav_order: 4 +--- + +# `cuda::associate_access_property` + +```cuda +template +__host__ __device__ +T* associate_access_property(T* ptr, Property prop); +``` + +**Preconditions**: +* if `Property` is [`cuda::access_property::shared`] then it must be valid to cast the generic pointer `ptr` to a pointer to the shared memory address space. +* if `Property` is one of [`cuda::access_property::global`], [`cuda::access_property::persisting`], [`cuda::access_property::normal`], or [`cuda::access_property::streaming`] then it must be valid to cast the generic pointer `ptr` to a pointer to the global memory address space. +* if `Property` is a [`cuda::access_property`] of "range" kind, then `ptr` must be in the valid range. + +**Mandates**: `Property` is convertible to [`cuda::access_property`]. + +**Effects**: no effects. + +**_Hint_**: to associate an access property with the returned pointer, such that subsequent memory operations with the returned pointer _or_ pointers derived from it _may_ apply the access property. + + * The "association" is _not_ part of the value representation of the pointer. + * The compiler is allowed to drop the association; it does not have a functional consequence. + * The association _may_ hold through simple expressions, sequence of simple statements, or fully inlined function calls where the pointer value or C++ reference is provably unchanged; this includes offset pointers used for array access. + * The association is _not_ expected to hold through the ABI of an unknown function call, e.g., when the pointer is passed through a separately-compiled function interface, unless link-time optimizations are used. + +**Note**: currently `associate_access_property` is ignored by nvcc and nvc++ on the host; but this might change any time. + +# Example + +```cuda +#include +__global__ void memcpy(int const* in_, int* out) { + int const* in = cuda::associate_access_property(in_, cuda::access_property::streaming{}); + auto idx = cooperative_groups::this_grid().thread_rank(); + + __shared__ int shmem[N]; + shmem[threadIdx.x] = in[idx]; // streaming access + + // compute... +} +``` + +[`cuda::access_propety`]: {{ "extended_api/memory_access_properties/access_property.html" | relative_url }} +[`cuda::access_property::persisting`]: {{ "extended_api/memory_access_properties/access_property.html#kinds-of-access-properties" | relative_url }} +[`cuda::access_property::streaming`]: {{ "extended_api/memory_access_properties/access_property.html#kinds-of-access-properties" | relative_url }} +[`cuda::access_property::normal`]: {{ "extended_api/memory_access_properties/access_property.html#kinds-of-access-properties" | relative_url }} +[`cuda::access_property::global`]: {{ "extended_api/memory_access_properties/access_property.html#kinds-of-access-properties" | relative_url }} +[`cuda::access_property::shared`]: {{ "extended_api/memory_access_properties/access_property.html#kinds-of-access-properties" | relative_url }} diff --git a/docs/extended_api/memory_access_properties/discard_memory.md b/docs/extended_api/memory_access_properties/discard_memory.md new file mode 100644 index 0000000000..5652b9e394 --- /dev/null +++ b/docs/extended_api/memory_access_properties/discard_memory.md @@ -0,0 +1,46 @@ +--- +parent: Memory access properties +grand_parent: Extended API +nav_order: 5 +--- + +# `cuda::discard_memory` + +```cuda +__device__ void discard_memory(void volatile* ptr, size_t nbytes); +``` + +**Preconditions**: `ptr` points to a valid allocation of size greater or equal to `nbytes`. + +**Effects**: equivalent to `memset(ptr, _indeterminate_, nbytes)`. + +**Hint**: to discard modified cache lines without writing back the cached data to memory. +Enables using global memory as temporary scratch space. +Does **not** generate any HW store operations. + +# Example + +This kernel needs a scratch pad that does not fit in shared memory, so it uses an allocation in global memory instead: + +```cuda +#include +__device__ int compute(int* scratch, size_t N); + +__global__ void kernel(int const* in, int* out, int* scratch, size_t N) { + // Each thread reads N elements into the scratch pad: + for (int i = 0; i < N; ++i) { + int idx = threadIdx.x + i * blockDim.x; + scratch[idx] = in[idx]; + } + __syncthreads(); + + // All threads compute on the scratch pad: + int result = compute(scratch, N); + + // All threads discard the scratch pad memory to _hint_ that it does not need to be flushed from the cache: + cuda::discard_memory(scratch + threadIdx.x * N, N * sizeof(int)); + __syncthreads(); + + out[threadIdx.x] = result; +} +``` diff --git a/docs/readme.md b/docs/readme.md new file mode 100644 index 0000000000..a3e99a2ff9 --- /dev/null +++ b/docs/readme.md @@ -0,0 +1,9 @@ +libcu++ documentation +=== + +To build and serve the documentation as a website to `http://localhost:4000` just execute the following from libcudacxx root directory: + +```shell +./docs/serve +``` + diff --git a/docs/serve b/docs/serve new file mode 100755 index 0000000000..ff9c5cbb6a --- /dev/null +++ b/docs/serve @@ -0,0 +1,9 @@ +#!/usr/bin/env sh + +set -ex + +( + cd docs + docker build -f Dockerfile -t libcudacxx:docs . + docker run --rm -p 4000:4000 -v $(pwd):/srv/jekyll -u $(id -u):$(id -g) -it libcudacxx:docs bash -c "jekyll serve --watch --host 0.0.0.0" +)