From d31e8d77506db9c45d1c1c64524fee7891be3083 Mon Sep 17 00:00:00 2001
From: Gonzalo Brito Gadeschi <gonzalob@nvidia.com>
Date: Thu, 29 Jul 2021 06:03:38 -0700
Subject: [PATCH] Document cuda::annotated_ptr and related APIs

---
 docs/Dockerfile                               |   3 +
 docs/Gemfile                                  |   1 +
 docs/_config.yml                              |  11 +-
 docs/extended_api.md                          |   1 +
 .../asynchronous_operations/memcpy_async.md   |  30 +-
 docs/extended_api/memory_access_properties.md |  13 +
 .../access_property.md                        | 287 +++++++++++++++++
 .../memory_access_properties/annotated_ptr.md | 290 ++++++++++++++++++
 .../apply_access_property.md                  | 107 +++++++
 .../associate_access_property.md              |  53 ++++
 .../discard_memory.md                         |  46 +++
 docs/readme.md                                |   9 +
 docs/serve                                    |   9 +
 13 files changed, 858 insertions(+), 2 deletions(-)
 create mode 100644 docs/Dockerfile
 create mode 100644 docs/extended_api/memory_access_properties.md
 create mode 100644 docs/extended_api/memory_access_properties/access_property.md
 create mode 100644 docs/extended_api/memory_access_properties/annotated_ptr.md
 create mode 100644 docs/extended_api/memory_access_properties/apply_access_property.md
 create mode 100644 docs/extended_api/memory_access_properties/associate_access_property.md
 create mode 100644 docs/extended_api/memory_access_properties/discard_memory.md
 create mode 100644 docs/readme.md
 create mode 100755 docs/serve
diff --git a/docs/Dockerfile b/docs/Dockerfile
new file mode 100644
index 0000000000..972e9c7fe5
--- /dev/null
+++ b/docs/Dockerfile
@@ -0,0 +1,3 @@
+FROM jekyll/jekyll:4.0
+COPY Gemfile /Gemfile
+RUN /bin/bash -l -c "bundle install"
\ No newline at end of file
diff --git a/docs/Gemfile b/docs/Gemfile
index 68f7a4a659..05acb6ed7c 100644
--- a/docs/Gemfile
+++ b/docs/Gemfile
@@ -6,4 +6,5 @@ group :jekyll_plugins do
   gem "jekyll-default-layout"        # GitHub Pages.
   gem "jekyll-titles-from-headings"  # GitHub Pages.
   gem "jekyll-relative-links"        # GitHub Pages.
+  gem "jekyll-remote-theme"          # GitHub Pages.
 end
diff --git a/docs/_config.yml b/docs/_config.yml
index 868d927d07..99e24ca44b 100644
--- a/docs/_config.yml
+++ b/docs/_config.yml
@@ -13,7 +13,8 @@ search.heading_level: 4
 # just-the-docs ignores these filenames by default.
 include: [ "contributing.md", "code_of_conduct.md" ]
 
-plugins:
+plugins_dir:
+  - jekyll-remote-theme
   - jekyll-optional-front-matter # GitHub Pages.
   - jekyll-default-layout        # GitHub Pages.
   - jekyll-titles-from-headings  # GitHub Pages.
@@ -81,6 +82,14 @@ defaults:
       has_children: true
       has_toc: false
       nav_order: 4
+  -
+    scope:
+      path: extended_api/memory_access_properties.md
+    values:
+      parent: Extended API
+      has_children: true
+      has_toc: false
+      nav_order: 5
   -
     scope:
       path: releases/changelog.md
diff --git a/docs/extended_api.md b/docs/extended_api.md
index d31d768c46..bb5ba43399 100644
--- a/docs/extended_api.md
+++ b/docs/extended_api.md
@@ -17,6 +17,7 @@ nav_order: 3
 
 {% include_relative extended_api/asynchronous_operations.md %}
 
+{% include_relative extended_api/memory_access_properties.md %}
 
 [Thread Scopes]: ./extended_api/thread_groups.md
 [Thread Groups]: ./extended_api/thread_scopes.md
diff --git a/docs/extended_api/asynchronous_operations/memcpy_async.md b/docs/extended_api/asynchronous_operations/memcpy_async.md
index 8d9e34ed85..ebe3773fc2 100644
--- a/docs/extended_api/asynchronous_operations/memcpy_async.md
+++ b/docs/extended_api/asynchronous_operations/memcpy_async.md
@@ -28,16 +28,42 @@ Defined in header `<cuda/pipeline>`:
 ```cuda
 // (3)
 template <typename Shape, cuda::thread_scope Scope>
+__host__ __device__
 void cuda::memcpy_async(void* destination, void const* source, Shape size,
                         cuda::pipeline<Scope>& pipeline);
 
 // (4)
 template <typename Group, typename Shape, cuda::thread_scope Scope>
+__host__ __device__
 void cuda::memcpy_async(Group const& group,
-                        void * destination, void const* source, Shape size,
+                        void* destination, void const* source, Shape size,
                         cuda::pipeline<Scope>& pipeline);
 ```
 
+Defined in header `<cuda/annotated_ptr>`:
+
+```cuda
+// (5)
+template <typename Dst, typename Src, typename SrcProperty, typename Shape, typename Sync>
+__host__ __device__
+void memcpy_async(Dst* dst, cuda::annotated_ptr<Src, SrcProperty> src, Shape size, Sync& sync);
+
+// (6)
+template<typename Dst, typename DstProperty, typename Src, typename SrcProperty, typename Shape, typename Sync>
+__host__ __device__
+void memcpy_async(cuda::annotated_ptr<Dst, DstProperty> dst, cuda::annotated_ptr<Src, SrcProperty> src, Shape size, Sync& sync);
+
+// (7)
+template<typename Group, typename Dst, typename Src, typename SrcProperty, typename Shape, typename Sync>
+__host__ __device__
+void memcpy_async(Group const& group, Dst* dst, cuda::annotated_ptr<Src, SrcProperty> src, Shape size, Sync& sync);
+
+// (8)
+template<typename Group, typename Dst, typename DstProperty, typename Src, typename SrcProperty, typename Shape, typename Sync>
+__host__ __device__
+void memcpy_async(Group const& group, cuda::annotated_ptr<Dst, DstProperty> dst, cuda::annotated_ptr<Src, SrcProperty> src, Shape size, Sync& sync);
+```
+
 `cuda::memcpy_async` asynchronously copies `size` bytes from the memory
   location pointed to by `source` to the memory location pointed to by
   `destination`.
@@ -51,6 +77,8 @@ Both objects are reinterpreted as arrays of `unsigned char`.
    in the current thread.
 4. Binds the asynchronous copy completion to `cuda::pipeline` and cooperatively
    issues the copy across all threads in `group`.
+5. 5-8: convenience wrappers using `cuda::annotated_ptr` where `Sync` is 
+   either `cuda::barrier` or `cuda::pipeline`.
 
 ## Notes
 
diff --git a/docs/extended_api/memory_access_properties.md b/docs/extended_api/memory_access_properties.md
new file mode 100644
index 0000000000..a3f34fc253
--- /dev/null
+++ b/docs/extended_api/memory_access_properties.md
@@ -0,0 +1,13 @@
+## Memory access properties
+
+| [`cuda::annotated_ptr`]             | Binds an access property to a pointer. `(class template)` <br/><br/> 1.6.0 / CUDA 11.5 |
+| [`cuda::access_property`]           | Represents a memory access property. `(class)` <br/><br/> 1.6.0 / CUDA 11.5 |
+| [`cuda::apply_access_property`]     | Applies access property to memory location. `(function template)` <br/><br/> 1.6.0 / CUDA 11.5 |
+| [`cuda::associate_access_property`] | Associates access property with raw pointer. `(function template)` <br/><br/> 1.6.0 / CUDA 11.5 |
+| [`cuda::discard_memory`]            | Writes indeterminate values to memory. `(function)` <br/><br/> 1.6.0 / CUDA 11.5 |
+
+[`cuda::annotated_ptr`]: {{ "extended_api/memory_access_properties/annotated_ptr.html" | relative_url }}
+[`cuda::access_property`]: {{ "extended_api/memory_access_properties/access_property.html" | relative_url }}
+[`cuda::associate_access_property`]: {{ "extended_api/memory_access_properties/associate_access_property.html" | relative_url }}
+[`cuda::apply_access_property`]: {{ "extended_api/memory_access_properties/apply_access_property.html" | relative_url }}
+[`cuda::discard_memory`]: {{ "extended_api/memory_access_properties/discard_memory.html" | relative_url }}
diff --git a/docs/extended_api/memory_access_properties/access_property.md b/docs/extended_api/memory_access_properties/access_property.md
new file mode 100644
index 0000000000..741789cad2
--- /dev/null
+++ b/docs/extended_api/memory_access_properties/access_property.md
@@ -0,0 +1,287 @@
+---
+parent: Memory access properties
+grand_parent: Extended API
+nav_order: 2
+---
+
+# `cuda::access_property`
+
+Defined in header `<cuda/annotated_ptr>`:
+
+```cuda
+namespace cuda {
+class access_property;
+} // namespace cuda
+```
+
+The class [`cuda::access_property`] is a [`LiteralType`] that provides an opaque encoding for properties of memory operations.
+It is used in combination with [`cuda::annotated_ptr`], [`cuda::associate_access_property`] and [`cuda::apply_access_property`] to _request_ the application of properties to memory operations.
+
+```cuda
+namespace cuda {
+
+class access_property {
+  public:
+    // Static memory space property:
+    struct shared {};
+    struct global {};
+
+    // Static global memory residence control property:
+    struct normal {
+        __host__ __device__ constexpr operator cudaAccessProperty() const noexcept;
+    };
+    struct persisting {
+        __host__ __device__ constexpr operator cudaAccessProperty() const noexcept;
+    };
+    struct streaming {
+        __host__ __device__ constexpr operator cudaAccessProperty() const noexcept;
+    };
+
+    // Default constructor:
+    __host__ __device__ constexpr access_property() noexcept;
+
+    // Copy constructor:
+    constexpr access_property(access_property const&) noexcept = default;
+
+    // Copy assignment:
+    access_property& operator=(const access_property& other) noexcept = default;
+
+    // Constructors from static global memory residence control properties:
+    __host__ __device__ constexpr access_property(global)     noexcept;
+    __host__ __device__ constexpr access_property(normal)     noexcept;
+    __host__ __device__ constexpr access_property(streaming)  noexcept;
+    __host__ __device__ constexpr access_property(persisting) noexcept;
+
+    // Dynamic interleaved global memory residence control property constructors:
+    __host__ __device__ constexpr access_property(normal,     float probability);
+    __host__ __device__ constexpr access_property(streaming,  float probability);
+    __host__ __device__ constexpr access_property(persisting, float probability);
+    __host__ __device__ constexpr access_property(normal,     float probability, streaming);
+    __host__ __device__ constexpr access_property(persisting, float probability, streaming);
+
+    // Dynamic range global memory residence control property constructors:
+    __host__ __device__ constexpr access_property(void* ptr, size_t partition_bytes, size_t total_bytes, normal);
+    __host__ __device__ constexpr access_property(void* ptr, size_t partition_bytes, size_t total_bytes, streaming);
+    __host__ __device__ constexpr access_property(void* ptr, size_t partition_bytes, size_t total_bytes, persisting);
+    __host__ __device__ constexpr access_property(void* ptr, size_t partition_bytes, size_t total_bytes, normal,     streaming);
+    __host__ __device__ constexpr access_property(void* ptr, size_t partition_bytes, size_t total_bytes, persisting, streaming);
+};
+
+} // namespace cuda
+```
+
+## Kinds of access properties
+
+Access properties are either _static_ compile-time values or _dynamic_ runtime values. 
+The following properties of a memory access are provided:
+
+* Static memory space properties:
+  * [`cuda::access_property::shared`] : memory access to the shared memory space,
+* Static global memory space _and_ residence control properties:
+  * [`cuda::access_property::global`] : memory access to the global memory space without indicating an expected frequency of access to that memory,
+  * [`cuda::access_property::normal`] : memory access to the global memory space expecting the memory to be accessed as frequent as other memory,
+  * [`cuda::access_property::persisting`] : memory access to the global memory space expecting the memory to be accessed more frequently than other memory; this priority is suitable for data that should remain persistent in cache,
+  * [`cuda::access_property::streaming`] : memory access to the global memory space expecting the memory to be accessed infrequently; this priority is suitable for streaming data.
+* Dynamic global memory residence control properties:
+  * `normal`, `persisting`, `streaming`: static memory residence control properties may be specified at runtime,
+  * `interleaved`: choose a `probability` of memory addresses to be accessed with one property and the remaining `1 - probability` addresses with another,
+  * `range`: choose a partitioned memory range with memory accesses to the "middle" sub-partition using the _primary_ property, and memory accesess to the head and tail sub-partitions using the _secondary_ property.
+
+**Note**: the difference between [`cuda::access_property::global`] and [`cuda::access_property::normal`] is subtle. 
+The [`cuda::access_property::normal`] hints that the pointer points to the global address space _and_ the memory will be accessed with "normal frequency", while [`cuda::access_property::global`] only hints that the pointer points to the global address-space, it does not hint about how frequent the accesses will be.
+
+> **WARNING**: the behavior of _requesting_ the application of `cuda::access_property` to memory accesses, or their association with memory addresses, outside of the corresponding address space is _undefined_ (note: even if that address is not "used").
+
+
+## Default constructor
+
+```cuda
+__host__ __device__ constexpr access_property() noexcept;
+```
+
+**Effects**: as if `access_property(global)`.
+
+
+## Static global memory residence control property constructors
+
+```cuda
+__host__ __device__ constexpr access_property::access_property(global) noexcept; 
+__host__ __device__ constexpr access_property::access_property(normal) noexcept; 
+__host__ __device__ constexpr access_property::access_property(streaming) noexcept; 
+__host__ __device__ constexpr access_property::access_property(persisting) noexcept; 
+```
+
+**Effects**: as-if `access_property(PROPERTY, 1.0)` where `PROPERTY` is one of `global`, `normal`, `streaming`, or `persisting`.
+
+
+## Dynamic interleaved global memory residence control property constructors
+
+```cuda
+__host__ __device__ constexpr access_property::access_property(normal,     float probability); 
+__host__ __device__ constexpr access_property::access_property(streaming,  float probability); 
+__host__ __device__ constexpr access_property::access_property(persisting, float probability); 
+__host__ __device__ constexpr access_property::access_property(normal,     float probability, streaming); 
+__host__ __device__ constexpr access_property::access_property(persisting, float probability, streaming); 
+```
+
+**Preconditions**: `0 < probability <= 1.0`.
+
+**Effects**: constructs an _interleaved_ access property that _requests_ the first and third arguments - access properties - to be applied with `probability` and `1 - probability` to memory accesses. 
+The overloads without a third argument request applying `global` with `1 - probability`.
+
+## Dynamic range global memory residence control property constructors
+
+```cuda
+__host__ __device__ constexpr access_property::access_property(void* ptr, size_t leading_bytes, size_t total_bytes, normal); 
+__host__ __device__ constexpr access_property::access_property(void* ptr, size_t leading_bytes, size_t total_bytes, streaming); 
+__host__ __device__ constexpr access_property::access_property(void* ptr, size_t leading_bytes, size_t total_bytes, persisting); 
+__host__ __device__ constexpr access_property::access_property(void* ptr, size_t leading_bytes, size_t total_bytes, normal,     streaming); 
+__host__ __device__ constexpr access_property::access_property(void* ptr, size_t leading_bytes, size_t total_bytes, persisting, streaming);
+```
+
+> note: pointer arithmetic below performed `char* ptr` instead of `void* ptr`
+
+**Preconditions**:
+* `ptr` is a generic pointer that is _valid_ to cast to a pointer to the global memory address space.
+* `0 < leading_bytes <= total_bytes <= 4GB`.
+
+**Postconditions**: memory accesses requesting the application of this property must be in range `[max(0, ptr + leading_bytes - total_bytes), ptr + total_bytes)`.
+
+**Effects**: the fourth and fifth arguments, access properties, are called _primary_ and _secondary_ properties. 
+The overloads without a fifth argument use `global` as the _secondary_ property.
+Constructs a _range_ access property _requesting_ the properties to be **approximately** applied to memory accesses as follows:
+
+  * secondary property to accesses in address-range: `[max(0, ptr + leading_bytes - total_bytes), ptr)`
+  * primary property to accesses in address-range: `[ptr, ptr + leading_bytes)`
+  * secondary property to accesses in address-range: `[ptr + leading_bytes, ptr + total_bytes)`
+
+**Note**: This property enables three main use cases:
+
+1. Unary range `[ptr, ptr + total_bytes)` with primary property by using `leading_bytes == total_bytes`.
+2. Binary range `[ptr, ptr + leading_bytes)` and `[ptr + leading_bytes, ptr + total_bytes)` with primary and secondary properties by just not using this range to access any memory in range `[max(0, ptr + leading_bytes - total_bytes), ptr)`.
+3. Primary range with secondary "halo" ranges (see example below). Given `leading_bytes` for the primary range, and `halo_bytes` for the size of each of the secondary ranges by using `total_bytes == leading_bytes + halo_bytes`:
+
+   ```c++
+    ____________________________________________________________
+   |  halo / secondary | leading / primary   | halo / secondary |
+    ------------------------------------------------------------
+                       ^
+                       | ptr
+
+   |<-- halo_bytes  -->|<-- leading_bytes -->|<-- halo_bytes -->|
+                       |<--            total_bytes           -->|
+   ```
+
+## Conversion operators
+
+```cuda
+__host__ __device__ constexpr access_property::normal::operator cudaAccessProperty() const noexcept; 
+__host__ __device__ constexpr access_property::streaming::operator cudaAccessProperty() const noexcept; 
+__host__ __device__ constexpr access_property::persisting::operator cudaAccessProperty() const noexcept; 
+```
+
+**Returns**: corresponding CUDA Runtime [`cudaAccessProperty`] value. 
+
+**Note**: Allows `constexpr cuda::access_property::normal{}`,
+`cuda::access_property::streaming{}`, and `cuda::access_property::persisting{}`
+to be used in lieu of the corresponding CUDA Runtime [`cudaAccessProperty`]
+enumerated values.
+
+## Mapping of access properties to NVVM-IR and the PTX ISA
+
+> **WARNING**: The implementation makes **no guarantees** about the content of this section; it can change any time.
+
+When `cuda::access_property` is applied to memory operation, it sometimes
+matches with some of the cache eviction priorities and cache hints introduced in the [PTX ISA
+Version 7.4]. See [Cache Eviction Priority Hints]
+
+* `global`: `evict_unchanged`
+* `normal`: `evict_normal`
+* `persisting`: `evict_last`
+* `streaming`:  `evict_first`
+
+When using `shared` and `global`, the pointer being accessed can be assumed to point to memory in the `shared` and `global` address spaces.
+This is exploited for optimization purposes in NVVM-IR.
+
+## Example 
+
+```cuda
+#include <cuda/annotated_ptr>
+
+__global__ void undefined_behavior(int* global) {
+    // Associating pointers with mismatching address spaces is undefined:
+    cuda::associate_access_property(global, cuda::access_property::shared{});     // undefined behavior
+    __shared__ int shmem;
+    cuda::associate_access_property(&shmem, cuda::access_property::normal{});     // undefined behavior
+    cuda::associate_access_property(&shmem, cuda::access_property::streaming{});  // undefined behavior
+    cuda::associate_access_property(&shmem, cuda::access_property::persisting{}); // undefined behavior
+
+    cuda::access_property interleaved_implicit_global(cuda::access_property::streaming{}, 0.5);
+    cuda::associate_access_property(&shmem, interleaved_implicit_global);         // undefined behavior
+
+    cuda::access_property range_implicit_global0(&shmem, 0, sizeof(int), cuda::access_property::streaming{});
+    cuda::associate_access_property(&shmem, range_implicit_global0);              // undefined behavior
+
+    // Using a zero probability or probability out-of-range (0, 1] is undefined:
+    cuda::access_property interleaved(cuda::access_property::streaming{}, 0.0);   // undefined behavior
+}
+
+__global__ void correct(int* global) {
+    __shared__ int shmem;
+    cuda::associate_access_property(&shmem, cuda::access_property::shared{});
+
+    cuda::access_property global_range0(global, 0, sizeof(int), cuda::access_property::streaming{});
+    cuda::associate_access_property(global, global_range0);
+
+    cuda::access_property global_interleaved(cuda::access_property::streaming{}, 1.0);
+    cuda::associate_access_property(global, global_interleaved);
+
+    // Access properties can be constructed for any address range
+    cuda::access_property global_range1(global,  0, sizeof(int), cuda::access_property::streaming{});
+    cuda::access_property global_range2(nullptr, 0, sizeof(int), cuda::access_property::streaming{});
+}
+
+__global__ void range(int* g, size_t n) {
+  // To apply a single property to all elements in the range [g, g+n), set leading_bytes = total_bytes = n
+  auto range_property = cuda::access_property(g, n, n, cuda::access_property::persisting{});
+}
+
+__global__ void range_with_halos(int* g, size_t n, size_t halos) {
+    // In the range [g, g + n), the first and last "halos" elements of `int` type are halos.
+    // This example applies one property to the halo elements, and another property to the internal elements:
+    // - halos: streaming  (secondary property)
+    // - internal: persisting (primary property)
+
+    auto internal_property = cuda::access_property::persisting{};
+    auto halo_property = cuda::access_property::streaming{};
+
+    // For the range property, the pointer used to build the property
+    // must satisfy p = g + halos
+    int* p = g + halos;
+    // Then, "total_elements" (total_size * sizeof(int)) must satisfy:
+    // g + n = p + total_elements
+    int total_bytes = (g + n - p) * sizeof(int);
+    // Finally, "leading_elements" (leading_bytes * sizeof(int)) must satisfy:
+    // g = p + leading_elements - total_elements
+    int leading_bytes = (g - p) * sizeof(int) + total_bytes;
+
+    // Is a property that we can use for halo exchange:
+    auto range_property = cuda::access_property(p, leading_bytes, total_bytes, internal_property, halo_property);
+}
+```
+
+[`cuda::annotated_ptr`]: {{ "extended_api/memory_access_properties/annotated_ptr.html" | relative_url }}
+[`cuda::access_propety`]: {{ "extended_api/memory_access_properties/access_property.html" | relative_url }}
+[`cuda::associate_access_property`]: {{ "extended_api/memory_access_properties/associate_access_property.html" | relative_url }}
+[`cuda::apply_access_property`]: {{ "extended_api/memory_access_properties/apply_access_property.html" | relative_url }}
+[`cuda::access_property::shared`]: {{ "extended_api/memory_access_properties/access_property.html#kinds-of-access-properties" | relative_url }}
+[`cuda::access_property::global`]: {{ "extended_api/memory_access_properties/access_property.html#kinds-of-access-properties" | relative_url }}
+[`cuda::access_property::persisting`]: {{ "extended_api/memory_access_properties/access_property.html#kinds-of-access-properties" | relative_url }}
+[`cuda::access_property::normal`]: {{ "extended_api/memory_access_properties/access_property.html#kinds-of-access-properties" | relative_url }}
+[`cuda::access_property::streaming`]: {{ "extended_api/memory_access_properties/access_property.html#kinds-of-access-properties" | relative_url }}
+
+[`LiteralType`]: https://en.cppreference.com/w/cpp/named_req/LiteralType
+[`cudaAccessProperty`]: https://docs.nvidia.com/cuda/cuda-runtime-api
+[PTX ISA Version 7.4]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#ptx-isa-version-7-4
+[Cache Eviction Priority Hints]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#cache-eviction-priority-hints
+
diff --git a/docs/extended_api/memory_access_properties/annotated_ptr.md b/docs/extended_api/memory_access_properties/annotated_ptr.md
new file mode 100644
index 0000000000..a5148547ce
--- /dev/null
+++ b/docs/extended_api/memory_access_properties/annotated_ptr.md
@@ -0,0 +1,290 @@
+---
+parent: Memory access properties
+grand_parent: Extended API
+nav_order: 1
+---
+
+# `cuda::annotated_ptr`
+
+Defined in header `<cuda/annotated_ptr>`:
+
+```cuda
+namespace cuda {
+template <typename Type, typename Property>
+class annotated_ptr<Type, Property>;
+} // namespace cuda
+```
+
+**Mandates**: `Property` is one of:
+
+* [`cuda::access_property::shared`],
+* [`cuda::access_property::global`],
+* [`cuda::access_property::persisting`],
+* [`cuda::access_property::normal`],
+* [`cuda::access_property::streaming`], or
+* [`cuda::access_property`] (a type-erased property with a runtime value).
+
+_Note_: if `Property` is [`cuda::access_property`], i.e. a dynamic property with a runtime value, then `sizeof(cuda::annotated_ptr<Type, cuda::access_property>) == 2 * sizeof(Type*)`. Otherwise, its size is `sizeof(Type*)`.
+
+The class template [`cuda::annotated_ptr`] is a pointer annotated with an access property that _may_ be applied to memory operations performed through the [`cuda::annotated_ptr`].
+
+In contrast with [`cuda::associate_access_property`], [`cuda::annotated_ptr`] maintains the association when passed through ABI boundaries, e.g., calling a non-inlined library function with a [`cuda::annotated_ptr`] argument.
+
+It implements a pointer-like interface:
+
+| Pointer Expression  | `cuda::annotated_ptr<T, P>`               | Description                                 |
+|=====================|===========================================|=============================================|
+| `T* a`              | `cuda::annotated_ptr<T, P> a`             | non-`const` pointer to non-`const` memory   |
+| `T const * a`       | `cuda::annotated_ptr<T const, P> a`       | non-`const` pointer to `const` memory       |
+| `T* const a`        | `const cuda::annotated_ptr<T, P> a`       | `const` pointer to non-`const` memory       |
+| `T const* const a`  | `const cuda::annotated_ptr<T const, P> a` | `const` pointer to `const` memory           |
+| `val = *a;`         | `val = *a;`                               | dereference operator to load an element     |
+| `*a = val;`         | `*a = val;`                               | dereference operator to store an element    |
+| `val = a[n];`       | `val = a[n];`                             | subscript operator to load an element       |
+| `a[n] = val;`       | `a[n] = val;`                             | subscript operator to store an element      |
+| `T* a = nullptr;`   | `annotated_ptr<T, P> a = nullptr;`        | `nullptr` initialization                    |
+| `n = a - b;`        | `n = a - b;`                              | difference operator                         |
+| `if (a) { ... }`    | `if (a) { ... }`                          | explicit bool conversion                    |
+
+But it is not a drop-in replacement for pointers since, among others, it does not:
+
+* model any [`Iterator`] concept,
+* implement [`std::pointer_traits`], [`std::iterator_traits`], etc.
+* have the same variance as pointer.
+
+```cuda
+namespace cuda {
+
+template<class Type, class Property>
+class annotated_ptr {
+public:
+  using value_type = Type;
+  using size_type = std::size_t;
+  using reference = value_type &;
+  using pointer = value_type *;
+  using const_pointer = value_type const *;
+  using difference_type = std::ptrdiff_t;
+
+  __host__ __device__ constexpr annotated_ptr() noexcept;
+  __host__ __device__ constexpr annotated_ptr(annotated_ptr const&) noexcept = default;
+  __host__ __device__ constexpr annotated_ptr& operator=(annotated_ptr const&) noexcept = default;
+  __host__ __device__ explicit annotated_ptr(pointer);
+  template <class RuntimeProperty>
+  __host__ __device__ annotated_ptr(pointer, RuntimeProperty);
+  template <class T, class P>
+  __host__ __device__ annotated_ptr(annotated_ptr<T,P> const&);
+
+  __host__ __device__ constexpr explicit operator bool() const noexcept;
+  __host__ __device__ pointer get() const noexcept;
+
+  __host__ __device__ reference operator*() const;
+  __host__ __device__ pointer operator->() const;
+  __host__ __device__ reference operator[](std::ptrdiff_t) const;
+  __host__ __device__ constexpr difference_type operator-(annotated_ptr);
+
+private:
+  pointer ptr;   // exposition only
+  Property prop; // exposition only
+};
+
+} // namespace cuda
+```
+
+## Constructors and assignment
+
+### Default constructor
+
+```cuda
+constexpr annotated_ptr() noexcept;
+```
+
+**Effects**:  as if constructed by `annotated_ptr(nullptr)`;
+
+### Constructor from pointer
+
+```cuda
+constexpr explicit annotated_ptr(pointer ptr);
+```
+
+**Preconditions**:
+
+* if `Property` is [`cuda::access_property::shared`] then `ptr` must be a generic pointer that is valid to cast to a pointer to the shared memory address space.
+* if `Property` is [`cuda::access_property::global`], [`cuda::access_property::normal`], [`cuda::access_property::streaming`], [`cuda::access_property::persisting`], or [`cuda::access_property`]  then `ptr` must be a generic pointer that is valid to cast to a pointer to the global memory address space.
+
+**Effects**:  Constructs an `annotated_ptr` requesting associating `ptr` with `Property`. 
+If `Property` is [`cuda::access_property`] then `prop` is initialized with [`cuda::access_property::global`].
+
+**Note**: in **Preconditions** "valid" means that casting the generic pointer to the corresponding address space does not introduce undefined behavior.
+
+### Constructor from pointer and access property
+
+```cuda
+template <class RuntimeProperty>
+annotated_ptr(pointer ptr, RuntimeProperty prop);
+```
+
+**Mandates**:
+
+* `Property` is [`cuda::access_property`].
+* `RuntimeProperty` is any of [`cuda::access_property::global`], [`cuda::access_property::normal`], [`cuda::access_property::streaming`], [`cuda::access_property::persisting`], or [`cuda::access_property`].
+
+**Preconditions**: `ptr` is a pointer to a valid allocation in the global memory address space.
+
+**Effects**:  Constructs an `annotated_ptr` requesting the association of `ptr` with the property `prop`.
+
+# Copy constructor from a different `annotated_ptr`
+
+```cuda
+template <class T, class P>
+constexpr annotated_ptr(annotated_ptr<T,P> const& a);
+```
+
+**Mandates**:
+
+* `annotated_ptr<Type, Property>::pointer` is assignable from `annotated_ptr<T, P>::pointer`.
+* `Property` is either [`cuda::access_property`] or `P`.
+* `Property` and `P` specify the same memory space.
+
+**Preconditions**: `pointer` is compatible with `Property`.
+
+**Effects**: Constructs an `annotated_ptr` for the same pointer as the input `annotated_ptr`.
+
+
+## Explicit conversion operator to `bool`
+
+```cuda
+constexpr operator bool() const noexcept;
+```
+
+**Returns**: `false` if the pointer is a `nullptr`, `true` otherwise.
+
+
+## Raw pointer access
+
+```cuda
+pointer get() const noexcept;
+```
+
+**Returns**: A pointer derived from the `annotated_ptr`.
+
+## Operators
+
+### Dereference
+
+```cuda
+reference operator*() const;
+```
+
+**Preconditions**: The `annotated_ptr` is not null and points to a valid `T` value.
+
+**Returns**: [`*cuda::associate_access_property(ptr, prop)`][`cuda::associate_access_property`]
+
+### Pointer-to-member
+
+```cuda
+pointer operator->() const;
+```
+
+**Preconditions**: the `annotated_ptr` is not null.
+
+**Returns**: [`cuda::associate_access_property(ptr, prop)`][`cuda::associate_access_property`]
+
+### Subscript
+
+```cuda
+reference operator[](ptrdiff_t i) const;
+```
+
+**Preconditions**: `ptr` points to a valid allocation of at least size `[ptr, ptr+i]`.
+
+**Returns**: [`*cuda::associate_access_property(ptr+i,prop)`][`cuda::associate_access_property`]
+
+### Pointer distance
+
+```cuda
+constexpr difference_type operator-(annotated_ptr p) const;
+```
+
+**Preconditions**: `ptr` and `p` point to the same allocation.
+
+**Returns**: as-if `get() - p.get()`.
+
+## Example
+
+Given three input and output vectors `x`, `y`, and `z`, and two arrays of coefficients `a` and `b`, all of length `N`:
+
+```cuda
+size_t N;
+int* x, *y, *z;
+int* a, *b;
+```
+
+the grid-strided kernel:
+
+```cuda
+__global__ void update(int* const x, int const* const a, int const* const b, size_t N) {
+    auto g = cooperative_groups::this_grid();
+    for (int i = g.thread_rank(); idx < N; idx += g.size()) {
+        x[i] = a[i] * x[i] + b[i];
+    }
+}
+```
+
+updates `x`, `y`, and `z` as follows:
+
+```cuda
+update<<<grid, block>>>(x, a, b, N);
+update<<<grid, block>>>(y, a, b, N);
+update<<<grid, block>>>(z, a, b, N);
+```
+
+The elements of `a` and `b` are used in all kernels.
+If `N` is large enough, elements of `a` and `b` might be evicted from the L2 cache, requiring these to be re-loaded from memory in the next `update`.
+
+We can make the `update` kernel generic to allow the caller to pass [`cuda::annotated_ptr`] objects that hint at how memory will be accessed:
+
+```cuda
+template <typename PointerX, typename PointerA, typename PointerB>
+__global__ void update_template(PointerX x, PointerA a, PointerB b, size_t N) {
+    auto g = cooperative_groups::this_grid();
+    for (int idx = g.thread_rank(); idx < N; idx += g.size()) {
+        x[idx] = a[idx] * x[idx] + b[idx];
+    }
+}
+```
+
+With [`cuda::annotated_ptr`], the caller can then specify the temporal locality of the memory accesses:
+
+```cuda
+// Frequent accesses to "a" and "b"; infrequent accesses to "x" and "y":
+cuda::annotated_ptr<int const, cuda::access_property::persisting> a_p {a}, b_p{b};
+cuda::annotated_ptr<int, cuda::access_property::streaming> x_s{x}, y_s{y};
+update_template<<<grid, block>>>(x_s, a_p, b_p, N);
+update_template<<<grid, block>>>(y_s, a_p, b_p, N);
+
+// Infrequent accesses to "a" and "b"; frequent acceses to "z":
+cuda::annotated_ptr<int const, cuda::access_property::streaming> a_s {a}, b_s{b};
+cuda::annotated_ptr<int, cuda::access_property::persisting> z_p{z};
+update_template<<<grid, block>>>(z_p, a_s, b_s, N);
+
+// Different kernel, "update_z", uses "z" again one last time.
+// Since "z" was accessed as "persisting" by the previous kernel,
+// parts of it are more likely to have previously survived in the L2 cache.
+update_z<<<grid, block>>>(z, ...);
+```
+
+Notice how the raw pointers to `a` and `b` can be wrapped by both `annotated_ptr<T, persistent>` and `annotated_ptr<T, streaming>`, and accesses through each pointer applies the corresponding access property.
+
+[`Iterator`]: https://en.cppreference.com/w/cpp/iterator
+[`std::pointer_traits`]: https://en.cppreference.com/w/cpp/memory/pointer_traits
+[`std::iterator_traits`]: https://en.cppreference.com/w/cpp/iterator/iterator_traits
+
+[`cuda::annotated_ptr`]: {{ "extended_api/memory_access_properties/annotated_ptr.html" | relative_url }}
+[`cuda::access_propety`]: {{ "extended_api/memory_access_properties/access_property.html" | relative_url }}
+[`cuda::associate_access_property`]: {{ "extended_api/memory_access_properties/associate_access_property.html" | relative_url }}
+[`cuda::apply_access_property`]: {{ "extended_api/memory_access_properties/apply_access_property.html" | relative_url }}
+[`cuda::access_property::shared`]: {{ "extended_api/memory_access_properties/access_property.html#kinds-of-access-properties" | relative_url }}
+[`cuda::access_property::global`]: {{ "extended_api/memory_access_properties/access_property.html#kinds-of-access-properties" | relative_url }}
+[`cuda::access_property::persisting`]: {{ "extended_api/memory_access_properties/access_property.html#kinds-of-access-properties" | relative_url }}
+[`cuda::access_property::normal`]: {{ "extended_api/memory_access_properties/access_property.html#kinds-of-access-properties" | relative_url }}
+[`cuda::access_property::streaming`]: {{ "extended_api/memory_access_properties/access_property.html#kinds-of-access-properties" | relative_url }}
diff --git a/docs/extended_api/memory_access_properties/apply_access_property.md b/docs/extended_api/memory_access_properties/apply_access_property.md
new file mode 100644
index 0000000000..749d2208b6
--- /dev/null
+++ b/docs/extended_api/memory_access_properties/apply_access_property.md
@@ -0,0 +1,107 @@
+---
+parent: Memory access properties
+grand_parent: Extended API
+nav_order: 3
+---
+
+# `cuda::apply_access_property`
+
+```cuda
+template <class ShapeT>
+__host__ __device__
+void apply_access_property(void const volatile* ptr, ShapeT shape, cuda::access_property::persisting) noexcept;
+template <class ShapeT>
+__host__ __device__
+void apply_access_property(void const volatile* ptr, ShapeT shape, cuda::access_property::normal) noexcept;
+```
+
+**Mandates**: [`ShapeT`] is either [`std::size_t`] or [`cuda::aligned_size_t`].
+
+**Preconditions**: `ptr` points to a valid allocation for `shape` in the global memory address space.
+
+**Effects**: no effects.
+
+**_Hint_**: to prefetch `shape` bytes of memory starting at `ptr` while applying a property. Two properties are supported:
+
+* [`cuda::access_property::persisting`] 
+* [`cuda::access_property::normal`]
+
+
+**Note**: in **Preconditions** "valid allocation for `shape` means that:
+
+* if `ShapeT` is `aligned_size_t<N>(sz)` then `ptr` is aligned to an `N`-bytes alignment boundary, and
+* for all offsets `i` in the extent of `shape`, i.e., `i` in `[0, shape)` then the expression `*(ptr + i)` does not exhibit undefined behavior.
+
+**Note**: currently `apply_access_property` is ignored by nvcc and nvc++ on the host.
+
+# Example
+
+Given three input and output vectors `x`, `y`, and `z`, and two arrays of coefficients `a` and `b`, all of length `N`:
+
+```cuda
+size_t N;
+int* x, *y, *z;
+int* a, *b;
+```
+
+the grid-strided kernel:
+
+```cuda
+__global__ void update(int* const x, int const* const a, int const* const b, size_t N) {
+    auto g = cooperative_groups::this_grid();
+    for (int idx = g.thread_rank(); idx < N; idx += g.size()) {
+        x[idx] = a[idx] * x[idx] + b[idx];
+    }
+}
+```
+
+updates `x`, `y`, and `z` as follows:
+
+```cuda
+update<<<grid, block>>>(x, a, b, N);
+update<<<grid, block>>>(y, a, b, N);
+update<<<grid, block>>>(z, a, b, N);
+```
+
+The elements of `a` and `b` are used in all kernels.
+For certain values of `N`, this may prevent parts of `a` and `b` from being evicted from the L2 cache, avoiding reloading these from memory in the subsequent `update` kernel.
+
+With [`cuda::access_property`] and [`cuda::apply_access_property`], we can write kernels that specify that `a` and `b` are accessed more often than (`pin`) and as often as (`unpin`) other data:
+
+```cuda
+__global__ void pin(int* a, int* b, size_t N) {
+    auto g = cooperative_groups::this_grid();
+    for (int idx = g.thread_rank(); idx < N; idx += g.size()) {
+        cuda::apply_access_property(a + idx, sizeof(int), cuda::access_property::persisting{});
+        cuda::apply_access_property(b + idx, sizeof(int), cuda::access_property::persisting{});
+    }
+}
+__global__ void unpin(int* a, int* b, size_t N) {
+    auto g = cooperative_groups::this_grid();
+    for (int idx = g.thread_rank(); idx < N; idx += g.size()) {
+        cuda::apply_access_property(a + idx, sizeof(int), cuda::access_property::normal{});
+        cuda::apply_access_property(b + idx, sizeof(int), cuda::access_property::normal{});
+    }
+}
+```
+
+which we can launch before and after the `update` kernels:
+
+```cuda
+pin<<<grid, block>>>(a, b, N);
+update<<<grid, block>>>(x, a, b, N);
+update<<<grid, block>>>(y, a, b, N);
+update<<<grid, block>>>(z, a, b, N);
+unpin<<<grid, block>>>(a, b, N);
+```
+
+This does not require modifying the `update` kernel, and for certain values of `N` prevents `a` and `b` from having to be re-loaded from memory.
+
+The `pin` and `unpin` kernels can be fused into the kernels for the `x` and `z` updates by modifying these kernels.
+
+[`std::size_t`]: https://en.cppreference.com/w/cpp/types/size_t
+[`ShapeT`]: {{ "extended_api/shapes.html" | relative_url }}
+[`cuda::aligned_size_t`]: {{ "extended_api/shapes/aligned_size_t.html" | relative_url }}
+[`cuda::access_propety`]: {{ "extended_api/memory_access_properties/access_property.html" | relative_url }}
+[`cuda::access_property::persisting`]: {{ "extended_api/memory_access_properties/access_property.html#kinds-of-access-properties" | relative_url }}
+[`cuda::access_property::normal`]: {{ "extended_api/memory_access_properties/access_property.html#kinds-of-access-properties" | relative_url }}
diff --git a/docs/extended_api/memory_access_properties/associate_access_property.md b/docs/extended_api/memory_access_properties/associate_access_property.md
new file mode 100644
index 0000000000..91580e8ed4
--- /dev/null
+++ b/docs/extended_api/memory_access_properties/associate_access_property.md
@@ -0,0 +1,53 @@
+---
+parent: Memory access properties
+grand_parent: Extended API
+nav_order: 4
+---
+
+# `cuda::associate_access_property`
+
+```cuda
+template <class T, class Property>
+__host__ __device__ 
+T* associate_access_property(T* ptr, Property prop);
+```
+
+**Preconditions**:
+* if `Property` is [`cuda::access_property::shared`] then it must be valid to cast the generic pointer `ptr` to a pointer to the shared memory address space.
+* if `Property` is one of [`cuda::access_property::global`], [`cuda::access_property::persisting`], [`cuda::access_property::normal`], or [`cuda::access_property::streaming`] then it must be valid to cast the generic pointer `ptr` to a pointer to the global memory address space.
+* if `Property` is a [`cuda::access_property`] of "range" kind, then `ptr` must be in the valid range. 
+
+**Mandates**: `Property` is convertible to [`cuda::access_property`].
+
+**Effects**: no effects. 
+
+**_Hint_**: to associate an access property with the returned pointer, such that subsequent memory operations with the returned pointer _or_ pointers derived from it _may_ apply the access property. 
+
+  * The "association" is _not_ part of the value representation of the pointer.
+  * The compiler is allowed to drop the association; it does not have a functional consequence.
+  * The association _may_ hold through simple expressions, sequence of simple statements, or fully inlined function calls where the pointer value or C++ reference is provably unchanged; this includes offset pointers used for array access. 
+  * The association is _not_ expected to hold through the ABI of an unknown function call, e.g., when the pointer is passed through a separately-compiled function interface, unless link-time optimizations are used.
+
+**Note**: currently `associate_access_property` is ignored by nvcc and nvc++ on the host; but this might change any time.
+
+# Example
+
+```cuda
+#include <cuda/cooperative_groups.h>
+__global__ void memcpy(int const* in_, int* out) {
+    int const* in = cuda::associate_access_property(in_, cuda::access_property::streaming{});
+    auto idx = cooperative_groups::this_grid().thread_rank();
+
+    __shared__ int shmem[N];
+    shmem[threadIdx.x] = in[idx]; // streaming access
+
+    // compute...
+}
+```
+
+[`cuda::access_propety`]: {{ "extended_api/memory_access_properties/access_property.html" | relative_url }}
+[`cuda::access_property::persisting`]: {{ "extended_api/memory_access_properties/access_property.html#kinds-of-access-properties" | relative_url }}
+[`cuda::access_property::streaming`]: {{ "extended_api/memory_access_properties/access_property.html#kinds-of-access-properties" | relative_url }}
+[`cuda::access_property::normal`]: {{ "extended_api/memory_access_properties/access_property.html#kinds-of-access-properties" | relative_url }}
+[`cuda::access_property::global`]: {{ "extended_api/memory_access_properties/access_property.html#kinds-of-access-properties" | relative_url }}
+[`cuda::access_property::shared`]: {{ "extended_api/memory_access_properties/access_property.html#kinds-of-access-properties" | relative_url }}
diff --git a/docs/extended_api/memory_access_properties/discard_memory.md b/docs/extended_api/memory_access_properties/discard_memory.md
new file mode 100644
index 0000000000..5652b9e394
--- /dev/null
+++ b/docs/extended_api/memory_access_properties/discard_memory.md
@@ -0,0 +1,46 @@
+---
+parent: Memory access properties
+grand_parent: Extended API
+nav_order: 5
+---
+
+# `cuda::discard_memory`
+
+```cuda
+__device__ void discard_memory(void volatile* ptr, size_t nbytes);
+```
+
+**Preconditions**: `ptr` points to a valid allocation of size greater or equal to `nbytes`.
+
+**Effects**: equivalent to `memset(ptr, _indeterminate_, nbytes)`.
+
+**Hint**: to discard modified cache lines without writing back the cached data to memory. 
+Enables using global memory as temporary scratch space.
+Does **not** generate any HW store operations.
+
+# Example
+
+This kernel needs a scratch pad that does not fit in shared memory, so it uses an allocation in global memory instead:
+
+```cuda
+#include <cuda/annotated_ptr>
+__device__ int compute(int* scratch, size_t N);
+
+__global__ void kernel(int const* in, int* out, int* scratch, size_t N) {
+    // Each thread reads N elements into the scratch pad:
+    for (int i = 0; i < N; ++i) {
+        int idx = threadIdx.x + i * blockDim.x;
+        scratch[idx] = in[idx];
+    }
+    __syncthreads();
+
+    // All threads compute on the scratch pad:
+    int result = compute(scratch, N);
+
+    // All threads discard the scratch pad memory to _hint_ that it does not need to be flushed from the cache:
+    cuda::discard_memory(scratch + threadIdx.x * N, N * sizeof(int));
+    __syncthreads();
+
+    out[threadIdx.x] = result;
+}
+```
diff --git a/docs/readme.md b/docs/readme.md
new file mode 100644
index 0000000000..a3e99a2ff9
--- /dev/null
+++ b/docs/readme.md
@@ -0,0 +1,9 @@
+libcu++ documentation
+===
+
+To build and serve the documentation as a website to `http://localhost:4000` just execute the following from libcudacxx root directory:
+
+```shell
+./docs/serve
+```
+
diff --git a/docs/serve b/docs/serve
new file mode 100755
index 0000000000..ff9c5cbb6a
--- /dev/null
+++ b/docs/serve
@@ -0,0 +1,9 @@
+#!/usr/bin/env sh
+
+set -ex
+
+(
+    cd docs
+    docker build -f Dockerfile -t libcudacxx:docs .
+    docker run --rm -p 4000:4000 -v $(pwd):/srv/jekyll -u $(id -u):$(id -g) -it libcudacxx:docs bash -c "jekyll serve --watch --host 0.0.0.0"
+)