From 6b015ef95287d6e6442ff9acdf49d5c52aed7770 Mon Sep 17 00:00:00 2001
From: Gonzalo Brito Gadeschi <gonzalob@nvidia.com>
Date: Tue, 10 May 2022 03:13:56 -0700
Subject: [PATCH 1/8] Memory model documentation

---
 docs/extended_api.md                          |   2 +-
 docs/extended_api/memory_model.md             | 186 ++++++++++++++++++
 .../synchronization_primitives/atomic.md      |   4 +-
 docs/extended_api/thread_scopes.md            | 154 ---------------
 4 files changed, 190 insertions(+), 156 deletions(-)
 create mode 100644 docs/extended_api/memory_model.md
 delete mode 100644 docs/extended_api/thread_scopes.md

diff --git a/docs/extended_api.md b/docs/extended_api.md
index d81d9bb63d..952b7c81e5 100644
--- a/docs/extended_api.md
+++ b/docs/extended_api.md
@@ -21,6 +21,6 @@ nav_order: 3
 
 {% include_relative extended_api/functional.md %}
 
-[Thread Scopes]: ./extended_api/thread_scopes.md
+[Thread Scopes]: ./extended_api/memory_model.md#thread-scopes
 [Thread Groups]: ./extended_api/thread_groups.md
 
diff --git a/docs/extended_api/memory_model.md b/docs/extended_api/memory_model.md
new file mode 100644
index 0000000000..7e8c875ac6
--- /dev/null
+++ b/docs/extended_api/memory_model.md
@@ -0,0 +1,186 @@
+---
+parent: Extended API
+nav_order: 0
+---
+
+# Memory model
+
+Standard C++ presents a view that the cost to synchronize threads is uniform and low.
+
+CUDA C++ is different: the cost to synchronize threads grows as threads are further appart.
+It is low across threads within a block, but high across arbitrary threads in the system running on multiple GPUs and CPUs.
+
+To account for non-uniform thread synchronization costs that are not always low, CUDA C++ extends the standard C++ memory model and concurrency facilities in the `cuda::` namespace with **thread scopes**, retaining the syntax and semantics of standard C++ by default.
+
+## Threads
+
+In CUDA C++, memory can only be modified by threads, so any operation that modifies memory is being executed by a thread.
+
+Examples of threads of execution in CUDA C++ programs: GPU or CPU threads, threads in DMA engines executing asynchronous operations like [`memcpy_async`] or [`cudaMemcpyAsync`], threads in NICs executing RDMA operations, etc.
+
+[`memcpy_async`]: asynchronous_operations/memcpy_async.md
+[`cudaMemcpyAsync`]: https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1g85073372f776b4c4d5f89f7124b7bf79
+
+
+## Thread Scopes
+
+A _thread scope_ specifies the kind of threads that can synchronize with each other using synchronization primitive such as [`atomic`] or [`barrier`].
+
+```cuda
+namespace cuda {
+
+enum thread_scope {
+  thread_scope_system,
+  thread_scope_device,
+  thread_scope_block,
+  thread_scope_thread
+};
+
+}  // namespace cuda
+```
+
+[`atomic`]: synchronization_primitives/atomic.md
+[`barrier`]: synchronization_primitives/barrier.md
+
+### Scope Relationships
+
+Each program thread is related to each other program thread by one or more thread scope relations:
+- Each thread in the system is related to each other thread in the system by the *system* thread scope: `thread_scope_system`.
+- Each GPU thread is related to each other GPU thread in the same CUDA device by the *device* thread scope: `thread_scope_device`.
+- Each GPU thread is related to each other GPU thread in the same CUDA thread block by the *block* thread scope: `thread_scope_block`.
+- Each thread is related to itself by the `thread` thread scope: `thread_scope_thread`.
+
+## Synchronization primitives
+
+Types in namespaces `std::` and `cuda::std::` have the same behavior as corresponding types in namespace `cuda::` when instantiated with a scope of `cuda::thread_scope_system`.
+
+## Atomicity
+
+An atomic operation is atomic at the scope it specifies if:
+- it specifies a scope other than `thread_scope_system`, **or**
+
+the scope is `thread_scope_system` and:
+
+- it affects an object in [unified memory] and [`concurrentManagedAccess`] is `1`, **or**
+- it affects an object in CPU memory and [`hostNativeAtomicSupported`] is `1`, **or**
+- it is a load or store that affects a naturally-aligned object of sizes `1`, `2`, `4`, or `8` bytes on [mapped memory], **or**
+- it affects an object in GPU memory and only GPU threads access it.
+
+[mapped memory]: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#mapped-memory
+
+Refer to the [CUDA programming guide] for more information on [unified memory], [mapped memory], CPU memory, and GPU peer memory.
+
+[mapped memory]: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#mapped-memory
+[unified memory]: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#um-unified-memory-programming-hd
+[CUDA programming guide]: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html
+[`concurrentManagedAccess`]: https://docs.nvidia.com/cuda/cuda-runtime-api/structcudaDeviceProp.html#structcudaDeviceProp_116f9619ccc85e93bc456b8c69c80e78b
+[`hostNativeAtomicSupported`]: https://docs.nvidia.com/cuda/cuda-runtime-api/structcudaDeviceProp.html#structcudaDeviceProp_1ef82fd7d1d0413c7d6f33287e5b6306f
+
+## Data Races
+
+Modify [intro.races paragraph 21] of ISO/IEC IS 14882 (the C++ Standard) as follows:
+> The execution of a program contains a data race if it contains two potentially concurrent conflicting actions, at least one of which is not atomic ***at a scope that includes the thread that performed the other operation***, and neither happens before the other, except for the special case for signal handlers described below. Any such data race results inundefined behavior. [...]
+
+Modify [thread.barrier.class paragraph 4] of ISO/IEC IS 14882 (the C++ Standard) as follows:
+> 4. Concurrent invocations of the member functions of `barrier`, other than its destructor, do not introduce data races ***as if they were atomic operations***. [...]
+
+Modify [thread.latch.class paragraph 2] of ISO/IEC IS 14882 (the C++ Standard) as follows:
+> 2. Concurrent invocations of the member functions of `latch`, other than its destructor, do not introduce data races ***as if they were atomic operations***.
+
+Modify [thread.sema.cnt paragraph 3] of ISO/IEC IS 14882 (the C++ Standard) as follows:
+> 3. Concurrent invocations of the member functions of `counting_semaphore`, other than its destructor, do not introduce data races ***as if they were atomic operations***.
+
+Modify [thread.stoptoken.intro paragraph 5] of ISO/IEC IS 14882 (the C++ Standard) as follows:
+> Calls to the functions request_­stop, stop_­requested, and stop_­possible do not introduce data races ***as if they were atomic operations***. [...]
+
+[thread.stoptoken.intro paragraph 5]: https://eel.is/c++draft/thread#stoptoken.intro-5
+
+Modify [atomics.fences paragraph 2 through 4] of ISO/IEC IS 14882 (the C++ Standard) as follows:
+> A release fence A synchronizes with an acquire fence B if there exist atomic
+> operations X and Y, both operating on some atomic object M, such that A is
+> sequenced before X, X modifies M, Y is sequenced before B, and Y reads the
+> value written by X or a value written by any side effect in the hypothetical
+> release sequence X would head if it were a release operation,
+> ***and each operation (A, B, X, and Y) specifies a scope that includes the thread that performed each other operation***.
+
+> A release fence A synchronizes with an atomic operation B that performs an
+> acquire operation on an atomic object M if there exists an atomic operation X
+> such that A is sequenced before X, X modifies M, and B reads the value
+> written by X or a value written by any side effect in the hypothetical
+> release sequence X would head if it were a release operation,
+> ***and each operation (A, B, and X) specifies a scope that includes the thread that performed each other operation***.
+
+> An atomic operation A that is a release operation on an atomic object M
+> synchronizes with an acquire fence B if there exists some atomic operation X
+> on M such that X is sequenced before B and reads the value written by A or a
+> value written by any side effect in the release sequence headed by A,
+> ***and each operation (A, B, and X) specifies a scope that includes the thread that performed each other operation***.
+
+## Example: Message Passing
+
+The following example passes a message stored to the `x` variable by a thread in block `0` to a thread in block `1` via the flag `f`:
+
+<table class="display">
+<tr class="header"><td colspan="2" markdown="span" align="center">
+`int x = 0;`<br>
+`int f = 0;`
+</td></tr>
+<tr class="header">
+<td markdown="span" align="center"> 
+**Thread 0 Block 0** 
+</td><td markdown="span" align="center"> 
+**Thread 0 Block 1** 
+</td>
+</tr>
+<tr>
+<td markdown="span">
+`x = 42;`<br>
+`cuda::atomic_ref<int, cuda::thread_scope_device> flag(f);`<br>
+`flag.store(1, memory_order_release);`
+</td>
+<td markdown="span">
+`cuda::atomic_ref<int, cuda::thread_scope_device> flag(f);`<br>
+`while(flag.load(memory_order_acquire) != 1);`<br>
+`assert(x == 42);`
+</td>
+</tr>
+</table>
+
+In the following variation of the previous example, two threads concurrently access the `f` object without synchronization, which leads to a **data race**, and exhibits **undefined behavior**:
+
+<table>
+<tr><td colspan="2" markdown="span" align="center">
+`int x = 0;`<br>
+`int f = 0;`
+</td></tr>
+<tr>
+<td markdown="span" align="center"> 
+**Thread 0 Block 0** 
+</td><td markdown="span" align="center"> 
+**Thread 0 Block 1** 
+</td>
+</tr>
+<tr>
+<td markdown="span">
+`x = 42;`<br>
+`cuda::atomic_ref<int, cuda::thread_scope_block> flag(f);`<br>
+`flag.store(1, memory_order_release);  // UB: data race`
+</td>
+<td markdown="span">
+`cuda::atomic_ref<int, cuda::thread_scope_device> flag(f);`<br>
+`while(flag.load(memory_order_acquire) != 1); // UB: data race`<br>
+`assert(x == 42);`
+</td>
+</tr>
+</table>
+
+While the memory operations on `f` - the store and the loads - are atomic, the scope of the store operation is "block scope". Since the store is performed by Thread 0 of Block 0, it only includes all other threads of Block 0. However, the thread doing the loads is in Block 1, i.e., it is not in a scope included by the store operation performed in Block 0, causing the store and the load to not be "atomic", and introducing a data-race. 
+
+For more examples see the [PTX memory consistency model litmus tests].
+
+[PTX memory consistency model litmus tests]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#axioms 
+[intro.races paragraph 21]: https://eel.is/c++draft/intro.races#21
+[thread.barrier.class paragraph 4]: https://eel.is/c++draft/thread.barrier.class#4
+[thread.latch.class paragraph 2]: https://eel.is/c++draft/thread.latch.class#2
+[thread.sema.cnt paragraph 3]: https://eel.is/c++draft/thread.sema.cnt#3
+[atomics.fences paragraph 2 through 4]: https://eel.is/c++draft/atomics.fences#2
diff --git a/docs/extended_api/synchronization_primitives/atomic.md b/docs/extended_api/synchronization_primitives/atomic.md
index 67b18972f8..7d4e57c58a 100644
--- a/docs/extended_api/synchronization_primitives/atomic.md
+++ b/docs/extended_api/synchronization_primitives/atomic.md
@@ -44,7 +44,7 @@ Under CUDA Compute Capability 6 (Pascal), an object of type `atomic` may not be
 - if `is_always_lock_free()` is `false`.
 
 Under CUDA Compute Capability prior to 6 (Pascal), objects of type
-  `cuda::atomic` or [`cuda::std::atomic`] may not be used.
+  [`cuda::atomic`] or [`cuda::std::atomic`] may not be used.
 
 ## Implementation-Defined Behavior
 
@@ -87,6 +87,8 @@ __global__ void example_kernel() {
 
 [`cuda::std::atomic`]: https://en.cppreference.com/w/cpp/atomic/atomic
 
+[`cuda::atomic`]: ./atomic.md
+
 [atomics.types.int]: https://eel.is/c++draft/atomics.types.int
 [atomics.types.pointer]: https://eel.is/c++draft/atomics.types.pointer
 
diff --git a/docs/extended_api/thread_scopes.md b/docs/extended_api/thread_scopes.md
deleted file mode 100644
index 67215090e5..0000000000
--- a/docs/extended_api/thread_scopes.md
+++ /dev/null
@@ -1,154 +0,0 @@
----
-parent: Extended API
-nav_order: 0
----
-
-# Thread Scopes
-
-```cuda
-namespace cuda {
-
-// Header `<cuda/atomic>`.
-
-enum thread_scope {
-  thread_scope_system,
-  thread_scope_device,
-  thread_scope_block,
-  thread_scope_thread
-};
-
-template <typename T, thread_scope Scope = thread_scope_system>
-class atomic;
-
-void atomic_thread_fence(std::memory_order, thread_scope = thread_scope_system);
-
-// Header `<cuda/barrier>`.
-
-template <thread_scope Scope,
-          typename Completion = /* implementation-defined */>
-class barrier;
-
-// Header `<cuda/latch>`.
-
-template <thread_scope Scope>
-class latch;
-
-// Header `<cuda/semaphore>`.
-
-template <thread_scope Scope>
-class binary_semaphore;
-
-template <thread_scope Scope = thread_scope_thread,
-          ptrdiff_t LeastMaximumValue = /* implementation-defined */>
-class counting_semaphore;
-
-}
-```
-
-Standard C++ presents a view that the cost to synchronize threads is uniform
-  and low.
-CUDA C++ is different: the overhead is low among threads within a block, and
-  high across arbitrary threads in the system.
-
-To bridge these two realities, libcu++ introduces **thread scopes**
-  to the Standard's concurrency facilities in the `cuda::` namespace, while
-  retaining the syntax and semantics of Standard C++ by default.
-A thread scope specifies the kind of threads that can synchronize with each
-  other using a primitive such as an `atomic` or a `barrier`.
-
-## Scope Relationships
-
-Each program thread is related to each other program thread by one or more
-  thread scope relations:
-- Each thread (CPU or GPU) is related to each other thread in the computer
-  system by the *system* thread scope, specified with `thread_scope_system`.
-- Each GPU thread is related to each other GPU thread in the same CUDA device
-  by the *device* thread scope, specified with `thread_scope_device`.
-- Each GPU thread is related to each other GPU thread in the same CUDA block
-  by the *block* thread scope, specified with `thread_scope_block`.
-- Each thread (CPU or GPU) is related to itself by the `thread` thread scope,
-  specified with `thread_scope_thread`.
-
-Objects in namespace `cuda::std::` have the same behavior as corresponding
-  objects in namespace `cuda::` when instantiated with a scope of
-  `cuda::thread_scope_system`.
-
-Refer to the [CUDA programming guide] for more information on how CUDA launches
-  threads into devices and blocks.
-
-## Atomicity
-
-An atomic operation is atomic at the scope it specifies if:
-- it specifies a scope other than `thread_scope_system`, or
-- it affects an object in unified memory and [`concurrentManagedAccess`] is
-  `1`, or
-- it affects an object in CPU memory and [`hostNativeAtomicSupported`] is `1`,
-  or
-- it affects an object in GPU memory and only GPU threads access it.
-
-Refer to the [CUDA programming guide] for more information on
-  unified memory, CPU memory, and GPU peer memory.
-
-## Data Races
-
-Modify [intro.races paragraph 21] of ISO/IEC IS 14882 (the C++ Standard) as
-  follows:
-> The execution of a program contains a data race if it contains two
-> potentially concurrent conflicting actions, at least one of which is not
-> atomic
-> ***at a scope that includes the thread that performed the other operation***,
-> and neither happens before the other, except for the special
-> case for signal handlers described below. Any such data race results in
-> undefined behavior. [...]
-
-Modify [thread.barrier.class paragraph 4] of ISO/IEC IS 14882 (the C++
-  Standard) as follows:
-> 4. Concurrent invocations of the member functions of `barrier`, other than its
-> destructor, do not introduce data races
-> ***as if they were atomic operations***.
-> [...]
-
-Modify [thread.latch.class paragraph 2] of ISO/IEC IS 14882 (the C++ Standard)
-  as follows:
-> 2. Concurrent invocations of the member functions of `latch`, other than its
-> destructor, do not introduce data races
-> ***as if they were atomic operations***.
-
-Modify [thread.sema.cnt paragraph 3] of ISO/IEC IS 14882 (the C++ Standard) as
-  follows:
-> 3. Concurrent invocations of the member functions of `counting_semaphore`,
-> other than its destructor, do not introduce data races
-> ***as if they were atomic operations***.
-
-Modify [atomics.fences paragraph 2 through 4] of ISO/IEC IS 14882 (the C++
-  Standard) as follows:
-> A release fence A synchronizes with an acquire fence B if there exist atomic
-> operations X and Y, both operating on some atomic object M, such that A is
-> sequenced before X, X modifies M, Y is sequenced before B, and Y reads the
-> value written by X or a value written by any side effect in the hypothetical
-> release sequence X would head if it were a release operation,
-> ***and each operation (A, B, X, and Y) specifies a scope that includes the thread that performed each other operation***.
-
-> A release fence A synchronizes with an atomic operation B that performs an
-> acquire operation on an atomic object M if there exists an atomic operation X
-> such that A is sequenced before X, X modifies M, and B reads the value
-> written by X or a value written by any side effect in the hypothetical
-> release sequence X would head if it were a release operation,
-> ***and each operation (A, B, and X) specifies a scope that includes the thread that performed each other operation***.
-
-> An atomic operation A that is a release operation on an atomic object M
-> synchronizes with an acquire fence B if there exists some atomic operation X
-> on M such that X is sequenced before B and reads the value written by A or a
-> value written by any side effect in the release sequence headed by A,
-> ***and each operation (A, B, and X) specifies a scope that includes the thread that performed each other operation***.
-
-
-[intro.races paragraph 21]: https://eel.is/c++draft/intro.races#21
-[thread.barrier.class paragraph 4]: https://eel.is/c++draft/thread.barrier.class#4
-[thread.latch.class paragraph 2]: https://eel.is/c++draft/thread.latch.class#2
-[thread.sema.cnt paragraph 3]: https://eel.is/c++draft/thread.sema.cnt#3
-[atomics.fences paragraph 2 through 4]: https://eel.is/c++draft/atomics.fences#2
-
-[CUDA programming guide]: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html
-[`concurrentManagedAccess` property]: https://docs.nvidia.com/cuda/cuda-runtime-api/structcudaDeviceProp.html#structcudaDeviceProp_116f9619ccc85e93bc456b8c69c80e78b
-[`hostNativeAtomicSupported` property]: https://docs.nvidia.com/cuda/cuda-runtime-api/structcudaDeviceProp.html#structcudaDeviceProp_1ef82fd7d1d0413c7d6f33287e5b6306f

From c31c15dba616345e33b524c2db821ae13526c7f1 Mon Sep 17 00:00:00 2001
From: Gonzalo Brito Gadeschi <gonzalob@nvidia.com>
Date: Wed, 10 Aug 2022 12:09:09 -0700
Subject: [PATCH 2/8] Make site wider to support tables

---
 docs/_sass/color_schemes/nvidia.scss | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/_sass/color_schemes/nvidia.scss b/docs/_sass/color_schemes/nvidia.scss
index edd520bf6c..6bd1ddcbbf 100644
--- a/docs/_sass/color_schemes/nvidia.scss
+++ b/docs/_sass/color_schemes/nvidia.scss
@@ -15,6 +15,7 @@ pre.highlight code
 { font-size: 0.9em !important; }
 
 $nav-width: 300px;
+$content-width: 1000px;
 
 $body-background-color: $grey-dk-300;
 $sidebar-color: $grey-dk-300;

From 485a923265d7eeb1367293fe537947877fa5a33e Mon Sep 17 00:00:00 2001
From: Gonzalo Brito Gadeschi <gonzalob@nvidia.com>
Date: Wed, 10 Aug 2022 12:09:21 -0700
Subject: [PATCH 3/8] Fix site build scripts

---
 docs/Dockerfile | 3 ++-
 docs/serve      | 7 ++++++-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/docs/Dockerfile b/docs/Dockerfile
index 972e9c7fe5..e64302f147 100644
--- a/docs/Dockerfile
+++ b/docs/Dockerfile
@@ -1,3 +1,4 @@
 FROM jekyll/jekyll:4.0
 COPY Gemfile /Gemfile
-RUN /bin/bash -l -c "bundle install"
\ No newline at end of file
+RUN /bin/bash -l -c "bundle install"
+RUN /bin/bash -l -c "bundle add webrick"
\ No newline at end of file
diff --git a/docs/serve b/docs/serve
index ff9c5cbb6a..0a5f5f5dd9 100755
--- a/docs/serve
+++ b/docs/serve
@@ -2,8 +2,13 @@
 
 set -ex
 
+mkdir -p build/docs
+(
+    cd build/docs
+    cp ../../docs/Gemfile .
+    docker build -f ../../docs/Dockerfile -t libcudacxx:docs .
+)
 (
     cd docs
-    docker build -f Dockerfile -t libcudacxx:docs .
     docker run --rm -p 4000:4000 -v $(pwd):/srv/jekyll -u $(id -u):$(id -g) -it libcudacxx:docs bash -c "jekyll serve --watch --host 0.0.0.0"
 )

From 7ed814f0d7aa23a422ee8ee271ca455bc3ffd568 Mon Sep 17 00:00:00 2001
From: gonzalobg <65027571+gonzalobg@users.noreply.github.com>
Date: Mon, 15 Aug 2022 16:11:54 +0200
Subject: [PATCH 4/8] Fix typo

---
 docs/extended_api/memory_model.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/extended_api/memory_model.md b/docs/extended_api/memory_model.md
index 7e8c875ac6..eff7831fe9 100644
--- a/docs/extended_api/memory_model.md
+++ b/docs/extended_api/memory_model.md
@@ -7,7 +7,7 @@ nav_order: 0
 
 Standard C++ presents a view that the cost to synchronize threads is uniform and low.
 
-CUDA C++ is different: the cost to synchronize threads grows as threads are further appart.
+CUDA C++ is different: the cost to synchronize threads grows as threads are further apart.
 It is low across threads within a block, but high across arbitrary threads in the system running on multiple GPUs and CPUs.
 
 To account for non-uniform thread synchronization costs that are not always low, CUDA C++ extends the standard C++ memory model and concurrency facilities in the `cuda::` namespace with **thread scopes**, retaining the syntax and semantics of standard C++ by default.

From 0dab4eee03aa2fb80fce0b4208be425838fcabad Mon Sep 17 00:00:00 2001
From: gonzalobg <65027571+gonzalobg@users.noreply.github.com>
Date: Mon, 15 Aug 2022 16:12:18 +0200
Subject: [PATCH 5/8] Update docs/extended_api/memory_model.md

---
 docs/extended_api/memory_model.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/extended_api/memory_model.md b/docs/extended_api/memory_model.md
index eff7831fe9..7a3ae5a44f 100644
--- a/docs/extended_api/memory_model.md
+++ b/docs/extended_api/memory_model.md
@@ -79,7 +79,7 @@ Refer to the [CUDA programming guide] for more information on [unified memory],
 ## Data Races
 
 Modify [intro.races paragraph 21] of ISO/IEC IS 14882 (the C++ Standard) as follows:
-> The execution of a program contains a data race if it contains two potentially concurrent conflicting actions, at least one of which is not atomic ***at a scope that includes the thread that performed the other operation***, and neither happens before the other, except for the special case for signal handlers described below. Any such data race results inundefined behavior. [...]
+> The execution of a program contains a data race if it contains two potentially concurrent conflicting actions, at least one of which is not atomic ***at a scope that includes the thread that performed the other operation***, and neither happens before the other, except for the special case for signal handlers described below. Any such data race results in undefined behavior. [...]
 
 Modify [thread.barrier.class paragraph 4] of ISO/IEC IS 14882 (the C++ Standard) as follows:
 > 4. Concurrent invocations of the member functions of `barrier`, other than its destructor, do not introduce data races ***as if they were atomic operations***. [...]

From 1d10a7fb5311443dd5bee949a8d10fef85931b50 Mon Sep 17 00:00:00 2001
From: gonzalobg <65027571+gonzalobg@users.noreply.github.com>
Date: Mon, 15 Aug 2022 16:13:10 +0200
Subject: [PATCH 6/8] Leaving sentence out

---
 docs/extended_api/memory_model.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/docs/extended_api/memory_model.md b/docs/extended_api/memory_model.md
index 7a3ae5a44f..54a26a1aa2 100644
--- a/docs/extended_api/memory_model.md
+++ b/docs/extended_api/memory_model.md
@@ -16,7 +16,6 @@ To account for non-uniform thread synchronization costs that are not always low,
 
 In CUDA C++, memory can only be modified by threads, so any operation that modifies memory is being executed by a thread.
 
-Examples of threads of execution in CUDA C++ programs: GPU or CPU threads, threads in DMA engines executing asynchronous operations like [`memcpy_async`] or [`cudaMemcpyAsync`], threads in NICs executing RDMA operations, etc.
 
 [`memcpy_async`]: asynchronous_operations/memcpy_async.md
 [`cudaMemcpyAsync`]: https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1g85073372f776b4c4d5f89f7124b7bf79

From 383837ec283cf4403ae2f29b6ea8e9a984def674 Mon Sep 17 00:00:00 2001
From: gonzalobg <65027571+gonzalobg@users.noreply.github.com>
Date: Mon, 15 Aug 2022 16:22:46 +0200
Subject: [PATCH 7/8] Remove Threads section

---
 docs/extended_api/memory_model.md | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/docs/extended_api/memory_model.md b/docs/extended_api/memory_model.md
index 54a26a1aa2..e0e08a54c1 100644
--- a/docs/extended_api/memory_model.md
+++ b/docs/extended_api/memory_model.md
@@ -12,15 +12,6 @@ It is low across threads within a block, but high across arbitrary threads in th
 
 To account for non-uniform thread synchronization costs that are not always low, CUDA C++ extends the standard C++ memory model and concurrency facilities in the `cuda::` namespace with **thread scopes**, retaining the syntax and semantics of standard C++ by default.
 
-## Threads
-
-In CUDA C++, memory can only be modified by threads, so any operation that modifies memory is being executed by a thread.
-
-
-[`memcpy_async`]: asynchronous_operations/memcpy_async.md
-[`cudaMemcpyAsync`]: https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1g85073372f776b4c4d5f89f7124b7bf79
-
-
 ## Thread Scopes
 
 A _thread scope_ specifies the kind of threads that can synchronize with each other using synchronization primitive such as [`atomic`] or [`barrier`].

From 2d465e44b520fc10049f4e405b5104ea6d5a4d1e Mon Sep 17 00:00:00 2001
From: Wesley Maxey <wesley.maxey@gmail.com>
Date: Mon, 21 Nov 2022 15:33:44 -0800
Subject: [PATCH 8/8] Fix broken links in extended_api

---
 docs/extended_api/asynchronous_operations/memcpy_async.md | 8 +++-----
 docs/extended_api/synchronization_primitives/atomic.md    | 3 +--
 docs/extended_api/synchronization_primitives/barrier.md   | 3 +--
 .../barrier/barrier_native_handle.md                      | 3 +--
 .../synchronization_primitives/barrier/init.md            | 3 +--
 .../synchronization_primitives/binary_semaphore.md        | 2 +-
 .../synchronization_primitives/counting_semaphore.md      | 2 +-
 docs/extended_api/synchronization_primitives/latch.md     | 3 +--
 docs/extended_api/synchronization_primitives/pipeline.md  | 3 +--
 .../synchronization_primitives/pipeline_shared_state.md   | 2 +-
 10 files changed, 12 insertions(+), 20 deletions(-)

diff --git a/docs/extended_api/asynchronous_operations/memcpy_async.md b/docs/extended_api/asynchronous_operations/memcpy_async.md
index ebe3773fc2..f5461e4ebc 100644
--- a/docs/extended_api/asynchronous_operations/memcpy_async.md
+++ b/docs/extended_api/asynchronous_operations/memcpy_async.md
@@ -77,7 +77,7 @@ Both objects are reinterpreted as arrays of `unsigned char`.
    in the current thread.
 4. Binds the asynchronous copy completion to `cuda::pipeline` and cooperatively
    issues the copy across all threads in `group`.
-5. 5-8: convenience wrappers using `cuda::annotated_ptr` where `Sync` is 
+5. 5-8: convenience wrappers using `cuda::annotated_ptr` where `Sync` is
    either `cuda::barrier` or `cuda::pipeline`.
 
 ## Notes
@@ -141,9 +141,7 @@ __global__ void example_kernel(char* dst, char* src) {
 
 [_TriviallyCopyable_]: https://en.cppreference.com/w/cpp/named_req/TriviallyCopyable
 
-[_ThreadGroup_]: ./thread_group.md
-
 [`cuda::std::size_t`]: https://en.cppreference.com/w/c/types/size_t
-[`cuda::aligned_size_t`]: ./shapes/aligned_size_t.md
 
-[`cuda::pipeline::quit`]: ./pipelines/pipeline/quit.md
+[`cuda::aligned_size_t`]: ../shapes/aligned_size_t.md
+[`cuda::pipeline::quit`]: ../synchronization_primitives/pipeline/quit.md
diff --git a/docs/extended_api/synchronization_primitives/atomic.md b/docs/extended_api/synchronization_primitives/atomic.md
index 7d4e57c58a..71cbcfb851 100644
--- a/docs/extended_api/synchronization_primitives/atomic.md
+++ b/docs/extended_api/synchronization_primitives/atomic.md
@@ -78,7 +78,7 @@ __global__ void example_kernel() {
 [See it on Godbolt](https://godbolt.org/z/avo3Evbee){: .btn }
 
 
-[`cuda::thread_scope`]: ../thread_scopes.md
+[`cuda::thread_scope`]: ../memory_model.md
 
 [`cuda::atomic_thread_fence`]: ./atomic/atomic_thread_fence.md
 
@@ -94,4 +94,3 @@ __global__ void example_kernel() {
 
 [`concurrentManagedAccess` property]: https://docs.nvidia.com/cuda/cuda-runtime-api/structcudaDeviceProp.html#structcudaDeviceProp_116f9619ccc85e93bc456b8c69c80e78b
 [`hostNativeAtomicSupported` property]: https://docs.nvidia.com/cuda/cuda-runtime-api/structcudaDeviceProp.html#structcudaDeviceProp_1ef82fd7d1d0413c7d6f33287e5b6306f
-
diff --git a/docs/extended_api/synchronization_primitives/barrier.md b/docs/extended_api/synchronization_primitives/barrier.md
index 17ad379fc6..c1141cc6bd 100644
--- a/docs/extended_api/synchronization_primitives/barrier.md
+++ b/docs/extended_api/synchronization_primitives/barrier.md
@@ -94,7 +94,7 @@ __global__ void example_kernel() {
 [See it on Godbolt](https://godbolt.org/z/ehdrY8Kae){: .btn }
 
 
-[`cuda::thread_scope`]: ../thread_scopes.md
+[`cuda::thread_scope`]: ../memory_model.md
 
 [`cuda::barrier::init`]: ./barrier/init.md
 [`cuda::device::barrier_native_handle`]: ./barrier/barrier_native_handle.md
@@ -107,4 +107,3 @@ __global__ void example_kernel() {
 
 [`concurrentManagedAccess` property]: https://docs.nvidia.com/cuda/cuda-runtime-api/structcudaDeviceProp.html#structcudaDeviceProp_116f9619ccc85e93bc456b8c69c80e78b
 [`hostNativeAtomicSupported` property]: https://docs.nvidia.com/cuda/cuda-runtime-api/structcudaDeviceProp.html#structcudaDeviceProp_1ef82fd7d1d0413c7d6f33287e5b6306f
-
diff --git a/docs/extended_api/synchronization_primitives/barrier/barrier_native_handle.md b/docs/extended_api/synchronization_primitives/barrier/barrier_native_handle.md
index 7fdbe81b03..be1652eb26 100644
--- a/docs/extended_api/synchronization_primitives/barrier/barrier_native_handle.md
+++ b/docs/extended_api/synchronization_primitives/barrier/barrier_native_handle.md
@@ -44,7 +44,7 @@ __global__ void example_kernel(cuda::barrier<cuda::thread_scope_block>& bar) {
 [See it on Godbolt](https://godbolt.org/z/dr4798Y76){: .btn }
 
 
-[`cuda::thread_scope`]: ./thread_scopes.md
+[`cuda::thread_scope`]: ./memory_model.md
 
 [thread.barrier.class paragraph 12]: https://eel.is/c++draft/thread.barrier.class#12
 
@@ -52,4 +52,3 @@ __global__ void example_kernel(cuda::barrier<cuda::thread_scope_block>& bar) {
 
 [`concurrentManagedAccess` property]: https://docs.nvidia.com/cuda/cuda-runtime-api/structcudaDeviceProp.html#structcudaDeviceProp_116f9619ccc85e93bc456b8c69c80e78b
 [`hostNativeAtomicSupported` property]: https://docs.nvidia.com/cuda/cuda-runtime-api/structcudaDeviceProp.html#structcudaDeviceProp_1ef82fd7d1d0413c7d6f33287e5b6306f
-
diff --git a/docs/extended_api/synchronization_primitives/barrier/init.md b/docs/extended_api/synchronization_primitives/barrier/init.md
index e2e7aa47bc..1b52023108 100644
--- a/docs/extended_api/synchronization_primitives/barrier/init.md
+++ b/docs/extended_api/synchronization_primitives/barrier/init.md
@@ -55,7 +55,7 @@ __global__ void example_kernel() {
 [See it on Godbolt](https://godbolt.org/z/jG8se6Kd8){: .btn }
 
 
-[`cuda::thread_scope`]: ./thread_scopes.md
+[`cuda::thread_scope`]: ./memory_model.md
 
 [thread.barrier.class paragraph 12]: https://eel.is/c++draft/thread.barrier.class#12
 
@@ -63,4 +63,3 @@ __global__ void example_kernel() {
 
 [`concurrentManagedAccess` property]: https://docs.nvidia.com/cuda/cuda-runtime-api/structcudaDeviceProp.html#structcudaDeviceProp_116f9619ccc85e93bc456b8c69c80e78b
 [`hostNativeAtomicSupported` property]: https://docs.nvidia.com/cuda/cuda-runtime-api/structcudaDeviceProp.html#structcudaDeviceProp_1ef82fd7d1d0413c7d6f33287e5b6306f
-
diff --git a/docs/extended_api/synchronization_primitives/binary_semaphore.md b/docs/extended_api/synchronization_primitives/binary_semaphore.md
index d0af259177..0ffbb1b20a 100644
--- a/docs/extended_api/synchronization_primitives/binary_semaphore.md
+++ b/docs/extended_api/synchronization_primitives/binary_semaphore.md
@@ -69,7 +69,7 @@ __global__ void example_kernel() {
 [See it on Godbolt](https://godbolt.org/z/eKfjYYz58){: .btn }
 
 
-[`cuda::thread_scope`]: ../thread_scopes.md
+[`cuda::thread_scope`]: ../memory_model.md
 
 [`cuda::std::binary_semaphore`]: https://en.cppreference.com/w/cpp/thread/binary_semaphore
 
diff --git a/docs/extended_api/synchronization_primitives/counting_semaphore.md b/docs/extended_api/synchronization_primitives/counting_semaphore.md
index 41dbeaa22b..1052c987f7 100644
--- a/docs/extended_api/synchronization_primitives/counting_semaphore.md
+++ b/docs/extended_api/synchronization_primitives/counting_semaphore.md
@@ -65,7 +65,7 @@ __global__ void example_kernel() {
 [See it on Godbolt](https://godbolt.org/z/3YrjjTvG6){: .btn }
 
 
-[`cuda::thread_scope`]: ../thread_scopes.md
+[`cuda::thread_scope`]: ../memory_model.md
 
 [`cuda::std::counting_semaphore`]: https://en.cppreference.com/w/cpp/thread/counting_semaphore
 
diff --git a/docs/extended_api/synchronization_primitives/latch.md b/docs/extended_api/synchronization_primitives/latch.md
index ce7512072a..d72e4df00e 100644
--- a/docs/extended_api/synchronization_primitives/latch.md
+++ b/docs/extended_api/synchronization_primitives/latch.md
@@ -62,10 +62,9 @@ __global__ void example_kernel() {
 [See it on Godbolt](https://godbolt.org/z/8v4dcK7fa){: .btn }
 
 
-[`cuda::thread_scope`]: ../thread_scopes.md
+[`cuda::thread_scope`]: ../memory_model.md
 
 [`cuda::std::latch`]: https://en.cppreference.com/w/cpp/thread/latch
 
 [`concurrentManagedAccess` property]: https://docs.nvidia.com/cuda/cuda-runtime-api/structcudaDeviceProp.html#structcudaDeviceProp_116f9619ccc85e93bc456b8c69c80e78b
 [`hostNativeAtomicSupported` property]: https://docs.nvidia.com/cuda/cuda-runtime-api/structcudaDeviceProp.html#structcudaDeviceProp_1ef82fd7d1d0413c7d6f33287e5b6306f
-
diff --git a/docs/extended_api/synchronization_primitives/pipeline.md b/docs/extended_api/synchronization_primitives/pipeline.md
index 73b01e9ee0..ccb9697050 100644
--- a/docs/extended_api/synchronization_primitives/pipeline.md
+++ b/docs/extended_api/synchronization_primitives/pipeline.md
@@ -141,7 +141,7 @@ template void __global__ example_kernel<int>(int*, int*, cuda::std::size_t);
 [asynchronous operations]: ../asynchronous_operations.md
 [`cuda::memcpy_async`]: ../asynchronous_operations/memcpy_async.md
 
-[`cuda::thread_scope`]: ../thread_scopes.md
+[`cuda::thread_scope`]: ../memory_model.md
 [`cuda::pipeline_shared_state`]: ./pipeline_shared_state.md
 
 [(destructor)]: ./pipeline/destructor.md
@@ -152,4 +152,3 @@ template void __global__ example_kernel<int>(int*, int*, cuda::std::size_t);
 [`consumer_wait_until`]: ./pipeline/consumer_wait.md
 [`consumer_release`]: ./pipeline/consumer_release.md
 [`quit`]: ./pipeline/quit.md
-
diff --git a/docs/extended_api/synchronization_primitives/pipeline_shared_state.md b/docs/extended_api/synchronization_primitives/pipeline_shared_state.md
index 10050219e1..9daba5fea6 100644
--- a/docs/extended_api/synchronization_primitives/pipeline_shared_state.md
+++ b/docs/extended_api/synchronization_primitives/pipeline_shared_state.md
@@ -77,6 +77,6 @@ __global__ void example_kernel(char* device_buffer, char* sysmem_buffer) {
 [See it on Godbolt](https://godbolt.org/z/M9ah7r1Yx){: .btn }
 
 
-[`cuda::thread_scope`]: ../thread_scopes.md
+[`cuda::thread_scope`]: ../memory_model.md
 
 [(constructor)]: ./pipeline_shared_state/constructor.md