From 6b015ef95287d6e6442ff9acdf49d5c52aed7770 Mon Sep 17 00:00:00 2001 From: Gonzalo Brito Gadeschi Date: Tue, 10 May 2022 03:13:56 -0700 Subject: [PATCH 1/8] Memory model documentation --- docs/extended_api.md | 2 +- docs/extended_api/memory_model.md | 186 ++++++++++++++++++ .../synchronization_primitives/atomic.md | 4 +- docs/extended_api/thread_scopes.md | 154 --------------- 4 files changed, 190 insertions(+), 156 deletions(-) create mode 100644 docs/extended_api/memory_model.md delete mode 100644 docs/extended_api/thread_scopes.md diff --git a/docs/extended_api.md b/docs/extended_api.md index d81d9bb63d..952b7c81e5 100644 --- a/docs/extended_api.md +++ b/docs/extended_api.md @@ -21,6 +21,6 @@ nav_order: 3 {% include_relative extended_api/functional.md %} -[Thread Scopes]: ./extended_api/thread_scopes.md +[Thread Scopes]: ./extended_api/memory_model.md#thread-scopes [Thread Groups]: ./extended_api/thread_groups.md diff --git a/docs/extended_api/memory_model.md b/docs/extended_api/memory_model.md new file mode 100644 index 0000000000..7e8c875ac6 --- /dev/null +++ b/docs/extended_api/memory_model.md @@ -0,0 +1,186 @@ +--- +parent: Extended API +nav_order: 0 +--- + +# Memory model + +Standard C++ presents a view that the cost to synchronize threads is uniform and low. + +CUDA C++ is different: the cost to synchronize threads grows as threads are further appart. +It is low across threads within a block, but high across arbitrary threads in the system running on multiple GPUs and CPUs. + +To account for non-uniform thread synchronization costs that are not always low, CUDA C++ extends the standard C++ memory model and concurrency facilities in the `cuda::` namespace with **thread scopes**, retaining the syntax and semantics of standard C++ by default. + +## Threads + +In CUDA C++, memory can only be modified by threads, so any operation that modifies memory is being executed by a thread. + +Examples of threads of execution in CUDA C++ programs: GPU or CPU threads, threads in DMA engines executing asynchronous operations like [`memcpy_async`] or [`cudaMemcpyAsync`], threads in NICs executing RDMA operations, etc. + +[`memcpy_async`]: asynchronous_operations/memcpy_async.md +[`cudaMemcpyAsync`]: https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1g85073372f776b4c4d5f89f7124b7bf79 + + +## Thread Scopes + +A _thread scope_ specifies the kind of threads that can synchronize with each other using synchronization primitive such as [`atomic`] or [`barrier`]. + +```cuda +namespace cuda { + +enum thread_scope { + thread_scope_system, + thread_scope_device, + thread_scope_block, + thread_scope_thread +}; + +} // namespace cuda +``` + +[`atomic`]: synchronization_primitives/atomic.md +[`barrier`]: synchronization_primitives/barrier.md + +### Scope Relationships + +Each program thread is related to each other program thread by one or more thread scope relations: +- Each thread in the system is related to each other thread in the system by the *system* thread scope: `thread_scope_system`. +- Each GPU thread is related to each other GPU thread in the same CUDA device by the *device* thread scope: `thread_scope_device`. +- Each GPU thread is related to each other GPU thread in the same CUDA thread block by the *block* thread scope: `thread_scope_block`. +- Each thread is related to itself by the `thread` thread scope: `thread_scope_thread`. + +## Synchronization primitives + +Types in namespaces `std::` and `cuda::std::` have the same behavior as corresponding types in namespace `cuda::` when instantiated with a scope of `cuda::thread_scope_system`. + +## Atomicity + +An atomic operation is atomic at the scope it specifies if: +- it specifies a scope other than `thread_scope_system`, **or** + +the scope is `thread_scope_system` and: + +- it affects an object in [unified memory] and [`concurrentManagedAccess`] is `1`, **or** +- it affects an object in CPU memory and [`hostNativeAtomicSupported`] is `1`, **or** +- it is a load or store that affects a naturally-aligned object of sizes `1`, `2`, `4`, or `8` bytes on [mapped memory], **or** +- it affects an object in GPU memory and only GPU threads access it. + +[mapped memory]: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#mapped-memory + +Refer to the [CUDA programming guide] for more information on [unified memory], [mapped memory], CPU memory, and GPU peer memory. + +[mapped memory]: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#mapped-memory +[unified memory]: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#um-unified-memory-programming-hd +[CUDA programming guide]: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html +[`concurrentManagedAccess`]: https://docs.nvidia.com/cuda/cuda-runtime-api/structcudaDeviceProp.html#structcudaDeviceProp_116f9619ccc85e93bc456b8c69c80e78b +[`hostNativeAtomicSupported`]: https://docs.nvidia.com/cuda/cuda-runtime-api/structcudaDeviceProp.html#structcudaDeviceProp_1ef82fd7d1d0413c7d6f33287e5b6306f + +## Data Races + +Modify [intro.races paragraph 21] of ISO/IEC IS 14882 (the C++ Standard) as follows: +> The execution of a program contains a data race if it contains two potentially concurrent conflicting actions, at least one of which is not atomic ***at a scope that includes the thread that performed the other operation***, and neither happens before the other, except for the special case for signal handlers described below. Any such data race results inundefined behavior. [...] + +Modify [thread.barrier.class paragraph 4] of ISO/IEC IS 14882 (the C++ Standard) as follows: +> 4. Concurrent invocations of the member functions of `barrier`, other than its destructor, do not introduce data races ***as if they were atomic operations***. [...] + +Modify [thread.latch.class paragraph 2] of ISO/IEC IS 14882 (the C++ Standard) as follows: +> 2. Concurrent invocations of the member functions of `latch`, other than its destructor, do not introduce data races ***as if they were atomic operations***. + +Modify [thread.sema.cnt paragraph 3] of ISO/IEC IS 14882 (the C++ Standard) as follows: +> 3. Concurrent invocations of the member functions of `counting_semaphore`, other than its destructor, do not introduce data races ***as if they were atomic operations***. + +Modify [thread.stoptoken.intro paragraph 5] of ISO/IEC IS 14882 (the C++ Standard) as follows: +> Calls to the functions request_­stop, stop_­requested, and stop_­possible do not introduce data races ***as if they were atomic operations***. [...] + +[thread.stoptoken.intro paragraph 5]: https://eel.is/c++draft/thread#stoptoken.intro-5 + +Modify [atomics.fences paragraph 2 through 4] of ISO/IEC IS 14882 (the C++ Standard) as follows: +> A release fence A synchronizes with an acquire fence B if there exist atomic +> operations X and Y, both operating on some atomic object M, such that A is +> sequenced before X, X modifies M, Y is sequenced before B, and Y reads the +> value written by X or a value written by any side effect in the hypothetical +> release sequence X would head if it were a release operation, +> ***and each operation (A, B, X, and Y) specifies a scope that includes the thread that performed each other operation***. + +> A release fence A synchronizes with an atomic operation B that performs an +> acquire operation on an atomic object M if there exists an atomic operation X +> such that A is sequenced before X, X modifies M, and B reads the value +> written by X or a value written by any side effect in the hypothetical +> release sequence X would head if it were a release operation, +> ***and each operation (A, B, and X) specifies a scope that includes the thread that performed each other operation***. + +> An atomic operation A that is a release operation on an atomic object M +> synchronizes with an acquire fence B if there exists some atomic operation X +> on M such that X is sequenced before B and reads the value written by A or a +> value written by any side effect in the release sequence headed by A, +> ***and each operation (A, B, and X) specifies a scope that includes the thread that performed each other operation***. + +## Example: Message Passing + +The following example passes a message stored to the `x` variable by a thread in block `0` to a thread in block `1` via the flag `f`: + + + + + + + + + + +
+`int x = 0;`
+`int f = 0;` +
+**Thread 0 Block 0** + +**Thread 0 Block 1** +
+`x = 42;`
+`cuda::atomic_ref flag(f);`
+`flag.store(1, memory_order_release);` +
+`cuda::atomic_ref flag(f);`
+`while(flag.load(memory_order_acquire) != 1);`
+`assert(x == 42);` +
+ +In the following variation of the previous example, two threads concurrently access the `f` object without synchronization, which leads to a **data race**, and exhibits **undefined behavior**: + + + + + + + + + + +
+`int x = 0;`
+`int f = 0;` +
+**Thread 0 Block 0** + +**Thread 0 Block 1** +
+`x = 42;`
+`cuda::atomic_ref flag(f);`
+`flag.store(1, memory_order_release); // UB: data race` +
+`cuda::atomic_ref flag(f);`
+`while(flag.load(memory_order_acquire) != 1); // UB: data race`
+`assert(x == 42);` +
+ +While the memory operations on `f` - the store and the loads - are atomic, the scope of the store operation is "block scope". Since the store is performed by Thread 0 of Block 0, it only includes all other threads of Block 0. However, the thread doing the loads is in Block 1, i.e., it is not in a scope included by the store operation performed in Block 0, causing the store and the load to not be "atomic", and introducing a data-race. + +For more examples see the [PTX memory consistency model litmus tests]. + +[PTX memory consistency model litmus tests]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#axioms +[intro.races paragraph 21]: https://eel.is/c++draft/intro.races#21 +[thread.barrier.class paragraph 4]: https://eel.is/c++draft/thread.barrier.class#4 +[thread.latch.class paragraph 2]: https://eel.is/c++draft/thread.latch.class#2 +[thread.sema.cnt paragraph 3]: https://eel.is/c++draft/thread.sema.cnt#3 +[atomics.fences paragraph 2 through 4]: https://eel.is/c++draft/atomics.fences#2 diff --git a/docs/extended_api/synchronization_primitives/atomic.md b/docs/extended_api/synchronization_primitives/atomic.md index 67b18972f8..7d4e57c58a 100644 --- a/docs/extended_api/synchronization_primitives/atomic.md +++ b/docs/extended_api/synchronization_primitives/atomic.md @@ -44,7 +44,7 @@ Under CUDA Compute Capability 6 (Pascal), an object of type `atomic` may not be - if `is_always_lock_free()` is `false`. Under CUDA Compute Capability prior to 6 (Pascal), objects of type - `cuda::atomic` or [`cuda::std::atomic`] may not be used. + [`cuda::atomic`] or [`cuda::std::atomic`] may not be used. ## Implementation-Defined Behavior @@ -87,6 +87,8 @@ __global__ void example_kernel() { [`cuda::std::atomic`]: https://en.cppreference.com/w/cpp/atomic/atomic +[`cuda::atomic`]: ./atomic.md + [atomics.types.int]: https://eel.is/c++draft/atomics.types.int [atomics.types.pointer]: https://eel.is/c++draft/atomics.types.pointer diff --git a/docs/extended_api/thread_scopes.md b/docs/extended_api/thread_scopes.md deleted file mode 100644 index 67215090e5..0000000000 --- a/docs/extended_api/thread_scopes.md +++ /dev/null @@ -1,154 +0,0 @@ ---- -parent: Extended API -nav_order: 0 ---- - -# Thread Scopes - -```cuda -namespace cuda { - -// Header ``. - -enum thread_scope { - thread_scope_system, - thread_scope_device, - thread_scope_block, - thread_scope_thread -}; - -template -class atomic; - -void atomic_thread_fence(std::memory_order, thread_scope = thread_scope_system); - -// Header ``. - -template -class barrier; - -// Header ``. - -template -class latch; - -// Header ``. - -template -class binary_semaphore; - -template -class counting_semaphore; - -} -``` - -Standard C++ presents a view that the cost to synchronize threads is uniform - and low. -CUDA C++ is different: the overhead is low among threads within a block, and - high across arbitrary threads in the system. - -To bridge these two realities, libcu++ introduces **thread scopes** - to the Standard's concurrency facilities in the `cuda::` namespace, while - retaining the syntax and semantics of Standard C++ by default. -A thread scope specifies the kind of threads that can synchronize with each - other using a primitive such as an `atomic` or a `barrier`. - -## Scope Relationships - -Each program thread is related to each other program thread by one or more - thread scope relations: -- Each thread (CPU or GPU) is related to each other thread in the computer - system by the *system* thread scope, specified with `thread_scope_system`. -- Each GPU thread is related to each other GPU thread in the same CUDA device - by the *device* thread scope, specified with `thread_scope_device`. -- Each GPU thread is related to each other GPU thread in the same CUDA block - by the *block* thread scope, specified with `thread_scope_block`. -- Each thread (CPU or GPU) is related to itself by the `thread` thread scope, - specified with `thread_scope_thread`. - -Objects in namespace `cuda::std::` have the same behavior as corresponding - objects in namespace `cuda::` when instantiated with a scope of - `cuda::thread_scope_system`. - -Refer to the [CUDA programming guide] for more information on how CUDA launches - threads into devices and blocks. - -## Atomicity - -An atomic operation is atomic at the scope it specifies if: -- it specifies a scope other than `thread_scope_system`, or -- it affects an object in unified memory and [`concurrentManagedAccess`] is - `1`, or -- it affects an object in CPU memory and [`hostNativeAtomicSupported`] is `1`, - or -- it affects an object in GPU memory and only GPU threads access it. - -Refer to the [CUDA programming guide] for more information on - unified memory, CPU memory, and GPU peer memory. - -## Data Races - -Modify [intro.races paragraph 21] of ISO/IEC IS 14882 (the C++ Standard) as - follows: -> The execution of a program contains a data race if it contains two -> potentially concurrent conflicting actions, at least one of which is not -> atomic -> ***at a scope that includes the thread that performed the other operation***, -> and neither happens before the other, except for the special -> case for signal handlers described below. Any such data race results in -> undefined behavior. [...] - -Modify [thread.barrier.class paragraph 4] of ISO/IEC IS 14882 (the C++ - Standard) as follows: -> 4. Concurrent invocations of the member functions of `barrier`, other than its -> destructor, do not introduce data races -> ***as if they were atomic operations***. -> [...] - -Modify [thread.latch.class paragraph 2] of ISO/IEC IS 14882 (the C++ Standard) - as follows: -> 2. Concurrent invocations of the member functions of `latch`, other than its -> destructor, do not introduce data races -> ***as if they were atomic operations***. - -Modify [thread.sema.cnt paragraph 3] of ISO/IEC IS 14882 (the C++ Standard) as - follows: -> 3. Concurrent invocations of the member functions of `counting_semaphore`, -> other than its destructor, do not introduce data races -> ***as if they were atomic operations***. - -Modify [atomics.fences paragraph 2 through 4] of ISO/IEC IS 14882 (the C++ - Standard) as follows: -> A release fence A synchronizes with an acquire fence B if there exist atomic -> operations X and Y, both operating on some atomic object M, such that A is -> sequenced before X, X modifies M, Y is sequenced before B, and Y reads the -> value written by X or a value written by any side effect in the hypothetical -> release sequence X would head if it were a release operation, -> ***and each operation (A, B, X, and Y) specifies a scope that includes the thread that performed each other operation***. - -> A release fence A synchronizes with an atomic operation B that performs an -> acquire operation on an atomic object M if there exists an atomic operation X -> such that A is sequenced before X, X modifies M, and B reads the value -> written by X or a value written by any side effect in the hypothetical -> release sequence X would head if it were a release operation, -> ***and each operation (A, B, and X) specifies a scope that includes the thread that performed each other operation***. - -> An atomic operation A that is a release operation on an atomic object M -> synchronizes with an acquire fence B if there exists some atomic operation X -> on M such that X is sequenced before B and reads the value written by A or a -> value written by any side effect in the release sequence headed by A, -> ***and each operation (A, B, and X) specifies a scope that includes the thread that performed each other operation***. - - -[intro.races paragraph 21]: https://eel.is/c++draft/intro.races#21 -[thread.barrier.class paragraph 4]: https://eel.is/c++draft/thread.barrier.class#4 -[thread.latch.class paragraph 2]: https://eel.is/c++draft/thread.latch.class#2 -[thread.sema.cnt paragraph 3]: https://eel.is/c++draft/thread.sema.cnt#3 -[atomics.fences paragraph 2 through 4]: https://eel.is/c++draft/atomics.fences#2 - -[CUDA programming guide]: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html -[`concurrentManagedAccess` property]: https://docs.nvidia.com/cuda/cuda-runtime-api/structcudaDeviceProp.html#structcudaDeviceProp_116f9619ccc85e93bc456b8c69c80e78b -[`hostNativeAtomicSupported` property]: https://docs.nvidia.com/cuda/cuda-runtime-api/structcudaDeviceProp.html#structcudaDeviceProp_1ef82fd7d1d0413c7d6f33287e5b6306f From c31c15dba616345e33b524c2db821ae13526c7f1 Mon Sep 17 00:00:00 2001 From: Gonzalo Brito Gadeschi Date: Wed, 10 Aug 2022 12:09:09 -0700 Subject: [PATCH 2/8] Make site wider to support tables --- docs/_sass/color_schemes/nvidia.scss | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/_sass/color_schemes/nvidia.scss b/docs/_sass/color_schemes/nvidia.scss index edd520bf6c..6bd1ddcbbf 100644 --- a/docs/_sass/color_schemes/nvidia.scss +++ b/docs/_sass/color_schemes/nvidia.scss @@ -15,6 +15,7 @@ pre.highlight code { font-size: 0.9em !important; } $nav-width: 300px; +$content-width: 1000px; $body-background-color: $grey-dk-300; $sidebar-color: $grey-dk-300; From 485a923265d7eeb1367293fe537947877fa5a33e Mon Sep 17 00:00:00 2001 From: Gonzalo Brito Gadeschi Date: Wed, 10 Aug 2022 12:09:21 -0700 Subject: [PATCH 3/8] Fix site build scripts --- docs/Dockerfile | 3 ++- docs/serve | 7 ++++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/docs/Dockerfile b/docs/Dockerfile index 972e9c7fe5..e64302f147 100644 --- a/docs/Dockerfile +++ b/docs/Dockerfile @@ -1,3 +1,4 @@ FROM jekyll/jekyll:4.0 COPY Gemfile /Gemfile -RUN /bin/bash -l -c "bundle install" \ No newline at end of file +RUN /bin/bash -l -c "bundle install" +RUN /bin/bash -l -c "bundle add webrick" \ No newline at end of file diff --git a/docs/serve b/docs/serve index ff9c5cbb6a..0a5f5f5dd9 100755 --- a/docs/serve +++ b/docs/serve @@ -2,8 +2,13 @@ set -ex +mkdir -p build/docs +( + cd build/docs + cp ../../docs/Gemfile . + docker build -f ../../docs/Dockerfile -t libcudacxx:docs . +) ( cd docs - docker build -f Dockerfile -t libcudacxx:docs . docker run --rm -p 4000:4000 -v $(pwd):/srv/jekyll -u $(id -u):$(id -g) -it libcudacxx:docs bash -c "jekyll serve --watch --host 0.0.0.0" ) From 7ed814f0d7aa23a422ee8ee271ca455bc3ffd568 Mon Sep 17 00:00:00 2001 From: gonzalobg <65027571+gonzalobg@users.noreply.github.com> Date: Mon, 15 Aug 2022 16:11:54 +0200 Subject: [PATCH 4/8] Fix typo --- docs/extended_api/memory_model.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/extended_api/memory_model.md b/docs/extended_api/memory_model.md index 7e8c875ac6..eff7831fe9 100644 --- a/docs/extended_api/memory_model.md +++ b/docs/extended_api/memory_model.md @@ -7,7 +7,7 @@ nav_order: 0 Standard C++ presents a view that the cost to synchronize threads is uniform and low. -CUDA C++ is different: the cost to synchronize threads grows as threads are further appart. +CUDA C++ is different: the cost to synchronize threads grows as threads are further apart. It is low across threads within a block, but high across arbitrary threads in the system running on multiple GPUs and CPUs. To account for non-uniform thread synchronization costs that are not always low, CUDA C++ extends the standard C++ memory model and concurrency facilities in the `cuda::` namespace with **thread scopes**, retaining the syntax and semantics of standard C++ by default. From 0dab4eee03aa2fb80fce0b4208be425838fcabad Mon Sep 17 00:00:00 2001 From: gonzalobg <65027571+gonzalobg@users.noreply.github.com> Date: Mon, 15 Aug 2022 16:12:18 +0200 Subject: [PATCH 5/8] Update docs/extended_api/memory_model.md --- docs/extended_api/memory_model.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/extended_api/memory_model.md b/docs/extended_api/memory_model.md index eff7831fe9..7a3ae5a44f 100644 --- a/docs/extended_api/memory_model.md +++ b/docs/extended_api/memory_model.md @@ -79,7 +79,7 @@ Refer to the [CUDA programming guide] for more information on [unified memory], ## Data Races Modify [intro.races paragraph 21] of ISO/IEC IS 14882 (the C++ Standard) as follows: -> The execution of a program contains a data race if it contains two potentially concurrent conflicting actions, at least one of which is not atomic ***at a scope that includes the thread that performed the other operation***, and neither happens before the other, except for the special case for signal handlers described below. Any such data race results inundefined behavior. [...] +> The execution of a program contains a data race if it contains two potentially concurrent conflicting actions, at least one of which is not atomic ***at a scope that includes the thread that performed the other operation***, and neither happens before the other, except for the special case for signal handlers described below. Any such data race results in undefined behavior. [...] Modify [thread.barrier.class paragraph 4] of ISO/IEC IS 14882 (the C++ Standard) as follows: > 4. Concurrent invocations of the member functions of `barrier`, other than its destructor, do not introduce data races ***as if they were atomic operations***. [...] From 1d10a7fb5311443dd5bee949a8d10fef85931b50 Mon Sep 17 00:00:00 2001 From: gonzalobg <65027571+gonzalobg@users.noreply.github.com> Date: Mon, 15 Aug 2022 16:13:10 +0200 Subject: [PATCH 6/8] Leaving sentence out --- docs/extended_api/memory_model.md | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/extended_api/memory_model.md b/docs/extended_api/memory_model.md index 7a3ae5a44f..54a26a1aa2 100644 --- a/docs/extended_api/memory_model.md +++ b/docs/extended_api/memory_model.md @@ -16,7 +16,6 @@ To account for non-uniform thread synchronization costs that are not always low, In CUDA C++, memory can only be modified by threads, so any operation that modifies memory is being executed by a thread. -Examples of threads of execution in CUDA C++ programs: GPU or CPU threads, threads in DMA engines executing asynchronous operations like [`memcpy_async`] or [`cudaMemcpyAsync`], threads in NICs executing RDMA operations, etc. [`memcpy_async`]: asynchronous_operations/memcpy_async.md [`cudaMemcpyAsync`]: https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1g85073372f776b4c4d5f89f7124b7bf79 From 383837ec283cf4403ae2f29b6ea8e9a984def674 Mon Sep 17 00:00:00 2001 From: gonzalobg <65027571+gonzalobg@users.noreply.github.com> Date: Mon, 15 Aug 2022 16:22:46 +0200 Subject: [PATCH 7/8] Remove Threads section --- docs/extended_api/memory_model.md | 9 --------- 1 file changed, 9 deletions(-) diff --git a/docs/extended_api/memory_model.md b/docs/extended_api/memory_model.md index 54a26a1aa2..e0e08a54c1 100644 --- a/docs/extended_api/memory_model.md +++ b/docs/extended_api/memory_model.md @@ -12,15 +12,6 @@ It is low across threads within a block, but high across arbitrary threads in th To account for non-uniform thread synchronization costs that are not always low, CUDA C++ extends the standard C++ memory model and concurrency facilities in the `cuda::` namespace with **thread scopes**, retaining the syntax and semantics of standard C++ by default. -## Threads - -In CUDA C++, memory can only be modified by threads, so any operation that modifies memory is being executed by a thread. - - -[`memcpy_async`]: asynchronous_operations/memcpy_async.md -[`cudaMemcpyAsync`]: https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1g85073372f776b4c4d5f89f7124b7bf79 - - ## Thread Scopes A _thread scope_ specifies the kind of threads that can synchronize with each other using synchronization primitive such as [`atomic`] or [`barrier`]. From 2d465e44b520fc10049f4e405b5104ea6d5a4d1e Mon Sep 17 00:00:00 2001 From: Wesley Maxey Date: Mon, 21 Nov 2022 15:33:44 -0800 Subject: [PATCH 8/8] Fix broken links in extended_api --- docs/extended_api/asynchronous_operations/memcpy_async.md | 8 +++----- docs/extended_api/synchronization_primitives/atomic.md | 3 +-- docs/extended_api/synchronization_primitives/barrier.md | 3 +-- .../barrier/barrier_native_handle.md | 3 +-- .../synchronization_primitives/barrier/init.md | 3 +-- .../synchronization_primitives/binary_semaphore.md | 2 +- .../synchronization_primitives/counting_semaphore.md | 2 +- docs/extended_api/synchronization_primitives/latch.md | 3 +-- docs/extended_api/synchronization_primitives/pipeline.md | 3 +-- .../synchronization_primitives/pipeline_shared_state.md | 2 +- 10 files changed, 12 insertions(+), 20 deletions(-) diff --git a/docs/extended_api/asynchronous_operations/memcpy_async.md b/docs/extended_api/asynchronous_operations/memcpy_async.md index ebe3773fc2..f5461e4ebc 100644 --- a/docs/extended_api/asynchronous_operations/memcpy_async.md +++ b/docs/extended_api/asynchronous_operations/memcpy_async.md @@ -77,7 +77,7 @@ Both objects are reinterpreted as arrays of `unsigned char`. in the current thread. 4. Binds the asynchronous copy completion to `cuda::pipeline` and cooperatively issues the copy across all threads in `group`. -5. 5-8: convenience wrappers using `cuda::annotated_ptr` where `Sync` is +5. 5-8: convenience wrappers using `cuda::annotated_ptr` where `Sync` is either `cuda::barrier` or `cuda::pipeline`. ## Notes @@ -141,9 +141,7 @@ __global__ void example_kernel(char* dst, char* src) { [_TriviallyCopyable_]: https://en.cppreference.com/w/cpp/named_req/TriviallyCopyable -[_ThreadGroup_]: ./thread_group.md - [`cuda::std::size_t`]: https://en.cppreference.com/w/c/types/size_t -[`cuda::aligned_size_t`]: ./shapes/aligned_size_t.md -[`cuda::pipeline::quit`]: ./pipelines/pipeline/quit.md +[`cuda::aligned_size_t`]: ../shapes/aligned_size_t.md +[`cuda::pipeline::quit`]: ../synchronization_primitives/pipeline/quit.md diff --git a/docs/extended_api/synchronization_primitives/atomic.md b/docs/extended_api/synchronization_primitives/atomic.md index 7d4e57c58a..71cbcfb851 100644 --- a/docs/extended_api/synchronization_primitives/atomic.md +++ b/docs/extended_api/synchronization_primitives/atomic.md @@ -78,7 +78,7 @@ __global__ void example_kernel() { [See it on Godbolt](https://godbolt.org/z/avo3Evbee){: .btn } -[`cuda::thread_scope`]: ../thread_scopes.md +[`cuda::thread_scope`]: ../memory_model.md [`cuda::atomic_thread_fence`]: ./atomic/atomic_thread_fence.md @@ -94,4 +94,3 @@ __global__ void example_kernel() { [`concurrentManagedAccess` property]: https://docs.nvidia.com/cuda/cuda-runtime-api/structcudaDeviceProp.html#structcudaDeviceProp_116f9619ccc85e93bc456b8c69c80e78b [`hostNativeAtomicSupported` property]: https://docs.nvidia.com/cuda/cuda-runtime-api/structcudaDeviceProp.html#structcudaDeviceProp_1ef82fd7d1d0413c7d6f33287e5b6306f - diff --git a/docs/extended_api/synchronization_primitives/barrier.md b/docs/extended_api/synchronization_primitives/barrier.md index 17ad379fc6..c1141cc6bd 100644 --- a/docs/extended_api/synchronization_primitives/barrier.md +++ b/docs/extended_api/synchronization_primitives/barrier.md @@ -94,7 +94,7 @@ __global__ void example_kernel() { [See it on Godbolt](https://godbolt.org/z/ehdrY8Kae){: .btn } -[`cuda::thread_scope`]: ../thread_scopes.md +[`cuda::thread_scope`]: ../memory_model.md [`cuda::barrier::init`]: ./barrier/init.md [`cuda::device::barrier_native_handle`]: ./barrier/barrier_native_handle.md @@ -107,4 +107,3 @@ __global__ void example_kernel() { [`concurrentManagedAccess` property]: https://docs.nvidia.com/cuda/cuda-runtime-api/structcudaDeviceProp.html#structcudaDeviceProp_116f9619ccc85e93bc456b8c69c80e78b [`hostNativeAtomicSupported` property]: https://docs.nvidia.com/cuda/cuda-runtime-api/structcudaDeviceProp.html#structcudaDeviceProp_1ef82fd7d1d0413c7d6f33287e5b6306f - diff --git a/docs/extended_api/synchronization_primitives/barrier/barrier_native_handle.md b/docs/extended_api/synchronization_primitives/barrier/barrier_native_handle.md index 7fdbe81b03..be1652eb26 100644 --- a/docs/extended_api/synchronization_primitives/barrier/barrier_native_handle.md +++ b/docs/extended_api/synchronization_primitives/barrier/barrier_native_handle.md @@ -44,7 +44,7 @@ __global__ void example_kernel(cuda::barrier& bar) { [See it on Godbolt](https://godbolt.org/z/dr4798Y76){: .btn } -[`cuda::thread_scope`]: ./thread_scopes.md +[`cuda::thread_scope`]: ./memory_model.md [thread.barrier.class paragraph 12]: https://eel.is/c++draft/thread.barrier.class#12 @@ -52,4 +52,3 @@ __global__ void example_kernel(cuda::barrier& bar) { [`concurrentManagedAccess` property]: https://docs.nvidia.com/cuda/cuda-runtime-api/structcudaDeviceProp.html#structcudaDeviceProp_116f9619ccc85e93bc456b8c69c80e78b [`hostNativeAtomicSupported` property]: https://docs.nvidia.com/cuda/cuda-runtime-api/structcudaDeviceProp.html#structcudaDeviceProp_1ef82fd7d1d0413c7d6f33287e5b6306f - diff --git a/docs/extended_api/synchronization_primitives/barrier/init.md b/docs/extended_api/synchronization_primitives/barrier/init.md index e2e7aa47bc..1b52023108 100644 --- a/docs/extended_api/synchronization_primitives/barrier/init.md +++ b/docs/extended_api/synchronization_primitives/barrier/init.md @@ -55,7 +55,7 @@ __global__ void example_kernel() { [See it on Godbolt](https://godbolt.org/z/jG8se6Kd8){: .btn } -[`cuda::thread_scope`]: ./thread_scopes.md +[`cuda::thread_scope`]: ./memory_model.md [thread.barrier.class paragraph 12]: https://eel.is/c++draft/thread.barrier.class#12 @@ -63,4 +63,3 @@ __global__ void example_kernel() { [`concurrentManagedAccess` property]: https://docs.nvidia.com/cuda/cuda-runtime-api/structcudaDeviceProp.html#structcudaDeviceProp_116f9619ccc85e93bc456b8c69c80e78b [`hostNativeAtomicSupported` property]: https://docs.nvidia.com/cuda/cuda-runtime-api/structcudaDeviceProp.html#structcudaDeviceProp_1ef82fd7d1d0413c7d6f33287e5b6306f - diff --git a/docs/extended_api/synchronization_primitives/binary_semaphore.md b/docs/extended_api/synchronization_primitives/binary_semaphore.md index d0af259177..0ffbb1b20a 100644 --- a/docs/extended_api/synchronization_primitives/binary_semaphore.md +++ b/docs/extended_api/synchronization_primitives/binary_semaphore.md @@ -69,7 +69,7 @@ __global__ void example_kernel() { [See it on Godbolt](https://godbolt.org/z/eKfjYYz58){: .btn } -[`cuda::thread_scope`]: ../thread_scopes.md +[`cuda::thread_scope`]: ../memory_model.md [`cuda::std::binary_semaphore`]: https://en.cppreference.com/w/cpp/thread/binary_semaphore diff --git a/docs/extended_api/synchronization_primitives/counting_semaphore.md b/docs/extended_api/synchronization_primitives/counting_semaphore.md index 41dbeaa22b..1052c987f7 100644 --- a/docs/extended_api/synchronization_primitives/counting_semaphore.md +++ b/docs/extended_api/synchronization_primitives/counting_semaphore.md @@ -65,7 +65,7 @@ __global__ void example_kernel() { [See it on Godbolt](https://godbolt.org/z/3YrjjTvG6){: .btn } -[`cuda::thread_scope`]: ../thread_scopes.md +[`cuda::thread_scope`]: ../memory_model.md [`cuda::std::counting_semaphore`]: https://en.cppreference.com/w/cpp/thread/counting_semaphore diff --git a/docs/extended_api/synchronization_primitives/latch.md b/docs/extended_api/synchronization_primitives/latch.md index ce7512072a..d72e4df00e 100644 --- a/docs/extended_api/synchronization_primitives/latch.md +++ b/docs/extended_api/synchronization_primitives/latch.md @@ -62,10 +62,9 @@ __global__ void example_kernel() { [See it on Godbolt](https://godbolt.org/z/8v4dcK7fa){: .btn } -[`cuda::thread_scope`]: ../thread_scopes.md +[`cuda::thread_scope`]: ../memory_model.md [`cuda::std::latch`]: https://en.cppreference.com/w/cpp/thread/latch [`concurrentManagedAccess` property]: https://docs.nvidia.com/cuda/cuda-runtime-api/structcudaDeviceProp.html#structcudaDeviceProp_116f9619ccc85e93bc456b8c69c80e78b [`hostNativeAtomicSupported` property]: https://docs.nvidia.com/cuda/cuda-runtime-api/structcudaDeviceProp.html#structcudaDeviceProp_1ef82fd7d1d0413c7d6f33287e5b6306f - diff --git a/docs/extended_api/synchronization_primitives/pipeline.md b/docs/extended_api/synchronization_primitives/pipeline.md index 73b01e9ee0..ccb9697050 100644 --- a/docs/extended_api/synchronization_primitives/pipeline.md +++ b/docs/extended_api/synchronization_primitives/pipeline.md @@ -141,7 +141,7 @@ template void __global__ example_kernel(int*, int*, cuda::std::size_t); [asynchronous operations]: ../asynchronous_operations.md [`cuda::memcpy_async`]: ../asynchronous_operations/memcpy_async.md -[`cuda::thread_scope`]: ../thread_scopes.md +[`cuda::thread_scope`]: ../memory_model.md [`cuda::pipeline_shared_state`]: ./pipeline_shared_state.md [(destructor)]: ./pipeline/destructor.md @@ -152,4 +152,3 @@ template void __global__ example_kernel(int*, int*, cuda::std::size_t); [`consumer_wait_until`]: ./pipeline/consumer_wait.md [`consumer_release`]: ./pipeline/consumer_release.md [`quit`]: ./pipeline/quit.md - diff --git a/docs/extended_api/synchronization_primitives/pipeline_shared_state.md b/docs/extended_api/synchronization_primitives/pipeline_shared_state.md index 10050219e1..9daba5fea6 100644 --- a/docs/extended_api/synchronization_primitives/pipeline_shared_state.md +++ b/docs/extended_api/synchronization_primitives/pipeline_shared_state.md @@ -77,6 +77,6 @@ __global__ void example_kernel(char* device_buffer, char* sysmem_buffer) { [See it on Godbolt](https://godbolt.org/z/M9ah7r1Yx){: .btn } -[`cuda::thread_scope`]: ../thread_scopes.md +[`cuda::thread_scope`]: ../memory_model.md [(constructor)]: ./pipeline_shared_state/constructor.md