atomic: Make all operations follow sequentially consistent ordering

stotko · May 25, 2020 · 25af2e0 · 25af2e0
1 parent 89ec1d2
commit 25af2e0
Show file tree

Hide file tree

Showing 9 changed files with 372 additions and 42 deletions.
diff --git a/src/stdgpu/atomic.cuh b/src/stdgpu/atomic.cuh
@@ -59,7 +59,7 @@ namespace stdgpu
  * Differences to std::atomic:
  *  - Atomics must be modeled as containers since threads have to operate on the exact same object (which also requires copy and move constructors)
  *  - Manual allocation and destruction of container required
- *  - load and store are not atomically safe
+ *  - All operations (including load() and store()) explicitly follow sequentially consistent ordering
  *  - Additional min and max functions for all supported integer and floating point types
  *  - Additional increment/decrement + modulo functions for unsigned int
  */
@@ -100,37 +100,33 @@ class atomic
 
 
         /**
-         * \brief Loads and returns the current value of the atomic object
+         * \brief Atomically loads and returns the current value of the atomic object
          * \return The current value of this object
-         * \note This operation is not atomically safe
          */
         STDGPU_HOST_DEVICE T
         load() const;
 
 
         /**
-         * \brief Loads and returns the current value of the atomic object
+         * \brief Atomically loads and returns the current value of the atomic object
          * \return The current value of this object
-         * \note Equivalent to load()
          */
         STDGPU_HOST_DEVICE
         operator T() const; // NOLINT(hicpp-explicit-conversions)
 
 
         /**
-         * \brief Replaces the current value with desired
+         * \brief Atomically replaces the current value with desired one
          * \param[in] desired The value to store to the atomic object
-         * \note This operation is not atomically safe
          */
         STDGPU_HOST_DEVICE void
         store(const T desired);
 
 
         /**
-         * \brief Replaces the current value with desired
+         * \brief Atomically replaces the current value with desired one
          * \param[in] desired The value to store to the atomic object
          * \return The desired value
-         * \note Equivalent to store()
          */
         STDGPU_HOST_DEVICE T //NOLINT(misc-unconventional-assign-operator)
         operator=(const T desired);

diff --git a/src/stdgpu/cuda/atomic.cuh b/src/stdgpu/cuda/atomic.cuh
@@ -27,6 +27,25 @@ namespace stdgpu
 namespace cuda
 {
 
+/**
+ * \brief Atomically loads and returns the current value of the atomic object
+ * \param[in] address A pointer to a value
+ * \return The current value of this object
+ */
+template <typename T>
+STDGPU_DEVICE_ONLY T
+atomic_load(T* address);
+
+/**
+ * \brief Atomically replaces the current value with desired one
+ * \param[in] address A pointer to a value
+ * \param[in] desired The value to store to the atomic object
+ */
+template <typename T>
+STDGPU_DEVICE_ONLY void
+atomic_store(T* address,
+             const T desired);
+
 /**
  * \brief Atomically exchanges the stored value with the given argument
  * \param[in] address A pointer to a value

diff --git a/src/stdgpu/cuda/impl/atomic_detail.cuh b/src/stdgpu/cuda/impl/atomic_detail.cuh
@@ -17,7 +17,6 @@
 #define STDGPU_CUDA_ATOMIC_DETAIL_H
 
 #include <stdgpu/algorithm.h>
-#include <stdgpu/contract.h>
 #include <stdgpu/limits.h>
 #include <stdgpu/platform.h>
 
@@ -49,7 +48,7 @@ atomicMin(float* address,
     do
     {
         assumed = old;
-        old = atomicCAS(address_as_int, assumed, __float_as_int( fminf(__int_as_float(assumed), value) ));
+        old = atomicCAS(address_as_int, assumed, __float_as_int( stdgpu::min<float>(__int_as_float(assumed), value) ));
 
     // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
     }
@@ -69,7 +68,7 @@ atomicMax(float* address,
     do
     {
         assumed = old;
-        old = atomicCAS(address_as_int, assumed, __float_as_int( fmaxf(__int_as_float(assumed), value) ));
+        old = atomicCAS(address_as_int, assumed, __float_as_int( stdgpu::max<float>(__int_as_float(assumed), value) ));
 
     // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
     }
@@ -179,12 +178,47 @@ namespace stdgpu
 namespace cuda
 {
 
+template <typename T>
+STDGPU_DEVICE_ONLY T
+atomic_load(T* address)
+{
+    __threadfence();
+
+    volatile T* volatile_address = address;
+    T current = *volatile_address;
+
+    __threadfence();
+
+    return current;
+}
+
+
+template <typename T>
+STDGPU_DEVICE_ONLY void
+atomic_store(T* address,
+             const T desired)
+{
+    __threadfence();
+
+    volatile T* volatile_address = address;
+    *volatile_address = desired;
+
+    __threadfence();
+}
+
+
 template <typename T, typename>
 STDGPU_DEVICE_ONLY T
 atomic_exchange(T* address,
                 const T desired)
 {
-    return atomicExch(address, desired);
+    __threadfence();
+
+    T old = atomicExch(address, desired);
+
+    __threadfence();
+
+    return old;
 }
 
 
@@ -194,7 +228,13 @@ atomic_compare_exchange(T* address,
                         const T expected,
                         const T desired)
 {
-    return atomicCAS(address, expected, desired);
+    __threadfence();
+
+    T old = atomicCAS(address, expected, desired);
+
+    __threadfence();
+
+    return old;
 }
 
 
@@ -203,7 +243,13 @@ STDGPU_DEVICE_ONLY T
 atomic_fetch_add(T* address,
                  const T arg)
 {
-    return atomicAdd(address, arg);
+    __threadfence();
+
+    T old = atomicAdd(address, arg);
+
+    __threadfence();
+
+    return old;
 }
 
 
@@ -212,7 +258,13 @@ STDGPU_DEVICE_ONLY T
 atomic_fetch_sub(T* address,
                  const T arg)
 {
-    return atomicSub(address, arg);
+    __threadfence();
+
+    T old = atomicSub(address, arg);
+
+    __threadfence();
+
+    return old;
 }
 
 
@@ -221,7 +273,13 @@ STDGPU_DEVICE_ONLY T
 atomic_fetch_and(T* address,
                  const T arg)
 {
-    return atomicAnd(address, arg);
+    __threadfence();
+
+    T old = atomicAnd(address, arg);
+
+    __threadfence();
+
+    return old;
 }
 
 
@@ -230,7 +288,13 @@ STDGPU_DEVICE_ONLY T
 atomic_fetch_or(T* address,
                  const T arg)
 {
-    return atomicOr(address, arg);
+    __threadfence();
+
+    T old = atomicOr(address, arg);
+
+    __threadfence();
+
+    return old;
 }
 
 
@@ -239,7 +303,13 @@ STDGPU_DEVICE_ONLY T
 atomic_fetch_xor(T* address,
                  const T arg)
 {
-    return atomicXor(address, arg);
+    __threadfence();
+
+    T old = atomicXor(address, arg);
+
+    __threadfence();
+
+    return old;
 }
 
 
@@ -248,7 +318,13 @@ STDGPU_DEVICE_ONLY T
 atomic_fetch_min(T* address,
                  const T arg)
 {
-    return atomicMin(address, arg);
+    __threadfence();
+
+    T old = atomicMin(address, arg);
+
+    __threadfence();
+
+    return old;
 }
 
 
@@ -257,7 +333,13 @@ STDGPU_DEVICE_ONLY T
 atomic_fetch_max(T* address,
                  const T arg)
 {
-    return atomicMax(address, arg);
+    __threadfence();
+
+    T old = atomicMax(address, arg);
+
+    __threadfence();
+
+    return old;
 }
 
 
@@ -266,7 +348,13 @@ STDGPU_DEVICE_ONLY T
 atomic_fetch_inc_mod(T* address,
                      const T arg)
 {
-    return atomicInc(address, arg);
+    __threadfence();
+
+    T old = atomicInc(address, arg);
+
+    __threadfence();
+
+    return old;
 }
 
 
@@ -275,7 +363,13 @@ STDGPU_DEVICE_ONLY T
 atomic_fetch_dec_mod(T* address,
                      const T arg)
 {
-    return atomicDec(address, arg);
+    __threadfence();
+
+    T old = atomicDec(address, arg);
+
+    __threadfence();
+
+    return old;
 }
 
 } // namespace cuda

diff --git a/src/stdgpu/hip/atomic.h b/src/stdgpu/hip/atomic.h
@@ -27,6 +27,25 @@ namespace stdgpu
 namespace hip
 {
 
+/**
+ * \brief Atomically loads and returns the current value of the atomic object
+ * \param[in] address A pointer to a value
+ * \return The current value of this object
+ */
+template <typename T>
+STDGPU_DEVICE_ONLY T
+atomic_load(T* address);
+
+/**
+ * \brief Atomically replaces the current value with desired one
+ * \param[in] address A pointer to a value
+ * \param[in] desired The value to store to the atomic object
+ */
+template <typename T>
+STDGPU_DEVICE_ONLY void
+atomic_store(T* address,
+             const T desired);
+
 /**
  * \brief Atomically exchanges the stored value with the given argument
  * \param[in] address A pointer to a value