From 9a3ef24e53bb51e749f1c1914bab5aa01dec9e4a Mon Sep 17 00:00:00 2001
From: Christopher Harris <charris@nvidia.com>
Date: Thu, 24 Oct 2019 15:36:02 -0500
Subject: [PATCH 01/15] add value setter to device_scalar

---
 include/rmm/device_scalar.hpp | 17 +++++++++++++++++
 tests/device_scalar_tests.cpp | 10 ++++++++++
 2 files changed, 27 insertions(+)
diff --git a/include/rmm/device_scalar.hpp b/include/rmm/device_scalar.hpp
index 50a8dc42f..c16c8d333 100644
--- a/include/rmm/device_scalar.hpp
+++ b/include/rmm/device_scalar.hpp
@@ -83,6 +83,23 @@ class device_scalar {
    *---------------------------------------------------------------------------**/
   T const *get() const noexcept { return static_cast<T const *>(buff.data()); }
 
+  /**---------------------------------------------------------------------------*
+   * @brief Copies the value from host to device.
+   *
+   * @return T The value of the scalar after synchronizing its stream
+   *---------------------------------------------------------------------------**/
+  void value(T value) {
+    auto status = cudaMemcpyAsync(buff.data(), &value, sizeof(T),
+                             cudaMemcpyDefault, buff.stream());
+    if (cudaSuccess != status) {
+      throw std::runtime_error{"Device memcpy failed."};
+    }
+    status = cudaStreamSynchronize(buff.stream());
+    if (cudaSuccess != status) {
+      throw std::runtime_error{"Stream sync failed."};
+    }
+  }
+
   device_scalar() = default;
   ~device_scalar() = default;
   device_scalar(device_scalar const &) = default;
diff --git a/tests/device_scalar_tests.cpp b/tests/device_scalar_tests.cpp
index 2f24c34a2..cdc04ed67 100644
--- a/tests/device_scalar_tests.cpp
+++ b/tests/device_scalar_tests.cpp
@@ -87,3 +87,13 @@ TYPED_TEST(DeviceScalarTest, MoveCtor) {
   EXPECT_EQ(moved_to.value(), original_value);
   EXPECT_EQ(nullptr, scalar.get());
 }
+
+TYPED_TEST(DeviceScalarTest, SetValue) {
+  rmm::device_scalar<TypeParam> scalar{this->value, this->stream, this->mr};
+  EXPECT_NE(nullptr, scalar.get());
+
+  auto expected = this->distribution(this->generator);
+
+  scalar.value(expected);
+  EXPECT_EQ(expected, scalar.value());
+}

From e57f4d709dc5eb2992525141b4c5c8ebc707215f Mon Sep 17 00:00:00 2001
From: Christopher Harris <charris@nvidia.com>
Date: Thu, 24 Oct 2019 15:43:27 -0500
Subject: [PATCH 02/15] changelog

---
 CHANGELOG.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 60921f0e5..6be7f228c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,8 @@
 
 ## New Features
 
+ - PR #167 Added value setter to `device_scalar`
+
 ## Improvements
 
 - PR #161 Use `std::atexit` to finalize RMM after Python interpreter shutdown

From 91b2d7ab198323cd83ee497cd123f1a3931014d0 Mon Sep 17 00:00:00 2001
From: Christopher Harris <charris@nvidia.com>
Date: Thu, 24 Oct 2019 15:44:10 -0500
Subject: [PATCH 03/15] fix doc

---
 include/rmm/device_scalar.hpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/include/rmm/device_scalar.hpp b/include/rmm/device_scalar.hpp
index c16c8d333..f47781ca8 100644
--- a/include/rmm/device_scalar.hpp
+++ b/include/rmm/device_scalar.hpp
@@ -85,8 +85,6 @@ class device_scalar {
 
   /**---------------------------------------------------------------------------*
    * @brief Copies the value from host to device.
-   *
-   * @return T The value of the scalar after synchronizing its stream
    *---------------------------------------------------------------------------**/
   void value(T value) {
     auto status = cudaMemcpyAsync(buff.data(), &value, sizeof(T),

From da98387fe90a806d199eb0d5bd9f3e7a45caa792 Mon Sep 17 00:00:00 2001
From: Christopher Harris <charris@nvidia.com>
Date: Thu, 24 Oct 2019 16:20:54 -0500
Subject: [PATCH 04/15] for device_scalar, rename `T get()` to `T data()`,
 rename `void value(T)` to `void set_value(T)`, adjust docs

---
 include/rmm/device_scalar.hpp | 32 +++++++++++++++++---------------
 tests/device_scalar_tests.cpp | 24 ++++++++++++------------
 2 files changed, 29 insertions(+), 27 deletions(-)

diff --git a/include/rmm/device_scalar.hpp b/include/rmm/device_scalar.hpp
index f47781ca8..3c14667dd 100644
--- a/include/rmm/device_scalar.hpp
+++ b/include/rmm/device_scalar.hpp
@@ -62,7 +62,7 @@ class device_scalar {
   T value() const {
     T host_value{};
     auto status = cudaMemcpyAsync(&host_value, buff.data(), sizeof(T),
-                             cudaMemcpyDefault, buff.stream());
+                                  cudaMemcpyDefault, buff.stream());
     if (cudaSuccess != status) {
       throw std::runtime_error{"Device memcpy failed."};
     }
@@ -74,21 +74,13 @@ class device_scalar {
   }
 
   /**---------------------------------------------------------------------------*
-   * @brief Returns pointer to object in device memory.
-   *---------------------------------------------------------------------------**/
-  T *get() noexcept { return static_cast<T *>(buff.data()); }
-
-  /**---------------------------------------------------------------------------*
-   * @brief Returns pointer to object in device memory.
-   *---------------------------------------------------------------------------**/
-  T const *get() const noexcept { return static_cast<T const *>(buff.data()); }
-
-  /**---------------------------------------------------------------------------*
-   * @brief Copies the value from host to device.
+   * @brief Copies the value from hostto device.
+   *
+   * @param host_value The value of the scalar after synchronizing its stream
    *---------------------------------------------------------------------------**/
-  void value(T value) {
-    auto status = cudaMemcpyAsync(buff.data(), &value, sizeof(T),
-                             cudaMemcpyDefault, buff.stream());
+  void set_value(T host_value) {
+    auto status = cudaMemcpyAsync(buff.data(), &host_value, sizeof(T),
+                                  cudaMemcpyDefault, buff.stream());
     if (cudaSuccess != status) {
       throw std::runtime_error{"Device memcpy failed."};
     }
@@ -98,6 +90,16 @@ class device_scalar {
     }
   }
 
+  /**---------------------------------------------------------------------------*
+   * @brief Returns pointer to object in device memory.
+   *---------------------------------------------------------------------------**/
+  T *data() noexcept { return static_cast<T *>(buff.data()); }
+
+  /**---------------------------------------------------------------------------*
+   * @brief Returns pointer to object in device memory.
+   *---------------------------------------------------------------------------**/
+  T const *data() const noexcept { return static_cast<T const *>(buff.data()); }
+
   device_scalar() = default;
   ~device_scalar() = default;
   device_scalar(device_scalar const &) = default;
diff --git a/tests/device_scalar_tests.cpp b/tests/device_scalar_tests.cpp
index cdc04ed67..4f2256159 100644
--- a/tests/device_scalar_tests.cpp
+++ b/tests/device_scalar_tests.cpp
@@ -53,47 +53,47 @@ TYPED_TEST_CASE(DeviceScalarTest, Types);
 
 TYPED_TEST(DeviceScalarTest, DefaultUninitialized) {
   rmm::device_scalar<TypeParam> scalar{};
-  EXPECT_NE(nullptr, scalar.get());
+  EXPECT_NE(nullptr, scalar.data());
 }
 
 TYPED_TEST(DeviceScalarTest, InitialValue) {
   rmm::device_scalar<TypeParam> scalar{this->value, this->stream, this->mr};
-  EXPECT_NE(nullptr, scalar.get());
+  EXPECT_NE(nullptr, scalar.data());
   EXPECT_EQ(this->value, scalar.value());
 }
 
 TYPED_TEST(DeviceScalarTest, CopyCtor) {
   rmm::device_scalar<TypeParam> scalar{this->value, this->stream, this->mr};
-  EXPECT_NE(nullptr, scalar.get());
+  EXPECT_NE(nullptr, scalar.data());
   EXPECT_EQ(this->value, scalar.value());
 
   rmm::device_scalar<TypeParam> copy{scalar};
-  EXPECT_NE(nullptr, copy.get());
-  EXPECT_NE(copy.get(), scalar.get());
+  EXPECT_NE(nullptr, copy.data());
+  EXPECT_NE(copy.data(), scalar.data());
   EXPECT_EQ(copy.value(), scalar.value());
 }
 
 TYPED_TEST(DeviceScalarTest, MoveCtor) {
   rmm::device_scalar<TypeParam> scalar{this->value, this->stream, this->mr};
-  EXPECT_NE(nullptr, scalar.get());
+  EXPECT_NE(nullptr, scalar.data());
   EXPECT_EQ(this->value, scalar.value());
 
-  auto original_pointer = scalar.get();
+  auto original_pointer = scalar.data();
   auto original_value = scalar.value();
 
   rmm::device_scalar<TypeParam> moved_to{std::move(scalar)};
-  EXPECT_NE(nullptr, moved_to.get());
-  EXPECT_EQ(moved_to.get(), original_pointer);
+  EXPECT_NE(nullptr, moved_to.data());
+  EXPECT_EQ(moved_to.data(), original_pointer);
   EXPECT_EQ(moved_to.value(), original_value);
-  EXPECT_EQ(nullptr, scalar.get());
+  EXPECT_EQ(nullptr, scalar.data());
 }
 
 TYPED_TEST(DeviceScalarTest, SetValue) {
   rmm::device_scalar<TypeParam> scalar{this->value, this->stream, this->mr};
-  EXPECT_NE(nullptr, scalar.get());
+  EXPECT_NE(nullptr, scalar.data());
 
   auto expected = this->distribution(this->generator);
 
-  scalar.value(expected);
+  scalar.set_value(expected);
   EXPECT_EQ(expected, scalar.value());
 }

From c27e29050c961d4a40210e49b92cb15e74fc7b83 Mon Sep 17 00:00:00 2001
From: Christopher Harris <xixonia@gmail.com>
Date: Thu, 24 Oct 2019 17:33:18 -0500
Subject: [PATCH 05/15] Update include/rmm/device_scalar.hpp

Co-Authored-By: Mark Harris <mharris@nvidia.com>
---
 include/rmm/device_scalar.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/rmm/device_scalar.hpp b/include/rmm/device_scalar.hpp
index 3c14667dd..936450456 100644
--- a/include/rmm/device_scalar.hpp
+++ b/include/rmm/device_scalar.hpp
@@ -74,7 +74,7 @@ class device_scalar {
   }
 
   /**---------------------------------------------------------------------------*
-   * @brief Copies the value from hostto device.
+   * @brief Copies the value from host to device.
    *
    * @param host_value The value of the scalar after synchronizing its stream
    *---------------------------------------------------------------------------**/

From aff0a4a7a7c4879043d8d553fb43a25f08ecd01d Mon Sep 17 00:00:00 2001
From: Christopher Harris <charris@nvidia.com>
Date: Fri, 25 Oct 2019 13:16:56 -0500
Subject: [PATCH 06/15] add value_async and set_value_async to device_scalar

---
 include/rmm/device_scalar.hpp | 116 +++++++++++++++++++++++++++-------
 1 file changed, 94 insertions(+), 22 deletions(-)

diff --git a/include/rmm/device_scalar.hpp b/include/rmm/device_scalar.hpp
index 3c14667dd..953ba503c 100644
--- a/include/rmm/device_scalar.hpp
+++ b/include/rmm/device_scalar.hpp
@@ -60,34 +60,75 @@ class device_scalar {
    * @return T The value of the scalar after synchronizing its stream
    *---------------------------------------------------------------------------**/
   T value() const {
-    T host_value{};
-    auto status = cudaMemcpyAsync(&host_value, buff.data(), sizeof(T),
-                                  cudaMemcpyDefault, buff.stream());
-    if (cudaSuccess != status) {
-      throw std::runtime_error{"Device memcpy failed."};
-    }
-    status = cudaStreamSynchronize(buff.stream());
-    if (cudaSuccess != status) {
-      throw std::runtime_error{"Stream sync failed."};
-    }
-    return host_value;
+    return _value<true>(buff.stream());
   }
 
   /**---------------------------------------------------------------------------*
-   * @brief Copies the value from hostto device.
+   * @brief Copies the value from device to host and returns the value.
    *
-   * @param host_value The value of the scalar after synchronizing its stream
+   * @return T The value of the scalar after synchronizing its stream
+   * @param stream CUDA stream on which to perform the copy
+   *---------------------------------------------------------------------------**/
+  T value(cudaStream_t stream) const {
+    return _value<true>(stream);
+  }
+
+  /**---------------------------------------------------------------------------*
+   * @brief Copies the value from device to host and returns the value.
+   *
+   * @return T The value of the scalar
+   *---------------------------------------------------------------------------**/
+  T value_async() const {
+    return _value<false>(buff.stream());
+  }
+
+  /**---------------------------------------------------------------------------*
+   * @brief Copies the value from device to host and returns the value.
+   *
+   * @return T The value of the scalar
+   * @param stream CUDA stream on which to perform the copy
+   *---------------------------------------------------------------------------**/
+  T value_async(cudaStream_t stream) const {
+    return _value<false>(stream);
+  }
+
+  /**---------------------------------------------------------------------------*
+   * @brief Copies the value from host to device and synchronizes.
+   *
+   * @param host_value The host value which will be copied to device
    *---------------------------------------------------------------------------**/
   void set_value(T host_value) {
-    auto status = cudaMemcpyAsync(buff.data(), &host_value, sizeof(T),
-                                  cudaMemcpyDefault, buff.stream());
-    if (cudaSuccess != status) {
-      throw std::runtime_error{"Device memcpy failed."};
-    }
-    status = cudaStreamSynchronize(buff.stream());
-    if (cudaSuccess != status) {
-      throw std::runtime_error{"Stream sync failed."};
-    }
+    _set_value<true>(host_value, buff.stream());
+  }
+
+
+  /**---------------------------------------------------------------------------*
+   * @brief Copies the value from host to device and synchronizes.
+   *
+   * @param host_value The host value which will be copied to device
+   * @param stream CUDA stream on which to perform the copy
+   *---------------------------------------------------------------------------**/
+  void set_value(T host_value, cudaStream_t stream) {
+    _set_value<true>(host_value, stream);
+  }
+
+  /**---------------------------------------------------------------------------*
+   * @brief Copies the value from host to device.
+   *
+   * @param host_value The host value which will be copied to device
+   *---------------------------------------------------------------------------**/
+  void set_value_async(T host_value) {
+    _set_value<false>(host_value, buff.stream());
+  }
+
+  /**---------------------------------------------------------------------------*
+   * @brief Copies the value from host to device.
+   *
+   * @param host_value The host value which will be copied to device
+   * @param stream CUDA stream on which to perform the copy
+   *---------------------------------------------------------------------------**/
+  void set_value_async(T host_value, cudaStream_t stream) {
+    _set_value<false>(host_value, stream);
   }
 
   /**---------------------------------------------------------------------------*
@@ -109,6 +150,37 @@ class device_scalar {
 
  private:
   rmm::device_buffer buff{sizeof(T)};
+
+  template<bool synchronize>
+  inline T _value(cudaStream_t stream) const {
+    T host_value{};
+    _memcpy<synchronize>(&host_value, buff.data(), sizeof(T), stream);
+    return host_value;
+  }
+
+  template<bool synchronize>
+  inline void _set_value(T host_value, cudaStream_t stream) {
+    _memcpy<synchronize>(buff.data(), &host_value, sizeof(T), stream);
+  }
+
+  template<bool synchronize>
+  inline void _memcpy(void *dst, const void *src, size_t count, cudaStream_t stream) const {
+    auto status = cudaMemcpyAsync(dst, src, count, cudaMemcpyDefault, stream);
+
+    if (cudaSuccess != status) {
+      throw std::runtime_error{"Device memcpy failed."};
+    }
+
+    if (false == synchronize) {
+      return;
+    }
+
+    status = cudaStreamSynchronize(stream);
+
+    if (cudaSuccess != status) {
+      throw std::runtime_error{"Stream sync failed."};
+    }
+  }
 };
 
 }  // namespace rmm

From 772fb67628e5a9132623b1129a5b7e70095e2781 Mon Sep 17 00:00:00 2001
From: Christopher Harris <charris@nvidia.com>
Date: Fri, 25 Oct 2019 13:25:01 -0500
Subject: [PATCH 07/15] make device_scalar constructor use _set_value.

---
 include/rmm/device_scalar.hpp | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/include/rmm/device_scalar.hpp b/include/rmm/device_scalar.hpp
index 953ba503c..f2596ae30 100644
--- a/include/rmm/device_scalar.hpp
+++ b/include/rmm/device_scalar.hpp
@@ -46,12 +46,8 @@ class device_scalar {
       T const &initial_value, cudaStream_t stream_ = 0,
       rmm::mr::device_memory_resource *mr_ = rmm::mr::get_default_resource())
       : buff{sizeof(T), stream_, mr_} {
-    auto status = cudaMemcpyAsync(buff.data(), &initial_value, sizeof(T),
-                                  cudaMemcpyDefault, buff.stream());
-
-    if (cudaSuccess != status) {
-      throw std::runtime_error{"Device memcpy failed."};
-    }
+    
+    _set_value<false>(initial_value, buff.stream());
   }
 
   /**---------------------------------------------------------------------------*

From b03e7f17defb68b18b087b2c19d5105e7e831aee Mon Sep 17 00:00:00 2001
From: Christopher Harris <charris@nvidia.com>
Date: Sun, 27 Oct 2019 13:41:40 -0500
Subject: [PATCH 08/15] use RMM_CHECK_CUDA in rmm::device_scalar

---
 include/rmm/device_scalar.hpp | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/include/rmm/device_scalar.hpp b/include/rmm/device_scalar.hpp
index f2596ae30..d90e8e7b0 100644
--- a/include/rmm/device_scalar.hpp
+++ b/include/rmm/device_scalar.hpp
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include <rmm/rmm.hpp>
 #include <rmm/device_buffer.hpp>
 #include <rmm/mr/default_memory_resource.hpp>
 #include <rmm/mr/device_memory_resource.hpp>
@@ -161,9 +162,9 @@ class device_scalar {
 
   template<bool synchronize>
   inline void _memcpy(void *dst, const void *src, size_t count, cudaStream_t stream) const {
-    auto status = cudaMemcpyAsync(dst, src, count, cudaMemcpyDefault, stream);
+    auto status = _memcpy_copy(dst, src, count, stream);
 
-    if (cudaSuccess != status) {
+    if (RMM_SUCCESS != status) {
       throw std::runtime_error{"Device memcpy failed."};
     }
 
@@ -171,12 +172,22 @@ class device_scalar {
       return;
     }
 
-    status = cudaStreamSynchronize(stream);
+    status = _memcpy_sync(stream);
 
-    if (cudaSuccess != status) {
+    if (RMM_SUCCESS != status) {
       throw std::runtime_error{"Stream sync failed."};
     }
   }
+
+  inline rmmError_t _memcpy_copy(void *dst, const void *src, size_t count, cudaStream_t stream) const {
+    RMM_CHECK_CUDA(cudaMemcpyAsync(dst, src, count, cudaMemcpyDefault, stream));
+    return RMM_SUCCESS;
+  }
+
+  inline rmmError_t _memcpy_sync(cudaStream_t stream) const {
+    RMM_CHECK_CUDA(cudaStreamSynchronize(stream));
+    return RMM_SUCCESS;
+  }
 };
 
 }  // namespace rmm

From a16f5a636aef41490bcf065a7f58ee26e20f825b Mon Sep 17 00:00:00 2001
From: Christopher Harris <charris@nvidia.com>
Date: Mon, 28 Oct 2019 14:26:42 -0500
Subject: [PATCH 09/15] revert calls to RMM_CHECK_CUDA

---
 include/rmm/device_scalar.hpp | 19 ++++---------------
 1 file changed, 4 insertions(+), 15 deletions(-)

diff --git a/include/rmm/device_scalar.hpp b/include/rmm/device_scalar.hpp
index d90e8e7b0..f2596ae30 100644
--- a/include/rmm/device_scalar.hpp
+++ b/include/rmm/device_scalar.hpp
@@ -16,7 +16,6 @@
 
 #pragma once
 
-#include <rmm/rmm.hpp>
 #include <rmm/device_buffer.hpp>
 #include <rmm/mr/default_memory_resource.hpp>
 #include <rmm/mr/device_memory_resource.hpp>
@@ -162,9 +161,9 @@ class device_scalar {
 
   template<bool synchronize>
   inline void _memcpy(void *dst, const void *src, size_t count, cudaStream_t stream) const {
-    auto status = _memcpy_copy(dst, src, count, stream);
+    auto status = cudaMemcpyAsync(dst, src, count, cudaMemcpyDefault, stream);
 
-    if (RMM_SUCCESS != status) {
+    if (cudaSuccess != status) {
       throw std::runtime_error{"Device memcpy failed."};
     }
 
@@ -172,22 +171,12 @@ class device_scalar {
       return;
     }
 
-    status = _memcpy_sync(stream);
+    status = cudaStreamSynchronize(stream);
 
-    if (RMM_SUCCESS != status) {
+    if (cudaSuccess != status) {
       throw std::runtime_error{"Stream sync failed."};
     }
   }
-
-  inline rmmError_t _memcpy_copy(void *dst, const void *src, size_t count, cudaStream_t stream) const {
-    RMM_CHECK_CUDA(cudaMemcpyAsync(dst, src, count, cudaMemcpyDefault, stream));
-    return RMM_SUCCESS;
-  }
-
-  inline rmmError_t _memcpy_sync(cudaStream_t stream) const {
-    RMM_CHECK_CUDA(cudaStreamSynchronize(stream));
-    return RMM_SUCCESS;
-  }
 };
 
 }  // namespace rmm

From c80baa400b9387bee2c0b44fa6175231d5ca71c7 Mon Sep 17 00:00:00 2001
From: Christopher Harris <charris@nvidia.com>
Date: Tue, 29 Oct 2019 10:42:30 -0500
Subject: [PATCH 10/15] removing stored-stream overloads of rmm:device_scalar 
 value/set_value

---
 include/rmm/device_scalar.hpp | 57 +++++------------------------------
 1 file changed, 7 insertions(+), 50 deletions(-)

diff --git a/include/rmm/device_scalar.hpp b/include/rmm/device_scalar.hpp
index f2596ae30..821474d25 100644
--- a/include/rmm/device_scalar.hpp
+++ b/include/rmm/device_scalar.hpp
@@ -50,53 +50,26 @@ class device_scalar {
     _set_value<false>(initial_value, buff.stream());
   }
 
-  /**---------------------------------------------------------------------------*
-   * @brief Copies the value from device to host and returns the value.
-   *
-   * @return T The value of the scalar after synchronizing its stream
-   *---------------------------------------------------------------------------**/
-  T value() const {
-    return _value<true>(buff.stream());
-  }
-
   /**---------------------------------------------------------------------------*
    * @brief Copies the value from device to host and returns the value.
    *
    * @return T The value of the scalar after synchronizing its stream
    * @param stream CUDA stream on which to perform the copy
    *---------------------------------------------------------------------------**/
-  T value(cudaStream_t stream) const {
+  T value(cudaStream_t stream = 0) const {
     return _value<true>(stream);
   }
 
-  /**---------------------------------------------------------------------------*
-   * @brief Copies the value from device to host and returns the value.
-   *
-   * @return T The value of the scalar
-   *---------------------------------------------------------------------------**/
-  T value_async() const {
-    return _value<false>(buff.stream());
-  }
-
   /**---------------------------------------------------------------------------*
    * @brief Copies the value from device to host and returns the value.
    *
    * @return T The value of the scalar
    * @param stream CUDA stream on which to perform the copy
    *---------------------------------------------------------------------------**/
-  T value_async(cudaStream_t stream) const {
+  T value_async(cudaStream_t stream = 0) const {
     return _value<false>(stream);
   }
 
-  /**---------------------------------------------------------------------------*
-   * @brief Copies the value from host to device and synchronizes.
-   *
-   * @param host_value The host value which will be copied to device
-   *---------------------------------------------------------------------------**/
-  void set_value(T host_value) {
-    _set_value<true>(host_value, buff.stream());
-  }
-
 
   /**---------------------------------------------------------------------------*
    * @brief Copies the value from host to device and synchronizes.
@@ -104,26 +77,17 @@ class device_scalar {
    * @param host_value The host value which will be copied to device
    * @param stream CUDA stream on which to perform the copy
    *---------------------------------------------------------------------------**/
-  void set_value(T host_value, cudaStream_t stream) {
+  void set_value(T host_value, cudaStream_t stream = 0) {
     _set_value<true>(host_value, stream);
   }
 
-  /**---------------------------------------------------------------------------*
-   * @brief Copies the value from host to device.
-   *
-   * @param host_value The host value which will be copied to device
-   *---------------------------------------------------------------------------**/
-  void set_value_async(T host_value) {
-    _set_value<false>(host_value, buff.stream());
-  }
-
   /**---------------------------------------------------------------------------*
    * @brief Copies the value from host to device.
    *
    * @param host_value The host value which will be copied to device
    * @param stream CUDA stream on which to perform the copy
    *---------------------------------------------------------------------------**/
-  void set_value_async(T host_value, cudaStream_t stream) {
+  void set_value_async(T host_value, cudaStream_t stream = 0) {
     _set_value<false>(host_value, stream);
   }
 
@@ -160,20 +124,13 @@ class device_scalar {
   }
 
   template<bool synchronize>
-  inline void _memcpy(void *dst, const void *src, size_t count, cudaStream_t stream) const {
+  inline void _memcpy(void *dst, const void *src, size_t count,
+                      cudaStream_t stream) const{
     auto status = cudaMemcpyAsync(dst, src, count, cudaMemcpyDefault, stream);
-
     if (cudaSuccess != status) {
       throw std::runtime_error{"Device memcpy failed."};
     }
-
-    if (false == synchronize) {
-      return;
-    }
-
-    status = cudaStreamSynchronize(stream);
-
-    if (cudaSuccess != status) {
+    if (cudaSuccess != cudaStreamSynchronize(stream)) {
       throw std::runtime_error{"Stream sync failed."};
     }
   }

From 5356a9c9f8a0dc29493cb83cfb723ca6bf375b9f Mon Sep 17 00:00:00 2001
From: Christopher Harris <charris@nvidia.com>
Date: Tue, 29 Oct 2019 18:54:58 -0500
Subject: [PATCH 11/15] remove trailing underscores on device_scalar
 constructor args

---
 include/rmm/device_scalar.hpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/include/rmm/device_scalar.hpp b/include/rmm/device_scalar.hpp
index 821474d25..485f93e79 100644
--- a/include/rmm/device_scalar.hpp
+++ b/include/rmm/device_scalar.hpp
@@ -39,13 +39,13 @@ class device_scalar {
    * @brief Construct a new `device_scalar`
    *
    * @param initial_value The initial value of the object in device memory
-   * @param stream_ Optional, stream on which to perform allocation and copy
-   * @param mr_ Optional, resource with which to allocate
+   * @param stream Optional, stream on which to perform allocation and copy
+   * @param mr Optional, resource with which to allocate
    *---------------------------------------------------------------------------**/
   explicit device_scalar(
-      T const &initial_value, cudaStream_t stream_ = 0,
-      rmm::mr::device_memory_resource *mr_ = rmm::mr::get_default_resource())
-      : buff{sizeof(T), stream_, mr_} {
+      T const &initial_value, cudaStream_t stream = 0,
+      rmm::mr::device_memory_resource *mr = rmm::mr::get_default_resource())
+      : buff{sizeof(T), stream, mr} {
     
     _set_value<false>(initial_value, buff.stream());
   }

From 525b4cadc00baf096a46655eca42bb61fce3b375 Mon Sep 17 00:00:00 2001
From: Christopher Harris <charris@nvidia.com>
Date: Wed, 30 Oct 2019 13:32:02 -0500
Subject: [PATCH 12/15] remove async versions of device_scalar value/set_value

---
 include/rmm/device_scalar.hpp | 45 ++++-------------------------------
 1 file changed, 5 insertions(+), 40 deletions(-)

diff --git a/include/rmm/device_scalar.hpp b/include/rmm/device_scalar.hpp
index 485f93e79..44182a02c 100644
--- a/include/rmm/device_scalar.hpp
+++ b/include/rmm/device_scalar.hpp
@@ -47,7 +47,7 @@ class device_scalar {
       rmm::mr::device_memory_resource *mr = rmm::mr::get_default_resource())
       : buff{sizeof(T), stream, mr} {
     
-    _set_value<false>(initial_value, buff.stream());
+    _memcpy(buff.data(), &initial_value, sizeof(T), stream);
   }
 
   /**---------------------------------------------------------------------------*
@@ -57,20 +57,11 @@ class device_scalar {
    * @param stream CUDA stream on which to perform the copy
    *---------------------------------------------------------------------------**/
   T value(cudaStream_t stream = 0) const {
-    return _value<true>(stream);
-  }
-
-  /**---------------------------------------------------------------------------*
-   * @brief Copies the value from device to host and returns the value.
-   *
-   * @return T The value of the scalar
-   * @param stream CUDA stream on which to perform the copy
-   *---------------------------------------------------------------------------**/
-  T value_async(cudaStream_t stream = 0) const {
-    return _value<false>(stream);
+    T host_value{};
+    _memcpy(&host_value, buff.data(), sizeof(T), stream);
+    return host_value;
   }
 
-
   /**---------------------------------------------------------------------------*
    * @brief Copies the value from host to device and synchronizes.
    *
@@ -78,17 +69,7 @@ class device_scalar {
    * @param stream CUDA stream on which to perform the copy
    *---------------------------------------------------------------------------**/
   void set_value(T host_value, cudaStream_t stream = 0) {
-    _set_value<true>(host_value, stream);
-  }
-
-  /**---------------------------------------------------------------------------*
-   * @brief Copies the value from host to device.
-   *
-   * @param host_value The host value which will be copied to device
-   * @param stream CUDA stream on which to perform the copy
-   *---------------------------------------------------------------------------**/
-  void set_value_async(T host_value, cudaStream_t stream = 0) {
-    _set_value<false>(host_value, stream);
+    _memcpy(buff.data(), &host_value, sizeof(T), stream);
   }
 
   /**---------------------------------------------------------------------------*
@@ -111,28 +92,12 @@ class device_scalar {
  private:
   rmm::device_buffer buff{sizeof(T)};
 
-  template<bool synchronize>
-  inline T _value(cudaStream_t stream) const {
-    T host_value{};
-    _memcpy<synchronize>(&host_value, buff.data(), sizeof(T), stream);
-    return host_value;
-  }
-
-  template<bool synchronize>
-  inline void _set_value(T host_value, cudaStream_t stream) {
-    _memcpy<synchronize>(buff.data(), &host_value, sizeof(T), stream);
-  }
-
-  template<bool synchronize>
   inline void _memcpy(void *dst, const void *src, size_t count,
                       cudaStream_t stream) const{
     auto status = cudaMemcpyAsync(dst, src, count, cudaMemcpyDefault, stream);
     if (cudaSuccess != status) {
       throw std::runtime_error{"Device memcpy failed."};
     }
-    if (cudaSuccess != cudaStreamSynchronize(stream)) {
-      throw std::runtime_error{"Stream sync failed."};
-    }
   }
 };
 

From 53ebb7edb03bb964736fd36b06b838ec45114624 Mon Sep 17 00:00:00 2001
From: Christopher Harris <charris@nvidia.com>
Date: Wed, 30 Oct 2019 15:56:01 -0500
Subject: [PATCH 13/15] device_scalar doc adjustments

---
 include/rmm/device_scalar.hpp | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/include/rmm/device_scalar.hpp b/include/rmm/device_scalar.hpp
index 44182a02c..2130f5c1a 100644
--- a/include/rmm/device_scalar.hpp
+++ b/include/rmm/device_scalar.hpp
@@ -47,29 +47,30 @@ class device_scalar {
       rmm::mr::device_memory_resource *mr = rmm::mr::get_default_resource())
       : buff{sizeof(T), stream, mr} {
     
-    _memcpy(buff.data(), &initial_value, sizeof(T), stream);
+    _memcpy(buff.data(), &initial_value, stream);
   }
 
   /**---------------------------------------------------------------------------*
-   * @brief Copies the value from device to host and returns the value.
+   * @brief Copies the value from device to host synchronously and returns the
+   * value.
    *
-   * @return T The value of the scalar after synchronizing its stream
+   * @return T The value of the scalar
    * @param stream CUDA stream on which to perform the copy
    *---------------------------------------------------------------------------**/
   T value(cudaStream_t stream = 0) const {
     T host_value{};
-    _memcpy(&host_value, buff.data(), sizeof(T), stream);
+    _memcpy(&host_value, buff.data(), stream);
     return host_value;
   }
 
   /**---------------------------------------------------------------------------*
-   * @brief Copies the value from host to device and synchronizes.
+   * @brief Copies the value from host to device synchronously.
    *
    * @param host_value The host value which will be copied to device
    * @param stream CUDA stream on which to perform the copy
    *---------------------------------------------------------------------------**/
   void set_value(T host_value, cudaStream_t stream = 0) {
-    _memcpy(buff.data(), &host_value, sizeof(T), stream);
+    _memcpy(buff.data(), &host_value, stream);
   }
 
   /**---------------------------------------------------------------------------*
@@ -92,9 +93,9 @@ class device_scalar {
  private:
   rmm::device_buffer buff{sizeof(T)};
 
-  inline void _memcpy(void *dst, const void *src, size_t count,
-                      cudaStream_t stream) const{
-    auto status = cudaMemcpyAsync(dst, src, count, cudaMemcpyDefault, stream);
+  inline void _memcpy(void *dst, const void *src, cudaStream_t stream) const {
+    auto status = cudaMemcpyAsync(dst, src, sizeof(T), cudaMemcpyDefault,
+                                  stream);
     if (cudaSuccess != status) {
       throw std::runtime_error{"Device memcpy failed."};
     }

From 3268c07b11187121e5dac758fcaa09a6ea4c3d9c Mon Sep 17 00:00:00 2001
From: Christopher Harris <charris@nvidia.com>
Date: Wed, 30 Oct 2019 19:45:45 -0500
Subject: [PATCH 14/15] device_scalar explicit synchronization in
 value/set_value

---
 include/rmm/device_scalar.hpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/include/rmm/device_scalar.hpp b/include/rmm/device_scalar.hpp
index 44182a02c..6b1ba5665 100644
--- a/include/rmm/device_scalar.hpp
+++ b/include/rmm/device_scalar.hpp
@@ -95,9 +95,14 @@ class device_scalar {
   inline void _memcpy(void *dst, const void *src, size_t count,
                       cudaStream_t stream) const{
     auto status = cudaMemcpyAsync(dst, src, count, cudaMemcpyDefault, stream);
+
     if (cudaSuccess != status) {
       throw std::runtime_error{"Device memcpy failed."};
     }
+
+    if (cudaSuccess != cudaStreamSynchronize(stream)) {
+      throw std::runtime_error{"Stream sync failed."};
+    }
   }
 };
 

From c5b8a99850cb2bde4e8ebea1d7bc66d5f597f437 Mon Sep 17 00:00:00 2001
From: Christopher Harris <charris@nvidia.com>
Date: Wed, 30 Oct 2019 19:53:49 -0500
Subject: [PATCH 15/15] fix build error

---
 include/rmm/device_scalar.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/rmm/device_scalar.hpp b/include/rmm/device_scalar.hpp
index 0795a09e4..ceae5e16e 100644
--- a/include/rmm/device_scalar.hpp
+++ b/include/rmm/device_scalar.hpp
@@ -93,9 +93,9 @@ class device_scalar {
  private:
   rmm::device_buffer buff{sizeof(T)};
 
-  inline void _memcpy(void *dst, const void *src, size_t count,
+  inline void _memcpy(void *dst, const void *src,
                       cudaStream_t stream) const{
-    auto status = cudaMemcpyAsync(dst, src, count, cudaMemcpyDefault, stream);
+    auto status = cudaMemcpyAsync(dst, src, sizeof(T), cudaMemcpyDefault, stream);
 
     if (cudaSuccess != status) {
       throw std::runtime_error{"Device memcpy failed."};