From 8824c318a140ca4593d9a1552f7b1b6250e0f362 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Wed, 5 Jan 2022 17:53:04 -0800
Subject: [PATCH] Fix CUDF_HOST_DEVICE.

---
 cpp/include/cudf/ast/detail/operators.hpp     |  39 ++---
 .../cudf/detail/aggregation/aggregation.hpp   |  16 ++-
 cpp/include/cudf/detail/indexalator.cuh       |  52 ++++---
 cpp/include/cudf/detail/iterator.cuh          |   6 +-
 .../cudf/detail/reduction_operators.cuh       |  41 +++---
 .../detail/utilities/device_operators.cuh     |  30 ++--
 .../utilities/transform_unary_functions.cuh   |  25 ++--
 cpp/include/cudf/fixed_point/fixed_point.hpp  | 135 ++++++++++--------
 cpp/include/cudf/fixed_point/temporary.hpp    |   4 +-
 cpp/include/cudf/lists/list_device_view.cuh   |   8 +-
 .../cudf/lists/lists_column_device_view.cuh   |   2 +-
 cpp/include/cudf/strings/json.hpp             |   4 +-
 cpp/include/cudf/strings/string_view.cuh      |   4 +-
 cpp/include/cudf/strings/string_view.hpp      |  18 +--
 cpp/include/cudf/types.hpp                    |  11 +-
 cpp/include/cudf/utilities/bit.hpp            |  20 +--
 .../cudf/utilities/type_dispatcher.hpp        |  16 ++-
 cpp/include/cudf/wrappers/dictionary.hpp      |  36 ++---
 cpp/src/io/orc/timezone.cuh                   |   8 +-
 cpp/src/quantiles/quantiles_util.hpp          |  32 +++--
 cpp/src/rolling/rolling_detail.hpp            |   8 +-
 cpp/src/strings/json/json_path.cu             |  39 ++---
 cpp/src/strings/regex/regex.cuh               |   2 +-
 cpp/src/strings/regex/regex.inl               |  10 +-
 .../optional_iterator_test_numeric.cu         |   5 +-
 .../iterator/pair_iterator_test_numeric.cu    |   7 +-
 26 files changed, 309 insertions(+), 269 deletions(-)
diff --git a/cpp/include/cudf/ast/detail/operators.hpp b/cpp/include/cudf/ast/detail/operators.hpp
index e550a09ec37..d7fd109f12a 100644
--- a/cpp/include/cudf/ast/detail/operators.hpp
+++ b/cpp/include/cudf/ast/detail/operators.hpp
@@ -49,7 +49,7 @@ constexpr bool is_valid_unary_op = cuda::std::is_invocable<Op, T>::value;
  * @param args Forwarded arguments to `operator()` of `f`.
  */
 template <typename F, typename... Ts>
-CUDF_HDI constexpr void ast_operator_dispatcher(ast_operator op, F&& f, Ts&&... args)
+CUDF_HOST_DEVICE inline constexpr void ast_operator_dispatcher(ast_operator op, F&& f, Ts&&... args)
 {
   switch (op) {
     case ast_operator::ADD:
@@ -920,7 +920,7 @@ struct single_dispatch_binary_operator_types {
             typename F,
             typename... Ts,
             std::enable_if_t<is_valid_binary_op<OperatorFunctor, LHS, LHS>>* = nullptr>
-  CUDF_HDI void operator()(F&& f, Ts&&... args)
+  CUDF_HOST_DEVICE inline void operator()(F&& f, Ts&&... args)
   {
     f.template operator()<OperatorFunctor, LHS, LHS>(std::forward<Ts>(args)...);
   }
@@ -929,7 +929,7 @@ struct single_dispatch_binary_operator_types {
             typename F,
             typename... Ts,
             std::enable_if_t<!is_valid_binary_op<OperatorFunctor, LHS, LHS>>* = nullptr>
-  CUDF_HDI void operator()(F&& f, Ts&&... args)
+  CUDF_HOST_DEVICE inline void operator()(F&& f, Ts&&... args)
   {
 #ifndef __CUDA_ARCH__
     CUDF_FAIL("Invalid binary operation.");
@@ -958,7 +958,10 @@ struct type_dispatch_binary_op {
    * @param args Forwarded arguments to `operator()` of `f`.
    */
   template <ast_operator op, typename F, typename... Ts>
-  CUDF_HDI void operator()(cudf::data_type lhs_type, cudf::data_type rhs_type, F&& f, Ts&&... args)
+  CUDF_HOST_DEVICE inline void operator()(cudf::data_type lhs_type,
+                                          cudf::data_type rhs_type,
+                                          F&& f,
+                                          Ts&&... args)
   {
     // Single dispatch (assume lhs_type == rhs_type)
     type_dispatcher(
@@ -981,7 +984,7 @@ struct type_dispatch_binary_op {
  * @param args Forwarded arguments to `operator()` of `f`.
  */
 template <typename F, typename... Ts>
-CUDF_HDI constexpr void binary_operator_dispatcher(
+CUDF_HOST_DEVICE inline constexpr void binary_operator_dispatcher(
   ast_operator op, cudf::data_type lhs_type, cudf::data_type rhs_type, F&& f, Ts&&... args)
 {
   ast_operator_dispatcher(op,
@@ -1006,7 +1009,7 @@ struct dispatch_unary_operator_types {
             typename F,
             typename... Ts,
             std::enable_if_t<is_valid_unary_op<OperatorFunctor, InputT>>* = nullptr>
-  CUDF_HDI void operator()(F&& f, Ts&&... args)
+  CUDF_HOST_DEVICE inline void operator()(F&& f, Ts&&... args)
   {
     f.template operator()<OperatorFunctor, InputT>(std::forward<Ts>(args)...);
   }
@@ -1015,7 +1018,7 @@ struct dispatch_unary_operator_types {
             typename F,
             typename... Ts,
             std::enable_if_t<!is_valid_unary_op<OperatorFunctor, InputT>>* = nullptr>
-  CUDF_HDI void operator()(F&& f, Ts&&... args)
+  CUDF_HOST_DEVICE inline void operator()(F&& f, Ts&&... args)
   {
 #ifndef __CUDA_ARCH__
     CUDF_FAIL("Invalid unary operation.");
@@ -1030,7 +1033,7 @@ struct dispatch_unary_operator_types {
  */
 struct type_dispatch_unary_op {
   template <ast_operator op, typename F, typename... Ts>
-  CUDF_HDI void operator()(cudf::data_type input_type, F&& f, Ts&&... args)
+  CUDF_HOST_DEVICE inline void operator()(cudf::data_type input_type, F&& f, Ts&&... args)
   {
     type_dispatcher(
       input_type,
@@ -1051,10 +1054,10 @@ struct type_dispatch_unary_op {
  * @param args Forwarded arguments to `operator()` of `f`.
  */
 template <typename F, typename... Ts>
-CUDF_HDI constexpr void unary_operator_dispatcher(ast_operator op,
-                                                  cudf::data_type input_type,
-                                                  F&& f,
-                                                  Ts&&... args)
+CUDF_HOST_DEVICE inline constexpr void unary_operator_dispatcher(ast_operator op,
+                                                                 cudf::data_type input_type,
+                                                                 F&& f,
+                                                                 Ts&&... args)
 {
   ast_operator_dispatcher(op,
                           detail::type_dispatch_unary_op{},
@@ -1079,7 +1082,7 @@ struct return_type_functor {
             typename LHS,
             typename RHS,
             std::enable_if_t<is_valid_binary_op<OperatorFunctor, LHS, RHS>>* = nullptr>
-  CUDF_HDI void operator()(cudf::data_type& result)
+  CUDF_HOST_DEVICE inline void operator()(cudf::data_type& result)
   {
     using Out = cuda::std::invoke_result_t<OperatorFunctor, LHS, RHS>;
     result    = cudf::data_type(cudf::type_to_id<Out>());
@@ -1089,7 +1092,7 @@ struct return_type_functor {
             typename LHS,
             typename RHS,
             std::enable_if_t<!is_valid_binary_op<OperatorFunctor, LHS, RHS>>* = nullptr>
-  CUDF_HDI void operator()(cudf::data_type& result)
+  CUDF_HOST_DEVICE inline void operator()(cudf::data_type& result)
   {
 #ifndef __CUDA_ARCH__
     CUDF_FAIL("Invalid binary operation. Return type cannot be determined.");
@@ -1108,7 +1111,7 @@ struct return_type_functor {
   template <typename OperatorFunctor,
             typename T,
             std::enable_if_t<is_valid_unary_op<OperatorFunctor, T>>* = nullptr>
-  CUDF_HDI void operator()(cudf::data_type& result)
+  CUDF_HOST_DEVICE inline void operator()(cudf::data_type& result)
   {
     using Out = cuda::std::invoke_result_t<OperatorFunctor, T>;
     result    = cudf::data_type(cudf::type_to_id<Out>());
@@ -1117,7 +1120,7 @@ struct return_type_functor {
   template <typename OperatorFunctor,
             typename T,
             std::enable_if_t<!is_valid_unary_op<OperatorFunctor, T>>* = nullptr>
-  CUDF_HDI void operator()(cudf::data_type& result)
+  CUDF_HOST_DEVICE inline void operator()(cudf::data_type& result)
   {
 #ifndef __CUDA_ARCH__
     CUDF_FAIL("Invalid unary operation. Return type cannot be determined.");
@@ -1156,7 +1159,7 @@ inline cudf::data_type ast_operator_return_type(ast_operator op,
  */
 struct arity_functor {
   template <ast_operator op>
-  CUDF_HDI void operator()(cudf::size_type& result)
+  CUDF_HOST_DEVICE inline void operator()(cudf::size_type& result)
   {
     // Arity is not dependent on null handling, so just use the false implementation here.
     result = operator_functor<op, false>::arity;
@@ -1169,7 +1172,7 @@ struct arity_functor {
  * @param op Operator used to determine arity.
  * @return Arity of the operator.
  */
-CUDF_HDI cudf::size_type ast_operator_arity(ast_operator op)
+CUDF_HOST_DEVICE inline cudf::size_type ast_operator_arity(ast_operator op)
 {
   auto result = cudf::size_type(0);
   ast_operator_dispatcher(op, detail::arity_functor{}, result);
diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp
index 99f7c239c75..3674efbcc7b 100644
--- a/cpp/include/cudf/detail/aggregation/aggregation.hpp
+++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp
@@ -1324,7 +1324,9 @@ AGG_KIND_MAPPING(aggregation::VARIANCE, var_aggregation);
  */
 #pragma nv_exec_check_disable
 template <typename F, typename... Ts>
-CUDF_HDI decltype(auto) aggregation_dispatcher(aggregation::Kind k, F&& f, Ts&&... args)
+CUDF_HOST_DEVICE inline decltype(auto) aggregation_dispatcher(aggregation::Kind k,
+                                                              F&& f,
+                                                              Ts&&... args)
 {
   switch (k) {
     case aggregation::SUM:
@@ -1416,7 +1418,7 @@ template <typename Element>
 struct dispatch_aggregation {
 #pragma nv_exec_check_disable
   template <aggregation::Kind k, typename F, typename... Ts>
-  CUDF_HDI decltype(auto) operator()(F&& f, Ts&&... args) const
+  CUDF_HOST_DEVICE inline decltype(auto) operator()(F&& f, Ts&&... args) const
   {
     return f.template operator()<Element, k>(std::forward<Ts>(args)...);
   }
@@ -1425,7 +1427,7 @@ struct dispatch_aggregation {
 struct dispatch_source {
 #pragma nv_exec_check_disable
   template <typename Element, typename F, typename... Ts>
-  CUDF_HDI decltype(auto) operator()(aggregation::Kind k, F&& f, Ts&&... args) const
+  CUDF_HOST_DEVICE inline decltype(auto) operator()(aggregation::Kind k, F&& f, Ts&&... args) const
   {
     return aggregation_dispatcher(
       k, dispatch_aggregation<Element>{}, std::forward<F>(f), std::forward<Ts>(args)...);
@@ -1449,10 +1451,10 @@ struct dispatch_source {
  */
 #pragma nv_exec_check_disable
 template <typename F, typename... Ts>
-CUDF_HDI constexpr decltype(auto) dispatch_type_and_aggregation(data_type type,
-                                                                aggregation::Kind k,
-                                                                F&& f,
-                                                                Ts&&... args)
+CUDF_HOST_DEVICE inline constexpr decltype(auto) dispatch_type_and_aggregation(data_type type,
+                                                                               aggregation::Kind k,
+                                                                               F&& f,
+                                                                               Ts&&... args)
 {
   return type_dispatcher(type, dispatch_source{}, k, std::forward<F>(f), std::forward<Ts>(args)...);
 }
diff --git a/cpp/include/cudf/detail/indexalator.cuh b/cpp/include/cudf/detail/indexalator.cuh
index 2430d356c51..11c82da8097 100644
--- a/cpp/include/cudf/detail/indexalator.cuh
+++ b/cpp/include/cudf/detail/indexalator.cuh
@@ -55,7 +55,7 @@ struct base_indexalator {
   /**
    * @brief Prefix increment operator.
    */
-  CUDF_HDI T& operator++()
+  CUDF_HOST_DEVICE inline T& operator++()
   {
     T& derived = static_cast<T&>(*this);
     derived.p_ += width_;
@@ -65,7 +65,7 @@ struct base_indexalator {
   /**
    * @brief Postfix increment operator.
    */
-  CUDF_HDI T operator++(int)
+  CUDF_HOST_DEVICE inline T operator++(int)
   {
     T tmp{static_cast<T&>(*this)};
     operator++();
@@ -75,7 +75,7 @@ struct base_indexalator {
   /**
    * @brief Prefix decrement operator.
    */
-  CUDF_HDI T& operator--()
+  CUDF_HOST_DEVICE inline T& operator--()
   {
     T& derived = static_cast<T&>(*this);
     derived.p_ -= width_;
@@ -85,7 +85,7 @@ struct base_indexalator {
   /**
    * @brief Postfix decrement operator.
    */
-  CUDF_HDI T operator--(int)
+  CUDF_HOST_DEVICE inline T operator--(int)
   {
     T tmp{static_cast<T&>(*this)};
     operator--();
@@ -95,7 +95,7 @@ struct base_indexalator {
   /**
    * @brief Compound assignment by sum operator.
    */
-  CUDF_HDI T& operator+=(difference_type offset)
+  CUDF_HOST_DEVICE inline T& operator+=(difference_type offset)
   {
     T& derived = static_cast<T&>(*this);
     derived.p_ += offset * width_;
@@ -105,7 +105,7 @@ struct base_indexalator {
   /**
    * @brief Increment by offset operator.
    */
-  CUDF_HDI T operator+(difference_type offset) const
+  CUDF_HOST_DEVICE inline T operator+(difference_type offset) const
   {
     auto tmp = T{static_cast<T const&>(*this)};
     tmp.p_ += (offset * width_);
@@ -115,7 +115,7 @@ struct base_indexalator {
   /**
    * @brief Addition assignment operator.
    */
-  CUDF_HDI friend T operator+(difference_type offset, T const& rhs)
+  CUDF_HOST_DEVICE inline friend T operator+(difference_type offset, T const& rhs)
   {
     T tmp{rhs};
     tmp.p_ += (offset * rhs.width_);
@@ -125,7 +125,7 @@ struct base_indexalator {
   /**
    * @brief Compound assignment by difference operator.
    */
-  CUDF_HDI T& operator-=(difference_type offset)
+  CUDF_HOST_DEVICE inline T& operator-=(difference_type offset)
   {
     T& derived = static_cast<T&>(*this);
     derived.p_ -= offset * width_;
@@ -135,7 +135,7 @@ struct base_indexalator {
   /**
    * @brief Decrement by offset operator.
    */
-  CUDF_HDI T operator-(difference_type offset) const
+  CUDF_HOST_DEVICE inline T operator-(difference_type offset) const
   {
     auto tmp = T{static_cast<T const&>(*this)};
     tmp.p_ -= (offset * width_);
@@ -145,7 +145,7 @@ struct base_indexalator {
   /**
    * @brief Subtraction assignment operator.
    */
-  CUDF_HDI friend T operator-(difference_type offset, T const& rhs)
+  CUDF_HOST_DEVICE inline friend T operator-(difference_type offset, T const& rhs)
   {
     T tmp{rhs};
     tmp.p_ -= (offset * rhs.width_);
@@ -155,7 +155,7 @@ struct base_indexalator {
   /**
    * @brief Compute offset from iterator difference operator.
    */
-  CUDF_HDI difference_type operator-(T const& rhs) const
+  CUDF_HOST_DEVICE inline difference_type operator-(T const& rhs) const
   {
     return (static_cast<T const&>(*this).p_ - rhs.p_) / width_;
   }
@@ -163,27 +163,45 @@ struct base_indexalator {
   /**
    * @brief Equals to operator.
    */
-  CUDF_HDI bool operator==(T const& rhs) const { return rhs.p_ == static_cast<T const&>(*this).p_; }
+  CUDF_HOST_DEVICE inline bool operator==(T const& rhs) const
+  {
+    return rhs.p_ == static_cast<T const&>(*this).p_;
+  }
   /**
    * @brief Not equals to operator.
    */
-  CUDF_HDI bool operator!=(T const& rhs) const { return rhs.p_ != static_cast<T const&>(*this).p_; }
+  CUDF_HOST_DEVICE inline bool operator!=(T const& rhs) const
+  {
+    return rhs.p_ != static_cast<T const&>(*this).p_;
+  }
   /**
    * @brief Less than operator.
    */
-  CUDF_HDI bool operator<(T const& rhs) const { return static_cast<T const&>(*this).p_ < rhs.p_; }
+  CUDF_HOST_DEVICE inline bool operator<(T const& rhs) const
+  {
+    return static_cast<T const&>(*this).p_ < rhs.p_;
+  }
   /**
    * @brief Greater than operator.
    */
-  CUDF_HDI bool operator>(T const& rhs) const { return static_cast<T const&>(*this).p_ > rhs.p_; }
+  CUDF_HOST_DEVICE inline bool operator>(T const& rhs) const
+  {
+    return static_cast<T const&>(*this).p_ > rhs.p_;
+  }
   /**
    * @brief Less than or equals to operator.
    */
-  CUDF_HDI bool operator<=(T const& rhs) const { return static_cast<T const&>(*this).p_ <= rhs.p_; }
+  CUDF_HOST_DEVICE inline bool operator<=(T const& rhs) const
+  {
+    return static_cast<T const&>(*this).p_ <= rhs.p_;
+  }
   /**
    * @brief Greater than or equals to operator.
    */
-  CUDF_HDI bool operator>=(T const& rhs) const { return static_cast<T const&>(*this).p_ >= rhs.p_; }
+  CUDF_HOST_DEVICE inline bool operator>=(T const& rhs) const
+  {
+    return static_cast<T const&>(*this).p_ >= rhs.p_;
+  }
 
  protected:
   /**
diff --git a/cpp/include/cudf/detail/iterator.cuh b/cpp/include/cudf/detail/iterator.cuh
index 93649b44fd9..10d9cda55dd 100644
--- a/cpp/include/cudf/detail/iterator.cuh
+++ b/cpp/include/cudf/detail/iterator.cuh
@@ -420,8 +420,7 @@ struct scalar_optional_accessor : public scalar_value_accessor<Element> {
    *
    * @return a thrust::optional<Element> for the scalar value.
    */
-  CUDF_HDI
-  const value_type operator()(size_type) const
+  CUDF_HOST_DEVICE inline const value_type operator()(size_type) const
   {
     if (has_nulls) {
       return (super_t::dscalar.is_valid()) ? Element{super_t::dscalar.value()}
@@ -454,8 +453,7 @@ struct scalar_pair_accessor : public scalar_value_accessor<Element> {
    *
    * @return a pair with value and validity of the scalar.
    */
-  CUDF_HDI
-  const value_type operator()(size_type) const
+  CUDF_HOST_DEVICE inline const value_type operator()(size_type) const
   {
 #if defined(__CUDA_ARCH__)
     return {Element(super_t::dscalar.value()), super_t::dscalar.is_valid()};
diff --git a/cpp/include/cudf/detail/reduction_operators.cuh b/cpp/include/cudf/detail/reduction_operators.cuh
index 5e26cd60c25..5a0cb4c1714 100644
--- a/cpp/include/cudf/detail/reduction_operators.cuh
+++ b/cpp/include/cudf/detail/reduction_operators.cuh
@@ -19,7 +19,7 @@
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/utilities/device_operators.cuh>
 #include <cudf/detail/utilities/transform_unary_functions.cuh>
-#include <cudf/types.hpp>  //for CUDF_HDI
+#include <cudf/types.hpp>  //for CUDF_HOST_DEVICE
 
 #include <cmath>
 #include <thrust/functional.h>
@@ -32,14 +32,12 @@ struct var_std {
   ResultType value;          /// the value
   ResultType value_squared;  /// the value of squared
 
-  CUDF_HDI
-  var_std(ResultType _value = 0, ResultType _value_squared = 0)
+  CUDF_HOST_DEVICE inline var_std(ResultType _value = 0, ResultType _value_squared = 0)
     : value(_value), value_squared(_value_squared){};
 
   using this_t = var_std<ResultType>;
 
-  CUDF_HDI
-  this_t operator+(this_t const& rhs) const
+  CUDF_HOST_DEVICE inline this_t operator+(this_t const& rhs) const
   {
     return this_t((this->value + rhs.value), (this->value_squared + rhs.value_squared));
   };
@@ -50,8 +48,10 @@ template <typename ResultType>
 struct transformer_var_std {
   using OutputType = var_std<ResultType>;
 
-  CUDF_HDI
-  OutputType operator()(ResultType const& value) { return OutputType(value, value * value); };
+  CUDF_HOST_DEVICE inline OutputType operator()(ResultType const& value)
+  {
+    return OutputType(value, value * value);
+  };
 };
 
 // ------------------------------------------------------------------------
@@ -201,9 +201,9 @@ struct compound_op : public simple_op<Derived> {
    * @return transformed output result of compound operator
    */
   template <typename ResultType, typename IntermediateType>
-  CUDF_HDI static ResultType compute_result(const IntermediateType& input,
-                                            const cudf::size_type& count,
-                                            const cudf::size_type& ddof)
+  CUDF_HOST_DEVICE inline static ResultType compute_result(const IntermediateType& input,
+                                                           const cudf::size_type& count,
+                                                           const cudf::size_type& ddof)
   {
     // Enforced interface
     return Derived::template intermediate<ResultType>::compute_result(input, count, ddof);
@@ -230,10 +230,9 @@ struct mean : public compound_op<mean> {
     using IntermediateType = ResultType;  // sum value
 
     // compute `mean` from intermediate type `IntermediateType`
-    CUDF_HDI
-    static ResultType compute_result(const IntermediateType& input,
-                                     const cudf::size_type& count,
-                                     const cudf::size_type& ddof)
+    CUDF_HOST_DEVICE inline static ResultType compute_result(const IntermediateType& input,
+                                                             const cudf::size_type& count,
+                                                             const cudf::size_type& ddof)
     {
       return (input / count);
     };
@@ -252,10 +251,9 @@ struct variance : public compound_op<variance> {
     using IntermediateType = var_std<ResultType>;  // with sum of value, and sum of squared value
 
     // compute `variance` from intermediate type `IntermediateType`
-    CUDF_HDI
-    static ResultType compute_result(const IntermediateType& input,
-                                     const cudf::size_type& count,
-                                     const cudf::size_type& ddof)
+    CUDF_HOST_DEVICE inline static ResultType compute_result(const IntermediateType& input,
+                                                             const cudf::size_type& count,
+                                                             const cudf::size_type& ddof)
     {
       ResultType mean     = input.value / count;
       ResultType asum     = input.value_squared;
@@ -279,10 +277,9 @@ struct standard_deviation : public compound_op<standard_deviation> {
     using IntermediateType = var_std<ResultType>;  // with sum of value, and sum of squared value
 
     // compute `standard deviation` from intermediate type `IntermediateType`
-    CUDF_HDI
-    static ResultType compute_result(const IntermediateType& input,
-                                     const cudf::size_type& count,
-                                     const cudf::size_type& ddof)
+    CUDF_HOST_DEVICE inline static ResultType compute_result(const IntermediateType& input,
+                                                             const cudf::size_type& count,
+                                                             const cudf::size_type& ddof)
     {
       using intermediateOp = variance::template intermediate<ResultType>;
       ResultType var       = intermediateOp::compute_result(input, count, ddof);
diff --git a/cpp/include/cudf/detail/utilities/device_operators.cuh b/cpp/include/cudf/detail/utilities/device_operators.cuh
index 2ae9f3bf675..a59ad4c42ee 100644
--- a/cpp/include/cudf/detail/utilities/device_operators.cuh
+++ b/cpp/include/cudf/detail/utilities/device_operators.cuh
@@ -40,7 +40,7 @@ namespace detail {
 template <typename LHS,
           typename RHS,
           std::enable_if_t<cudf::is_relationally_comparable<LHS, RHS>()>* = nullptr>
-CUDF_HDI auto min(LHS const& lhs, RHS const& rhs)
+CUDF_HOST_DEVICE inline auto min(LHS const& lhs, RHS const& rhs)
 {
   return std::min(lhs, rhs);
 }
@@ -51,7 +51,7 @@ CUDF_HDI auto min(LHS const& lhs, RHS const& rhs)
 template <typename LHS,
           typename RHS,
           std::enable_if_t<cudf::is_relationally_comparable<LHS, RHS>()>* = nullptr>
-CUDF_HDI auto max(LHS const& lhs, RHS const& rhs)
+CUDF_HOST_DEVICE inline auto max(LHS const& lhs, RHS const& rhs)
 {
   return std::max(lhs, rhs);
 }
@@ -62,7 +62,7 @@ CUDF_HDI auto max(LHS const& lhs, RHS const& rhs)
  */
 struct DeviceSum {
   template <typename T, typename std::enable_if_t<!cudf::is_timestamp<T>()>* = nullptr>
-  CUDF_HDI auto operator()(const T& lhs, const T& rhs) -> decltype(lhs + rhs)
+  CUDF_HOST_DEVICE inline auto operator()(const T& lhs, const T& rhs) -> decltype(lhs + rhs)
   {
     return lhs + rhs;
   }
@@ -94,13 +94,13 @@ struct DeviceSum {
  */
 struct DeviceCount {
   template <typename T, typename std::enable_if_t<cudf::is_timestamp<T>()>* = nullptr>
-  CUDF_HDI T operator()(const T& lhs, const T& rhs)
+  CUDF_HOST_DEVICE inline T operator()(const T& lhs, const T& rhs)
   {
     return T{DeviceCount{}(lhs.time_since_epoch(), rhs.time_since_epoch())};
   }
 
   template <typename T, typename std::enable_if_t<!cudf::is_timestamp<T>()>* = nullptr>
-  CUDF_HDI T operator()(const T&, const T& rhs)
+  CUDF_HOST_DEVICE inline T operator()(const T&, const T& rhs)
   {
     return rhs + T{1};
   }
@@ -117,7 +117,8 @@ struct DeviceCount {
  */
 struct DeviceMin {
   template <typename T>
-  CUDF_HDI auto operator()(const T& lhs, const T& rhs) -> decltype(cudf::detail::min(lhs, rhs))
+  CUDF_HOST_DEVICE inline auto operator()(const T& lhs, const T& rhs)
+    -> decltype(cudf::detail::min(lhs, rhs))
   {
     return numeric::detail::min(lhs, rhs);
   }
@@ -141,7 +142,7 @@ struct DeviceMin {
 
   // @brief identity specialized for string_view
   template <typename T, typename std::enable_if_t<std::is_same_v<T, cudf::string_view>>* = nullptr>
-  CUDF_HDI static constexpr T identity()
+  CUDF_HOST_DEVICE inline static constexpr T identity()
   {
     return string_view::max();
   }
@@ -158,7 +159,8 @@ struct DeviceMin {
  */
 struct DeviceMax {
   template <typename T>
-  CUDF_HDI auto operator()(const T& lhs, const T& rhs) -> decltype(cudf::detail::max(lhs, rhs))
+  CUDF_HOST_DEVICE inline auto operator()(const T& lhs, const T& rhs)
+    -> decltype(cudf::detail::max(lhs, rhs))
   {
     return numeric::detail::max(lhs, rhs);
   }
@@ -181,7 +183,7 @@ struct DeviceMax {
   }
 
   template <typename T, typename std::enable_if_t<std::is_same_v<T, cudf::string_view>>* = nullptr>
-  CUDF_HDI static constexpr T identity()
+  CUDF_HOST_DEVICE inline static constexpr T identity()
   {
     return string_view::min();
   }
@@ -198,7 +200,7 @@ struct DeviceMax {
  */
 struct DeviceProduct {
   template <typename T, typename std::enable_if_t<!cudf::is_timestamp<T>()>* = nullptr>
-  CUDF_HDI auto operator()(const T& lhs, const T& rhs) -> decltype(lhs * rhs)
+  CUDF_HOST_DEVICE inline auto operator()(const T& lhs, const T& rhs) -> decltype(lhs * rhs)
   {
     return lhs * rhs;
   }
@@ -222,7 +224,7 @@ struct DeviceProduct {
  */
 struct DeviceAnd {
   template <typename T, typename std::enable_if_t<std::is_integral<T>::value>* = nullptr>
-  CUDF_HDI auto operator()(const T& lhs, const T& rhs) -> decltype(lhs & rhs)
+  CUDF_HOST_DEVICE inline auto operator()(const T& lhs, const T& rhs) -> decltype(lhs & rhs)
   {
     return (lhs & rhs);
   }
@@ -233,7 +235,7 @@ struct DeviceAnd {
  */
 struct DeviceOr {
   template <typename T, typename std::enable_if_t<std::is_integral<T>::value>* = nullptr>
-  CUDF_HDI auto operator()(const T& lhs, const T& rhs) -> decltype(lhs | rhs)
+  CUDF_HOST_DEVICE inline auto operator()(const T& lhs, const T& rhs) -> decltype(lhs | rhs)
   {
     return (lhs | rhs);
   }
@@ -244,7 +246,7 @@ struct DeviceOr {
  */
 struct DeviceXor {
   template <typename T, typename std::enable_if_t<std::is_integral<T>::value>* = nullptr>
-  CUDF_HDI auto operator()(const T& lhs, const T& rhs) -> decltype(lhs ^ rhs)
+  CUDF_HOST_DEVICE inline auto operator()(const T& lhs, const T& rhs) -> decltype(lhs ^ rhs)
   {
     return (lhs ^ rhs);
   }
@@ -256,7 +258,7 @@ struct DeviceXor {
 struct DeviceLeadLag {
   const size_type row_offset;
 
-  explicit CUDF_HDI DeviceLeadLag(size_type offset_) : row_offset(offset_) {}
+  explicit CUDF_HOST_DEVICE inline DeviceLeadLag(size_type offset_) : row_offset(offset_) {}
 };
 
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/utilities/transform_unary_functions.cuh b/cpp/include/cudf/detail/utilities/transform_unary_functions.cuh
index 701f501be98..12774f57c6a 100644
--- a/cpp/include/cudf/detail/utilities/transform_unary_functions.cuh
+++ b/cpp/include/cudf/detail/utilities/transform_unary_functions.cuh
@@ -43,14 +43,13 @@ struct null_replacing_transformer {
   using type = ResultType;
   Functor f;
   type replacement;
-  CUDF_HDI
-  null_replacing_transformer(type null_replacement, Functor transformer)
+  CUDF_HOST_DEVICE inline null_replacing_transformer(type null_replacement, Functor transformer)
     : f(transformer), replacement(null_replacement)
   {
   }
 
   template <typename ElementType>
-  CUDF_HDI type operator()(thrust::pair<ElementType, bool> const& pair_value)
+  CUDF_HOST_DEVICE inline type operator()(thrust::pair<ElementType, bool> const& pair_value)
   {
     if (pair_value.second)
       return f(pair_value.first);
@@ -76,22 +75,21 @@ struct meanvar {
   ElementType value_squared;  /// the value of squared
   cudf::size_type count;      /// the count
 
-  CUDF_HDI
-  meanvar(ElementType _value = 0, ElementType _value_squared = 0, cudf::size_type _count = 0)
+  CUDF_HOST_DEVICE inline meanvar(ElementType _value         = 0,
+                                  ElementType _value_squared = 0,
+                                  cudf::size_type _count     = 0)
     : value(_value), value_squared(_value_squared), count(_count){};
 
   using this_t = cudf::meanvar<ElementType>;
 
-  CUDF_HDI
-  this_t operator+(this_t const& rhs) const
+  CUDF_HOST_DEVICE inline this_t operator+(this_t const& rhs) const
   {
     return this_t((this->value + rhs.value),
                   (this->value_squared + rhs.value_squared),
                   (this->count + rhs.count));
   };
 
-  CUDF_HDI
-  bool operator==(this_t const& rhs) const
+  CUDF_HOST_DEVICE inline bool operator==(this_t const& rhs) const
   {
     return ((this->value == rhs.value) && (this->value_squared == rhs.value_squared) &&
             (this->count == rhs.count));
@@ -113,8 +111,10 @@ struct meanvar {
  */
 template <typename ElementType>
 struct transformer_squared {
-  CUDF_HDI
-  ElementType operator()(ElementType const& value) { return (value * value); };
+  CUDF_HOST_DEVICE inline ElementType operator()(ElementType const& value)
+  {
+    return (value * value);
+  };
 };
 
 /**
@@ -130,8 +130,7 @@ template <typename ElementType>
 struct transformer_meanvar {
   using ResultType = meanvar<ElementType>;
 
-  CUDF_HDI
-  ResultType operator()(thrust::pair<ElementType, bool> const& pair)
+  CUDF_HOST_DEVICE inline ResultType operator()(thrust::pair<ElementType, bool> const& pair)
   {
     ElementType v = pair.first;
     return meanvar<ElementType>(v, v * v, (pair.second) ? 1 : 0);
diff --git a/cpp/include/cudf/fixed_point/fixed_point.hpp b/cpp/include/cudf/fixed_point/fixed_point.hpp
index 6de9e03ca96..727dce0db9d 100644
--- a/cpp/include/cudf/fixed_point/fixed_point.hpp
+++ b/cpp/include/cudf/fixed_point/fixed_point.hpp
@@ -79,7 +79,7 @@ template <typename Rep,
           typename T,
           typename cuda::std::enable_if_t<(cuda::std::is_same_v<int32_t, T> &&
                                            is_supported_representation_type<Rep>())>* = nullptr>
-CUDF_HDI Rep ipow(T exponent)
+CUDF_HOST_DEVICE inline Rep ipow(T exponent)
 {
   cudf_assert(exponent >= 0 && "integer exponentiation with negative exponent is not possible.");
   if (exponent == 0) return static_cast<Rep>(1);
@@ -108,7 +108,7 @@ CUDF_HDI Rep ipow(T exponent)
  * @return Shifted value of type T
  */
 template <typename Rep, Radix Rad, typename T>
-CUDF_HDI constexpr T right_shift(T const& val, scale_type const& scale)
+CUDF_HOST_DEVICE inline constexpr T right_shift(T const& val, scale_type const& scale)
 {
   return val / ipow<Rep, Rad>(static_cast<int32_t>(scale));
 }
@@ -125,7 +125,7 @@ CUDF_HDI constexpr T right_shift(T const& val, scale_type const& scale)
  * @return Shifted value of type T
  */
 template <typename Rep, Radix Rad, typename T>
-CUDF_HDI constexpr T left_shift(T const& val, scale_type const& scale)
+CUDF_HOST_DEVICE inline constexpr T left_shift(T const& val, scale_type const& scale)
 {
   return val * ipow<Rep, Rad>(static_cast<int32_t>(-scale));
 }
@@ -144,7 +144,7 @@ CUDF_HDI constexpr T left_shift(T const& val, scale_type const& scale)
  * @return Shifted value of type T
  */
 template <typename Rep, Radix Rad, typename T>
-CUDF_HDI constexpr T shift(T const& val, scale_type const& scale)
+CUDF_HOST_DEVICE inline constexpr T shift(T const& val, scale_type const& scale)
 {
   if (scale == 0)
     return val;
@@ -179,7 +179,7 @@ template <typename Rep,
 struct scaled_integer {
   Rep value;
   scale_type scale;
-  CUDF_HDI explicit scaled_integer(Rep v, scale_type s) : value{v}, scale{s} {}
+  CUDF_HOST_DEVICE inline explicit scaled_integer(Rep v, scale_type s) : value{v}, scale{s} {}
 };
 
 /**
@@ -210,7 +210,7 @@ class fixed_point {
   template <typename T,
             typename cuda::std::enable_if_t<cuda::std::is_floating_point<T>() &&
                                             is_supported_representation_type<Rep>()>* = nullptr>
-  CUDF_HDI explicit fixed_point(T const& value, scale_type const& scale)
+  CUDF_HOST_DEVICE inline explicit fixed_point(T const& value, scale_type const& scale)
     : _value{static_cast<Rep>(detail::shift<Rep, Rad>(value, scale))}, _scale{scale}
   {
   }
@@ -226,7 +226,7 @@ class fixed_point {
   template <typename T,
             typename cuda::std::enable_if_t<cuda::std::is_integral<T>() &&
                                             is_supported_representation_type<Rep>()>* = nullptr>
-  CUDF_HDI explicit fixed_point(T const& value, scale_type const& scale)
+  CUDF_HOST_DEVICE inline explicit fixed_point(T const& value, scale_type const& scale)
     // `value` is cast to `Rep` to avoid overflow in cases where
     // constructing to `Rep` that is wider than `T`
     : _value{detail::shift<Rep, Rad>(static_cast<Rep>(value), scale)}, _scale{scale}
@@ -238,8 +238,10 @@ class fixed_point {
    *
    * @param s scaled_integer that contains scale and already shifted value
    */
-  CUDF_HDI
-  explicit fixed_point(scaled_integer<Rep> s) : _value{s.value}, _scale{s.scale} {}
+  CUDF_HOST_DEVICE inline explicit fixed_point(scaled_integer<Rep> s)
+    : _value{s.value}, _scale{s.scale}
+  {
+  }
 
   /**
    * @brief "Scale-less" constructor that constructs `fixed_point` number with a specified
@@ -247,7 +249,8 @@ class fixed_point {
    */
   template <typename T,
             typename cuda::std::enable_if_t<is_supported_construction_value_type<T>()>* = nullptr>
-  CUDF_HDI fixed_point(T const& value) : _value{static_cast<Rep>(value)}, _scale{scale_type{0}}
+  CUDF_HOST_DEVICE inline fixed_point(T const& value)
+    : _value{static_cast<Rep>(value)}, _scale{scale_type{0}}
   {
   }
 
@@ -255,8 +258,7 @@ class fixed_point {
    * @brief Default constructor that constructs `fixed_point` number with a
    * value and scale of zero
    */
-  CUDF_HDI
-  fixed_point() : _value{0}, _scale{scale_type{0}} {}
+  CUDF_HOST_DEVICE inline fixed_point() : _value{0}, _scale{scale_type{0}} {}
 
   /**
    * @brief Explicit conversion operator for casting to floating point types
@@ -288,28 +290,34 @@ class fixed_point {
     return static_cast<U>(detail::shift<Rep, Rad>(value, scale_type{-_scale}));
   }
 
-  CUDF_HDI operator scaled_integer<Rep>() const { return scaled_integer<Rep>{_value, _scale}; }
+  CUDF_HOST_DEVICE inline operator scaled_integer<Rep>() const
+  {
+    return scaled_integer<Rep>{_value, _scale};
+  }
 
   /**
    * @brief Method that returns the underlying value of the `fixed_point` number
    *
    * @return The underlying value of the `fixed_point` number
    */
-  CUDF_HDI rep value() const { return _value; }
+  CUDF_HOST_DEVICE inline rep value() const { return _value; }
 
   /**
    * @brief Method that returns the scale of the `fixed_point` number
    *
    * @return The scale of the `fixed_point` number
    */
-  CUDF_HDI scale_type scale() const { return _scale; }
+  CUDF_HOST_DEVICE inline scale_type scale() const { return _scale; }
 
   /**
    * @brief Explicit conversion operator to `bool`
    *
    * @return The `fixed_point` value as a boolean (zero is `false`, nonzero is `true`)
    */
-  CUDF_HDI explicit constexpr operator bool() const { return static_cast<bool>(_value); }
+  CUDF_HOST_DEVICE inline explicit constexpr operator bool() const
+  {
+    return static_cast<bool>(_value);
+  }
 
   /**
    * @brief operator +=
@@ -319,7 +327,7 @@ class fixed_point {
    * @return The sum
    */
   template <typename Rep1, Radix Rad1>
-  CUDF_HDI fixed_point<Rep1, Rad1>& operator+=(fixed_point<Rep1, Rad1> const& rhs)
+  CUDF_HOST_DEVICE inline fixed_point<Rep1, Rad1>& operator+=(fixed_point<Rep1, Rad1> const& rhs)
   {
     *this = *this + rhs;
     return *this;
@@ -333,7 +341,7 @@ class fixed_point {
    * @return The product
    */
   template <typename Rep1, Radix Rad1>
-  CUDF_HDI fixed_point<Rep1, Rad1>& operator*=(fixed_point<Rep1, Rad1> const& rhs)
+  CUDF_HOST_DEVICE inline fixed_point<Rep1, Rad1>& operator*=(fixed_point<Rep1, Rad1> const& rhs)
   {
     *this = *this * rhs;
     return *this;
@@ -347,7 +355,7 @@ class fixed_point {
    * @return The difference
    */
   template <typename Rep1, Radix Rad1>
-  CUDF_HDI fixed_point<Rep1, Rad1>& operator-=(fixed_point<Rep1, Rad1> const& rhs)
+  CUDF_HOST_DEVICE inline fixed_point<Rep1, Rad1>& operator-=(fixed_point<Rep1, Rad1> const& rhs)
   {
     *this = *this - rhs;
     return *this;
@@ -361,7 +369,7 @@ class fixed_point {
    * @return The quotient
    */
   template <typename Rep1, Radix Rad1>
-  CUDF_HDI fixed_point<Rep1, Rad1>& operator/=(fixed_point<Rep1, Rad1> const& rhs)
+  CUDF_HOST_DEVICE inline fixed_point<Rep1, Rad1>& operator/=(fixed_point<Rep1, Rad1> const& rhs)
   {
     *this = *this / rhs;
     return *this;
@@ -372,8 +380,7 @@ class fixed_point {
    *
    * @return The incremented result
    */
-  CUDF_HDI
-  fixed_point<Rep, Rad>& operator++()
+  CUDF_HOST_DEVICE inline fixed_point<Rep, Rad>& operator++()
   {
     *this = *this + fixed_point<Rep, Rad>{1, scale_type{_scale}};
     return *this;
@@ -391,8 +398,8 @@ class fixed_point {
    * @return The resulting `fixed_point` sum
    */
   template <typename Rep1, Radix Rad1>
-  CUDF_HDI friend fixed_point<Rep1, Rad1> operator+(fixed_point<Rep1, Rad1> const& lhs,
-                                                    fixed_point<Rep1, Rad1> const& rhs);
+  CUDF_HOST_DEVICE inline friend fixed_point<Rep1, Rad1> operator+(
+    fixed_point<Rep1, Rad1> const& lhs, fixed_point<Rep1, Rad1> const& rhs);
 
   /**
    * @brief operator - (for subtracting two `fixed_point` numbers)
@@ -406,8 +413,8 @@ class fixed_point {
    * @return The resulting `fixed_point` difference
    */
   template <typename Rep1, Radix Rad1>
-  CUDF_HDI friend fixed_point<Rep1, Rad1> operator-(fixed_point<Rep1, Rad1> const& lhs,
-                                                    fixed_point<Rep1, Rad1> const& rhs);
+  CUDF_HOST_DEVICE inline friend fixed_point<Rep1, Rad1> operator-(
+    fixed_point<Rep1, Rad1> const& lhs, fixed_point<Rep1, Rad1> const& rhs);
 
   /**
    * @brief operator * (for multiplying two `fixed_point` numbers)
@@ -419,8 +426,8 @@ class fixed_point {
    * @return The resulting `fixed_point` product
    */
   template <typename Rep1, Radix Rad1>
-  CUDF_HDI friend fixed_point<Rep1, Rad1> operator*(fixed_point<Rep1, Rad1> const& lhs,
-                                                    fixed_point<Rep1, Rad1> const& rhs);
+  CUDF_HOST_DEVICE inline friend fixed_point<Rep1, Rad1> operator*(
+    fixed_point<Rep1, Rad1> const& lhs, fixed_point<Rep1, Rad1> const& rhs);
 
   /**
    * @brief operator / (for dividing two `fixed_point` numbers)
@@ -432,8 +439,8 @@ class fixed_point {
    * @return The resulting `fixed_point` quotient
    */
   template <typename Rep1, Radix Rad1>
-  CUDF_HDI friend fixed_point<Rep1, Rad1> operator/(fixed_point<Rep1, Rad1> const& lhs,
-                                                    fixed_point<Rep1, Rad1> const& rhs);
+  CUDF_HOST_DEVICE inline friend fixed_point<Rep1, Rad1> operator/(
+    fixed_point<Rep1, Rad1> const& lhs, fixed_point<Rep1, Rad1> const& rhs);
 
   /**
    * @brief operator == (for comparing two `fixed_point` numbers)
@@ -447,8 +454,8 @@ class fixed_point {
    * @return true if `lhs` and `rhs` are equal, false if not
    */
   template <typename Rep1, Radix Rad1>
-  CUDF_HDI friend bool operator==(fixed_point<Rep1, Rad1> const& lhs,
-                                  fixed_point<Rep1, Rad1> const& rhs);
+  CUDF_HOST_DEVICE inline friend bool operator==(fixed_point<Rep1, Rad1> const& lhs,
+                                                 fixed_point<Rep1, Rad1> const& rhs);
 
   /**
    * @brief operator != (for comparing two `fixed_point` numbers)
@@ -462,8 +469,8 @@ class fixed_point {
    * @return true if `lhs` and `rhs` are not equal, false if not
    */
   template <typename Rep1, Radix Rad1>
-  CUDF_HDI friend bool operator!=(fixed_point<Rep1, Rad1> const& lhs,
-                                  fixed_point<Rep1, Rad1> const& rhs);
+  CUDF_HOST_DEVICE inline friend bool operator!=(fixed_point<Rep1, Rad1> const& lhs,
+                                                 fixed_point<Rep1, Rad1> const& rhs);
 
   /**
    * @brief operator <= (for comparing two `fixed_point` numbers)
@@ -477,8 +484,8 @@ class fixed_point {
    * @return true if `lhs` less than or equal to `rhs`, false if not
    */
   template <typename Rep1, Radix Rad1>
-  CUDF_HDI friend bool operator<=(fixed_point<Rep1, Rad1> const& lhs,
-                                  fixed_point<Rep1, Rad1> const& rhs);
+  CUDF_HOST_DEVICE inline friend bool operator<=(fixed_point<Rep1, Rad1> const& lhs,
+                                                 fixed_point<Rep1, Rad1> const& rhs);
 
   /**
    * @brief operator >= (for comparing two `fixed_point` numbers)
@@ -492,8 +499,8 @@ class fixed_point {
    * @return true if `lhs` greater than or equal to `rhs`, false if not
    */
   template <typename Rep1, Radix Rad1>
-  CUDF_HDI friend bool operator>=(fixed_point<Rep1, Rad1> const& lhs,
-                                  fixed_point<Rep1, Rad1> const& rhs);
+  CUDF_HOST_DEVICE inline friend bool operator>=(fixed_point<Rep1, Rad1> const& lhs,
+                                                 fixed_point<Rep1, Rad1> const& rhs);
 
   /**
    * @brief operator < (for comparing two `fixed_point` numbers)
@@ -507,8 +514,8 @@ class fixed_point {
    * @return true if `lhs` less than `rhs`, false if not
    */
   template <typename Rep1, Radix Rad1>
-  CUDF_HDI friend bool operator<(fixed_point<Rep1, Rad1> const& lhs,
-                                 fixed_point<Rep1, Rad1> const& rhs);
+  CUDF_HOST_DEVICE inline friend bool operator<(fixed_point<Rep1, Rad1> const& lhs,
+                                                fixed_point<Rep1, Rad1> const& rhs);
 
   /**
    * @brief operator > (for comparing two `fixed_point` numbers)
@@ -522,8 +529,8 @@ class fixed_point {
    * @return true if `lhs` greater than `rhs`, false if not
    */
   template <typename Rep1, Radix Rad1>
-  CUDF_HDI friend bool operator>(fixed_point<Rep1, Rad1> const& lhs,
-                                 fixed_point<Rep1, Rad1> const& rhs);
+  CUDF_HOST_DEVICE inline friend bool operator>(fixed_point<Rep1, Rad1> const& lhs,
+                                                fixed_point<Rep1, Rad1> const& rhs);
 
   /**
    * @brief Method for creating a `fixed_point` number with a new `scale`
@@ -534,7 +541,7 @@ class fixed_point {
    * @param scale The `scale` of the returned `fixed_point` number
    * @return `fixed_point` number with a new `scale`
    */
-  CUDF_HDI fixed_point<Rep, Rad> rescaled(scale_type scale) const
+  CUDF_HOST_DEVICE inline fixed_point<Rep, Rad> rescaled(scale_type scale) const
   {
     if (scale == _scale) return *this;
     Rep const value = detail::shift<Rep, Rad>(_value, scale_type{scale - _scale});
@@ -573,7 +580,7 @@ class fixed_point {
  * @return true if addition causes overflow, false otherwise
  */
 template <typename Rep, typename T>
-CUDF_HDI auto addition_overflow(T lhs, T rhs)
+CUDF_HOST_DEVICE inline auto addition_overflow(T lhs, T rhs)
 {
   return rhs > 0 ? lhs > cuda::std::numeric_limits<Rep>::max() - rhs
                  : lhs < cuda::std::numeric_limits<Rep>::min() - rhs;
@@ -588,7 +595,7 @@ CUDF_HDI auto addition_overflow(T lhs, T rhs)
  * @return true if subtraction causes overflow, false otherwise
  */
 template <typename Rep, typename T>
-CUDF_HDI auto subtraction_overflow(T lhs, T rhs)
+CUDF_HOST_DEVICE inline auto subtraction_overflow(T lhs, T rhs)
 {
   return rhs > 0 ? lhs < cuda::std::numeric_limits<Rep>::min() + rhs
                  : lhs > cuda::std::numeric_limits<Rep>::max() + rhs;
@@ -603,7 +610,7 @@ CUDF_HDI auto subtraction_overflow(T lhs, T rhs)
  * @return true if division causes overflow, false otherwise
  */
 template <typename Rep, typename T>
-CUDF_HDI auto division_overflow(T lhs, T rhs)
+CUDF_HOST_DEVICE inline auto division_overflow(T lhs, T rhs)
 {
   return lhs == cuda::std::numeric_limits<Rep>::min() && rhs == -1;
 }
@@ -617,7 +624,7 @@ CUDF_HDI auto division_overflow(T lhs, T rhs)
  * @return true if multiplication causes overflow, false otherwise
  */
 template <typename Rep, typename T>
-CUDF_HDI auto multiplication_overflow(T lhs, T rhs)
+CUDF_HOST_DEVICE inline auto multiplication_overflow(T lhs, T rhs)
 {
   auto const min = cuda::std::numeric_limits<Rep>::min();
   auto const max = cuda::std::numeric_limits<Rep>::max();
@@ -631,8 +638,8 @@ CUDF_HDI auto multiplication_overflow(T lhs, T rhs)
 
 // PLUS Operation
 template <typename Rep1, Radix Rad1>
-CUDF_HDI fixed_point<Rep1, Rad1> operator+(fixed_point<Rep1, Rad1> const& lhs,
-                                           fixed_point<Rep1, Rad1> const& rhs)
+CUDF_HOST_DEVICE inline fixed_point<Rep1, Rad1> operator+(fixed_point<Rep1, Rad1> const& lhs,
+                                                          fixed_point<Rep1, Rad1> const& rhs)
 {
   auto const scale = std::min(lhs._scale, rhs._scale);
   auto const sum   = lhs.rescaled(scale)._value + rhs.rescaled(scale)._value;
@@ -649,8 +656,8 @@ CUDF_HDI fixed_point<Rep1, Rad1> operator+(fixed_point<Rep1, Rad1> const& lhs,
 
 // MINUS Operation
 template <typename Rep1, Radix Rad1>
-CUDF_HDI fixed_point<Rep1, Rad1> operator-(fixed_point<Rep1, Rad1> const& lhs,
-                                           fixed_point<Rep1, Rad1> const& rhs)
+CUDF_HOST_DEVICE inline fixed_point<Rep1, Rad1> operator-(fixed_point<Rep1, Rad1> const& lhs,
+                                                          fixed_point<Rep1, Rad1> const& rhs)
 {
   auto const scale = std::min(lhs._scale, rhs._scale);
   auto const diff  = lhs.rescaled(scale)._value - rhs.rescaled(scale)._value;
@@ -667,8 +674,8 @@ CUDF_HDI fixed_point<Rep1, Rad1> operator-(fixed_point<Rep1, Rad1> const& lhs,
 
 // MULTIPLIES Operation
 template <typename Rep1, Radix Rad1>
-CUDF_HDI fixed_point<Rep1, Rad1> operator*(fixed_point<Rep1, Rad1> const& lhs,
-                                           fixed_point<Rep1, Rad1> const& rhs)
+CUDF_HOST_DEVICE inline fixed_point<Rep1, Rad1> operator*(fixed_point<Rep1, Rad1> const& lhs,
+                                                          fixed_point<Rep1, Rad1> const& rhs)
 {
 #if defined(__CUDACC_DEBUG__)
 
@@ -682,8 +689,8 @@ CUDF_HDI fixed_point<Rep1, Rad1> operator*(fixed_point<Rep1, Rad1> const& lhs,
 
 // DIVISION Operation
 template <typename Rep1, Radix Rad1>
-CUDF_HDI fixed_point<Rep1, Rad1> operator/(fixed_point<Rep1, Rad1> const& lhs,
-                                           fixed_point<Rep1, Rad1> const& rhs)
+CUDF_HOST_DEVICE inline fixed_point<Rep1, Rad1> operator/(fixed_point<Rep1, Rad1> const& lhs,
+                                                          fixed_point<Rep1, Rad1> const& rhs)
 {
 #if defined(__CUDACC_DEBUG__)
 
@@ -697,7 +704,8 @@ CUDF_HDI fixed_point<Rep1, Rad1> operator/(fixed_point<Rep1, Rad1> const& lhs,
 
 // EQUALITY COMPARISON Operation
 template <typename Rep1, Radix Rad1>
-CUDF_HDI bool operator==(fixed_point<Rep1, Rad1> const& lhs, fixed_point<Rep1, Rad1> const& rhs)
+CUDF_HOST_DEVICE inline bool operator==(fixed_point<Rep1, Rad1> const& lhs,
+                                        fixed_point<Rep1, Rad1> const& rhs)
 {
   auto const scale = std::min(lhs._scale, rhs._scale);
   return lhs.rescaled(scale)._value == rhs.rescaled(scale)._value;
@@ -705,7 +713,8 @@ CUDF_HDI bool operator==(fixed_point<Rep1, Rad1> const& lhs, fixed_point<Rep1, R
 
 // EQUALITY NOT COMPARISON Operation
 template <typename Rep1, Radix Rad1>
-CUDF_HDI bool operator!=(fixed_point<Rep1, Rad1> const& lhs, fixed_point<Rep1, Rad1> const& rhs)
+CUDF_HOST_DEVICE inline bool operator!=(fixed_point<Rep1, Rad1> const& lhs,
+                                        fixed_point<Rep1, Rad1> const& rhs)
 {
   auto const scale = std::min(lhs._scale, rhs._scale);
   return lhs.rescaled(scale)._value != rhs.rescaled(scale)._value;
@@ -713,7 +722,8 @@ CUDF_HDI bool operator!=(fixed_point<Rep1, Rad1> const& lhs, fixed_point<Rep1, R
 
 // LESS THAN OR EQUAL TO Operation
 template <typename Rep1, Radix Rad1>
-CUDF_HDI bool operator<=(fixed_point<Rep1, Rad1> const& lhs, fixed_point<Rep1, Rad1> const& rhs)
+CUDF_HOST_DEVICE inline bool operator<=(fixed_point<Rep1, Rad1> const& lhs,
+                                        fixed_point<Rep1, Rad1> const& rhs)
 {
   auto const scale = std::min(lhs._scale, rhs._scale);
   return lhs.rescaled(scale)._value <= rhs.rescaled(scale)._value;
@@ -721,7 +731,8 @@ CUDF_HDI bool operator<=(fixed_point<Rep1, Rad1> const& lhs, fixed_point<Rep1, R
 
 // GREATER THAN OR EQUAL TO Operation
 template <typename Rep1, Radix Rad1>
-CUDF_HDI bool operator>=(fixed_point<Rep1, Rad1> const& lhs, fixed_point<Rep1, Rad1> const& rhs)
+CUDF_HOST_DEVICE inline bool operator>=(fixed_point<Rep1, Rad1> const& lhs,
+                                        fixed_point<Rep1, Rad1> const& rhs)
 {
   auto const scale = std::min(lhs._scale, rhs._scale);
   return lhs.rescaled(scale)._value >= rhs.rescaled(scale)._value;
@@ -729,7 +740,8 @@ CUDF_HDI bool operator>=(fixed_point<Rep1, Rad1> const& lhs, fixed_point<Rep1, R
 
 // LESS THAN Operation
 template <typename Rep1, Radix Rad1>
-CUDF_HDI bool operator<(fixed_point<Rep1, Rad1> const& lhs, fixed_point<Rep1, Rad1> const& rhs)
+CUDF_HOST_DEVICE inline bool operator<(fixed_point<Rep1, Rad1> const& lhs,
+                                       fixed_point<Rep1, Rad1> const& rhs)
 {
   auto const scale = std::min(lhs._scale, rhs._scale);
   return lhs.rescaled(scale)._value < rhs.rescaled(scale)._value;
@@ -737,7 +749,8 @@ CUDF_HDI bool operator<(fixed_point<Rep1, Rad1> const& lhs, fixed_point<Rep1, Ra
 
 // GREATER THAN Operation
 template <typename Rep1, Radix Rad1>
-CUDF_HDI bool operator>(fixed_point<Rep1, Rad1> const& lhs, fixed_point<Rep1, Rad1> const& rhs)
+CUDF_HOST_DEVICE inline bool operator>(fixed_point<Rep1, Rad1> const& lhs,
+                                       fixed_point<Rep1, Rad1> const& rhs)
 {
   auto const scale = std::min(lhs._scale, rhs._scale);
   return lhs.rescaled(scale)._value > rhs.rescaled(scale)._value;
diff --git a/cpp/include/cudf/fixed_point/temporary.hpp b/cpp/include/cudf/fixed_point/temporary.hpp
index 4268ad266bb..be900f252f6 100644
--- a/cpp/include/cudf/fixed_point/temporary.hpp
+++ b/cpp/include/cudf/fixed_point/temporary.hpp
@@ -62,13 +62,13 @@ constexpr auto abs(T value)
 }
 
 template <typename T>
-CUDF_HDI auto min(T lhs, T rhs)
+CUDF_HOST_DEVICE inline auto min(T lhs, T rhs)
 {
   return lhs < rhs ? lhs : rhs;
 }
 
 template <typename T>
-CUDF_HDI auto max(T lhs, T rhs)
+CUDF_HOST_DEVICE inline auto max(T lhs, T rhs)
 {
   return lhs > rhs ? lhs : rhs;
 }
diff --git a/cpp/include/cudf/lists/list_device_view.cuh b/cpp/include/cudf/lists/list_device_view.cuh
index dbce8484160..5071f046e0c 100644
--- a/cpp/include/cudf/lists/list_device_view.cuh
+++ b/cpp/include/cudf/lists/list_device_view.cuh
@@ -215,7 +215,7 @@ class list_device_view {
      *
      * @param _list The `list_device_view` whose rows are being accessed.
      */
-    explicit CUDF_HDI pair_accessor(list_device_view const& _list) : list{_list} {}
+    explicit CUDF_HOST_DEVICE inline pair_accessor(list_device_view const& _list) : list{_list} {}
 
     /**
      * @brief Accessor for the {data, validity} pair at the specified index
@@ -252,7 +252,9 @@ class list_device_view {
      *
      * @param _list The `list_device_view` whose rows are being accessed.
      */
-    explicit CUDF_HDI pair_rep_accessor(list_device_view const& _list) : list{_list} {}
+    explicit CUDF_HOST_DEVICE inline pair_rep_accessor(list_device_view const& _list) : list{_list}
+    {
+    }
 
     /**
      * @brief Accessor for the {rep_data, validity} pair at the specified index
@@ -286,7 +288,7 @@ class list_device_view {
  */
 struct list_size_functor {
   column_device_view const d_column;
-  CUDF_HDI list_size_functor(column_device_view const& d_col) : d_column(d_col)
+  CUDF_HOST_DEVICE inline list_size_functor(column_device_view const& d_col) : d_column(d_col)
   {
 #if defined(__CUDA_ARCH__)
     cudf_assert(d_col.type().id() == type_id::LIST && "Only list type column is supported");
diff --git a/cpp/include/cudf/lists/lists_column_device_view.cuh b/cpp/include/cudf/lists/lists_column_device_view.cuh
index 468877410f2..aff088a7f44 100644
--- a/cpp/include/cudf/lists/lists_column_device_view.cuh
+++ b/cpp/include/cudf/lists/lists_column_device_view.cuh
@@ -46,7 +46,7 @@ class lists_column_device_view {
   /**
    * @brief Fetches number of rows in the lists column
    */
-  CUDF_HDI cudf::size_type size() const { return underlying.size(); }
+  CUDF_HOST_DEVICE inline cudf::size_type size() const { return underlying.size(); }
 
   /**
    * @brief Fetches the offsets column of the underlying list column.
diff --git a/cpp/include/cudf/strings/json.hpp b/cpp/include/cudf/strings/json.hpp
index b425b0030ef..8435c47eaf5 100644
--- a/cpp/include/cudf/strings/json.hpp
+++ b/cpp/include/cudf/strings/json.hpp
@@ -48,7 +48,7 @@ class get_json_object_options {
    * @brief Returns true/false depending on whether single-quotes for representing strings
    * are allowed.
    */
-  CUDF_HDI bool get_allow_single_quotes() const { return allow_single_quotes; }
+  CUDF_HOST_DEVICE inline bool get_allow_single_quotes() const { return allow_single_quotes; }
 
   /**
    * @brief Returns true/false depending on whether individually returned string values have
@@ -72,7 +72,7 @@ class get_json_object_options {
    *
    * @endcode
    */
-  CUDF_HDI bool get_strip_quotes_from_single_strings() const
+  CUDF_HOST_DEVICE inline bool get_strip_quotes_from_single_strings() const
   {
     return strip_quotes_from_single_strings;
   }
diff --git a/cpp/include/cudf/strings/string_view.cuh b/cpp/include/cudf/strings/string_view.cuh
index 5b324d7d43d..43a90997c86 100644
--- a/cpp/include/cudf/strings/string_view.cuh
+++ b/cpp/include/cudf/strings/string_view.cuh
@@ -80,7 +80,7 @@ static __constant__ char max_string_sentinel[5]{"\xF7\xBF\xBF\xBF"};
  *
  * @return An empty string
  */
-CUDF_HDI string_view string_view::min() { return string_view(); }
+CUDF_HOST_DEVICE inline string_view string_view::min() { return string_view(); }
 
 /**
  * @brief Return maximum value associated with the string type
@@ -91,7 +91,7 @@ CUDF_HDI string_view string_view::min() { return string_view(); }
  * @return A string value which represents the highest possible valid UTF-8 encoded
  * character.
  */
-CUDF_HDI string_view string_view::max()
+CUDF_HOST_DEVICE inline string_view string_view::max()
 {
   const char* psentinel{nullptr};
 #if defined(__CUDA_ARCH__)
diff --git a/cpp/include/cudf/strings/string_view.hpp b/cpp/include/cudf/strings/string_view.hpp
index 3694473e2e8..22409ab3dc7 100644
--- a/cpp/include/cudf/strings/string_view.hpp
+++ b/cpp/include/cudf/strings/string_view.hpp
@@ -51,7 +51,7 @@ class string_view {
   /**
    * @brief Return the number of bytes in this string
    */
-  CUDF_HDI size_type size_bytes() const { return _bytes; }
+  CUDF_HOST_DEVICE inline size_type size_bytes() const { return _bytes; }
   /**
    * @brief Return the number of characters in this string
    */
@@ -59,12 +59,12 @@ class string_view {
   /**
    * @brief Return a pointer to the internal device array
    */
-  CUDF_HDI const char* data() const { return _data; }
+  CUDF_HOST_DEVICE inline const char* data() const { return _data; }
 
   /**
    * @brief Return true if string has no characters
    */
-  CUDF_HDI bool empty() const { return size_bytes() == 0; }
+  CUDF_HOST_DEVICE inline bool empty() const { return size_bytes() == 0; }
 
   /**
    * @brief Handy iterator for navigating through encoded characters.
@@ -284,7 +284,7 @@ class string_view {
    *
    * @return An empty string
    */
-  CUDF_HDI static string_view min();
+  CUDF_HOST_DEVICE inline static string_view min();
 
   /**
    * @brief Return maximum value associated with the string type
@@ -295,12 +295,12 @@ class string_view {
    * @return A string value which represents the highest possible valid UTF-8 encoded
    * character.
    */
-  CUDF_HDI static string_view max();
+  CUDF_HOST_DEVICE inline static string_view max();
 
   /**
    * @brief Default constructor represents an empty string.
    */
-  CUDF_HDI string_view() : _data(""), _bytes(0), _length(0) {}
+  CUDF_HOST_DEVICE inline string_view() : _data(""), _bytes(0), _length(0) {}
 
   /**
    * @brief Create instance from existing device char array.
@@ -308,7 +308,7 @@ class string_view {
    * @param data Device char array encoded in UTF8.
    * @param bytes Number of bytes in data array.
    */
-  CUDF_HDI string_view(const char* data, size_type bytes)
+  CUDF_HOST_DEVICE inline string_view(const char* data, size_type bytes)
     : _data(data), _bytes(bytes), _length(UNKNOWN_STRING_LENGTH)
   {
   }
@@ -386,7 +386,7 @@ constexpr size_type bytes_in_utf8_byte(uint8_t byte)
  * @param[out] character Single char_utf8 value.
  * @return The number of bytes in the character
  */
-CUDF_HDI size_type to_char_utf8(const char* str, char_utf8& character)
+CUDF_HOST_DEVICE inline size_type to_char_utf8(const char* str, char_utf8& character)
 {
   size_type const chr_width = bytes_in_utf8_byte(static_cast<uint8_t>(*str));
 
@@ -413,7 +413,7 @@ CUDF_HDI size_type to_char_utf8(const char* str, char_utf8& character)
  * @param[out] str Allocated char array with enough space to hold the encoded character.
  * @return The number of bytes in the character
  */
-CUDF_HDI size_type from_char_utf8(char_utf8 character, char* str)
+CUDF_HOST_DEVICE inline size_type from_char_utf8(char_utf8 character, char* str)
 {
   size_type const chr_width = bytes_in_char_utf8(character);
   for (size_type idx = 0; idx < chr_width; ++idx) {
diff --git a/cpp/include/cudf/types.hpp b/cpp/include/cudf/types.hpp
index 217198bb498..459a4182aa0 100644
--- a/cpp/include/cudf/types.hpp
+++ b/cpp/include/cudf/types.hpp
@@ -17,16 +17,9 @@
 #pragma once
 
 #ifdef __CUDACC__
-#define CUDF_HD   __host__ __device__
-#define CUDF_HDI  __host__ __device__ inline
-#define CUDF_HDFI __host__ __device__ __forceinline__
+#define CUDF_HOST_DEVICE __host__ __device__
 #else
-#define CUDF_HD
-#define CUDF_HDI  inline
-// TODO: Should we make these use __attribute__((always_inline))? It's
-// supported by gcc and clang, but it is unlikely to be worthwhile to force
-// inlining of host code in libcudf.
-#define CUDF_HDFI inline
+#define CUDF_HOST_DEVICE
 #endif
 
 #include <cassert>
diff --git a/cpp/include/cudf/utilities/bit.hpp b/cpp/include/cudf/utilities/bit.hpp
index f4c2cfa0831..f4a70463de3 100644
--- a/cpp/include/cudf/utilities/bit.hpp
+++ b/cpp/include/cudf/utilities/bit.hpp
@@ -42,7 +42,7 @@ namespace detail {
 #endif
 
 template <typename T>
-constexpr CUDF_HDI std::size_t size_in_bits()
+constexpr CUDF_HOST_DEVICE inline std::size_t size_in_bits()
 {
   static_assert(CHAR_BIT == 8, "Size of a byte must be 8 bits.");
   return sizeof(T) * CHAR_BIT;
@@ -58,7 +58,7 @@ constexpr CUDF_HDI std::size_t size_in_bits()
 /**
  * @brief Returns the index of the word containing the specified bit.
  */
-constexpr CUDF_HDI size_type word_index(size_type bit_index)
+constexpr CUDF_HOST_DEVICE inline size_type word_index(size_type bit_index)
 {
   return bit_index / detail::size_in_bits<bitmask_type>();
 }
@@ -66,7 +66,7 @@ constexpr CUDF_HDI size_type word_index(size_type bit_index)
 /**
  * @brief Returns the position within a word of the specified bit.
  */
-constexpr CUDF_HDI size_type intra_word_index(size_type bit_index)
+constexpr CUDF_HOST_DEVICE inline size_type intra_word_index(size_type bit_index)
 {
   return bit_index % detail::size_in_bits<bitmask_type>();
 }
@@ -80,7 +80,7 @@ constexpr CUDF_HDI size_type intra_word_index(size_type bit_index)
  * @param bitmask The bitmask containing the bit to set
  * @param bit_index Index of the bit to set
  */
-CUDF_HDI void set_bit_unsafe(bitmask_type* bitmask, size_type bit_index)
+CUDF_HOST_DEVICE inline void set_bit_unsafe(bitmask_type* bitmask, size_type bit_index)
 {
   assert(nullptr != bitmask);
   bitmask[word_index(bit_index)] |= (bitmask_type{1} << intra_word_index(bit_index));
@@ -95,7 +95,7 @@ CUDF_HDI void set_bit_unsafe(bitmask_type* bitmask, size_type bit_index)
  * @param bitmask The bitmask containing the bit to clear
  * @param bit_index The index of the bit to clear
  */
-CUDF_HDI void clear_bit_unsafe(bitmask_type* bitmask, size_type bit_index)
+CUDF_HOST_DEVICE inline void clear_bit_unsafe(bitmask_type* bitmask, size_type bit_index)
 {
   assert(nullptr != bitmask);
   bitmask[word_index(bit_index)] &= ~(bitmask_type{1} << intra_word_index(bit_index));
@@ -109,7 +109,7 @@ CUDF_HDI void clear_bit_unsafe(bitmask_type* bitmask, size_type bit_index)
  * @return true The specified bit is `1`
  * @return false  The specified bit is `0`
  */
-CUDF_HDI bool bit_is_set(bitmask_type const* bitmask, size_type bit_index)
+CUDF_HOST_DEVICE inline bool bit_is_set(bitmask_type const* bitmask, size_type bit_index)
 {
   assert(nullptr != bitmask);
   return bitmask[word_index(bit_index)] & (bitmask_type{1} << intra_word_index(bit_index));
@@ -125,7 +125,9 @@ CUDF_HDI bool bit_is_set(bitmask_type const* bitmask, size_type bit_index)
  * @return false  The specified bit is `0`
  * @return `default_value` if `bitmask` is nullptr
  */
-CUDF_HDI bool bit_value_or(bitmask_type const* bitmask, size_type bit_index, bool default_value)
+CUDF_HOST_DEVICE inline bool bit_value_or(bitmask_type const* bitmask,
+                                          size_type bit_index,
+                                          bool default_value)
 {
   return bitmask != nullptr ? bit_is_set(bitmask, bit_index) : default_value;
 }
@@ -138,7 +140,7 @@ CUDF_HDI bool bit_value_or(bitmask_type const* bitmask, size_type bit_index, boo
  * @param n The number of least significant bits to set
  * @return A bitmask word with `n` least significant bits set
  */
-constexpr CUDF_HDI bitmask_type set_least_significant_bits(size_type n)
+constexpr CUDF_HOST_DEVICE inline bitmask_type set_least_significant_bits(size_type n)
 {
   constexpr_assert(0 <= n && n < static_cast<size_type>(detail::size_in_bits<bitmask_type>()));
   return ((bitmask_type{1} << n) - 1);
@@ -152,7 +154,7 @@ constexpr CUDF_HDI bitmask_type set_least_significant_bits(size_type n)
  * @param n The number of most significant bits to set
  * @return A bitmask word with `n` most significant bits set
  */
-constexpr CUDF_HDI bitmask_type set_most_significant_bits(size_type n)
+constexpr CUDF_HOST_DEVICE inline bitmask_type set_most_significant_bits(size_type n)
 {
   constexpr size_type word_size{detail::size_in_bits<bitmask_type>()};
   constexpr_assert(0 <= n && n < word_size);
diff --git a/cpp/include/cudf/utilities/type_dispatcher.hpp b/cpp/include/cudf/utilities/type_dispatcher.hpp
index d7d38aba4f3..0c6a6ee244c 100644
--- a/cpp/include/cudf/utilities/type_dispatcher.hpp
+++ b/cpp/include/cudf/utilities/type_dispatcher.hpp
@@ -420,7 +420,9 @@ using scalar_device_type_t = typename type_to_scalar_type_impl<T>::ScalarDeviceT
 template <template <cudf::type_id> typename IdTypeMap = id_to_type_impl,
           typename Functor,
           typename... Ts>
-CUDF_HDFI constexpr decltype(auto) type_dispatcher(cudf::data_type dtype, Functor f, Ts&&... args)
+CUDF_HOST_DEVICE __forceinline__ constexpr decltype(auto) type_dispatcher(cudf::data_type dtype,
+                                                                          Functor f,
+                                                                          Ts&&... args)
 {
   switch (dtype.id()) {
     case type_id::BOOL8:
@@ -531,7 +533,7 @@ template <typename T1>
 struct double_type_dispatcher_second_type {
 #pragma nv_exec_check_disable
   template <typename T2, typename F, typename... Ts>
-  CUDF_HDFI decltype(auto) operator()(F&& f, Ts&&... args) const
+  CUDF_HOST_DEVICE __forceinline__ decltype(auto) operator()(F&& f, Ts&&... args) const
   {
     return f.template operator()<T1, T2>(std::forward<Ts>(args)...);
   }
@@ -541,7 +543,9 @@ template <template <cudf::type_id> typename IdTypeMap>
 struct double_type_dispatcher_first_type {
 #pragma nv_exec_check_disable
   template <typename T1, typename F, typename... Ts>
-  CUDF_HDFI decltype(auto) operator()(cudf::data_type type2, F&& f, Ts&&... args) const
+  CUDF_HOST_DEVICE __forceinline__ decltype(auto) operator()(cudf::data_type type2,
+                                                             F&& f,
+                                                             Ts&&... args) const
   {
     return type_dispatcher<IdTypeMap>(type2,
                                       detail::double_type_dispatcher_second_type<T1>{},
@@ -566,10 +570,8 @@ struct double_type_dispatcher_first_type {
  */
 #pragma nv_exec_check_disable
 template <template <cudf::type_id> typename IdTypeMap = id_to_type_impl, typename F, typename... Ts>
-CUDF_HDFI constexpr decltype(auto) double_type_dispatcher(cudf::data_type type1,
-                                                          cudf::data_type type2,
-                                                          F&& f,
-                                                          Ts&&... args)
+CUDF_HOST_DEVICE __forceinline__ constexpr decltype(auto) double_type_dispatcher(
+  cudf::data_type type1, cudf::data_type type2, F&& f, Ts&&... args)
 {
   return type_dispatcher<IdTypeMap>(type1,
                                     detail::double_type_dispatcher_first_type<IdTypeMap>{},
diff --git a/cpp/include/cudf/wrappers/dictionary.hpp b/cpp/include/cudf/wrappers/dictionary.hpp
index fd83ab6d329..9f1c1534e9a 100644
--- a/cpp/include/cudf/wrappers/dictionary.hpp
+++ b/cpp/include/cudf/wrappers/dictionary.hpp
@@ -56,22 +56,22 @@ struct dictionary_wrapper {
   dictionary_wrapper& operator=(const dictionary_wrapper&) = default;
 
   // construct object from type
-  CUDF_HDI constexpr explicit dictionary_wrapper(value_type v) : _value{v} {}
+  CUDF_HOST_DEVICE inline constexpr explicit dictionary_wrapper(value_type v) : _value{v} {}
 
   // conversion operator
-  CUDF_HDI explicit operator value_type() const { return _value; }
+  CUDF_HOST_DEVICE inline explicit operator value_type() const { return _value; }
   // simple accessor
-  CUDF_HDI value_type value() const { return _value; }
+  CUDF_HOST_DEVICE inline value_type value() const { return _value; }
 
-  static CUDF_HDI constexpr value_type max_value()
+  static CUDF_HOST_DEVICE inline constexpr value_type max_value()
   {
     return std::numeric_limits<value_type>::max();
   }
-  static CUDF_HDI constexpr value_type min_value()
+  static CUDF_HOST_DEVICE inline constexpr value_type min_value()
   {
     return std::numeric_limits<value_type>::min();
   }
-  static CUDF_HDI constexpr value_type lowest_value()
+  static CUDF_HOST_DEVICE inline constexpr value_type lowest_value()
   {
     return std::numeric_limits<IndexType>::lowest();
   }
@@ -82,43 +82,43 @@ struct dictionary_wrapper {
 
 // comparison operators
 template <typename Integer>
-CUDF_HDI bool operator==(dictionary_wrapper<Integer> const& lhs,
-                         dictionary_wrapper<Integer> const& rhs)
+CUDF_HOST_DEVICE inline bool operator==(dictionary_wrapper<Integer> const& lhs,
+                                        dictionary_wrapper<Integer> const& rhs)
 {
   return lhs.value() == rhs.value();
 }
 
 template <typename Integer>
-CUDF_HDI bool operator!=(dictionary_wrapper<Integer> const& lhs,
-                         dictionary_wrapper<Integer> const& rhs)
+CUDF_HOST_DEVICE inline bool operator!=(dictionary_wrapper<Integer> const& lhs,
+                                        dictionary_wrapper<Integer> const& rhs)
 {
   return lhs.value() != rhs.value();
 }
 
 template <typename Integer>
-CUDF_HDI bool operator<=(dictionary_wrapper<Integer> const& lhs,
-                         dictionary_wrapper<Integer> const& rhs)
+CUDF_HOST_DEVICE inline bool operator<=(dictionary_wrapper<Integer> const& lhs,
+                                        dictionary_wrapper<Integer> const& rhs)
 {
   return lhs.value() <= rhs.value();
 }
 
 template <typename Integer>
-CUDF_HDI bool operator>=(dictionary_wrapper<Integer> const& lhs,
-                         dictionary_wrapper<Integer> const& rhs)
+CUDF_HOST_DEVICE inline bool operator>=(dictionary_wrapper<Integer> const& lhs,
+                                        dictionary_wrapper<Integer> const& rhs)
 {
   return lhs.value() >= rhs.value();
 }
 
 template <typename Integer>
-CUDF_HDI constexpr bool operator<(dictionary_wrapper<Integer> const& lhs,
-                                  dictionary_wrapper<Integer> const& rhs)
+CUDF_HOST_DEVICE inline constexpr bool operator<(dictionary_wrapper<Integer> const& lhs,
+                                                 dictionary_wrapper<Integer> const& rhs)
 {
   return lhs.value() < rhs.value();
 }
 
 template <typename Integer>
-CUDF_HDI bool operator>(dictionary_wrapper<Integer> const& lhs,
-                        dictionary_wrapper<Integer> const& rhs)
+CUDF_HOST_DEVICE inline bool operator>(dictionary_wrapper<Integer> const& lhs,
+                                       dictionary_wrapper<Integer> const& rhs)
 {
   return lhs.value() > rhs.value();
 }
diff --git a/cpp/src/io/orc/timezone.cuh b/cpp/src/io/orc/timezone.cuh
index c4963268881..bd853314d10 100644
--- a/cpp/src/io/orc/timezone.cuh
+++ b/cpp/src/io/orc/timezone.cuh
@@ -57,10 +57,10 @@ static constexpr uint32_t cycle_entry_cnt = 2 * cycle_years;
  *
  * @return GMT offset
  */
-CUDF_HDI int32_t get_gmt_offset_impl(int64_t const* ttimes,
-                                     int32_t const* offsets,
-                                     size_t count,
-                                     int64_t ts)
+CUDF_HOST_DEVICE inline int32_t get_gmt_offset_impl(int64_t const* ttimes,
+                                                    int32_t const* offsets,
+                                                    size_t count,
+                                                    int64_t ts)
 {
   // Returns start of the range if all elements are larger than the input timestamp
   auto last_less_equal_ttime_idx = [&](long begin_idx, long end_idx, int64_t ts) {
diff --git a/cpp/src/quantiles/quantiles_util.hpp b/cpp/src/quantiles/quantiles_util.hpp
index 0c7c217c212..7af1e47087b 100644
--- a/cpp/src/quantiles/quantiles_util.hpp
+++ b/cpp/src/quantiles/quantiles_util.hpp
@@ -23,7 +23,7 @@
 namespace cudf {
 namespace detail {
 template <typename Result, typename T>
-CUDF_HDI Result get_array_value(T const* devarr, size_type location)
+CUDF_HOST_DEVICE inline Result get_array_value(T const* devarr, size_type location)
 {
   T result;
 #if defined(__CUDA_ARCH__)
@@ -36,7 +36,7 @@ CUDF_HDI Result get_array_value(T const* devarr, size_type location)
 
 namespace interpolate {
 template <typename Result, typename T>
-CUDF_HDI Result linear(T lhs, T rhs, double frac)
+CUDF_HOST_DEVICE inline Result linear(T lhs, T rhs, double frac)
 {
   // TODO: safe operation to avoid overflow/underflow
   // double can fully represent int8-32 value range.
@@ -52,7 +52,7 @@ CUDF_HDI Result linear(T lhs, T rhs, double frac)
 }
 
 template <typename Result, typename T>
-CUDF_HDI Result midpoint(T lhs, T rhs)
+CUDF_HOST_DEVICE inline Result midpoint(T lhs, T rhs)
 {
   // TODO: try std::midpoint (C++20) if available
   double dlhs = static_cast<double>(lhs);
@@ -61,7 +61,7 @@ CUDF_HDI Result midpoint(T lhs, T rhs)
 }
 
 template <typename Result>
-CUDF_HDI Result midpoint(int64_t lhs, int64_t rhs)
+CUDF_HOST_DEVICE inline Result midpoint(int64_t lhs, int64_t rhs)
 {
   // caring to avoid integer overflow and underflow between int64_t and Result( double )
   int64_t half = lhs / 2 + rhs / 2;
@@ -70,7 +70,7 @@ CUDF_HDI Result midpoint(int64_t lhs, int64_t rhs)
 }
 
 template <>
-CUDF_HDI int64_t midpoint(int64_t lhs, int64_t rhs)
+CUDF_HOST_DEVICE inline int64_t midpoint(int64_t lhs, int64_t rhs)
 {
   // caring to avoid integer overflow
   int64_t half   = lhs / 2 + rhs / 2;
@@ -92,8 +92,7 @@ struct quantile_index {
   size_type nearest;
   double fraction;
 
-  CUDF_HDI
-  quantile_index(size_type count, double quantile)
+  CUDF_HOST_DEVICE inline quantile_index(size_type count, double quantile)
   {
     quantile = std::min(std::max(quantile, 0.0), 1.0);
 
@@ -123,8 +122,10 @@ struct quantile_index {
  * @returns Value of the desired quantile.
  */
 template <typename Result, typename ValueAccessor>
-CUDF_HDI Result
-select_quantile(ValueAccessor get_value, size_type size, double q, interpolation interp)
+CUDF_HOST_DEVICE inline Result select_quantile(ValueAccessor get_value,
+                                               size_type size,
+                                               double q,
+                                               interpolation interp)
 {
   if (size < 2) { return get_value(0); }
 
@@ -154,7 +155,10 @@ select_quantile(ValueAccessor get_value, size_type size, double q, interpolation
 }
 
 template <typename Result, typename Iterator>
-CUDF_HDI Result select_quantile_data(Iterator begin, size_type size, double q, interpolation interp)
+CUDF_HOST_DEVICE inline Result select_quantile_data(Iterator begin,
+                                                    size_type size,
+                                                    double q,
+                                                    interpolation interp)
 {
   if (size == 0) return static_cast<Result>(*begin);
 
@@ -183,10 +187,10 @@ CUDF_HDI Result select_quantile_data(Iterator begin, size_type size, double q, i
 }
 
 template <typename Iterator>
-CUDF_HDI bool select_quantile_validity(Iterator begin,
-                                       size_type size,
-                                       double q,
-                                       interpolation interp)
+CUDF_HOST_DEVICE inline bool select_quantile_validity(Iterator begin,
+                                                      size_type size,
+                                                      double q,
+                                                      interpolation interp)
 {
   quantile_index idx(size, q);
 
diff --git a/cpp/src/rolling/rolling_detail.hpp b/cpp/src/rolling/rolling_detail.hpp
index 4a55615476f..80a9397922e 100644
--- a/cpp/src/rolling/rolling_detail.hpp
+++ b/cpp/src/rolling/rolling_detail.hpp
@@ -29,7 +29,7 @@ namespace detail {
 // store functor
 template <typename T, bool is_mean = false>
 struct rolling_store_output_functor {
-  CUDF_HDI void operator()(T& out, T& val, size_type count) { out = val; }
+  CUDF_HOST_DEVICE inline void operator()(T& out, T& val, size_type count) { out = val; }
 };
 
 // Specialization for MEAN
@@ -38,21 +38,21 @@ struct rolling_store_output_functor<_T, true> {
   // SFINAE for non-bool types
   template <typename T                                                             = _T,
             std::enable_if_t<!(cudf::is_boolean<T>() || cudf::is_timestamp<T>())>* = nullptr>
-  CUDF_HDI void operator()(T& out, T& val, size_type count)
+  CUDF_HOST_DEVICE inline void operator()(T& out, T& val, size_type count)
   {
     out = val / count;
   }
 
   // SFINAE for bool type
   template <typename T = _T, std::enable_if_t<cudf::is_boolean<T>()>* = nullptr>
-  CUDF_HDI void operator()(T& out, T& val, size_type count)
+  CUDF_HOST_DEVICE inline void operator()(T& out, T& val, size_type count)
   {
     out = static_cast<int32_t>(val) / count;
   }
 
   // SFINAE for timestamp types
   template <typename T = _T, std::enable_if_t<cudf::is_timestamp<T>()>* = nullptr>
-  CUDF_HDI void operator()(T& out, T& val, size_type count)
+  CUDF_HOST_DEVICE inline void operator()(T& out, T& val, size_type count)
   {
     out = static_cast<T>(val.time_since_epoch() / count);
   }
diff --git a/cpp/src/strings/json/json_path.cu b/cpp/src/strings/json/json_path.cu
index 0f24793e589..c61fb8905f5 100644
--- a/cpp/src/strings/json/json_path.cu
+++ b/cpp/src/strings/json/json_path.cu
@@ -72,19 +72,22 @@ enum class parse_result {
  */
 class parser {
  protected:
-  CUDF_HDI parser() : input(nullptr), input_len(0), pos(nullptr) {}
-  CUDF_HDI parser(const char* _input, int64_t _input_len)
+  CUDF_HOST_DEVICE inline parser() : input(nullptr), input_len(0), pos(nullptr) {}
+  CUDF_HOST_DEVICE inline parser(const char* _input, int64_t _input_len)
     : input(_input), input_len(_input_len), pos(_input)
   {
     parse_whitespace();
   }
 
-  CUDF_HDI parser(parser const& p) : input(p.input), input_len(p.input_len), pos(p.pos) {}
+  CUDF_HOST_DEVICE inline parser(parser const& p)
+    : input(p.input), input_len(p.input_len), pos(p.pos)
+  {
+  }
 
-  CUDF_HDI bool eof(const char* p) { return p - input >= input_len; }
-  CUDF_HDI bool eof() { return eof(pos); }
+  CUDF_HOST_DEVICE inline bool eof(const char* p) { return p - input >= input_len; }
+  CUDF_HOST_DEVICE inline bool eof() { return eof(pos); }
 
-  CUDF_HDI bool parse_whitespace()
+  CUDF_HOST_DEVICE inline bool parse_whitespace()
   {
     while (!eof()) {
       if (is_whitespace(*pos)) {
@@ -96,12 +99,12 @@ class parser {
     return false;
   }
 
-  CUDF_HDI bool is_hex_digit(char c)
+  CUDF_HOST_DEVICE inline bool is_hex_digit(char c)
   {
     return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F');
   }
 
-  CUDF_HDI int64_t chars_left() { return input_len - ((pos - input) + 1); }
+  CUDF_HOST_DEVICE inline int64_t chars_left() { return input_len - ((pos - input) + 1); }
 
   /**
    * @brief Parse an escape sequence.
@@ -111,7 +114,7 @@ class parser {
    *
    * @returns True on success or false on fail.
    */
-  CUDF_HDI bool parse_escape_seq()
+  CUDF_HOST_DEVICE inline bool parse_escape_seq()
   {
     if (*pos != '\\') { return false; }
     char c = *++pos;
@@ -144,7 +147,7 @@ class parser {
    * indicates allowing either single or double quotes (but not a mixture of both).
    * @returns A result code indicating success, failure or other result.
    */
-  CUDF_HDI parse_result parse_string(string_view& str, bool can_be_empty, char quote)
+  CUDF_HOST_DEVICE inline parse_result parse_string(string_view& str, bool can_be_empty, char quote)
   {
     str = string_view(nullptr, 0);
 
@@ -178,7 +181,7 @@ class parser {
   int64_t input_len;
   char const* pos;
 
-  CUDF_HDI bool is_whitespace(char c) { return c <= ' '; }
+  CUDF_HOST_DEVICE inline bool is_whitespace(char c) { return c <= ' '; }
 };
 
 /**
@@ -365,7 +368,7 @@ class json_state : private parser {
    * to not be present.
    * @returns A result code indicating success, failure or other result.
    */
-  CUDF_HDI parse_result parse_name(string_view& name, bool can_be_empty)
+  CUDF_HOST_DEVICE inline parse_result parse_name(string_view& name, bool can_be_empty)
   {
     char const quote = options.get_allow_single_quotes() ? 0 : '\"';
 
@@ -394,7 +397,7 @@ class json_state : private parser {
    * @param val (Output) The string containing the parsed value
    * @returns A result code indicating success, failure or other result.
    */
-  CUDF_HDI parse_result parse_non_string_value(string_view& val)
+  CUDF_HOST_DEVICE inline parse_result parse_non_string_value(string_view& val)
   {
     if (!parse_whitespace()) { return parse_result::ERROR; }
 
@@ -478,7 +481,7 @@ class json_state : private parser {
     return parse_result::SUCCESS;
   }
 
-  CUDF_HDI bool is_quote(char c)
+  CUDF_HOST_DEVICE inline bool is_quote(char c)
   {
     return (c == '\"') || (options.get_allow_single_quotes() && (c == '\''));
   }
@@ -498,8 +501,12 @@ enum class path_operator_type { ROOT, CHILD, CHILD_WILDCARD, CHILD_INDEX, ERROR,
  * an array of these operators applied to the incoming json string,
  */
 struct path_operator {
-  CUDF_HDI path_operator() : type(path_operator_type::ERROR), index(-1), expected_type{NONE} {}
-  CUDF_HDI path_operator(path_operator_type _type, json_element_type _expected_type = NONE)
+  CUDF_HOST_DEVICE inline path_operator()
+    : type(path_operator_type::ERROR), index(-1), expected_type{NONE}
+  {
+  }
+  CUDF_HOST_DEVICE inline path_operator(path_operator_type _type,
+                                        json_element_type _expected_type = NONE)
     : type(_type), index(-1), expected_type{_expected_type}
   {
   }
diff --git a/cpp/src/strings/regex/regex.cuh b/cpp/src/strings/regex/regex.cuh
index c6e0dc12df5..d6b8307c3fb 100644
--- a/cpp/src/strings/regex/regex.cuh
+++ b/cpp/src/strings/regex/regex.cuh
@@ -142,7 +142,7 @@ class reprog_device {
   /**
    * @brief Returns the number of regex groups found in the expression.
    */
-  CUDF_HDI int32_t group_counts() const { return _num_capturing_groups; }
+  CUDF_HOST_DEVICE inline int32_t group_counts() const { return _num_capturing_groups; }
 
   /**
    * @brief Returns the regex instruction object for a given index.
diff --git a/cpp/src/strings/regex/regex.inl b/cpp/src/strings/regex/regex.inl
index 3ea90813ce2..0e11e9c1bbd 100644
--- a/cpp/src/strings/regex/regex.inl
+++ b/cpp/src/strings/regex/regex.inl
@@ -45,12 +45,12 @@ struct alignas(8) relist {
   int16_t* inst_ids{};  // one per instruction
   u_char* mask{};       // bit per instruction
 
-  CUDF_HDI static int32_t data_size_for(int32_t insts)
+  CUDF_HOST_DEVICE inline static int32_t data_size_for(int32_t insts)
   {
     return ((sizeof(ranges[0]) + sizeof(inst_ids[0])) * insts) + ((insts + 7) / 8);
   }
 
-  CUDF_HDI static int32_t alloc_size(int32_t insts)
+  CUDF_HOST_DEVICE inline static int32_t alloc_size(int32_t insts)
   {
     int32_t size = sizeof(relist);
     size += data_size_for(insts);
@@ -58,9 +58,9 @@ struct alignas(8) relist {
     return size;
   }
 
-  CUDF_HDI relist() {}
+  CUDF_HOST_DEVICE inline relist() {}
 
-  CUDF_HDI relist(int16_t insts, u_char* data = nullptr) : listsize(insts)
+  CUDF_HOST_DEVICE inline relist(int16_t insts, u_char* data = nullptr) : listsize(insts)
   {
     auto ptr = data == nullptr ? reinterpret_cast<u_char*>(this) + sizeof(relist) : data;
     ranges   = reinterpret_cast<int2*>(ptr);
@@ -71,7 +71,7 @@ struct alignas(8) relist {
     reset();
   }
 
-  CUDF_HDI void reset()
+  CUDF_HOST_DEVICE inline void reset()
   {
     memset(mask, 0, (listsize + 7) / 8);
     size = 0;
diff --git a/cpp/tests/iterator/optional_iterator_test_numeric.cu b/cpp/tests/iterator/optional_iterator_test_numeric.cu
index f3e12a02bbe..e8102dee2a2 100644
--- a/cpp/tests/iterator/optional_iterator_test_numeric.cu
+++ b/cpp/tests/iterator/optional_iterator_test_numeric.cu
@@ -39,8 +39,7 @@ template <typename ElementType>
 struct transformer_optional_meanvar {
   using ResultType = thrust::optional<cudf::meanvar<ElementType>>;
 
-  CUDF_HDI
-  ResultType operator()(thrust::optional<ElementType> const& optional)
+  CUDF_HOST_DEVICE inline ResultType operator()(thrust::optional<ElementType> const& optional)
   {
     if (optional.has_value()) {
       auto v = *optional;
@@ -52,7 +51,7 @@ struct transformer_optional_meanvar {
 
 template <typename T>
 struct optional_to_meanvar {
-  CUDF_HDI T operator()(const thrust::optional<T>& v) { return v.value_or(T{0}); }
+  CUDF_HOST_DEVICE inline T operator()(const thrust::optional<T>& v) { return v.value_or(T{0}); }
 };
 
 // TODO: enable this test also at __CUDACC_DEBUG__
diff --git a/cpp/tests/iterator/pair_iterator_test_numeric.cu b/cpp/tests/iterator/pair_iterator_test_numeric.cu
index 9edda806a43..99c3bfc2eb4 100644
--- a/cpp/tests/iterator/pair_iterator_test_numeric.cu
+++ b/cpp/tests/iterator/pair_iterator_test_numeric.cu
@@ -36,8 +36,7 @@ template <typename ElementType>
 struct transformer_pair_meanvar {
   using ResultType = thrust::pair<cudf::meanvar<ElementType>, bool>;
 
-  CUDF_HDI
-  ResultType operator()(thrust::pair<ElementType, bool> const& pair)
+  CUDF_HOST_DEVICE inline ResultType operator()(thrust::pair<ElementType, bool> const& pair)
   {
     ElementType v = pair.first;
     return {{v, static_cast<ElementType>(v * v), (pair.second) ? 1 : 0}, pair.second};
@@ -46,8 +45,8 @@ struct transformer_pair_meanvar {
 
 struct sum_if_not_null {
   template <typename T>
-  CUDF_HDI thrust::pair<T, bool> operator()(const thrust::pair<T, bool>& lhs,
-                                            const thrust::pair<T, bool>& rhs)
+  CUDF_HOST_DEVICE inline thrust::pair<T, bool> operator()(const thrust::pair<T, bool>& lhs,
+                                                           const thrust::pair<T, bool>& rhs)
   {
     if (lhs.second & rhs.second)
       return {lhs.first + rhs.first, true};