fixed_point + cudf::binary_operation API Changes (rapidsai#7435)

This resolves rapidsai#7442 Recently while working with @razajafri on `fixed_point` binary ops, it became clear that the `cudf::binary_operation` is breaking the "easy to use, **hard to misuse**" # 1 design guideline. I knew about this but I slotted it as technical debt to be cleaned up later. Long story short, after discussions with both @razajafri, @jrhemstad and comments on the rapidsai#7442, we will implement the following: * [x] For `fixed_point` + `cudf::binary_operation` + `DIV` always **use** the `cudf::data_type output_type` parameter * [x] ~~For `fixed_point` + `cudf::binary_operation` + `TRUE_DIV`, require that the columns/scalars provided as arguments (`lhs` and `rhs`) will result in the specified `data_type`/`scale`~~ * [x] Provide a convenience function (something like `binary_operation_fixed_point_scale()`) that will compute the "expected" scale given two input columns/scalars and a `binary_operator` * [x] Remove `TRUE_DIV` * [x] Add unit tests for different output data_types * [x] Update Python/Cython **This will be a breaking change for all `fixed_point` + `cudf::binary_operation`.** Authors: - Conor Hoekstra (@codereport) Approvers: - Keith Kraus (@kkraus14) - Mike Wilson (@hyperbolic2346) URL: rapidsai#7435
hyperbolic2346 · Mar 23, 2021 · 9db65f8 · 9db65f8
1 parent 7d3420a
commit 9db65f8
Show file tree

Hide file tree

Showing 12 changed files with 420 additions and 293 deletions.
diff --git a/cpp/include/cudf/binaryop.hpp b/cpp/include/cudf/binaryop.hpp
@@ -178,5 +178,29 @@ std::unique_ptr<column> binary_operation(
   data_type output_type,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Computes the `scale` for a `fixed_point` number based on given binary operator `op`
+ *
+ * @param op           The binary_operator used for two `fixed_point` numbers
+ * @param left_scale   Scale of left `fixed_point` number
+ * @param right_scale  Scale of right `fixed_point` number
+ * @return             The resulting `scale` of the computed `fixed_point` number
+ */
+int32_t binary_operation_fixed_point_scale(binary_operator op,
+                                           int32_t left_scale,
+                                           int32_t right_scale);
+
+/**
+ * @brief Computes the `data_type` for a `fixed_point` number based on given binary operator `op`
+ *
+ * @param op   The binary_operator used for two `fixed_point` numbers
+ * @param lhs  `cudf::data_type` of left `fixed_point` number
+ * @param rhs  `cudf::data_type` of right `fixed_point` number
+ * @return     The resulting `cudf::data_type` of the computed `fixed_point` number
+ */
+cudf::data_type binary_operation_fixed_point_output_type(binary_operator op,
+                                                         cudf::data_type const& lhs,
+                                                         cudf::data_type const& rhs);
+
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/src/binaryop/binaryop.cpp b/cpp/src/binaryop/binaryop.cpp
diff --git a/cpp/src/unary/cast_ops.cu b/cpp/src/unary/cast_ops.cu
@@ -178,11 +178,13 @@ std::unique_ptr<column> rescale(column_view input,
 
   if (input.type().scale() > scale) {
     auto const scalar = make_fixed_point_scalar<T>(0, scale_type{scale});
-    return detail::binary_operation(input, *scalar, binary_operator::ADD, {}, stream, mr);
+    auto const type   = cudf::data_type{cudf::type_to_id<T>(), scale};
+    return detail::binary_operation(input, *scalar, binary_operator::ADD, type, stream, mr);
   } else {
     auto const diff   = input.type().scale() - scale;
     auto const scalar = make_fixed_point_scalar<T>(std::pow(10, -diff), scale_type{diff});
-    return detail::binary_operation(input, *scalar, binary_operator::DIV, {}, stream, mr);
+    auto const type   = cudf::data_type{cudf::type_to_id<T>(), scale};
+    return detail::binary_operation(input, *scalar, binary_operator::DIV, type, stream, mr);
   }
 };
 

diff --git a/cpp/tests/binaryop/binop-integration-test.cpp b/cpp/tests/binaryop/binop-integration-test.cpp
diff --git a/cpp/tests/fixed_point/fixed_point_tests.cu b/cpp/tests/fixed_point/fixed_point_tests.cu
@@ -596,8 +596,9 @@ TEST_F(FixedPointTest, PositiveScaleWithValuesOutsideUnderlyingType32)
   auto const expected1 = fp_wrapper{{150000000}, scale_type{6}};
   auto const expected2 = fp_wrapper{{50000000}, scale_type{6}};
 
-  auto const result1 = cudf::binary_operation(a, b, cudf::binary_operator::ADD, {});
-  auto const result2 = cudf::binary_operation(a, c, cudf::binary_operator::DIV, {});
+  auto const type    = cudf::data_type{cudf::type_id::DECIMAL32, 6};
+  auto const result1 = cudf::binary_operation(a, b, cudf::binary_operator::ADD, type);
+  auto const result2 = cudf::binary_operation(a, c, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected1, result1->view());
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected2, result2->view());
@@ -618,8 +619,9 @@ TEST_F(FixedPointTest, PositiveScaleWithValuesOutsideUnderlyingType64)
   auto const expected1 = fp_wrapper{{150000000}, scale_type{100}};
   auto const expected2 = fp_wrapper{{50000000}, scale_type{100}};
 
-  auto const result1 = cudf::binary_operation(a, b, cudf::binary_operator::ADD, {});
-  auto const result2 = cudf::binary_operation(a, c, cudf::binary_operator::DIV, {});
+  auto const type    = cudf::data_type{cudf::type_id::DECIMAL64, 100};
+  auto const result1 = cudf::binary_operation(a, b, cudf::binary_operator::ADD, type);
+  auto const result2 = cudf::binary_operation(a, c, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected1, result1->view());
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected2, result2->view());
@@ -630,6 +632,7 @@ TYPED_TEST(FixedPointTestBothReps, ExtremelyLargeNegativeScale)
   // This is testing fixed_point values with an extremely large negative scale. The fixed_point
   // implementation should be able to handle any scale representable by an int32_t
 
+  using decimalXX  = fixed_point<TypeParam, Radix::BASE_10>;
   using fp_wrapper = cudf::test::fixed_point_column_wrapper<TypeParam>;
 
   auto const a = fp_wrapper{{10}, scale_type{-201}};
@@ -639,8 +642,11 @@ TYPED_TEST(FixedPointTestBothReps, ExtremelyLargeNegativeScale)
   auto const expected1 = fp_wrapper{{150}, scale_type{-202}};
   auto const expected2 = fp_wrapper{{5}, scale_type{-201}};
 
-  auto const result1 = cudf::binary_operation(a, b, cudf::binary_operator::ADD, {});
-  auto const result2 = cudf::binary_operation(a, c, cudf::binary_operator::DIV, {});
+  auto const type1   = cudf::data_type{cudf::type_to_id<decimalXX>(), -202};
+  auto const result1 = cudf::binary_operation(a, b, cudf::binary_operator::ADD, type1);
+
+  auto const type2   = cudf::data_type{cudf::type_to_id<decimalXX>(), -201};
+  auto const result2 = cudf::binary_operation(a, c, cudf::binary_operator::DIV, type2);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected1, result1->view());
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected2, result2->view());

diff --git a/python/cudf/cudf/_lib/binaryop.pyx b/python/cudf/cudf/_lib/binaryop.pyx
@@ -13,7 +13,7 @@ from cudf._lib.replace import replace_nulls
 from cudf._lib.scalar import as_device_scalar
 from cudf._lib.scalar cimport DeviceScalar
 from cudf._lib.types import np_to_cudf_types
-from cudf._lib.types cimport underlying_type_t_type_id
+from cudf._lib.types cimport underlying_type_t_type_id, dtype_to_data_type
 
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.scalar.scalar cimport scalar
@@ -174,15 +174,8 @@ def binaryop(lhs, rhs, op, dtype):
     cdef binary_operator c_op = <binary_operator> (
         <underlying_type_t_binary_operator> op
     )
-    cdef type_id tid = (
-        <type_id> (
-            <underlying_type_t_type_id> (
-                np_to_cudf_types[np.dtype(dtype)]
-            )
-        )
-    )
 
-    cdef data_type c_dtype = data_type(tid)
+    cdef data_type c_dtype = dtype_to_data_type(dtype)
 
     if is_scalar(lhs) or lhs is None:
         is_string_col = is_string_dtype(rhs.dtype)

diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx
@@ -32,7 +32,8 @@ from rmm._lib.device_buffer cimport DeviceBuffer
 from cudf._lib.types import np_to_cudf_types, cudf_to_np_types
 from cudf._lib.types cimport (
     underlying_type_t_type_id,
-    dtype_from_column_view
+    dtype_from_column_view,
+    dtype_to_data_type
 )
 from cudf._lib.null_mask import bitmask_allocation_size_bytes
 
@@ -378,29 +379,12 @@ cdef class Column:
     cdef column_view _view(self, libcudf_types.size_type null_count) except *:
         if is_categorical_dtype(self.dtype):
             col = self.base_children[0]
+            data_dtype = col.dtype
         else:
             col = self
+            data_dtype = self.dtype
 
-        data_dtype = col.dtype
-        cdef libcudf_types.type_id tid
-
-        if is_list_dtype(self.dtype):
-            tid = libcudf_types.type_id.LIST
-        elif is_struct_dtype(self.dtype):
-            tid = libcudf_types.type_id.STRUCT
-        elif is_decimal_dtype(self.dtype):
-            tid = libcudf_types.type_id.DECIMAL64
-        else:
-            tid = <libcudf_types.type_id> (
-                <underlying_type_t_type_id> (
-                    np_to_cudf_types[np.dtype(data_dtype)]
-                )
-            )
-        cdef libcudf_types.data_type dtype = (
-            libcudf_types.data_type(tid, -self.dtype.scale)
-            if tid == libcudf_types.type_id.DECIMAL64
-            else libcudf_types.data_type(tid)
-        )
+        cdef libcudf_types.data_type dtype = dtype_to_data_type(data_dtype)
         cdef libcudf_types.size_type offset = self.offset
         cdef vector[column_view] children
         cdef void* data

diff --git a/python/cudf/cudf/_lib/types.pxd b/python/cudf/cudf/_lib/types.pxd
@@ -4,7 +4,7 @@ from libc.stdint cimport int32_t
 from libcpp cimport bool
 from cudf._lib.cpp.column.column_view cimport column_view
 from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view
-
+cimport cudf._lib.cpp.types as libcudf_types
 
 ctypedef bool underlying_type_t_order
 ctypedef bool underlying_type_t_null_order
@@ -14,3 +14,5 @@ ctypedef int32_t underlying_type_t_type_id
 ctypedef bool underlying_type_t_null_policy
 
 cdef dtype_from_column_view(column_view cv)
+
+cdef libcudf_types.data_type dtype_to_data_type(dtype) except *
diff --git a/python/cudf/cudf/_lib/types.pyx b/python/cudf/cudf/_lib/types.pyx
@@ -15,6 +15,7 @@ from cudf._lib.types cimport (
 from cudf._lib.cpp.column.column_view cimport column_view
 from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view
 from cudf.core.dtypes import ListDtype, StructDtype, Decimal64Dtype
+from cudf.utils.dtypes import is_decimal_dtype, is_list_dtype, is_struct_dtype
 
 cimport cudf._lib.cpp.types as libcudf_types
 
@@ -192,8 +193,7 @@ cdef dtype_from_structs_column_view(column_view cv):
 
 cdef dtype_from_decimal_column_view(column_view cv):
     scale = -cv.type().scale()
-    precision = 18  # max of 64 bit integer
-    return Decimal64Dtype(precision=precision, scale=scale)
+    return Decimal64Dtype(precision=Decimal64Dtype.MAX_PRECISION, scale=scale)
 
 cdef dtype_from_column_view(column_view cv):
     cdef libcudf_types.type_id tid = cv.type().id()
@@ -208,3 +208,20 @@ cdef dtype_from_column_view(column_view cv):
                                   "Use decimal64 instead")
     else:
         return cudf_to_np_types[<underlying_type_t_type_id>(tid)]
+
+cdef libcudf_types.data_type dtype_to_data_type(dtype) except *:
+    if is_list_dtype(dtype):
+        tid = libcudf_types.type_id.LIST
+    elif is_struct_dtype(dtype):
+        tid = libcudf_types.type_id.STRUCT
+    elif is_decimal_dtype(dtype):
+        tid = libcudf_types.type_id.DECIMAL64
+    else:
+        tid = <libcudf_types.type_id> (
+            <underlying_type_t_type_id> (
+                np_to_cudf_types[np.dtype(dtype)]))
+
+    if tid == libcudf_types.type_id.DECIMAL64:
+        return libcudf_types.data_type(tid, -dtype.scale)
+    else:
+        return libcudf_types.data_type(tid)
diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py
@@ -64,7 +64,11 @@ def to_arrow(self):
     def binary_operator(self, op, other, reflect=False):
         if reflect:
             self, other = other, self
-        result = libcudf.binaryop.binaryop(self, other, op, "int32")
+        scale = _binop_scale(self.dtype, other.dtype, op)
+        output_type = Decimal64Dtype(
+            scale=scale, precision=Decimal64Dtype.MAX_PRECISION
+        )  # precision will be ignored, libcudf has no notion of precision
+        result = libcudf.binaryop.binaryop(self, other, op, output_type)
         result.dtype.precision = _binop_precision(self.dtype, other.dtype, op)
         return result
 
@@ -99,6 +103,18 @@ def as_string_column(
             )
 
 
+def _binop_scale(l_dtype, r_dtype, op):
+    # This should at some point be hooked up to libcudf's
+    # binary_operation_fixed_point_scale
+    s1, s2 = l_dtype.scale, r_dtype.scale
+    if op in ("add", "sub"):
+        return max(s1, s2)
+    elif op == "mul":
+        return s1 + s2
+    else:
+        raise NotImplementedError()
+
+
 def _binop_precision(l_dtype, r_dtype, op):
     """
     Returns the result precision when performing the

diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py
@@ -235,7 +235,7 @@ class Decimal64Dtype(ExtensionDtype):
 
     name = "decimal"
     _metadata = ("precision", "scale")
-    _MAX_PRECISION = np.floor(np.log10(np.iinfo("int64").max))
+    MAX_PRECISION = np.floor(np.log10(np.iinfo("int64").max))
 
     def __init__(self, precision, scale=0):
         """
@@ -303,10 +303,10 @@ def __hash__(self):
 
     @classmethod
     def _validate(cls, precision, scale=0):
-        if precision > Decimal64Dtype._MAX_PRECISION:
+        if precision > Decimal64Dtype.MAX_PRECISION:
             raise ValueError(
                 f"Cannot construct a {cls.__name__}"
-                f" with precision > {cls._MAX_PRECISION}"
+                f" with precision > {cls.MAX_PRECISION}"
             )
         if abs(scale) > precision:
             raise ValueError(f"scale={scale} exceeds precision={precision}")

diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py
@@ -13,7 +13,7 @@
 from pandas.core.dtypes.dtypes import CategoricalDtype, CategoricalDtypeType
 
 import cudf
-from cudf._lib.scalar import DeviceScalar, _is_null_host_scalar
+from cudf._lib.scalar import DeviceScalar
 from cudf.core._compat import PANDAS_GE_120
 
 _NA_REP = "<NA>"
@@ -331,7 +331,10 @@ def to_cudf_compatible_scalar(val, dtype=None):
 
     If `val` is None, returns None.
     """
-    if _is_null_host_scalar(val) or isinstance(val, cudf.Scalar):
+
+    if cudf._lib.scalar._is_null_host_scalar(val) or isinstance(
+        val, cudf.Scalar
+    ):
         return val
 
     if not is_scalar(val):