Preserve float16 upscaling (#9069)

Fixes: #9065 This PR enables using `np.dtype` only for `__cuda_array_interface__` scenario in `as_column`. The dtype in this array interface is guaranteed to be numeric which `np.dtype` can handle. Also there is `float16` dtype upcasting logic already inplace below i.e., at line 1760. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Ashwin Srinath (https://github.com/shwina) URL: #9069
rapidsai · Aug 26, 2021 · 263190a · 263190a
1 parent 0ad36ff
commit 263190a
Show file tree

Hide file tree

Showing 26 changed files with 177 additions and 134 deletions.
diff --git a/python/cudf/cudf/_fuzz_testing/csv.py b/python/cudf/cudf/_fuzz_testing/csv.py
@@ -13,7 +13,7 @@
     pyarrow_to_pandas,
 )
 from cudf.testing import dataset_generator as dg
-from cudf.utils.dtypes import pandas_dtypes_to_cudf_dtypes
+from cudf.utils.dtypes import pandas_dtypes_to_np_dtypes
 
 logging.basicConfig(
     format="%(asctime)s %(levelname)-8s %(message)s",
@@ -100,7 +100,7 @@ def set_rand_params(self, params):
                         dtype_val = {
                             col_name: "category"
                             if cudf.utils.dtypes.is_categorical_dtype(dtype)
-                            else pandas_dtypes_to_cudf_dtypes[dtype]
+                            else pandas_dtypes_to_np_dtypes[dtype]
                             for col_name, dtype in dtype_val.items()
                         }
                     params_dict[param] = dtype_val

diff --git a/python/cudf/cudf/_fuzz_testing/json.py b/python/cudf/cudf/_fuzz_testing/json.py
@@ -14,7 +14,7 @@
     pyarrow_to_pandas,
 )
 from cudf.testing import dataset_generator as dg
-from cudf.utils.dtypes import pandas_dtypes_to_cudf_dtypes
+from cudf.utils.dtypes import pandas_dtypes_to_np_dtypes
 
 logging.basicConfig(
     format="%(asctime)s %(levelname)-8s %(message)s",
@@ -31,7 +31,7 @@ def _get_dtype_param_value(dtype_val):
                 processed_dtypes[col_name] = "category"
             else:
                 processed_dtypes[col_name] = str(
-                    pandas_dtypes_to_cudf_dtypes.get(dtype, dtype)
+                    pandas_dtypes_to_np_dtypes.get(dtype, dtype)
                 )
         return processed_dtypes
     return dtype_val

diff --git a/python/cudf/cudf/_fuzz_testing/utils.py b/python/cudf/cudf/_fuzz_testing/utils.py
@@ -11,7 +11,7 @@
 import cudf
 from cudf.testing._utils import assert_eq
 from cudf.utils.dtypes import (
-    pandas_dtypes_to_cudf_dtypes,
+    pandas_dtypes_to_np_dtypes,
     pyarrow_dtypes_to_pandas_dtypes,
 )
 
@@ -218,7 +218,7 @@ def convert_nulls_to_none(records, df):
     scalar_columns_convert = [
         col
         for col in df.columns
-        if df[col].dtype in pandas_dtypes_to_cudf_dtypes
+        if df[col].dtype in pandas_dtypes_to_np_dtypes
         or pd.api.types.is_datetime64_dtype(df[col].dtype)
         or pd.api.types.is_timedelta64_dtype(df[col].dtype)
     ]
@@ -263,7 +263,7 @@ def _null_to_None(value):
     has_nulls_or_nullable_dtype = any(
         [
             True
-            if df[col].dtype in pandas_dtypes_to_cudf_dtypes
+            if df[col].dtype in pandas_dtypes_to_np_dtypes
             or df[col].isnull().any()
             else False
             for col in df.columns

diff --git a/python/cudf/cudf/_lib/aggregation.pyx b/python/cudf/cudf/_lib/aggregation.pyx
@@ -11,7 +11,11 @@ from libcpp.string cimport string
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
-from cudf._lib.types import NullHandling, cudf_to_np_types, np_to_cudf_types
+from cudf._lib.types import (
+    LIBCUDF_TO_SUPPORTED_NUMPY_TYPES,
+    SUPPORTED_NUMPY_TO_LIBCUDF_TYPES,
+    NullHandling,
+)
 from cudf.utils import cudautils
 
 from cudf._lib.types cimport (
@@ -281,15 +285,15 @@ cdef class Aggregation:
         compiled_op = cudautils.compile_udf(op, type_signature)
         output_np_dtype = cudf.dtype(compiled_op[1])
         cpp_str = compiled_op[0].encode('UTF-8')
-        if output_np_dtype not in np_to_cudf_types:
+        if output_np_dtype not in SUPPORTED_NUMPY_TO_LIBCUDF_TYPES:
             raise TypeError(
                 "Result of window function has unsupported dtype {}"
                 .format(op[1])
             )
         tid = (
             <libcudf_types.type_id> (
                 <underlying_type_t_type_id> (
-                    np_to_cudf_types[output_np_dtype]
+                    SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[output_np_dtype]
                 )
             )
         )
@@ -425,15 +429,15 @@ cdef class RollingAggregation:
         compiled_op = cudautils.compile_udf(op, type_signature)
         output_np_dtype = cudf.dtype(compiled_op[1])
         cpp_str = compiled_op[0].encode('UTF-8')
-        if output_np_dtype not in np_to_cudf_types:
+        if output_np_dtype not in SUPPORTED_NUMPY_TO_LIBCUDF_TYPES:
             raise TypeError(
                 "Result of window function has unsupported dtype {}"
                 .format(op[1])
             )
         tid = (
             <libcudf_types.type_id> (
                 <underlying_type_t_type_id> (
-                    np_to_cudf_types[output_np_dtype]
+                    SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[output_np_dtype]
                 )
             )
         )

diff --git a/python/cudf/cudf/_lib/binaryop.pyx b/python/cudf/cudf/_lib/binaryop.pyx
@@ -16,7 +16,7 @@ from cudf._lib.scalar import as_device_scalar
 
 from cudf._lib.scalar cimport DeviceScalar
 
-from cudf._lib.types import np_to_cudf_types
+from cudf._lib.types import SUPPORTED_NUMPY_TO_LIBCUDF_TYPES
 
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view
@@ -212,7 +212,7 @@ def binaryop_udf(Column lhs, Column rhs, udf_ptx, dtype):
     cdef type_id tid = (
         <type_id> (
             <underlying_type_t_type_id> (
-                np_to_cudf_types[cudf.dtype(dtype)]
+                SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[cudf.dtype(dtype)]
             )
         )
     )

diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx
@@ -30,7 +30,10 @@ from cudf._lib.cpp.strings.convert.convert_integers cimport (
     from_integers as cpp_from_integers,
 )
 
-from cudf._lib.types import cudf_to_np_types, np_to_cudf_types
+from cudf._lib.types import (
+    LIBCUDF_TO_SUPPORTED_NUMPY_TYPES,
+    SUPPORTED_NUMPY_TO_LIBCUDF_TYPES,
+)
 
 from cudf._lib.types cimport (
     dtype_from_column_view,

diff --git a/python/cudf/cudf/_lib/orc.pyx b/python/cudf/cudf/_lib/orc.pyx
@@ -8,11 +8,8 @@ from libcpp.string cimport string
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
-from cudf._lib.cpp.column.column cimport column
-
-from cudf.utils.dtypes import is_struct_dtype
-
 from cudf._lib.column cimport Column
+from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.io.orc cimport (
     chunked_orc_writer_options,
     orc_chunked_writer,
@@ -45,15 +42,15 @@ from cudf._lib.io.utils cimport (
 )
 from cudf._lib.table cimport Table
 
-from cudf._lib.types import np_to_cudf_types
+from cudf._lib.types import SUPPORTED_NUMPY_TO_LIBCUDF_TYPES
 
 from cudf._lib.types cimport underlying_type_t_type_id
 
 import numpy as np
 
 from cudf._lib.utils cimport data_from_unique_ptr, get_column_names
 
-from cudf._lib.utils import _index_level_name, generate_pandas_metadata
+from cudf._lib.utils import generate_pandas_metadata
 
 
 cpdef read_raw_orc_statistics(filepath_or_buffer):
@@ -97,7 +94,9 @@ cpdef read_orc(object filepaths_or_buffers,
             if timestamp_type is None else
             <type_id>(
                 <underlying_type_t_type_id> (
-                    np_to_cudf_types[cudf.dtype(timestamp_type)]
+                    SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[
+                        cudf.dtype(timestamp_type)
+                    ]
                 )
             )
         ),

diff --git a/python/cudf/cudf/_lib/reduce.pyx b/python/cudf/cudf/_lib/reduce.pyx
@@ -1,7 +1,6 @@
 # Copyright (c) 2020-2021, NVIDIA CORPORATION.
 
 import cudf
-from cudf.core.dtypes import Decimal64Dtype
 from cudf.utils.dtypes import is_decimal_dtype
 
 from cudf._lib.column cimport Column
@@ -12,7 +11,7 @@ from cudf._lib.cpp.scalar.scalar cimport scalar
 from cudf._lib.cpp.types cimport data_type, type_id
 from cudf._lib.scalar cimport DeviceScalar
 
-from cudf._lib.types import np_to_cudf_types
+from cudf._lib.types import SUPPORTED_NUMPY_TO_LIBCUDF_TYPES
 
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move, pair
@@ -76,7 +75,7 @@ def reduce(reduction_op, Column incol, dtype=None, **kwargs):
         scale = -c_result.get()[0].type().scale()
         precision = _reduce_precision(col_dtype, reduction_op, len(incol))
         py_result = DeviceScalar.from_unique_ptr(
-            move(c_result), dtype=Decimal64Dtype(precision, scale)
+            move(c_result), dtype=cudf.Decimal64Dtype(precision, scale)
         )
     else:
         py_result = DeviceScalar.from_unique_ptr(move(c_result))
@@ -160,4 +159,4 @@ def _reduce_precision(dtype, op, nrows):
         new_p = 2 * p + nrows
     else:
         raise NotImplementedError()
-    return max(min(new_p, Decimal64Dtype.MAX_PRECISION), 0)
+    return max(min(new_p, cudf.Decimal64Dtype.MAX_PRECISION), 0)
diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx
@@ -21,7 +21,7 @@ from libcpp.utility cimport move
 
 import cudf
 from cudf._lib.types import (
-    cudf_to_np_types,
+    LIBCUDF_TO_SUPPORTED_NUMPY_TYPES,
     datetime_unit_map,
     duration_unit_map,
 )
@@ -199,7 +199,7 @@ cdef class DeviceScalar:
                 )
             else:
                 s._dtype = ListDtype(
-                    cudf_to_np_types[
+                    LIBCUDF_TO_SUPPORTED_NUMPY_TYPES[
                         <underlying_type_t_type_id>(
                             (<list_scalar*>s.get_raw_ptr())[0]
                             .view().type().id()
@@ -210,7 +210,7 @@ cdef class DeviceScalar:
             if dtype is not None:
                 s._dtype = dtype
             else:
-                s._dtype = cudf_to_np_types[
+                s._dtype = LIBCUDF_TO_SUPPORTED_NUMPY_TYPES[
                     <underlying_type_t_type_id>(cdtype.id())
                 ]
         return s

diff --git a/python/cudf/cudf/_lib/string_casting.pyx b/python/cudf/cudf/_lib/string_casting.pyx
@@ -8,7 +8,7 @@ from cudf._lib.scalar import as_device_scalar
 
 from cudf._lib.scalar cimport DeviceScalar
 
-from cudf._lib.types import np_to_cudf_types
+from cudf._lib.types import SUPPORTED_NUMPY_TO_LIBCUDF_TYPES
 
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
@@ -72,7 +72,7 @@ def string_to_floating(Column input_col, object out_type):
     cdef unique_ptr[column] c_result
     cdef type_id tid = <type_id> (
         <underlying_type_t_type_id> (
-            np_to_cudf_types[out_type]
+            SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[out_type]
         )
     )
     cdef data_type c_out_type = data_type(tid)
@@ -165,7 +165,7 @@ def string_to_integer(Column input_col, object out_type):
     cdef unique_ptr[column] c_result
     cdef type_id tid = <type_id> (
         <underlying_type_t_type_id> (
-            np_to_cudf_types[out_type]
+            SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[out_type]
         )
     )
     cdef data_type c_out_type = data_type(tid)
@@ -552,7 +552,7 @@ def timestamp2int(Column input_col, dtype, format):
     cdef column_view input_column_view = input_col.view()
     cdef type_id tid = <type_id> (
         <underlying_type_t_type_id> (
-            np_to_cudf_types[dtype]
+            SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[dtype]
         )
     )
     cdef data_type out_type = data_type(tid)
@@ -617,7 +617,7 @@ def timedelta2int(Column input_col, dtype, format):
     cdef column_view input_column_view = input_col.view()
     cdef type_id tid = <type_id> (
         <underlying_type_t_type_id> (
-            np_to_cudf_types[dtype]
+            SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[dtype]
         )
     )
     cdef data_type out_type = data_type(tid)
@@ -744,7 +744,9 @@ def htoi(Column input_col, **kwargs):
     cdef column_view input_column_view = input_col.view()
     cdef type_id tid = <type_id> (
         <underlying_type_t_type_id> (
-            np_to_cudf_types[kwargs.get('dtype', cudf.dtype("int64"))]
+            SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[
+                kwargs.get('dtype', cudf.dtype("int64"))
+            ]
         )
     )
     cdef data_type c_out_type = data_type(tid)

diff --git a/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx b/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx
@@ -4,7 +4,7 @@ import numpy as np
 
 from cudf._lib.column cimport Column
 
-from cudf._lib.types import np_to_cudf_types
+from cudf._lib.types import SUPPORTED_NUMPY_TO_LIBCUDF_TYPES
 
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string

diff --git a/python/cudf/cudf/_lib/transform.pyx b/python/cudf/cudf/_lib/transform.pyx
@@ -20,7 +20,7 @@ from cudf.core.buffer import Buffer
 
 from cudf._lib.cpp.types cimport bitmask_type, data_type, size_type, type_id
 
-from cudf._lib.types import np_to_cudf_types
+from cudf._lib.types import SUPPORTED_NUMPY_TO_LIBCUDF_TYPES
 
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view
@@ -103,7 +103,9 @@ def transform(Column input, op):
 
     try:
         c_tid = <type_id> (
-            <underlying_type_t_type_id> np_to_cudf_types[np_dtype]
+            <underlying_type_t_type_id> SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[
+                np_dtype
+            ]
         )
         c_dtype = data_type(c_tid)
 
@@ -131,7 +133,9 @@ def masked_udf(Table incols, op, output_type):
     cdef data_type c_dtype
 
     c_tid = <type_id> (
-        <underlying_type_t_type_id> np_to_cudf_types[output_type]
+        <underlying_type_t_type_id> SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[
+            output_type
+        ]
     )
     c_dtype = data_type(c_tid)