Skip to content

Commit

Permalink
Preserve float16 upscaling (#9069)
Browse files Browse the repository at this point in the history
Fixes: #9065 

This PR enables using `np.dtype` only for `__cuda_array_interface__` scenario in `as_column`. The dtype in this array interface is guaranteed to be numeric which `np.dtype` can handle. Also there is `float16` dtype upcasting logic already inplace below i.e., at line 1760.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Ashwin Srinath (https://github.com/shwina)

URL: #9069
  • Loading branch information
galipremsagar authored Aug 26, 2021
1 parent 0ad36ff commit 263190a
Show file tree
Hide file tree
Showing 26 changed files with 177 additions and 134 deletions.
4 changes: 2 additions & 2 deletions python/cudf/cudf/_fuzz_testing/csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
pyarrow_to_pandas,
)
from cudf.testing import dataset_generator as dg
from cudf.utils.dtypes import pandas_dtypes_to_cudf_dtypes
from cudf.utils.dtypes import pandas_dtypes_to_np_dtypes

logging.basicConfig(
format="%(asctime)s %(levelname)-8s %(message)s",
Expand Down Expand Up @@ -100,7 +100,7 @@ def set_rand_params(self, params):
dtype_val = {
col_name: "category"
if cudf.utils.dtypes.is_categorical_dtype(dtype)
else pandas_dtypes_to_cudf_dtypes[dtype]
else pandas_dtypes_to_np_dtypes[dtype]
for col_name, dtype in dtype_val.items()
}
params_dict[param] = dtype_val
Expand Down
4 changes: 2 additions & 2 deletions python/cudf/cudf/_fuzz_testing/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
pyarrow_to_pandas,
)
from cudf.testing import dataset_generator as dg
from cudf.utils.dtypes import pandas_dtypes_to_cudf_dtypes
from cudf.utils.dtypes import pandas_dtypes_to_np_dtypes

logging.basicConfig(
format="%(asctime)s %(levelname)-8s %(message)s",
Expand All @@ -31,7 +31,7 @@ def _get_dtype_param_value(dtype_val):
processed_dtypes[col_name] = "category"
else:
processed_dtypes[col_name] = str(
pandas_dtypes_to_cudf_dtypes.get(dtype, dtype)
pandas_dtypes_to_np_dtypes.get(dtype, dtype)
)
return processed_dtypes
return dtype_val
Expand Down
6 changes: 3 additions & 3 deletions python/cudf/cudf/_fuzz_testing/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
import cudf
from cudf.testing._utils import assert_eq
from cudf.utils.dtypes import (
pandas_dtypes_to_cudf_dtypes,
pandas_dtypes_to_np_dtypes,
pyarrow_dtypes_to_pandas_dtypes,
)

Expand Down Expand Up @@ -218,7 +218,7 @@ def convert_nulls_to_none(records, df):
scalar_columns_convert = [
col
for col in df.columns
if df[col].dtype in pandas_dtypes_to_cudf_dtypes
if df[col].dtype in pandas_dtypes_to_np_dtypes
or pd.api.types.is_datetime64_dtype(df[col].dtype)
or pd.api.types.is_timedelta64_dtype(df[col].dtype)
]
Expand Down Expand Up @@ -263,7 +263,7 @@ def _null_to_None(value):
has_nulls_or_nullable_dtype = any(
[
True
if df[col].dtype in pandas_dtypes_to_cudf_dtypes
if df[col].dtype in pandas_dtypes_to_np_dtypes
or df[col].isnull().any()
else False
for col in df.columns
Expand Down
14 changes: 9 additions & 5 deletions python/cudf/cudf/_lib/aggregation.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,11 @@ from libcpp.string cimport string
from libcpp.utility cimport move
from libcpp.vector cimport vector

from cudf._lib.types import NullHandling, cudf_to_np_types, np_to_cudf_types
from cudf._lib.types import (
LIBCUDF_TO_SUPPORTED_NUMPY_TYPES,
SUPPORTED_NUMPY_TO_LIBCUDF_TYPES,
NullHandling,
)
from cudf.utils import cudautils

from cudf._lib.types cimport (
Expand Down Expand Up @@ -281,15 +285,15 @@ cdef class Aggregation:
compiled_op = cudautils.compile_udf(op, type_signature)
output_np_dtype = cudf.dtype(compiled_op[1])
cpp_str = compiled_op[0].encode('UTF-8')
if output_np_dtype not in np_to_cudf_types:
if output_np_dtype not in SUPPORTED_NUMPY_TO_LIBCUDF_TYPES:
raise TypeError(
"Result of window function has unsupported dtype {}"
.format(op[1])
)
tid = (
<libcudf_types.type_id> (
<underlying_type_t_type_id> (
np_to_cudf_types[output_np_dtype]
SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[output_np_dtype]
)
)
)
Expand Down Expand Up @@ -425,15 +429,15 @@ cdef class RollingAggregation:
compiled_op = cudautils.compile_udf(op, type_signature)
output_np_dtype = cudf.dtype(compiled_op[1])
cpp_str = compiled_op[0].encode('UTF-8')
if output_np_dtype not in np_to_cudf_types:
if output_np_dtype not in SUPPORTED_NUMPY_TO_LIBCUDF_TYPES:
raise TypeError(
"Result of window function has unsupported dtype {}"
.format(op[1])
)
tid = (
<libcudf_types.type_id> (
<underlying_type_t_type_id> (
np_to_cudf_types[output_np_dtype]
SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[output_np_dtype]
)
)
)
Expand Down
4 changes: 2 additions & 2 deletions python/cudf/cudf/_lib/binaryop.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ from cudf._lib.scalar import as_device_scalar

from cudf._lib.scalar cimport DeviceScalar

from cudf._lib.types import np_to_cudf_types
from cudf._lib.types import SUPPORTED_NUMPY_TO_LIBCUDF_TYPES

from cudf._lib.cpp.column.column cimport column
from cudf._lib.cpp.column.column_view cimport column_view
Expand Down Expand Up @@ -212,7 +212,7 @@ def binaryop_udf(Column lhs, Column rhs, udf_ptx, dtype):
cdef type_id tid = (
<type_id> (
<underlying_type_t_type_id> (
np_to_cudf_types[cudf.dtype(dtype)]
SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[cudf.dtype(dtype)]
)
)
)
Expand Down
5 changes: 4 additions & 1 deletion python/cudf/cudf/_lib/column.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,10 @@ from cudf._lib.cpp.strings.convert.convert_integers cimport (
from_integers as cpp_from_integers,
)

from cudf._lib.types import cudf_to_np_types, np_to_cudf_types
from cudf._lib.types import (
LIBCUDF_TO_SUPPORTED_NUMPY_TYPES,
SUPPORTED_NUMPY_TO_LIBCUDF_TYPES,
)

from cudf._lib.types cimport (
dtype_from_column_view,
Expand Down
13 changes: 6 additions & 7 deletions python/cudf/cudf/_lib/orc.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,8 @@ from libcpp.string cimport string
from libcpp.utility cimport move
from libcpp.vector cimport vector

from cudf._lib.cpp.column.column cimport column

from cudf.utils.dtypes import is_struct_dtype

from cudf._lib.column cimport Column
from cudf._lib.cpp.column.column cimport column
from cudf._lib.cpp.io.orc cimport (
chunked_orc_writer_options,
orc_chunked_writer,
Expand Down Expand Up @@ -45,15 +42,15 @@ from cudf._lib.io.utils cimport (
)
from cudf._lib.table cimport Table

from cudf._lib.types import np_to_cudf_types
from cudf._lib.types import SUPPORTED_NUMPY_TO_LIBCUDF_TYPES

from cudf._lib.types cimport underlying_type_t_type_id

import numpy as np

from cudf._lib.utils cimport data_from_unique_ptr, get_column_names

from cudf._lib.utils import _index_level_name, generate_pandas_metadata
from cudf._lib.utils import generate_pandas_metadata


cpdef read_raw_orc_statistics(filepath_or_buffer):
Expand Down Expand Up @@ -97,7 +94,9 @@ cpdef read_orc(object filepaths_or_buffers,
if timestamp_type is None else
<type_id>(
<underlying_type_t_type_id> (
np_to_cudf_types[cudf.dtype(timestamp_type)]
SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[
cudf.dtype(timestamp_type)
]
)
)
),
Expand Down
7 changes: 3 additions & 4 deletions python/cudf/cudf/_lib/reduce.pyx
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
# Copyright (c) 2020-2021, NVIDIA CORPORATION.

import cudf
from cudf.core.dtypes import Decimal64Dtype
from cudf.utils.dtypes import is_decimal_dtype

from cudf._lib.column cimport Column
Expand All @@ -12,7 +11,7 @@ from cudf._lib.cpp.scalar.scalar cimport scalar
from cudf._lib.cpp.types cimport data_type, type_id
from cudf._lib.scalar cimport DeviceScalar

from cudf._lib.types import np_to_cudf_types
from cudf._lib.types import SUPPORTED_NUMPY_TO_LIBCUDF_TYPES

from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move, pair
Expand Down Expand Up @@ -76,7 +75,7 @@ def reduce(reduction_op, Column incol, dtype=None, **kwargs):
scale = -c_result.get()[0].type().scale()
precision = _reduce_precision(col_dtype, reduction_op, len(incol))
py_result = DeviceScalar.from_unique_ptr(
move(c_result), dtype=Decimal64Dtype(precision, scale)
move(c_result), dtype=cudf.Decimal64Dtype(precision, scale)
)
else:
py_result = DeviceScalar.from_unique_ptr(move(c_result))
Expand Down Expand Up @@ -160,4 +159,4 @@ def _reduce_precision(dtype, op, nrows):
new_p = 2 * p + nrows
else:
raise NotImplementedError()
return max(min(new_p, Decimal64Dtype.MAX_PRECISION), 0)
return max(min(new_p, cudf.Decimal64Dtype.MAX_PRECISION), 0)
6 changes: 3 additions & 3 deletions python/cudf/cudf/_lib/scalar.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ from libcpp.utility cimport move

import cudf
from cudf._lib.types import (
cudf_to_np_types,
LIBCUDF_TO_SUPPORTED_NUMPY_TYPES,
datetime_unit_map,
duration_unit_map,
)
Expand Down Expand Up @@ -199,7 +199,7 @@ cdef class DeviceScalar:
)
else:
s._dtype = ListDtype(
cudf_to_np_types[
LIBCUDF_TO_SUPPORTED_NUMPY_TYPES[
<underlying_type_t_type_id>(
(<list_scalar*>s.get_raw_ptr())[0]
.view().type().id()
Expand All @@ -210,7 +210,7 @@ cdef class DeviceScalar:
if dtype is not None:
s._dtype = dtype
else:
s._dtype = cudf_to_np_types[
s._dtype = LIBCUDF_TO_SUPPORTED_NUMPY_TYPES[
<underlying_type_t_type_id>(cdtype.id())
]
return s
Expand Down
14 changes: 8 additions & 6 deletions python/cudf/cudf/_lib/string_casting.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ from cudf._lib.scalar import as_device_scalar

from cudf._lib.scalar cimport DeviceScalar

from cudf._lib.types import np_to_cudf_types
from cudf._lib.types import SUPPORTED_NUMPY_TO_LIBCUDF_TYPES

from libcpp.memory cimport unique_ptr
from libcpp.string cimport string
Expand Down Expand Up @@ -72,7 +72,7 @@ def string_to_floating(Column input_col, object out_type):
cdef unique_ptr[column] c_result
cdef type_id tid = <type_id> (
<underlying_type_t_type_id> (
np_to_cudf_types[out_type]
SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[out_type]
)
)
cdef data_type c_out_type = data_type(tid)
Expand Down Expand Up @@ -165,7 +165,7 @@ def string_to_integer(Column input_col, object out_type):
cdef unique_ptr[column] c_result
cdef type_id tid = <type_id> (
<underlying_type_t_type_id> (
np_to_cudf_types[out_type]
SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[out_type]
)
)
cdef data_type c_out_type = data_type(tid)
Expand Down Expand Up @@ -552,7 +552,7 @@ def timestamp2int(Column input_col, dtype, format):
cdef column_view input_column_view = input_col.view()
cdef type_id tid = <type_id> (
<underlying_type_t_type_id> (
np_to_cudf_types[dtype]
SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[dtype]
)
)
cdef data_type out_type = data_type(tid)
Expand Down Expand Up @@ -617,7 +617,7 @@ def timedelta2int(Column input_col, dtype, format):
cdef column_view input_column_view = input_col.view()
cdef type_id tid = <type_id> (
<underlying_type_t_type_id> (
np_to_cudf_types[dtype]
SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[dtype]
)
)
cdef data_type out_type = data_type(tid)
Expand Down Expand Up @@ -744,7 +744,9 @@ def htoi(Column input_col, **kwargs):
cdef column_view input_column_view = input_col.view()
cdef type_id tid = <type_id> (
<underlying_type_t_type_id> (
np_to_cudf_types[kwargs.get('dtype', cudf.dtype("int64"))]
SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[
kwargs.get('dtype', cudf.dtype("int64"))
]
)
)
cdef data_type c_out_type = data_type(tid)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ import numpy as np

from cudf._lib.column cimport Column

from cudf._lib.types import np_to_cudf_types
from cudf._lib.types import SUPPORTED_NUMPY_TO_LIBCUDF_TYPES

from libcpp.memory cimport unique_ptr
from libcpp.string cimport string
Expand Down
10 changes: 7 additions & 3 deletions python/cudf/cudf/_lib/transform.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ from cudf.core.buffer import Buffer

from cudf._lib.cpp.types cimport bitmask_type, data_type, size_type, type_id

from cudf._lib.types import np_to_cudf_types
from cudf._lib.types import SUPPORTED_NUMPY_TO_LIBCUDF_TYPES

from cudf._lib.cpp.column.column cimport column
from cudf._lib.cpp.column.column_view cimport column_view
Expand Down Expand Up @@ -103,7 +103,9 @@ def transform(Column input, op):

try:
c_tid = <type_id> (
<underlying_type_t_type_id> np_to_cudf_types[np_dtype]
<underlying_type_t_type_id> SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[
np_dtype
]
)
c_dtype = data_type(c_tid)

Expand Down Expand Up @@ -131,7 +133,9 @@ def masked_udf(Table incols, op, output_type):
cdef data_type c_dtype

c_tid = <type_id> (
<underlying_type_t_type_id> np_to_cudf_types[output_type]
<underlying_type_t_type_id> SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[
output_type
]
)
c_dtype = data_type(c_tid)

Expand Down
Loading

0 comments on commit 263190a

Please sign in to comment.