From 89fdb6a762e4a64b9654ac2d6da4aabc83240841 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 11 Feb 2022 14:07:06 -0800 Subject: [PATCH 01/10] Fix a comment. --- python/cudf/cudf/core/single_column_frame.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py index bf867923b57..727f3095696 100644 --- a/python/cudf/cudf/core/single_column_frame.py +++ b/python/cudf/cudf/core/single_column_frame.py @@ -274,7 +274,7 @@ def factorize(self, na_sentinel=-1): def _make_operands_for_binop( self, - other: T, + other: Any, fill_value: Any = None, reflect: bool = False, *args, @@ -310,7 +310,7 @@ def _make_operands_for_binop( else: result_name = self.name - # This needs to be tested correctly + # TODO: This needs to be tested correctly if isinstance(other, SingleColumnFrame): other = other._column elif not _is_scalar_or_zero_d_array(other): From 310029979a207573e2700684249c2cae3c58d2a1 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 11 Feb 2022 14:28:20 -0800 Subject: [PATCH 02/10] Leverage more binop machinery for ufuncs. --- python/cudf/cudf/core/series.py | 93 ++++++++++++++++++++------------- 1 file changed, 56 insertions(+), 37 deletions(-) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 90ebeba5087..709c7747e54 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -7,7 +7,6 @@ import pickle import warnings from collections import abc as abc -from itertools import repeat from numbers import Number from shutil import get_terminal_size from typing import Any, MutableMapping, Optional, Set, Union @@ -1021,10 +1020,9 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): op = f"__{'r' if reflect else ''}{op}__" # pandas bitwise operations return bools if indexes are misaligned. - # TODO: Generalize for other types of Frames if ( "bitwise" in fname - and isinstance(other, Series) + and isinstance(other, IndexedFrame) and not self.index.equals(other.index) ): return getattr(self, op)(other).astype(bool) @@ -1057,40 +1055,46 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): cupy_func = getattr(cupy, fname) if cupy_func: # Indices must be aligned before converting to arrays. - if ufunc.nin == 2 and all(map(isinstance, inputs, repeat(Series))): - inputs = _align_indices(inputs, allow_non_unique=True) - index = inputs[0].index + if ufunc.nin == 2: + other = inputs[self is inputs[0]] + inputs, index = self._prep_for_binop(other, fname) else: - index = self.index + inputs = {self.name: (self._column, None, False, None)} + index = self._index - cupy_inputs = [] mask = None - for inp in inputs: - # TODO: Generalize for other types of Frames - if isinstance(inp, Series) and inp.has_nulls: - new_mask = as_column(inp.nullmask) - - # TODO: This is a hackish way to perform a bitwise and of - # bitmasks. Once we expose cudf::detail::bitwise_and, then - # we can use that instead. - mask = new_mask if mask is None else (mask & new_mask) - - # Arbitrarily fill with zeros. For ufuncs, we assume that - # the end result propagates nulls via a bitwise and, so - # these elements are irrelevant. - inp = inp.fillna(0) - cupy_inputs.append(cupy.asarray(inp)) - - cp_output = cupy_func(*cupy_inputs, **kwargs) - - def make_frame(arr): - return self.__class__._from_data( - {self.name: as_column(arr).set_mask(mask)}, index=index - ) - - if ufunc.nout > 1: - return tuple(make_frame(out) for out in cp_output) - return make_frame(cp_output) + data = [{} for _ in range(ufunc.nout)] + for name, (left, right, _, _) in inputs.items(): + cupy_inputs = [] + # TODO: I'm jumping through multiple hoops to get the unary + # behavior to match up with the binary. I should see if there + # are better patterns to employ here. + for inp in (left, right) if ufunc.nin == 2 else (left,): + if isinstance(inp, column.ColumnBase) and inp.has_nulls(): + new_mask = as_column(inp.nullmask) + + # TODO: This is a hackish way to perform a bitwise and + # of bitmasks. Once we expose + # cudf::detail::bitwise_and, then we can use that + # instead. + mask = new_mask if mask is None else (mask & new_mask) + + # Arbitrarily fill with zeros. For ufuncs, we assume + # that the end result propagates nulls via a bitwise + # and, so these elements are irrelevant. + inp = inp.fillna(0) + cupy_inputs.append(cupy.asarray(inp)) + + cp_output = cupy_func(*cupy_inputs, **kwargs) + if ufunc.nout == 1: + cp_output = (cp_output,) + for i, out in enumerate(cp_output): + data[i][name] = as_column(out).set_mask(mask) + + out = tuple( + self.__class__._from_data(out, index=index) for out in data + ) + return out[0] if ufunc.nout == 1 else out return NotImplemented @@ -1342,7 +1346,7 @@ def __repr__(self): lines.append(category_memory) return "\n".join(lines) - def _binaryop( + def _prep_for_binop( self, other: Frame, fn: str, @@ -1376,9 +1380,24 @@ def _binaryop( lhs = self operands = lhs._make_operands_for_binop(other, fill_value, reflect) + return operands, lhs._index + + def _binaryop( + self, + other: Frame, + fn: str, + fill_value: Any = None, + reflect: bool = False, + can_reindex: bool = False, + *args, + **kwargs, + ): + operands, out_index = self._prep_for_binop( + other, fn, fill_value, reflect, can_reindex + ) return ( - lhs._from_data( - data=lhs._colwise_binop(operands, fn), index=lhs._index, + self._from_data( + data=self._colwise_binop(operands, fn), index=out_index, ) if operands is not NotImplemented else NotImplemented From 814462f25ff9a66bc0177a5c4822c73c8450ef8f Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 11 Feb 2022 15:18:30 -0800 Subject: [PATCH 03/10] Disable another test that can't work reliably. --- python/cudf/cudf/tests/test_array_ufunc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/tests/test_array_ufunc.py b/python/cudf/cudf/tests/test_array_ufunc.py index a384ddecca6..31f0c3a6bcb 100644 --- a/python/cudf/cudf/tests/test_array_ufunc.py +++ b/python/cudf/cudf/tests/test_array_ufunc.py @@ -109,7 +109,7 @@ def test_ufunc_series(ufunc, has_nulls, indexed): @pytest.mark.parametrize("reflect", [True, False]) def test_binary_ufunc_series_array(ufunc, has_nulls, indexed, type_, reflect): fname = ufunc.__name__ - if fname in ("greater", "greater_equal") and has_nulls: + if fname in ("greater", "greater_equal", "logical_and") and has_nulls: pytest.xfail( "The way cudf casts nans in arrays to nulls during binops with " "cudf objects is currently incompatible with pandas." From 1f7d77cd435be41461aec8875d7db0aa8d417bc0 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 11 Feb 2022 15:29:12 -0800 Subject: [PATCH 04/10] Rewrite unary inputs creation to work for multiple columns. --- python/cudf/cudf/core/series.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 709c7747e54..a4abd741369 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -1059,7 +1059,10 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): other = inputs[self is inputs[0]] inputs, index = self._prep_for_binop(other, fname) else: - inputs = {self.name: (self._column, None, False, None)} + inputs = { + name: (col, None, False, None) + for name, col in self._data.items() + } index = self._index mask = None From b0009f166bd364943ca800d6c05fcd13e81ca83d Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 11 Feb 2022 15:30:09 -0800 Subject: [PATCH 05/10] Separate binop prep for DataFrame. --- python/cudf/cudf/core/dataframe.py | 24 +++++++++++++++++++++--- python/cudf/cudf/core/series.py | 2 +- 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 371404ca477..57aa5f63cef 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1864,8 +1864,7 @@ def _get_columns_by_label(self, labels, downcast=False): ) return out - @annotate("DATAFRAME_BINARYOP", color="blue", domain="cudf_python") - def _binaryop( + def _prep_for_binop( self, other: Any, fn: str, @@ -1948,11 +1947,30 @@ def _binaryop( right = right_dict[col] operands[col] = (left, right, reflect, fill_value) else: + return NotImplemented, None + + return operands, lhs._index + + @annotate("DATAFRAME_BINARYOP", color="blue", domain="cudf_python") + def _binaryop( + self, + other: Any, + fn: str, + fill_value: Any = None, + reflect: bool = False, + can_reindex: bool = False, + *args, + **kwargs, + ): + operands, out_index = self._prep_for_binop( + other, fn, fill_value, reflect, can_reindex + ) + if operands is NotImplemented: return NotImplemented return self._from_data( ColumnAccessor(type(self)._colwise_binop(operands, fn)), - index=lhs._index, + index=out_index, ) @annotate("DATAFRAME_UPDATE", color="blue", domain="cudf_python") diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index a4abd741369..c110809cc28 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -1351,7 +1351,7 @@ def __repr__(self): def _prep_for_binop( self, - other: Frame, + other: Any, fn: str, fill_value: Any = None, reflect: bool = False, From b89aa9714a93d7aba99e84bdcae464b3011a3745 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 11 Feb 2022 15:34:23 -0800 Subject: [PATCH 06/10] Move __array_ufunc__ to IndexedFrame. --- python/cudf/cudf/core/dataframe.py | 8 -- python/cudf/cudf/core/indexed_frame.py | 148 +++++++++++++++++++++++++ python/cudf/cudf/core/series.py | 143 ------------------------ 3 files changed, 148 insertions(+), 151 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 57aa5f63cef..e1dd5c15b30 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1271,14 +1271,6 @@ def memory_usage(self, index=True, deep=False): {str(k): v for k, v in super().memory_usage(index, deep).items()} ) - @annotate("DATAFRAME_ARRAY_UFUNC", color="blue", domain="cudf_python") - def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): - if method == "__call__" and hasattr(cudf, ufunc.__name__): - func = getattr(cudf, ufunc.__name__) - return func(self) - else: - return NotImplemented - @annotate("DATAFRAME_ARRAY_FUNCTION", color="blue", domain="cudf_python") def __array_function__(self, func, types, args, kwargs): diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 7788a5346c8..e1ff3984948 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -1697,6 +1697,154 @@ def last(self, offset): slice_func=lambda i: self.iloc[i:], ) + # For more detail on this function and how it should work, see + # https://numpy.org/doc/stable/reference/ufuncs.html + def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): + # We don't currently support reduction, accumulation, etc. We also + # don't support any special kwargs or higher arity ufuncs than binary. + if method != "__call__" or kwargs or ufunc.nin > 2: + return NotImplemented + + # Binary operations + binary_operations = { + # Arithmetic binary operations. + "add": "add", + "subtract": "sub", + "multiply": "mul", + "matmul": "matmul", + "divide": "truediv", + "true_divide": "truediv", + "floor_divide": "floordiv", + "power": "pow", + "float_power": "pow", + "remainder": "mod", + "mod": "mod", + "fmod": "mod", + # Bitwise binary operations. + "bitwise_and": "and", + "bitwise_or": "or", + "bitwise_xor": "xor", + # Comparison binary operators + "greater": "gt", + "greater_equal": "ge", + "less": "lt", + "less_equal": "le", + "not_equal": "ne", + "equal": "eq", + } + + # First look for methods of the class. + fname = ufunc.__name__ + if fname in binary_operations: + reflect = self is not inputs[0] + other = inputs[0] if reflect else inputs[1] + + # These operators need to be mapped to their inverses when + # performing a reflected operation because no reflected version of + # the operators themselves exist. + ops_without_reflection = { + "gt": "lt", + "ge": "le", + "lt": "gt", + "le": "ge", + # ne and eq are symmetric, so they are their own inverse op + "ne": "ne", + "eq": "eq", + } + + op = binary_operations[fname] + if reflect and op in ops_without_reflection: + op = ops_without_reflection[op] + reflect = False + op = f"__{'r' if reflect else ''}{op}__" + + # pandas bitwise operations return bools if indexes are misaligned. + if ( + "bitwise" in fname + and isinstance(other, IndexedFrame) + and not self.index.equals(other.index) + ): + return getattr(self, op)(other).astype(bool) + # Float_power returns float irrespective of the input type. + if fname == "float_power": + return getattr(self, op)(other).astype(float) + return getattr(self, op)(other) + + # Special handling for unary operations. + if fname == "negative": + return self * -1 + if fname == "positive": + return self.copy(deep=True) + if fname == "invert": + return ~self + if fname == "absolute": + return self.abs() + if fname == "fabs": + return self.abs().astype(np.float64) + + # Note: There are some operations that may be supported by libcudf but + # are not supported by pandas APIs. In particular, libcudf binary + # operations support logical and/or operations, but those operations + # are not defined on pd.Series/DataFrame. For now those operations will + # dispatch to cupy, but if ufuncs are ever a bottleneck we could add + # special handling to dispatch those (or any other) functions that we + # could implement without cupy. + + # Attempt to dispatch all other functions to cupy. + cupy_func = getattr(cp, fname) + if cupy_func: + # Indices must be aligned before converting to arrays. + if ufunc.nin == 2: + other = inputs[self is inputs[0]] + inputs, index = self._prep_for_binop(other, fname) + else: + inputs = { + name: (col, None, False, None) + for name, col in self._data.items() + } + index = self._index + + mask = None + data = [{} for _ in range(ufunc.nout)] + for name, (left, right, _, _) in inputs.items(): + cupy_inputs = [] + # TODO: I'm jumping through multiple hoops to get the unary + # behavior to match up with the binary. I should see if there + # are better patterns to employ here. + for inp in (left, right) if ufunc.nin == 2 else (left,): + if ( + isinstance(inp, cudf.core.column.ColumnBase) + and inp.has_nulls() + ): + new_mask = cudf.core.column.as_column(inp.nullmask) + + # TODO: This is a hackish way to perform a bitwise and + # of bitmasks. Once we expose + # cudf::detail::bitwise_and, then we can use that + # instead. + mask = new_mask if mask is None else (mask & new_mask) + + # Arbitrarily fill with zeros. For ufuncs, we assume + # that the end result propagates nulls via a bitwise + # and, so these elements are irrelevant. + inp = inp.fillna(0) + cupy_inputs.append(cp.asarray(inp)) + + cp_output = cupy_func(*cupy_inputs, **kwargs) + if ufunc.nout == 1: + cp_output = (cp_output,) + for i, out in enumerate(cp_output): + data[i][name] = cudf.core.column.as_column(out).set_mask( + mask + ) + + out = tuple( + self.__class__._from_data(out, index=index) for out in data + ) + return out[0] if ufunc.nout == 1 else out + + return NotImplemented + def _check_duplicate_level_names(specified, level_names): """Raise if any of `specified` has duplicates in `level_names`.""" diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index c110809cc28..3aef4447a28 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -958,149 +958,6 @@ def to_frame(self, name=None): def memory_usage(self, index=True, deep=False): return sum(super().memory_usage(index, deep).values()) - # For more detail on this function and how it should work, see - # https://numpy.org/doc/stable/reference/ufuncs.html - def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): - # We don't currently support reduction, accumulation, etc. We also - # don't support any special kwargs or higher arity ufuncs than binary. - if method != "__call__" or kwargs or ufunc.nin > 2: - return NotImplemented - - # Binary operations - binary_operations = { - # Arithmetic binary operations. - "add": "add", - "subtract": "sub", - "multiply": "mul", - "matmul": "matmul", - "divide": "truediv", - "true_divide": "truediv", - "floor_divide": "floordiv", - "power": "pow", - "float_power": "pow", - "remainder": "mod", - "mod": "mod", - "fmod": "mod", - # Bitwise binary operations. - "bitwise_and": "and", - "bitwise_or": "or", - "bitwise_xor": "xor", - # Comparison binary operators - "greater": "gt", - "greater_equal": "ge", - "less": "lt", - "less_equal": "le", - "not_equal": "ne", - "equal": "eq", - } - - # First look for methods of the class. - fname = ufunc.__name__ - if fname in binary_operations: - reflect = self is not inputs[0] - other = inputs[0] if reflect else inputs[1] - - # These operators need to be mapped to their inverses when - # performing a reflected operation because no reflected version of - # the operators themselves exist. - ops_without_reflection = { - "gt": "lt", - "ge": "le", - "lt": "gt", - "le": "ge", - # ne and eq are symmetric, so they are their own inverse op - "ne": "ne", - "eq": "eq", - } - - op = binary_operations[fname] - if reflect and op in ops_without_reflection: - op = ops_without_reflection[op] - reflect = False - op = f"__{'r' if reflect else ''}{op}__" - - # pandas bitwise operations return bools if indexes are misaligned. - if ( - "bitwise" in fname - and isinstance(other, IndexedFrame) - and not self.index.equals(other.index) - ): - return getattr(self, op)(other).astype(bool) - # Float_power returns float irrespective of the input type. - if fname == "float_power": - return getattr(self, op)(other).astype(float) - return getattr(self, op)(other) - - # Special handling for unary operations. - if fname == "negative": - return self * -1 - if fname == "positive": - return self.copy(deep=True) - if fname == "invert": - return ~self - if fname == "absolute": - return self.abs() - if fname == "fabs": - return self.abs().astype(np.float64) - - # Note: There are some operations that may be supported by libcudf but - # are not supported by pandas APIs. In particular, libcudf binary - # operations support logical and/or operations, but those operations - # are not defined on pd.Series/DataFrame. For now those operations will - # dispatch to cupy, but if ufuncs are ever a bottleneck we could add - # special handling to dispatch those (or any other) functions that we - # could implement without cupy. - - # Attempt to dispatch all other functions to cupy. - cupy_func = getattr(cupy, fname) - if cupy_func: - # Indices must be aligned before converting to arrays. - if ufunc.nin == 2: - other = inputs[self is inputs[0]] - inputs, index = self._prep_for_binop(other, fname) - else: - inputs = { - name: (col, None, False, None) - for name, col in self._data.items() - } - index = self._index - - mask = None - data = [{} for _ in range(ufunc.nout)] - for name, (left, right, _, _) in inputs.items(): - cupy_inputs = [] - # TODO: I'm jumping through multiple hoops to get the unary - # behavior to match up with the binary. I should see if there - # are better patterns to employ here. - for inp in (left, right) if ufunc.nin == 2 else (left,): - if isinstance(inp, column.ColumnBase) and inp.has_nulls(): - new_mask = as_column(inp.nullmask) - - # TODO: This is a hackish way to perform a bitwise and - # of bitmasks. Once we expose - # cudf::detail::bitwise_and, then we can use that - # instead. - mask = new_mask if mask is None else (mask & new_mask) - - # Arbitrarily fill with zeros. For ufuncs, we assume - # that the end result propagates nulls via a bitwise - # and, so these elements are irrelevant. - inp = inp.fillna(0) - cupy_inputs.append(cupy.asarray(inp)) - - cp_output = cupy_func(*cupy_inputs, **kwargs) - if ufunc.nout == 1: - cp_output = (cp_output,) - for i, out in enumerate(cp_output): - data[i][name] = as_column(out).set_mask(mask) - - out = tuple( - self.__class__._from_data(out, index=index) for out in data - ) - return out[0] if ufunc.nout == 1 else out - - return NotImplemented - def __array_function__(self, func, types, args, kwargs): handled_types = [cudf.Series] for t in types: From d52cd45fbd7db583fd646b8a4ebd2b82454c051a Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 3 Feb 2022 14:53:02 -0800 Subject: [PATCH 07/10] Get DataFrame ufuncs working without testing nulls or indexes. --- python/cudf/cudf/core/dataframe.py | 1 + python/cudf/cudf/tests/test_array_ufunc.py | 88 ++++++++++++++++++++++ 2 files changed, 89 insertions(+) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index e1dd5c15b30..aab3b989047 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1876,6 +1876,7 @@ def _prep_for_binop( # implementation assumes that binary operations between a column and # NULL are always commutative, even for binops (like subtraction) that # are normally anticommutative. + # TODO: We probably should support pandas DataFrame/Series objects. if isinstance(rhs, Sequence): # TODO: Consider validating sequence length (pandas does). operands = { diff --git a/python/cudf/cudf/tests/test_array_ufunc.py b/python/cudf/cudf/tests/test_array_ufunc.py index 31f0c3a6bcb..a8e717c0179 100644 --- a/python/cudf/cudf/tests/test_array_ufunc.py +++ b/python/cudf/cudf/tests/test_array_ufunc.py @@ -181,3 +181,91 @@ def test_ufunc_cudf_series_error_with_out_kwarg(func): # this throws a value-error because of presence of out kwarg with pytest.raises(TypeError): func(x1=cudf_s1, x2=cudf_s2, out=cudf_s3) + + +# Skip matmul since it requires aligned shapes. +@pytest.mark.parametrize("ufunc", (uf for uf in _UFUNCS if uf != np.matmul)) +@pytest.mark.parametrize("has_nulls", [False]) +@pytest.mark.parametrize("indexed", [False]) +# @pytest.mark.parametrize("has_nulls", [True, False]) +# @pytest.mark.parametrize("indexed", [True, False]) +def test_ufunc_dataframe(ufunc, has_nulls, indexed): + # Note: This test assumes that all ufuncs are unary or binary. + fname = ufunc.__name__ + if indexed and fname in ( + "greater", + "greater_equal", + "less", + "less_equal", + "not_equal", + "equal", + ): + pytest.skip("Comparison operators do not support misaligned indexes.") + + N = 100 + # Avoid zeros in either array to skip division by 0 errors. Also limit the + # scale to avoid issues with overflow, etc. We use ints because some + # operations (like bitwise ops) are not defined for floats. + pandas_args = args = [ + cudf.DataFrame( + {"foo": cp.random.randint(low=1, high=10, size=N)}, + index=cp.random.choice(range(N), N, False) if indexed else None, + ) + for _ in range(ufunc.nin) + ] + + cols = [arg["foo"] for arg in args] + if has_nulls: + # Converting nullable integer cudf.Series to pandas will produce a + # float pd.Series, so instead we replace nulls with an arbitrary + # integer value, precompute the mask, and then reapply it afterwards. + for col in cols: + set_random_null_mask_inplace(col) + pandas_args = [arg["foo"].fillna(0) for arg in args] + + # Note: Different indexes must be aligned before the mask is computed. + # This requires using an internal function (_align_indices), and that + # is unlikely to change for the foreseeable future. + aligned = ( + cudf.DataFrame._align_indices(args, allow_non_unique=True) + if indexed and ufunc.nin == 2 + else args + ) + mask = reduce( + operator.or_, (a["foo"].isna() for a in aligned) + ).to_pandas() + + try: + got = ufunc(*args) + except AttributeError as e: + # We xfail if we don't have an explicit dispatch and cupy doesn't have + # the method so that we can easily identify these methods. As of this + # writing, the only missing methods are isnat and heaviside. + if "module 'cupy' has no attribute" in str(e): + pytest.xfail(reason="Operation not supported by cupy") + raise + + expect = ufunc(*(arg.to_pandas() for arg in pandas_args)) + + try: + if ufunc.nout > 1: + for g, e in zip(got, expect): + if has_nulls: + e[mask] = np.nan + assert_eq(g, e) + else: + if has_nulls: + expect[mask] = np.nan + assert_eq(got, expect) + except AssertionError: + # TODO: This branch can be removed when + # https://github.com/rapidsai/cudf/issues/10178 is resolved + if fname in ("power", "float_power"): + not_equal = cudf.from_pandas(expect) != got + not_equal[got.isna()] = False + diffs = got[not_equal] - cudf.from_pandas( + expect[not_equal.to_pandas()] + ) + if diffs["foo"].abs().max() == 1: + pytest.xfail("https://github.com/rapidsai/cudf/issues/10178") + raise From 7ca61033e7bb6f67b231a9506895d22c52687b81 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 11 Feb 2022 16:11:09 -0800 Subject: [PATCH 08/10] Fix test to handle nullability correctly and verify that nullable DataFrames work. --- python/cudf/cudf/tests/test_array_ufunc.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/python/cudf/cudf/tests/test_array_ufunc.py b/python/cudf/cudf/tests/test_array_ufunc.py index a8e717c0179..6a50d8bde73 100644 --- a/python/cudf/cudf/tests/test_array_ufunc.py +++ b/python/cudf/cudf/tests/test_array_ufunc.py @@ -185,9 +185,8 @@ def test_ufunc_cudf_series_error_with_out_kwarg(func): # Skip matmul since it requires aligned shapes. @pytest.mark.parametrize("ufunc", (uf for uf in _UFUNCS if uf != np.matmul)) -@pytest.mark.parametrize("has_nulls", [False]) @pytest.mark.parametrize("indexed", [False]) -# @pytest.mark.parametrize("has_nulls", [True, False]) +@pytest.mark.parametrize("has_nulls", [True, False]) # @pytest.mark.parametrize("indexed", [True, False]) def test_ufunc_dataframe(ufunc, has_nulls, indexed): # Note: This test assumes that all ufuncs are unary or binary. @@ -214,14 +213,15 @@ def test_ufunc_dataframe(ufunc, has_nulls, indexed): for _ in range(ufunc.nin) ] - cols = [arg["foo"] for arg in args] if has_nulls: # Converting nullable integer cudf.Series to pandas will produce a # float pd.Series, so instead we replace nulls with an arbitrary # integer value, precompute the mask, and then reapply it afterwards. - for col in cols: - set_random_null_mask_inplace(col) - pandas_args = [arg["foo"].fillna(0) for arg in args] + for arg in args: + set_random_null_mask_inplace(arg["foo"]) + pandas_args = [arg.copy() for arg in args] + for arg in pandas_args: + arg["foo"] = arg["foo"].fillna(0) # Note: Different indexes must be aligned before the mask is computed. # This requires using an internal function (_align_indices), and that From e9a266b22f3e6eb0291211d0944fdeaccf8be8ac Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 11 Feb 2022 16:40:19 -0800 Subject: [PATCH 09/10] Track down failures in indexing and document the underlying pandas limitations. --- python/cudf/cudf/tests/test_array_ufunc.py | 35 ++++++++++++++-------- 1 file changed, 23 insertions(+), 12 deletions(-) diff --git a/python/cudf/cudf/tests/test_array_ufunc.py b/python/cudf/cudf/tests/test_array_ufunc.py index 6a50d8bde73..f1aad1af9e6 100644 --- a/python/cudf/cudf/tests/test_array_ufunc.py +++ b/python/cudf/cudf/tests/test_array_ufunc.py @@ -185,26 +185,37 @@ def test_ufunc_cudf_series_error_with_out_kwarg(func): # Skip matmul since it requires aligned shapes. @pytest.mark.parametrize("ufunc", (uf for uf in _UFUNCS if uf != np.matmul)) -@pytest.mark.parametrize("indexed", [False]) @pytest.mark.parametrize("has_nulls", [True, False]) -# @pytest.mark.parametrize("indexed", [True, False]) +@pytest.mark.parametrize("indexed", [True, False]) def test_ufunc_dataframe(ufunc, has_nulls, indexed): # Note: This test assumes that all ufuncs are unary or binary. fname = ufunc.__name__ - if indexed and fname in ( - "greater", - "greater_equal", - "less", - "less_equal", - "not_equal", - "equal", - ): - pytest.skip("Comparison operators do not support misaligned indexes.") + # TODO: When pandas starts supporting misaligned indexes properly, remove + # this check but enable the one below. + if indexed: + pytest.xfail( + "pandas does not currently support misaligned indexes in " + "DataFrames, but we do. Until this is fixed we will skip these " + "tests. See the error here: " + "https://github.com/pandas-dev/pandas/blob/main/pandas/core/arraylike.py#L212, " # noqa: E501 + "called from https://github.com/pandas-dev/pandas/blob/main/pandas/core/arraylike.py#L258" # noqa: E501 + ) + # TODO: Enable the check below when we remove the check above. + # if indexed and fname in ( + # "greater", + # "greater_equal", + # "less", + # "less_equal", + # "not_equal", + # "equal", + # ): + # pytest.skip("Comparison operators do not support misaligned indexes.") # noqa: E501 N = 100 # Avoid zeros in either array to skip division by 0 errors. Also limit the # scale to avoid issues with overflow, etc. We use ints because some # operations (like bitwise ops) are not defined for floats. + # TODO: Add tests of mismatched columns etc. pandas_args = args = [ cudf.DataFrame( {"foo": cp.random.randint(low=1, high=10, size=N)}, @@ -227,7 +238,7 @@ def test_ufunc_dataframe(ufunc, has_nulls, indexed): # This requires using an internal function (_align_indices), and that # is unlikely to change for the foreseeable future. aligned = ( - cudf.DataFrame._align_indices(args, allow_non_unique=True) + cudf.core.dataframe._align_indices(*args) if indexed and ufunc.nin == 2 else args ) From 4928004e3172dd2676b94d386242a943aae92a0f Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 14 Feb 2022 14:34:18 -0800 Subject: [PATCH 10/10] Fix copyright. --- python/cudf/cudf/core/single_column_frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py index 727f3095696..50b206d3388 100644 --- a/python/cudf/cudf/core/single_column_frame.py +++ b/python/cudf/cudf/core/single_column_frame.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. +# Copyright (c) 2021-2022, NVIDIA CORPORATION. """Base class for Frame types that only have a single column.""" from __future__ import annotations