diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 5239cf9d648..926dfaa4c86 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1878,6 +1878,7 @@ def _binaryop( fn: str, fill_value: Any = None, reflect: bool = False, + can_reindex: bool = False, *args, **kwargs, ): @@ -1898,14 +1899,17 @@ def _binaryop( for right, (name, left) in zip(rhs, lhs._data.items()) } elif isinstance(rhs, DataFrame): - if fn in cudf.utils.utils._EQUALITY_OPS: - if not lhs.columns.equals(rhs.columns) or not lhs.index.equals( - rhs.index - ): - raise ValueError( - "Can only compare identically-labeled " - "DataFrame objects" - ) + if ( + not can_reindex + and fn in cudf.utils.utils._EQUALITY_OPS + and ( + not lhs.columns.equals(rhs.columns) + or not lhs.index.equals(rhs.index) + ) + ): + raise ValueError( + "Can only compare identically-labeled " "DataFrame objects" + ) lhs, rhs = _align_indices(lhs, rhs) @@ -6481,14 +6485,6 @@ def unstack(self, level=-1, fill_value=None): self, level=level, fill_value=fill_value ) - def equals(self, other): - if not isinstance(other, DataFrame): - return False - for self_name, other_name in zip(self._data.names, other._data.names): - if self_name != other_name: - return False - return super().equals(other) - def explode(self, column, ignore_index=False): """ Transform each element of a list-like to a row, replicating index @@ -6536,14 +6532,27 @@ def explode(self, column, ignore_index=False): return super()._explode(column, ignore_index) -def make_binop_func(op): +def make_binop_func(op, postprocess=None): + # This function is used to wrap binary operations in Frame with an + # appropriate API for DataFrame as required for pandas compatibility. The + # main effect is reordering and error-checking parameters in + # DataFrame-specific ways. The postprocess argument is a callable that may + # optionally be provided to modify the result of the binop if additional + # processing is needed for pandas compatibility. The callable must have the + # signature + # def postprocess(left, right, output) + # where left and right are the inputs to the binop and output is the result + # of calling the wrapped Frame binop. wrapped_func = getattr(Frame, op) @functools.wraps(wrapped_func) def wrapper(self, other, axis="columns", level=None, fill_value=None): if axis not in (1, "columns"): raise NotImplementedError("Only axis=1 supported at this time.") - return wrapped_func(self, other, axis, level, fill_value) + output = wrapped_func(self, other, axis, level, fill_value) + if postprocess is None: + return output + return postprocess(self, other, output) # functools.wraps copies module level attributes to `wrapper` and sets # __wrapped__ attributes to `wrapped_func`. Cpython looks up the signature @@ -6560,6 +6569,7 @@ def wrapper(self, other, axis="columns", level=None, fill_value=None): return wrapper +# Wrap arithmetic Frame binop functions with the expected API for Series. for binop in [ "add", "radd", @@ -6583,6 +6593,57 @@ def wrapper(self, other, axis="columns", level=None, fill_value=None): setattr(DataFrame, binop, make_binop_func(binop)) +def _make_replacement_func(value): + # This function generates a postprocessing function suitable for use with + # make_binop_func that fills null columns with the desired fill value. + + def func(left, right, output): + # This function may be passed as the postprocess argument to + # make_binop_func. Columns that are only present in one of the inputs + # will be null in the output. This function postprocesses the output to + # replace those nulls with some desired output. + if isinstance(right, Series): + uncommon_columns = set(left._column_names) ^ set(right.index) + elif isinstance(right, DataFrame): + uncommon_columns = set(left._column_names) ^ set( + right._column_names + ) + elif _is_scalar_or_zero_d_array(right): + for name, col in output._data.items(): + output._data[name] = col.fillna(value) + return output + else: + return output + + for name in uncommon_columns: + output._data[name] = column.full( + size=len(output), fill_value=value, dtype="bool" + ) + return output + + return func + + +# The ne comparator needs special postprocessing because elements that missing +# in one operand should be treated as null and result in True in the output +# rather than simply propagating nulls. +DataFrame.ne = make_binop_func("ne", _make_replacement_func(True)) + + +# All other comparison operators needs return False when one of the operands is +# missing in the input. +for binop in [ + "eq", + "lt", + "le", + "gt", + "ge", +]: + setattr( + DataFrame, binop, make_binop_func(binop, _make_replacement_func(False)) + ) + + def from_pandas(obj, nan_as_null=None): """ Convert certain Pandas objects into the cudf equivalent. diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 0b895460410..b14a4d91831 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -463,6 +463,10 @@ def equals(self, other, **kwargs): if other is None or len(self) != len(other): return False + for self_name, other_name in zip(self._data.names, other._data.names): + if self_name != other_name: + return False + # check data: for self_col, other_col in zip( self._data.values(), other._data.values() @@ -6250,6 +6254,456 @@ def rtruediv(self, other, axis, level=None, fill_value=None): # Alias for rtruediv rdiv = rtruediv + def eq(self, other, axis="columns", level=None, fill_value=None): + """Equal to, element-wise (binary operator eq). + + Parameters + ---------- + other : Series or scalar value + fill_value : None or value + Value to fill nulls with before computation. If data in both + corresponding Series locations is null the result will be null + + Returns + ------- + Frame + The result of the operation. + + Examples + -------- + **DataFrame** + + >>> left = cudf.DataFrame({ + ... 'a': [1, 2, 3], + ... 'b': [4, 5, 6], + ... 'c': [7, 8, 9]} + ... ) + >>> right = cudf.DataFrame({ + ... 'a': [1, 2, 3], + ... 'b': [4, 5, 6], + ... 'd': [10, 12, 12]} + ... ) + >>> left.eq(right) + a b c d + 0 True True + 1 True True + 2 True True + >>> left.eq(right, fill_value=7) + a b c d + 0 True True True False + 1 True True False False + 2 True True False False + + **Series** + + >>> a = cudf.Series([1, 2, 3, None, 10, 20], + ... index=['a', 'c', 'd', 'e', 'f', 'g']) + >>> a + a 1 + c 2 + d 3 + e + f 10 + g 20 + dtype: int64 + >>> b = cudf.Series([-10, 23, -1, None, None], + ... index=['a', 'b', 'c', 'd', 'e']) + >>> b + a -10 + b 23 + c -1 + d + e + dtype: int64 + >>> a.eq(b, fill_value=2) + a False + b False + c False + d False + e + f False + g False + dtype: bool + """ + return self._binaryop( + other=other, fn="eq", fill_value=fill_value, can_reindex=True + ) + + def ne(self, other, axis="columns", level=None, fill_value=None): + """Not equal to, element-wise (binary operator ne). + + Parameters + ---------- + other : Series or scalar value + fill_value : None or value + Value to fill nulls with before computation. If data in both + corresponding Series locations is null the result will be null + + Returns + ------- + Frame + The result of the operation. + + Examples + -------- + **DataFrame** + + >>> left = cudf.DataFrame({ + ... 'a': [1, 2, 3], + ... 'b': [4, 5, 6], + ... 'c': [7, 8, 9]} + ... ) + >>> right = cudf.DataFrame({ + ... 'a': [1, 2, 3], + ... 'b': [4, 5, 6], + ... 'd': [10, 12, 12]} + ... ) + >>> left.ne(right) + a b c d + 0 False False + 1 False False + 2 False False + >>> left.ne(right, fill_value=7) + a b c d + 0 False False False True + 1 False False True True + 2 False False True True + + **Series** + + >>> a = cudf.Series([1, 2, 3, None, 10, 20], + ... index=['a', 'c', 'd', 'e', 'f', 'g']) + >>> a + a 1 + c 2 + d 3 + e + f 10 + g 20 + dtype: int64 + >>> b = cudf.Series([-10, 23, -1, None, None], + ... index=['a', 'b', 'c', 'd', 'e']) + >>> b + a -10 + b 23 + c -1 + d + e + dtype: int64 + >>> a.ne(b, fill_value=2) + a True + b True + c True + d True + e + f True + g True + dtype: bool + """ # noqa: E501 + return self._binaryop( + other=other, fn="ne", fill_value=fill_value, can_reindex=True + ) + + def lt(self, other, axis="columns", level=None, fill_value=None): + """Less than, element-wise (binary operator lt). + + Parameters + ---------- + other : Series or scalar value + fill_value : None or value + Value to fill nulls with before computation. If data in both + corresponding Series locations is null the result will be null + + Returns + ------- + Frame + The result of the operation. + + Examples + -------- + **DataFrame** + + >>> left = cudf.DataFrame({ + ... 'a': [1, 2, 3], + ... 'b': [4, 5, 6], + ... 'c': [7, 8, 9]} + ... ) + >>> right = cudf.DataFrame({ + ... 'a': [1, 2, 3], + ... 'b': [4, 5, 6], + ... 'd': [10, 12, 12]} + ... ) + >>> left.lt(right) + a b c d + 0 False False + 1 False False + 2 False False + >>> left.lt(right, fill_value=7) + a b c d + 0 False False False True + 1 False False False True + 2 False False False True + + **Series** + + >>> a = cudf.Series([1, 2, 3, None, 10, 20], + ... index=['a', 'c', 'd', 'e', 'f', 'g']) + >>> a + a 1 + c 2 + d 3 + e + f 10 + g 20 + dtype: int64 + >>> b = cudf.Series([-10, 23, -1, None, None], + ... index=['a', 'b', 'c', 'd', 'e']) + >>> b + a -10 + b 23 + c -1 + d + e + dtype: int64 + >>> a.lt(b, fill_value=-10) + a False + b True + c False + d False + e + f False + g False + dtype: bool + """ # noqa: E501 + return self._binaryop( + other=other, fn="lt", fill_value=fill_value, can_reindex=True + ) + + def le(self, other, axis="columns", level=None, fill_value=None): + """Less than or equal, element-wise (binary operator le). + + Parameters + ---------- + other : Series or scalar value + fill_value : None or value + Value to fill nulls with before computation. If data in both + corresponding Series locations is null the result will be null + + Returns + ------- + Frame + The result of the operation. + + Examples + -------- + **DataFrame** + + >>> left = cudf.DataFrame({ + ... 'a': [1, 2, 3], + ... 'b': [4, 5, 6], + ... 'c': [7, 8, 9]} + ... ) + >>> right = cudf.DataFrame({ + ... 'a': [1, 2, 3], + ... 'b': [4, 5, 6], + ... 'd': [10, 12, 12]} + ... ) + >>> left.le(right) + a b c d + 0 True True + 1 True True + 2 True True + >>> left.le(right, fill_value=7) + a b c d + 0 True True True True + 1 True True False True + 2 True True False True + + **Series** + + >>> a = cudf.Series([1, 2, 3, None, 10, 20], + ... index=['a', 'c', 'd', 'e', 'f', 'g']) + >>> a + a 1 + c 2 + d 3 + e + f 10 + g 20 + dtype: int64 + >>> b = cudf.Series([-10, 23, -1, None, None], + ... index=['a', 'b', 'c', 'd', 'e']) + >>> b + a -10 + b 23 + c -1 + d + e + dtype: int64 + >>> a.le(b, fill_value=-10) + a False + b True + c False + d False + e + f False + g False + dtype: bool + """ # noqa: E501 + return self._binaryop( + other=other, fn="le", fill_value=fill_value, can_reindex=True + ) + + def gt(self, other, axis="columns", level=None, fill_value=None): + """Greater than, element-wise (binary operator gt). + + Parameters + ---------- + other : Series or scalar value + fill_value : None or value + Value to fill nulls with before computation. If data in both + corresponding Series locations is null the result will be null + + Returns + ------- + Frame + The result of the operation. + + Examples + -------- + **DataFrame** + + >>> left = cudf.DataFrame({ + ... 'a': [1, 2, 3], + ... 'b': [4, 5, 6], + ... 'c': [7, 8, 9]} + ... ) + >>> right = cudf.DataFrame({ + ... 'a': [1, 2, 3], + ... 'b': [4, 5, 6], + ... 'd': [10, 12, 12]} + ... ) + >>> left.gt(right) + a b c d + 0 False False + 1 False False + 2 False False + >>> left.gt(right, fill_value=7) + a b c d + 0 False False False False + 1 False False True False + 2 False False True False + + **Series** + + >>> a = cudf.Series([1, 2, 3, None, 10, 20], + ... index=['a', 'c', 'd', 'e', 'f', 'g']) + >>> a + a 1 + c 2 + d 3 + e + f 10 + g 20 + dtype: int64 + >>> b = cudf.Series([-10, 23, -1, None, None], + ... index=['a', 'b', 'c', 'd', 'e']) + >>> b + a -10 + b 23 + c -1 + d + e + dtype: int64 + >>> a.gt(b) + a True + b False + c True + d False + e False + f False + g False + dtype: bool + """ # noqa: E501 + return self._binaryop( + other=other, fn="gt", fill_value=fill_value, can_reindex=True + ) + + def ge(self, other, axis="columns", level=None, fill_value=None): + """Greater than or equal, element-wise (binary operator ge). + + Parameters + ---------- + other : Series or scalar value + fill_value : None or value + Value to fill nulls with before computation. If data in both + corresponding Series locations is null the result will be null + + Returns + ------- + Frame + The result of the operation. + + Examples + -------- + **DataFrame** + + >>> left = cudf.DataFrame({ + ... 'a': [1, 2, 3], + ... 'b': [4, 5, 6], + ... 'c': [7, 8, 9]} + ... ) + >>> right = cudf.DataFrame({ + ... 'a': [1, 2, 3], + ... 'b': [4, 5, 6], + ... 'd': [10, 12, 12]} + ... ) + >>> left.ge(right) + a b c d + 0 True True + 1 True True + 2 True True + >>> left.ge(right, fill_value=7) + a b c d + 0 True True True False + 1 True True True False + 2 True True True False + + **Series** + + >>> a = cudf.Series([1, 2, 3, None, 10, 20], + ... index=['a', 'c', 'd', 'e', 'f', 'g']) + >>> a + a 1 + c 2 + d 3 + e + f 10 + g 20 + dtype: int64 + >>> b = cudf.Series([-10, 23, -1, None, None], + ... index=['a', 'b', 'c', 'd', 'e']) + >>> b + a -10 + b 23 + c -1 + d + e + dtype: int64 + >>> a.ge(b) + a True + b False + c True + d False + e False + f False + g False + dtype: bool + """ # noqa: E501 + return self._binaryop( + other=other, fn="ge", fill_value=fill_value, can_reindex=True + ) + def _get_replacement_values_for_columns( to_replace: Any, value: Any, columns_dtype_map: Dict[Any, Any] diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 2a9adf65283..c990961c7a2 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -1358,7 +1358,7 @@ def _binaryop( ) ): raise ValueError( - "Can only compare identically-labeled " "Series objects" + "Can only compare identically-labeled Series objects" ) lhs, other = _align_indices([self, other], allow_non_unique=True) else: @@ -1385,318 +1385,6 @@ def logical_or(self, other): def logical_not(self): return self._unaryop("not") - def eq(self, other, fill_value=None, axis=0): - """Equal to of series and other, element-wise - (binary operator eq). - - Parameters - ---------- - other : Series or scalar value - fill_value : None or value - Value to fill nulls with before computation. If data in both - corresponding Series locations is null the result will be null - - Returns - ------- - Series - The result of the operation. - - Examples - -------- - >>> import cudf - >>> a = cudf.Series([1, 2, 3, None, 10, 20], index=['a', 'c', 'd', 'e', 'f', 'g']) - >>> a - a 1 - c 2 - d 3 - e - f 10 - g 20 - dtype: int64 - >>> b = cudf.Series([-10, 23, -1, None, None], index=['a', 'b', 'c', 'd', 'e']) - >>> b - a -10 - b 23 - c -1 - d - e - dtype: int64 - >>> a.eq(b, fill_value=2) - a False - b False - c False - d False - e - f False - g False - dtype: bool - """ # noqa: E501 - if axis != 0: - raise NotImplementedError("Only axis=0 supported at this time.") - return self._binaryop( - other=other, fn="eq", fill_value=fill_value, can_reindex=True - ) - - def ne(self, other, fill_value=None, axis=0): - """Not equal to of series and other, element-wise - (binary operator ne). - - Parameters - ---------- - other : Series or scalar value - fill_value : None or value - Value to fill nulls with before computation. If data in both - corresponding Series locations is null the result will be null - - Returns - ------- - Series - The result of the operation. - - Examples - -------- - >>> import cudf - >>> a = cudf.Series([1, 2, 3, None, 10, 20], index=['a', 'c', 'd', 'e', 'f', 'g']) - >>> a - a 1 - c 2 - d 3 - e - f 10 - g 20 - dtype: int64 - >>> b = cudf.Series([-10, 23, -1, None, None], index=['a', 'b', 'c', 'd', 'e']) - >>> b - a -10 - b 23 - c -1 - d - e - dtype: int64 - >>> a.ne(b, fill_value=2) - a True - b True - c True - d True - e - f True - g True - dtype: bool - """ # noqa: E501 - if axis != 0: - raise NotImplementedError("Only axis=0 supported at this time.") - return self._binaryop( - other=other, fn="ne", fill_value=fill_value, can_reindex=True - ) - - def lt(self, other, fill_value=None, axis=0): - """Less than of series and other, element-wise - (binary operator lt). - - Parameters - ---------- - other : Series or scalar value - fill_value : None or value - Value to fill nulls with before computation. If data in both - corresponding Series locations is null the result will be null - - Returns - ------- - Series - The result of the operation. - - Examples - -------- - >>> import cudf - >>> a = cudf.Series([1, 2, 3, None, 10, 20], index=['a', 'c', 'd', 'e', 'f', 'g']) - >>> a - a 1 - c 2 - d 3 - e - f 10 - g 20 - dtype: int64 - >>> b = cudf.Series([-10, 23, -1, None, None], index=['a', 'b', 'c', 'd', 'e']) - >>> b - a -10 - b 23 - c -1 - d - e - dtype: int64 - >>> a.lt(b, fill_value=-10) - a False - b True - c False - d False - e - f False - g False - dtype: bool - """ # noqa: E501 - if axis != 0: - raise NotImplementedError("Only axis=0 supported at this time.") - return self._binaryop( - other=other, fn="lt", fill_value=fill_value, can_reindex=True - ) - - def le(self, other, fill_value=None, axis=0): - """Less than or equal to of series and other, element-wise - (binary operator le). - - Parameters - ---------- - other : Series or scalar value - fill_value : None or value - Value to fill nulls with before computation. If data in both - corresponding Series locations is null the result will be null - - Returns - ------- - Series - The result of the operation. - - Examples - -------- - >>> import cudf - >>> a = cudf.Series([1, 2, 3, None, 10, 20], index=['a', 'c', 'd', 'e', 'f', 'g']) - >>> a - a 1 - c 2 - d 3 - e - f 10 - g 20 - dtype: int64 - >>> b = cudf.Series([-10, 23, -1, None, None], index=['a', 'b', 'c', 'd', 'e']) - >>> b - a -10 - b 23 - c -1 - d - e - dtype: int64 - >>> a.le(b, fill_value=-10) - a False - b True - c False - d False - e - f False - g False - dtype: bool - """ # noqa: E501 - if axis != 0: - raise NotImplementedError("Only axis=0 supported at this time.") - return self._binaryop( - other=other, fn="le", fill_value=fill_value, can_reindex=True - ) - - def gt(self, other, fill_value=None, axis=0): - """Greater than of series and other, element-wise - (binary operator gt). - - Parameters - ---------- - other : Series or scalar value - fill_value : None or value - Value to fill nulls with before computation. If data in both - corresponding Series locations is null the result will be null - - Returns - ------- - Series - The result of the operation. - - Examples - -------- - >>> import cudf - >>> a = cudf.Series([1, 2, 3, None, 10, 20], index=['a', 'c', 'd', 'e', 'f', 'g']) - >>> a - a 1 - c 2 - d 3 - e - f 10 - g 20 - dtype: int64 - >>> b = cudf.Series([-10, 23, -1, None, None], index=['a', 'b', 'c', 'd', 'e']) - >>> b - a -10 - b 23 - c -1 - d - e - dtype: int64 - >>> a.gt(b) - a True - b False - c True - d False - e False - f False - g False - dtype: bool - """ # noqa: E501 - if axis != 0: - raise NotImplementedError("Only axis=0 supported at this time.") - return self._binaryop( - other=other, fn="gt", fill_value=fill_value, can_reindex=True - ) - - def ge(self, other, fill_value=None, axis=0): - """Greater than or equal to of series and other, element-wise - (binary operator ge). - - Parameters - ---------- - other : Series or scalar value - fill_value : None or value - Value to fill nulls with before computation. If data in both - corresponding Series locations is null the result will be null - - Returns - ------- - Series - The result of the operation. - - Examples - -------- - >>> import cudf - >>> a = cudf.Series([1, 2, 3, None, 10, 20], index=['a', 'c', 'd', 'e', 'f', 'g']) - >>> a - a 1 - c 2 - d 3 - e - f 10 - g 20 - dtype: int64 - >>> b = cudf.Series([-10, 23, -1, None, None], index=['a', 'b', 'c', 'd', 'e']) - >>> b - a -10 - b 23 - c -1 - d - e - dtype: int64 - >>> a.ge(b) - a True - b False - c True - d False - e False - f False - g False - dtype: bool - """ # noqa: E501 - if axis != 0: - raise NotImplementedError("Only axis=0 supported at this time.") - return self._binaryop( - other=other, fn="ge", fill_value=fill_value, can_reindex=True - ) - @copy_docstring(CategoricalAccessor) # type: ignore @property def cat(self): @@ -4221,6 +3909,10 @@ def pct_change( def make_binop_func(op): + # This function is used to wrap binary operations in Frame with an + # appropriate API for Series as required for pandas compatibility. The + # main effect is reordering and error-checking parameters in + # Series-specific ways. wrapped_func = getattr(Frame, op) @functools.wraps(wrapped_func) @@ -4243,6 +3935,7 @@ def wrapper(self, other, level=None, fill_value=None, axis=0): return wrapper +# Wrap all Frame binop functions with the expected API for Series. for binop in ( "add", "radd", @@ -4262,6 +3955,12 @@ def wrapper(self, other, level=None, fill_value=None, axis=0): "div", "rtruediv", "rdiv", + "eq", + "ne", + "lt", + "le", + "gt", + "ge", ): setattr(Series, binop, make_binop_func(binop)) diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py index 96281c139f2..228296de83d 100644 --- a/python/cudf/cudf/tests/test_binops.py +++ b/python/cudf/cudf/tests/test_binops.py @@ -833,6 +833,42 @@ def gen_df(): utils.assert_eq(expect, got) +@pytest.mark.parametrize("func", _operators_comparison) +@pytest.mark.parametrize("nulls", _nulls) +@pytest.mark.parametrize("other", ["df", "scalar"]) +def test_logical_operator_func_dataframe(func, nulls, other): + np.random.seed(0) + num_rows = 100 + num_cols = 3 + + def gen_df(): + pdf = pd.DataFrame() + from string import ascii_lowercase + + cols = np.random.choice(num_cols + 5, num_cols, replace=False) + + for i in range(num_cols): + colname = ascii_lowercase[cols[i]] + data = utils.gen_rand("float64", num_rows) * 10000 + if nulls == "some": + idx = np.random.choice( + num_rows, size=int(num_rows / 2), replace=False + ) + data[idx] = np.nan + pdf[colname] = data + return pdf + + pdf1 = gen_df() + pdf2 = gen_df() if other == "df" else 59.0 + gdf1 = cudf.DataFrame.from_pandas(pdf1) + gdf2 = cudf.DataFrame.from_pandas(pdf2) if other == "df" else 59.0 + + got = getattr(gdf1, func)(gdf2) + expect = getattr(pdf1, func)(pdf2)[list(got._data)] + + utils.assert_eq(expect, got) + + @pytest.mark.parametrize("func", _operators_arithmetic + _operators_comparison) @pytest.mark.parametrize("rhs", [0, 1, 2, 128]) def test_binop_bool_uint(func, rhs):