From 0b373d999a356183f1bcdf9d86cd90c2d739d196 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 21 Jul 2021 16:29:34 -0700 Subject: [PATCH 01/23] Move logic for any/all down from Series into Column. --- python/cudf/cudf/core/column/column.py | 33 ++++++++++++++++--- python/cudf/cudf/core/series.py | 44 ++++++++++++++------------ 2 files changed, 53 insertions(+), 24 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 48e6293c3f4..2e3eb412e4a 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -172,11 +172,36 @@ def equals(self, other: ColumnBase, check_dtypes: bool = False) -> bool: def _null_equals(self, other: ColumnBase) -> ColumnBase: return self.binary_operator("NULL_EQUALS", other) - def all(self) -> bool: - return bool(libcudf.reduce.reduce("all", self, dtype=np.bool_)) + def all(self, skipna: bool = True) -> bool: + # If all entries are null the result is True, including when the column + # is empty. + if self.null_count == self.size: + return True + + # We don't want to call _process_for_reduction if skipna is False + # because all is not a reduction where the final output is also + # nullified by any nulls in the the input. + result_col = self._process_for_reduction(True) if skipna else self + if isinstance(result_col, ColumnBase): + return libcudf.reduce.reduce("all", result_col, dtype=np.bool_) + else: + return result_col - def any(self) -> bool: - return bool(libcudf.reduce.reduce("any", self, dtype=np.bool_)) + def any(self, skipna: bool = True) -> bool: + # Early exit for fast cases. + if not skipna and self.has_nulls: + return True + elif skipna and self.null_count == self.size: + return False + + # We don't want to call _process_for_reduction if skipna is False + # because any is not a reduction where the final output is also + # nullified by any nulls in the the input. + result_col = self._process_for_reduction(True) if skipna else self + if isinstance(result_col, ColumnBase): + return libcudf.reduce.reduce("any", result_col, dtype=np.bool_) + else: + return result_col def __sizeof__(self) -> int: n = 0 diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 413fcefc2bc..80e4a18f1e7 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -2761,13 +2761,15 @@ def all(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): "bool_only parameter is not implemented yet" ) - if skipna: - result_series = self.nans_to_nulls() - if len(result_series) == result_series.null_count: - return True - else: - result_series = self - return result_series._column.all() + return self._column.all(skipna=skipna) + + # if skipna: + # result_series = self.nans_to_nulls() + # if len(result_series) == result_series.null_count: + # return True + # else: + # result_series = self + # return result_series._column.all() def any(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): """ @@ -2809,21 +2811,23 @@ def any(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): raise NotImplementedError( "bool_only parameter is not implemented yet" ) - + # TODO: I think we can remove this, pandas no longer supports None. skipna = False if skipna is None else skipna - if skipna is False and self.has_nulls: - return True - - if skipna: - result_series = self.nans_to_nulls() - if len(result_series) == result_series.null_count: - return False - - else: - result_series = self - - return result_series._column.any() + return self._column.any(skipna=skipna) + + # if skipna is False and self.has_nulls: + # return True + # + # if skipna: + # result_series = self.nans_to_nulls() + # if len(result_series) == result_series.null_count: + # return False + # + # else: + # result_series = self + # + # return result_series._column.any() def to_pandas(self, index=True, nullable=False, **kwargs): """ From 3a0e11eccd7368c32885f5709ecb17ee07480cd0 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 21 Jul 2021 16:42:43 -0700 Subject: [PATCH 02/23] Simplify _apply_support_method for improved performance. --- python/cudf/cudf/core/dataframe.py | 31 ++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index bc068413efb..74eb88e701a 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -7138,26 +7138,37 @@ def any(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): **kwargs, ) + _support_axis_lookup = { + 0: 0, + 1: 1, + None: 0, + "index": 0, + "columns": 1, + } + def _apply_support_method(self, method, axis=0, *args, **kwargs): - assert axis in (None, 0, 1) + axis = self._support_axis_lookup[axis] - if axis in (None, 0): + if axis == 0: + kwargs.pop("level", None) + kwargs.pop("numeric_only", None) result = [ - getattr(self[col], method)(*args, **kwargs) + getattr(self._data[col], method)(*args, **kwargs) for col in self._data.names ] if isinstance(result[0], Series): - support_result = result - result = DataFrame(index=support_result[0].index) - for idx, col in enumerate(self._data.names): - result[col] = support_result[idx] + result = self._from_data( + {col: result[i] for i, col in enumerate(self._data.names)}, + result[0].index, + ) else: - result = Series(result) - result = result.set_index(self._data.names) + result = Series._from_data( + {None: result}, as_index(self._data.names) + ) return result - elif axis == 1: + else: # for dask metadata compatibility skipna = kwargs.pop("skipna", None) if method not in _cupy_nan_methods_map and skipna not in ( From e6a1e2ad2b57260e0453cb9e2d2930db8ce9b8eb Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 22 Jul 2021 09:32:44 -0700 Subject: [PATCH 03/23] Alias DataFrame.product as DataFrame.prod. --- python/cudf/cudf/core/dataframe.py | 58 ++---------------------------- python/cudf/cudf/core/series.py | 1 + 2 files changed, 3 insertions(+), 56 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 74eb88e701a..4681d5a35f5 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -6480,62 +6480,8 @@ def product( **kwargs, ) - def prod( - self, - axis=None, - skipna=None, - dtype=None, - level=None, - numeric_only=None, - min_count=0, - **kwargs, - ): - """ - Return product of the values in the DataFrame. - - Parameters - ---------- - - axis: {index (0), columns(1)} - Axis for the function to be applied on. - skipna: bool, default True - Exclude NA/null values when computing the result. - dtype: data type - Data type to cast the result to. - min_count: int, default 0 - The required number of valid values to perform the operation. - If fewer than min_count non-NA values are present the result - will be NA. - - The default being 0. This means the sum of an all-NA or empty - Series is 0, and the product of an all-NA or empty Series is 1. - - Returns - ------- - scalar - - Notes - ----- - Parameters currently not supported are `level`, `numeric_only`. - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) - >>> df.prod() - a 24 - b 5040 - dtype: int64 - """ - return self.product( - axis=axis, - skipna=skipna, - dtype=dtype, - level=level, - numeric_only=numeric_only, - min_count=min_count, - **kwargs, - ) + # Alias for pandas compatibility. + prod = product def cummin(self, axis=None, skipna=True, *args, **kwargs): """ diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 80e4a18f1e7..3a8eb6fff90 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -4242,6 +4242,7 @@ def product( skipna=skipna, dtype=dtype, min_count=min_count ) + # Alias for pandas compatibility. prod = product def cummin(self, axis=None, skipna=True, *args, **kwargs): From a70eebd0b263a5d088caa859fe96b6e1d7b3a9d0 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 22 Jul 2021 10:12:46 -0700 Subject: [PATCH 04/23] Revert "Simplify _apply_support_method for improved performance." This reverts commit 61f09eb36445beabc7a01dc065cb9c2666e8ee3c. --- python/cudf/cudf/core/dataframe.py | 31 ++++++++++-------------------- 1 file changed, 10 insertions(+), 21 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 4681d5a35f5..eebb39d63eb 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -7084,37 +7084,26 @@ def any(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): **kwargs, ) - _support_axis_lookup = { - 0: 0, - 1: 1, - None: 0, - "index": 0, - "columns": 1, - } - def _apply_support_method(self, method, axis=0, *args, **kwargs): - axis = self._support_axis_lookup[axis] + assert axis in (None, 0, 1) - if axis == 0: - kwargs.pop("level", None) - kwargs.pop("numeric_only", None) + if axis in (None, 0): result = [ - getattr(self._data[col], method)(*args, **kwargs) + getattr(self[col], method)(*args, **kwargs) for col in self._data.names ] if isinstance(result[0], Series): - result = self._from_data( - {col: result[i] for i, col in enumerate(self._data.names)}, - result[0].index, - ) + support_result = result + result = DataFrame(index=support_result[0].index) + for idx, col in enumerate(self._data.names): + result[col] = support_result[idx] else: - result = Series._from_data( - {None: result}, as_index(self._data.names) - ) + result = Series(result) + result = result.set_index(self._data.names) return result - else: + elif axis == 1: # for dask metadata compatibility skipna = kwargs.pop("skipna", None) if method not in _cupy_nan_methods_map and skipna not in ( From 3d19fece60f52ed7a0c3deabca77768e8ec4f743 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 22 Jul 2021 10:44:16 -0700 Subject: [PATCH 05/23] Standardize reduction operation argument handling. --- python/cudf/cudf/core/series.py | 192 ++++++++++++++------------------ 1 file changed, 86 insertions(+), 106 deletions(-) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 3a8eb6fff90..e70b60d976e 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -3994,6 +3994,21 @@ def applymap(self, udf, out_dtype=None): # # Stats # + def _reduce( + self, op, axis=None, level=None, numeric_only=None, **kwargs, + ): + if axis not in (None, 0): + raise NotImplementedError("axis parameter is not implemented yet") + + if level is not None: + raise NotImplementedError("level parameter is not implemented yet") + + if numeric_only not in (None, True): + raise NotImplementedError( + "numeric_only parameter is not implemented yet" + ) + return getattr(self._column, op)(**kwargs) + def count(self, level=None, **kwargs): """ Return number of non-NA/null observations in the Series @@ -4056,19 +4071,15 @@ def min( >>> ser.min() 1 """ - - if axis not in (None, 0): - raise NotImplementedError("axis parameter is not implemented yet") - - if level is not None: - raise NotImplementedError("level parameter is not implemented yet") - - if numeric_only not in (None, True): - raise NotImplementedError( - "numeric_only parameter is not implemented yet" - ) - - return self._column.min(skipna=skipna, dtype=dtype) + return self._reduce( + "min", + axis=axis, + skipna=skipna, + dtype=dtype, + level=level, + numeric_only=numeric_only, + **kwargs, + ) def max( self, @@ -4106,19 +4117,15 @@ def max( >>> ser.max() 5 """ - - if axis not in (None, 0): - raise NotImplementedError("axis parameter is not implemented yet") - - if level is not None: - raise NotImplementedError("level parameter is not implemented yet") - - if numeric_only not in (None, True): - raise NotImplementedError( - "numeric_only parameter is not implemented yet" - ) - - return self._column.max(skipna=skipna, dtype=dtype) + return self._reduce( + "max", + axis=axis, + skipna=skipna, + dtype=dtype, + level=level, + numeric_only=numeric_only, + **kwargs, + ) def sum( self, @@ -4165,20 +4172,15 @@ def sum( >>> ser.sum() 15 """ - - if axis not in (None, 0): - raise NotImplementedError("axis parameter is not implemented yet") - - if level is not None: - raise NotImplementedError("level parameter is not implemented yet") - - if numeric_only not in (None, True): - raise NotImplementedError( - "numeric_only parameter is not implemented yet" - ) - - return self._column.sum( - skipna=skipna, dtype=dtype, min_count=min_count + return self._reduce( + "sum", + axis=axis, + skipna=skipna, + dtype=dtype, + level=level, + numeric_only=numeric_only, + min_count=min_count, + **kwargs, ) def product( @@ -4226,20 +4228,15 @@ def product( >>> ser.product() 120 """ - - if axis not in (None, 0): - raise NotImplementedError("axis parameter is not implemented yet") - - if level is not None: - raise NotImplementedError("level parameter is not implemented yet") - - if numeric_only not in (None, True): - raise NotImplementedError( - "numeric_only parameter is not implemented yet" - ) - - return self._column.product( - skipna=skipna, dtype=dtype, min_count=min_count + return self._reduce( + "product", + axis=axis, + skipna=skipna, + dtype=dtype, + level=level, + numeric_only=numeric_only, + min_count=min_count, + **kwargs, ) # Alias for pandas compatibility. @@ -4507,19 +4504,14 @@ def mean( >>> ser.mean() 15.5 """ - - if axis not in (None, 0): - raise NotImplementedError("axis parameter is not implemented yet") - - if level is not None: - raise NotImplementedError("level parameter is not implemented yet") - - if numeric_only not in (None, True): - raise NotImplementedError( - "numeric_only parameter is not implemented yet" - ) - - return self._column.mean(skipna=skipna) + return self._reduce( + "mean", + axis=axis, + skipna=skipna, + level=level, + numeric_only=numeric_only, + **kwargs, + ) def std( self, @@ -4572,19 +4564,15 @@ def std( >>> series.std(ddof=2) 15.05545305418162 """ - - if axis not in (None, 0): - raise NotImplementedError("axis parameter is not implemented yet") - - if level is not None: - raise NotImplementedError("level parameter is not implemented yet") - - if numeric_only not in (None, True): - raise NotImplementedError( - "numeric_only parameter is not implemented yet" - ) - - return self._column.std(skipna=skipna, ddof=ddof) + return self._reduce( + "std", + axis=axis, + skipna=skipna, + level=level, + numeric_only=numeric_only, + ddof=ddof, + **kwargs, + ) def var( self, @@ -4635,22 +4623,18 @@ def var( >>> series.var() 33.7 """ - - if axis not in (None, 0): - raise NotImplementedError("axis parameter is not implemented yet") - - if level is not None: - raise NotImplementedError("level parameter is not implemented yet") - - if numeric_only not in (None, True): - raise NotImplementedError( - "numeric_only parameter is not implemented yet" - ) - - return self._column.var(skipna=skipna, ddof=ddof) + return self._reduce( + "var", + axis=axis, + skipna=skipna, + level=level, + numeric_only=numeric_only, + ddof=ddof, + **kwargs, + ) def sum_of_squares(self, dtype=None): - return self._column.sum_of_squares(dtype=dtype) + return self._reduce("sum_of_squares", dtype=dtype) def median( self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs @@ -4688,18 +4672,14 @@ def median( >>> ser.median() 17.0 """ - if axis not in (None, 0): - raise NotImplementedError("axis parameter is not implemented yet") - - if level is not None: - raise NotImplementedError("level parameter is not implemented yet") - - if numeric_only not in (None, True): - raise NotImplementedError( - "numeric_only parameter is not implemented yet" - ) - - return self._column.median(skipna=skipna) + return self._reduce( + "median", + axis=axis, + skipna=skipna, + level=level, + numeric_only=numeric_only, + **kwargs, + ) def mode(self, dropna=True): """ From b437f50a6d71b1ce862b87c5beae8801cf241fbe Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 22 Jul 2021 10:47:24 -0700 Subject: [PATCH 06/23] Replace asserts with exception handlers. --- python/cudf/cudf/core/series.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index e70b60d976e..00799b2f932 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -4324,8 +4324,6 @@ def cummax(self, axis=0, skipna=True, *args, **kwargs): 3 5 4 5 """ - assert axis in (None, 0) - if axis not in (None, 0): raise NotImplementedError("axis parameter is not implemented yet") @@ -4937,7 +4935,11 @@ def corr(self, other, method="pearson", min_periods=None): -0.20454263717316112 """ - assert method in ("pearson",) and min_periods in (None,) + if method not in ("pearson",): + raise ValueError(f"Unknown method {method}") + + if min_periods not in ("None",): + raise NotImplementedError("Unsupported argument 'min_periods'") if self.empty or other.empty: return cudf.utils.dtypes._get_nan_for_dtype(self.dtype) @@ -5386,7 +5388,8 @@ def hash_encode(self, stop, use_name=False): 2 76 dtype: int32 """ - assert stop > 0 + if not stop > 0: + raise ValueError("stop must be a positive integer.") initial_hash = [hash(self.name) & 0xFFFFFFFF] if use_name else None hashed_values = Series(self._hash(initial_hash)) From ad025b62d7c905be86eb35a10ac11c1a154748e9 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 22 Jul 2021 13:12:20 -0700 Subject: [PATCH 07/23] Split axis 0 and axis 1 support methods into separate functions. --- python/cudf/cudf/core/dataframe.py | 203 +++++++++++++++-------------- 1 file changed, 103 insertions(+), 100 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index eebb39d63eb..7b9438cb67f 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -7084,116 +7084,119 @@ def any(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): **kwargs, ) - def _apply_support_method(self, method, axis=0, *args, **kwargs): - assert axis in (None, 0, 1) - - if axis in (None, 0): - result = [ - getattr(self[col], method)(*args, **kwargs) - for col in self._data.names - ] - - if isinstance(result[0], Series): - support_result = result - result = DataFrame(index=support_result[0].index) - for idx, col in enumerate(self._data.names): - result[col] = support_result[idx] - else: - result = Series(result) - result = result.set_index(self._data.names) - return result + def _apply_support_method_axis_0(self, method, *args, **kwargs): + result = [ + getattr(self[col], method)(*args, **kwargs) + for col in self._data.names + ] - elif axis == 1: - # for dask metadata compatibility - skipna = kwargs.pop("skipna", None) - if method not in _cupy_nan_methods_map and skipna not in ( - None, - True, - 1, - ): - raise NotImplementedError( - f"Row-wise operation to calculate '{method}'" - f" currently do not support `skipna=False`." - ) + if isinstance(result[0], Series): + support_result = result + result = DataFrame(index=support_result[0].index) + for idx, col in enumerate(self._data.names): + result[col] = support_result[idx] + else: + result = Series(result) + result = result.set_index(self._data.names) + return result - level = kwargs.pop("level", None) - if level not in (None,): - raise NotImplementedError( - "Row-wise operations currently do not support `level`." - ) + def _apply_support_method_axis_1(self, method, *args, **kwargs): + # for dask metadata compatibility + skipna = kwargs.pop("skipna", None) + if method not in _cupy_nan_methods_map and skipna not in ( + None, + True, + 1, + ): + raise NotImplementedError( + f"Row-wise operation to calculate '{method}'" + f" currently do not support `skipna=False`." + ) - numeric_only = kwargs.pop("numeric_only", None) - if numeric_only not in (None, True): - raise NotImplementedError( - "Row-wise operations currently do not " - "support `numeric_only=False`." - ) + level = kwargs.pop("level", None) + if level not in (None,): + raise NotImplementedError( + "Row-wise operations currently do not support `level`." + ) - min_count = kwargs.pop("min_count", None) - if min_count not in (None, 0): - raise NotImplementedError( - "Row-wise operations currently do not " - "support `min_count`." - ) + numeric_only = kwargs.pop("numeric_only", None) + if numeric_only not in (None, True): + raise NotImplementedError( + "Row-wise operations currently do not " + "support `numeric_only=False`." + ) - bool_only = kwargs.pop("bool_only", None) - if bool_only not in (None, True): - raise NotImplementedError( - "Row-wise operations currently do not " - "support `bool_only`." - ) + min_count = kwargs.pop("min_count", None) + if min_count not in (None, 0): + raise NotImplementedError( + "Row-wise operations currently do not " "support `min_count`." + ) - prepared, mask, common_dtype = self._prepare_for_rowwise_op( - method, skipna + bool_only = kwargs.pop("bool_only", None) + if bool_only not in (None, True): + raise NotImplementedError( + "Row-wise operations currently do not " "support `bool_only`." ) - for col in prepared._data.names: - if prepared._data[col].nullable: - prepared._data[col] = ( - prepared._data[col] - .astype( - cudf.utils.dtypes.get_min_float_dtype( - prepared._data[col] - ) - if not is_datetime_dtype(common_dtype) - else np.dtype("float64") + + prepared, mask, common_dtype = self._prepare_for_rowwise_op( + method, skipna + ) + for col in prepared._data.names: + if prepared._data[col].nullable: + prepared._data[col] = ( + prepared._data[col] + .astype( + cudf.utils.dtypes.get_min_float_dtype( + prepared._data[col] ) - .fillna(np.nan) + if not is_datetime_dtype(common_dtype) + else np.dtype("float64") ) - arr = cupy.asarray(prepared.as_gpu_matrix()) - - if skipna is not False and method in _cupy_nan_methods_map: - method = _cupy_nan_methods_map[method] - - result = getattr(cupy, method)(arr, axis=1, **kwargs) - - if result.ndim == 1: - type_coerced_methods = { - "count", - "min", - "max", - "sum", - "prod", - "cummin", - "cummax", - "cumsum", - "cumprod", - } - result_dtype = ( - common_dtype - if method in type_coerced_methods - or is_datetime_dtype(common_dtype) - else None + .fillna(np.nan) ) - result = column.as_column(result, dtype=result_dtype) - if mask is not None: - result = result.set_mask( - cudf._lib.transform.bools_to_mask(mask._column) - ) - return Series(result, index=self.index, dtype=result_dtype,) - else: - result_df = DataFrame(result).set_index(self.index) - result_df.columns = prepared.columns - return result_df + arr = cupy.asarray(prepared.as_gpu_matrix()) + + if skipna is not False and method in _cupy_nan_methods_map: + method = _cupy_nan_methods_map[method] + + result = getattr(cupy, method)(arr, axis=1, **kwargs) + + if result.ndim == 1: + type_coerced_methods = { + "count", + "min", + "max", + "sum", + "prod", + "cummin", + "cummax", + "cumsum", + "cumprod", + } + result_dtype = ( + common_dtype + if method in type_coerced_methods + or is_datetime_dtype(common_dtype) + else None + ) + result = column.as_column(result, dtype=result_dtype) + if mask is not None: + result = result.set_mask( + cudf._lib.transform.bools_to_mask(mask._column) + ) + return Series(result, index=self.index, dtype=result_dtype,) + else: + result_df = DataFrame(result).set_index(self.index) + result_df.columns = prepared.columns + return result_df + + def _apply_support_method(self, method, axis=0, *args, **kwargs): + assert axis in (None, 0, 1) + + if axis in (None, 0): + return self._apply_support_method_axis_0(method, *args, **kwargs) + elif axis == 1: + return self._apply_support_method_axis_1(method, *args, **kwargs) def _columns_view(self, columns): """ From 823f558132f2dc5a40ac6b3e2a84aed6532a1acc Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 26 Jul 2021 09:04:16 -0700 Subject: [PATCH 08/23] Switch reductions to use a new helper method. --- python/cudf/cudf/core/dataframe.py | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 7b9438cb67f..65ca35b5e7d 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -6364,6 +6364,23 @@ def max( **kwargs, ) + def _reduce( + self, op, axis=None, level=None, numeric_only=None, **kwargs, + ): + if level is not None: + raise NotImplementedError("level parameter is not implemented yet") + + if numeric_only not in (None, True): + raise NotImplementedError( + "numeric_only parameter is not implemented yet" + ) + assert axis in (None, 0, 1) + + if axis in (None, 0): + return self._apply_support_method_axis_0(op, **kwargs) + elif axis == 1: + return self._apply_support_method_axis_1(op, **kwargs) + def sum( self, axis=None, @@ -6411,7 +6428,7 @@ def sum( b 34 dtype: int64 """ - return self._apply_support_method( + return self._reduce( "sum", axis=axis, skipna=skipna, @@ -6469,7 +6486,7 @@ def product( b 5040 dtype: int64 """ - return self._apply_support_method( + return self._reduce( "prod", axis=axis, skipna=skipna, @@ -6667,7 +6684,7 @@ def mean( b 8.5 dtype: float64 """ - return self._apply_support_method( + return self._reduce( "mean", axis=axis, skipna=skipna, @@ -6822,7 +6839,7 @@ def std( dtype: float64 """ - return self._apply_support_method( + return self._reduce( "std", axis=axis, skipna=skipna, @@ -6877,7 +6894,7 @@ def var( b 1.666667 dtype: float64 """ - return self._apply_support_method( + return self._reduce( "var", axis=axis, skipna=skipna, From 54421895cfe540ed8b7fa20ef1a4e64269e8ea49 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 26 Jul 2021 09:22:02 -0700 Subject: [PATCH 09/23] Reimplement axis 0 reductions as column rather than Series ops. --- python/cudf/cudf/core/dataframe.py | 46 ++++++++++++++++++++++++------ 1 file changed, 37 insertions(+), 9 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 65ca35b5e7d..62246e3cc66 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -6263,7 +6263,8 @@ def count(self, axis=0, level=None, numeric_only=False, **kwargs): Single 5 dtype: int64 """ - if axis not in (0, "index", None): + axis = self._get_axis_from_axis_arg(axis) + if axis != 0: raise NotImplementedError("Only axis=0 is currently supported.") return self._apply_support_method( @@ -6310,7 +6311,7 @@ def min( b 7 dtype: int64 """ - return self._apply_support_method( + return self._reduce( "min", axis=axis, skipna=skipna, @@ -6355,7 +6356,7 @@ def max( b 10 dtype: int64 """ - return self._apply_support_method( + return self._reduce( "max", axis=axis, skipna=skipna, @@ -6364,6 +6365,23 @@ def max( **kwargs, ) + _SUPPORT_AXIS_LOOKUP = { + 0: 0, + 1: 1, + None: 0, + "index": 0, + "columns": 1, + } + + @classmethod + def _get_axis_from_axis_arg(cls, axis): + try: + return cls._SUPPORT_AXIS_LOOKUP[axis] + except KeyError: + raise ValueError( + "Invalid axis argument, must be 0, 1, 'index', 'columns'." + ) + def _reduce( self, op, axis=None, level=None, numeric_only=None, **kwargs, ): @@ -6374,10 +6392,17 @@ def _reduce( raise NotImplementedError( "numeric_only parameter is not implemented yet" ) - assert axis in (None, 0, 1) + axis = self._get_axis_from_axis_arg(axis) + + if axis == 0: + result = [ + getattr(self._data[col], op)(**kwargs) + for col in self._data.names + ] - if axis in (None, 0): - return self._apply_support_method_axis_0(op, **kwargs) + return Series._from_data( + {None: result}, as_index(self._data.names) + ) elif axis == 1: return self._apply_support_method_axis_1(op, **kwargs) @@ -6486,8 +6511,11 @@ def product( b 5040 dtype: int64 """ + axis = self._get_axis_from_axis_arg(axis) return self._reduce( - "prod", + # cuDF columns use "product" as the op name, but cupy uses "prod" + # and we need cupy if axis == 1. + "product" if axis == 0 else "prod", axis=axis, skipna=skipna, dtype=dtype, @@ -7208,9 +7236,9 @@ def _apply_support_method_axis_1(self, method, *args, **kwargs): return result_df def _apply_support_method(self, method, axis=0, *args, **kwargs): - assert axis in (None, 0, 1) + axis = self._get_axis_from_axis_arg(axis) - if axis in (None, 0): + if axis == 0: return self._apply_support_method_axis_0(method, *args, **kwargs) elif axis == 1: return self._apply_support_method_axis_1(method, *args, **kwargs) From 30781d89f17e24592c1e04edbf5d478e59b27860 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 26 Jul 2021 10:03:29 -0700 Subject: [PATCH 10/23] Move most reductions up to the Frame level. --- python/cudf/cudf/core/dataframe.py | 367 ---------------------------- python/cudf/cudf/core/frame.py | 373 +++++++++++++++++++++++++++++ python/cudf/cudf/core/series.py | 364 ---------------------------- 3 files changed, 373 insertions(+), 731 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 62246e3cc66..246f9ca2b29 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -6275,96 +6275,6 @@ def count(self, axis=0, level=None, numeric_only=False, **kwargs): **kwargs, ) - def min( - self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs, - ): - """ - Return the minimum of the values in the DataFrame. - - Parameters - ---------- - axis: {index (0), columns(1)} - Axis for the function to be applied on. - skipna: bool, default True - Exclude NA/null values when computing the result. - level: int or level name, default None - If the axis is a MultiIndex (hierarchical), count along a - particular level, collapsing into a Series. - numeric_only: bool, default None - Include only float, int, boolean columns. If None, will attempt to - use everything, then use only numeric data. - - Returns - ------- - Series - - Notes - ----- - Parameters currently not supported are `level`, `numeric_only`. - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) - >>> df.min() - a 1 - b 7 - dtype: int64 - """ - return self._reduce( - "min", - axis=axis, - skipna=skipna, - level=level, - numeric_only=numeric_only, - **kwargs, - ) - - def max( - self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs, - ): - """ - Return the maximum of the values in the DataFrame. - - Parameters - ---------- - axis: {index (0), columns(1)} - Axis for the function to be applied on. - skipna: bool, default True - Exclude NA/null values when computing the result. - level: int or level name, default None - If the axis is a MultiIndex (hierarchical), count along a - particular level, collapsing into a Series. - numeric_only: bool, default None - Include only float, int, boolean columns. If None, will attempt to - use everything, then use only numeric data. - - Returns - ------- - Series - - Notes - ----- - Parameters currently not supported are `level`, `numeric_only`. - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) - >>> df.max() - a 4 - b 10 - dtype: int64 - """ - return self._reduce( - "max", - axis=axis, - skipna=skipna, - level=level, - numeric_only=numeric_only, - **kwargs, - ) - _SUPPORT_AXIS_LOOKUP = { 0: 0, 1: 1, @@ -6406,128 +6316,6 @@ def _reduce( elif axis == 1: return self._apply_support_method_axis_1(op, **kwargs) - def sum( - self, - axis=None, - skipna=None, - dtype=None, - level=None, - numeric_only=None, - min_count=0, - **kwargs, - ): - """ - Return sum of the values in the DataFrame. - - Parameters - ---------- - - axis: {index (0), columns(1)} - Axis for the function to be applied on. - skipna: bool, default True - Exclude NA/null values when computing the result. - dtype: data type - Data type to cast the result to. - min_count: int, default 0 - The required number of valid values to perform the operation. - If fewer than min_count non-NA values are present the result - will be NA. - - The default being 0. This means the sum of an all-NA or empty - Series is 0, and the product of an all-NA or empty Series is 1. - - Returns - ------- - Series - - Notes - ----- - Parameters currently not supported are `level`, `numeric_only`. - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) - >>> df.sum() - a 10 - b 34 - dtype: int64 - """ - return self._reduce( - "sum", - axis=axis, - skipna=skipna, - dtype=dtype, - level=level, - numeric_only=numeric_only, - min_count=min_count, - **kwargs, - ) - - def product( - self, - axis=None, - skipna=None, - dtype=None, - level=None, - numeric_only=None, - min_count=0, - **kwargs, - ): - """ - Return product of the values in the DataFrame. - - Parameters - ---------- - - axis: {index (0), columns(1)} - Axis for the function to be applied on. - skipna: bool, default True - Exclude NA/null values when computing the result. - dtype: data type - Data type to cast the result to. - min_count: int, default 0 - The required number of valid values to perform the operation. - If fewer than min_count non-NA values are present the result - will be NA. - - The default being 0. This means the sum of an all-NA or empty - Series is 0, and the product of an all-NA or empty Series is 1. - - Returns - ------- - Series - - Notes - ----- - Parameters currently not supported are level`, `numeric_only`. - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) - >>> df.product() - a 24 - b 5040 - dtype: int64 - """ - axis = self._get_axis_from_axis_arg(axis) - return self._reduce( - # cuDF columns use "product" as the op name, but cupy uses "prod" - # and we need cupy if axis == 1. - "product" if axis == 0 else "prod", - axis=axis, - skipna=skipna, - dtype=dtype, - level=level, - numeric_only=numeric_only, - min_count=min_count, - **kwargs, - ) - - # Alias for pandas compatibility. - prod = product - def cummin(self, axis=None, skipna=True, *args, **kwargs): """ Return cumulative minimum of the DataFrame. @@ -6677,50 +6465,6 @@ def cumprod(self, axis=None, skipna=True, *args, **kwargs): "cumprod", axis=axis, skipna=skipna, *args, **kwargs ) - def mean( - self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs - ): - """ - Return the mean of the values for the requested axis. - - Parameters - ---------- - axis : {0 or 'index', 1 or 'columns'} - Axis for the function to be applied on. - skipna : bool, default True - Exclude NA/null values when computing the result. - level : int or level name, default None - If the axis is a MultiIndex (hierarchical), count along a - particular level, collapsing into a Series. - numeric_only : bool, default None - Include only float, int, boolean columns. If None, will attempt to - use everything, then use only numeric data. Not implemented for - Series. - **kwargs - Additional keyword arguments to be passed to the function. - - Returns - ------- - mean : Series or DataFrame (if level specified) - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) - >>> df.mean() - a 2.5 - b 8.5 - dtype: float64 - """ - return self._reduce( - "mean", - axis=axis, - skipna=skipna, - level=level, - numeric_only=numeric_only, - **kwargs, - ) - def mode(self, axis=0, numeric_only=False, dropna=True): """ Get the mode(s) of each element along the selected axis. @@ -6821,117 +6565,6 @@ def mode(self, axis=0, numeric_only=False, dropna=True): return df - def std( - self, - axis=None, - skipna=None, - level=None, - ddof=1, - numeric_only=None, - **kwargs, - ): - """ - Return sample standard deviation of the DataFrame. - - Normalized by N-1 by default. This can be changed using - the `ddof` argument - - Parameters - ---------- - - axis: {index (0), columns(1)} - Axis for the function to be applied on. - skipna: bool, default True - Exclude NA/null values. If an entire row/column is NA, the result - will be NA. - ddof: int, default 1 - Delta Degrees of Freedom. The divisor used in calculations - is N - ddof, where N represents the number of elements. - - Returns - ------- - Series - - Notes - ----- - Parameters currently not supported are `level` and - `numeric_only` - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) - >>> df.std() - a 1.290994 - b 1.290994 - dtype: float64 - """ - - return self._reduce( - "std", - axis=axis, - skipna=skipna, - level=level, - ddof=ddof, - numeric_only=numeric_only, - **kwargs, - ) - - def var( - self, - axis=None, - skipna=None, - level=None, - ddof=1, - numeric_only=None, - **kwargs, - ): - """ - Return unbiased variance of the DataFrame. - - Normalized by N-1 by default. This can be changed using the - ddof argument - - Parameters - ---------- - - axis: {index (0), columns(1)} - Axis for the function to be applied on. - skipna: bool, default True - Exclude NA/null values. If an entire row/column is NA, the result - will be NA. - ddof: int, default 1 - Delta Degrees of Freedom. The divisor used in calculations is - N - ddof, where N represents the number of elements. - - Returns - ------- - scalar - - Notes - ----- - Parameters currently not supported are `level` and - `numeric_only` - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) - >>> df.var() - a 1.666667 - b 1.666667 - dtype: float64 - """ - return self._reduce( - "var", - axis=axis, - skipna=skipna, - level=level, - ddof=ddof, - numeric_only=numeric_only, - **kwargs, - ) - def kurtosis( self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs ): diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 14b8ebe801f..7665446461a 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -3604,6 +3604,379 @@ def __pos__(self): def __abs__(self): return self._unaryop("abs") + # Reductions + def _reduce(self, *args, **kwargs): + raise NotImplementedError( + f"Reductions are not supported for objects of type {type(self)}." + ) + + def min( + self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs, + ): + """ + Return the minimum of the values in the DataFrame. + + Parameters + ---------- + axis: {index (0), columns(1)} + Axis for the function to be applied on. + skipna: bool, default True + Exclude NA/null values when computing the result. + level: int or level name, default None + If the axis is a MultiIndex (hierarchical), count along a + particular level, collapsing into a Series. + numeric_only: bool, default None + Include only float, int, boolean columns. If None, will attempt to + use everything, then use only numeric data. + + Returns + ------- + Series + + Notes + ----- + Parameters currently not supported are `level`, `numeric_only`. + + Examples + -------- + >>> import cudf + >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) + >>> df.min() + a 1 + b 7 + dtype: int64 + """ + return self._reduce( + "min", + axis=axis, + skipna=skipna, + level=level, + numeric_only=numeric_only, + **kwargs, + ) + + def max( + self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs, + ): + """ + Return the maximum of the values in the DataFrame. + + Parameters + ---------- + axis: {index (0), columns(1)} + Axis for the function to be applied on. + skipna: bool, default True + Exclude NA/null values when computing the result. + level: int or level name, default None + If the axis is a MultiIndex (hierarchical), count along a + particular level, collapsing into a Series. + numeric_only: bool, default None + Include only float, int, boolean columns. If None, will attempt to + use everything, then use only numeric data. + + Returns + ------- + Series + + Notes + ----- + Parameters currently not supported are `level`, `numeric_only`. + + Examples + -------- + >>> import cudf + >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) + >>> df.max() + a 4 + b 10 + dtype: int64 + """ + return self._reduce( + "max", + axis=axis, + skipna=skipna, + level=level, + numeric_only=numeric_only, + **kwargs, + ) + + def sum( + self, + axis=None, + skipna=None, + dtype=None, + level=None, + numeric_only=None, + min_count=0, + **kwargs, + ): + """ + Return sum of the values in the DataFrame. + + Parameters + ---------- + + axis: {index (0), columns(1)} + Axis for the function to be applied on. + skipna: bool, default True + Exclude NA/null values when computing the result. + dtype: data type + Data type to cast the result to. + min_count: int, default 0 + The required number of valid values to perform the operation. + If fewer than min_count non-NA values are present the result + will be NA. + + The default being 0. This means the sum of an all-NA or empty + Series is 0, and the product of an all-NA or empty Series is 1. + + Returns + ------- + Series + + Notes + ----- + Parameters currently not supported are `level`, `numeric_only`. + + Examples + -------- + >>> import cudf + >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) + >>> df.sum() + a 10 + b 34 + dtype: int64 + """ + return self._reduce( + "sum", + axis=axis, + skipna=skipna, + dtype=dtype, + level=level, + numeric_only=numeric_only, + min_count=min_count, + **kwargs, + ) + + def product( + self, + axis=None, + skipna=None, + dtype=None, + level=None, + numeric_only=None, + min_count=0, + **kwargs, + ): + """ + Return product of the values in the DataFrame. + + Parameters + ---------- + + axis: {index (0), columns(1)} + Axis for the function to be applied on. + skipna: bool, default True + Exclude NA/null values when computing the result. + dtype: data type + Data type to cast the result to. + min_count: int, default 0 + The required number of valid values to perform the operation. + If fewer than min_count non-NA values are present the result + will be NA. + + The default being 0. This means the sum of an all-NA or empty + Series is 0, and the product of an all-NA or empty Series is 1. + + Returns + ------- + Series + + Notes + ----- + Parameters currently not supported are level`, `numeric_only`. + + Examples + -------- + >>> import cudf + >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) + >>> df.product() + a 24 + b 5040 + dtype: int64 + """ + axis = self._get_axis_from_axis_arg(axis) + return self._reduce( + # cuDF columns use "product" as the op name, but cupy uses "prod" + # and we need cupy if axis == 1. + "product" if axis == 0 else "prod", + axis=axis, + skipna=skipna, + dtype=dtype, + level=level, + numeric_only=numeric_only, + min_count=min_count, + **kwargs, + ) + + # Alias for pandas compatibility. + prod = product + + def mean( + self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs + ): + """ + Return the mean of the values for the requested axis. + + Parameters + ---------- + axis : {0 or 'index', 1 or 'columns'} + Axis for the function to be applied on. + skipna : bool, default True + Exclude NA/null values when computing the result. + level : int or level name, default None + If the axis is a MultiIndex (hierarchical), count along a + particular level, collapsing into a Series. + numeric_only : bool, default None + Include only float, int, boolean columns. If None, will attempt to + use everything, then use only numeric data. Not implemented for + Series. + **kwargs + Additional keyword arguments to be passed to the function. + + Returns + ------- + mean : Series or DataFrame (if level specified) + + Examples + -------- + >>> import cudf + >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) + >>> df.mean() + a 2.5 + b 8.5 + dtype: float64 + """ + return self._reduce( + "mean", + axis=axis, + skipna=skipna, + level=level, + numeric_only=numeric_only, + **kwargs, + ) + + def std( + self, + axis=None, + skipna=None, + level=None, + ddof=1, + numeric_only=None, + **kwargs, + ): + """ + Return sample standard deviation of the DataFrame. + + Normalized by N-1 by default. This can be changed using + the `ddof` argument + + Parameters + ---------- + + axis: {index (0), columns(1)} + Axis for the function to be applied on. + skipna: bool, default True + Exclude NA/null values. If an entire row/column is NA, the result + will be NA. + ddof: int, default 1 + Delta Degrees of Freedom. The divisor used in calculations + is N - ddof, where N represents the number of elements. + + Returns + ------- + Series + + Notes + ----- + Parameters currently not supported are `level` and + `numeric_only` + + Examples + -------- + >>> import cudf + >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) + >>> df.std() + a 1.290994 + b 1.290994 + dtype: float64 + """ + + return self._reduce( + "std", + axis=axis, + skipna=skipna, + level=level, + ddof=ddof, + numeric_only=numeric_only, + **kwargs, + ) + + def var( + self, + axis=None, + skipna=None, + level=None, + ddof=1, + numeric_only=None, + **kwargs, + ): + """ + Return unbiased variance of the DataFrame. + + Normalized by N-1 by default. This can be changed using the + ddof argument + + Parameters + ---------- + + axis: {index (0), columns(1)} + Axis for the function to be applied on. + skipna: bool, default True + Exclude NA/null values. If an entire row/column is NA, the result + will be NA. + ddof: int, default 1 + Delta Degrees of Freedom. The divisor used in calculations is + N - ddof, where N represents the number of elements. + + Returns + ------- + scalar + + Notes + ----- + Parameters currently not supported are `level` and + `numeric_only` + + Examples + -------- + >>> import cudf + >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) + >>> df.var() + a 1.666667 + b 1.666667 + dtype: float64 + """ + return self._reduce( + "var", + axis=axis, + skipna=skipna, + level=level, + ddof=ddof, + numeric_only=numeric_only, + **kwargs, + ) + class SingleColumnFrame(Frame): """A one-dimensional frame. diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 00799b2f932..33b7c18b48d 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -4035,213 +4035,6 @@ def count(self, level=None, **kwargs): return self.valid_count - def min( - self, - axis=None, - skipna=None, - dtype=None, - level=None, - numeric_only=None, - **kwargs, - ): - """ - Return the minimum of the values in the Series. - - Parameters - ---------- - - skipna : bool, default True - Exclude NA/null values when computing the result. - - dtype : data type - Data type to cast the result to. - - Returns - ------- - scalar - - Notes - ----- - Parameters currently not supported are `axis`, `level`, `numeric_only`. - - Examples - -------- - >>> import cudf - >>> ser = cudf.Series([1, 5, 2, 4, 3]) - >>> ser.min() - 1 - """ - return self._reduce( - "min", - axis=axis, - skipna=skipna, - dtype=dtype, - level=level, - numeric_only=numeric_only, - **kwargs, - ) - - def max( - self, - axis=None, - skipna=None, - dtype=None, - level=None, - numeric_only=None, - **kwargs, - ): - """ - Return the maximum of the values in the Series. - - Parameters - ---------- - - skipna : bool, default True - Exclude NA/null values when computing the result. - - dtype : data type - Data type to cast the result to. - - Returns - ------- - scalar - - Notes - ----- - Parameters currently not supported are `axis`, `level`, `numeric_only`. - - Examples - -------- - >>> import cudf - >>> ser = cudf.Series([1, 5, 2, 4, 3]) - >>> ser.max() - 5 - """ - return self._reduce( - "max", - axis=axis, - skipna=skipna, - dtype=dtype, - level=level, - numeric_only=numeric_only, - **kwargs, - ) - - def sum( - self, - axis=None, - skipna=None, - dtype=None, - level=None, - numeric_only=None, - min_count=0, - **kwargs, - ): - """ - Return sum of the values in the Series. - - Parameters - ---------- - - skipna : bool, default True - Exclude NA/null values when computing the result. - - dtype : data type - Data type to cast the result to. - - min_count : int, default 0 - The required number of valid values to perform the operation. - If fewer than min_count non-NA values are present the result - will be NA. - - The default being 0. This means the sum of an all-NA or empty - Series is 0, and the product of an all-NA or empty Series is 1. - - Returns - ------- - scalar - - Notes - ----- - Parameters currently not supported are `axis`, `level`, `numeric_only`. - - Examples - -------- - >>> import cudf - >>> ser = cudf.Series([1, 5, 2, 4, 3]) - >>> ser.sum() - 15 - """ - return self._reduce( - "sum", - axis=axis, - skipna=skipna, - dtype=dtype, - level=level, - numeric_only=numeric_only, - min_count=min_count, - **kwargs, - ) - - def product( - self, - axis=None, - skipna=None, - dtype=None, - level=None, - numeric_only=None, - min_count=0, - **kwargs, - ): - """ - Return product of the values in the Series. - - Parameters - ---------- - - skipna : bool, default True - Exclude NA/null values when computing the result. - - dtype : data type - Data type to cast the result to. - - min_count : int, default 0 - The required number of valid values to perform the operation. - If fewer than min_count non-NA values are present the result - will be NA. - - The default being 0. This means the sum of an all-NA or empty - Series is 0, and the product of an all-NA or empty Series is 1. - - Returns - ------- - scalar - - Notes - ----- - Parameters currently not supported are `axis`, `level`, `numeric_only`. - - Examples - -------- - >>> import cudf - >>> ser = cudf.Series([1, 5, 2, 4, 3]) - >>> ser.product() - 120 - """ - return self._reduce( - "product", - axis=axis, - skipna=skipna, - dtype=dtype, - level=level, - numeric_only=numeric_only, - min_count=min_count, - **kwargs, - ) - - # Alias for pandas compatibility. - prod = product - def cummin(self, axis=None, skipna=True, *args, **kwargs): """ Return cumulative minimum of the Series. @@ -4474,163 +4267,6 @@ def cumprod(self, axis=0, skipna=True, *args, **kwargs): index=self.index, ) - def mean( - self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs - ): - """ - Return the mean of the values in the series. - - Parameters - ---------- - - skipna : bool, default True - Exclude NA/null values when computing the result. - - Returns - ------- - scalar - - Notes - ----- - Parameters currently not supported are `axis`, `level` and - `numeric_only` - - Examples - -------- - >>> import cudf - >>> ser = cudf.Series([10, 25, 3, 25, 24, 6]) - >>> ser.mean() - 15.5 - """ - return self._reduce( - "mean", - axis=axis, - skipna=skipna, - level=level, - numeric_only=numeric_only, - **kwargs, - ) - - def std( - self, - axis=None, - skipna=None, - level=None, - ddof=1, - numeric_only=None, - **kwargs, - ): - """ - Return sample standard deviation of the Series. - - Normalized by N-1 by default. This can be changed using - the `ddof` argument - - Parameters - ---------- - - skipna : bool, default True - Exclude NA/null values. If an entire row/column is NA, the result - will be NA. - - ddof : int, default 1 - Delta Degrees of Freedom. The divisor used in calculations - is N - ddof, where N represents the number of elements. - - Returns - ------- - scalar - - Notes - ----- - Parameters currently not supported are `axis`, `level` and - `numeric_only` - - Examples - -------- - >>> import cudf - >>> series = cudf.Series([10, 10, 20, 30, 40]) - >>> series - 0 10 - 1 10 - 2 20 - 3 30 - 4 40 - dtype: int64 - >>> series.std() - 13.038404810405298 - >>> series.std(ddof=2) - 15.05545305418162 - """ - return self._reduce( - "std", - axis=axis, - skipna=skipna, - level=level, - numeric_only=numeric_only, - ddof=ddof, - **kwargs, - ) - - def var( - self, - axis=None, - skipna=None, - level=None, - ddof=1, - numeric_only=None, - **kwargs, - ): - """ - Return unbiased variance of the Series. - - Normalized by N-1 by default. This can be changed using the - ddof argument - - Parameters - ---------- - - skipna : bool, default True - Exclude NA/null values. If an entire row/column is NA, the result - will be NA. - - ddof : int, default 1 - Delta Degrees of Freedom. The divisor used in calculations is - N - ddof, where N represents the number of elements. - - Returns - ------- - scalar - - Notes - ----- - Parameters currently not supported are `axis`, `level` and - `numeric_only` - - Examples - -------- - >>> import cudf - >>> series = cudf.Series([10, 11, 12, 0, 1]) - >>> series - 0 10 - 1 11 - 2 12 - 3 0 - 4 1 - dtype: int64 - >>> series.var() - 33.7 - """ - return self._reduce( - "var", - axis=axis, - skipna=skipna, - level=level, - numeric_only=numeric_only, - ddof=ddof, - **kwargs, - ) - def sum_of_squares(self, dtype=None): return self._reduce("sum_of_squares", dtype=dtype) From e2e37c19442286134a61b222c450fb5cdcb7b192 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 26 Jul 2021 10:27:03 -0700 Subject: [PATCH 11/23] Fix behavior of bool_only to not fail for bool_only=False for DataFrame any/all and clean up Series any/all implementations. --- python/cudf/cudf/core/dataframe.py | 38 +++--------------- python/cudf/cudf/core/series.py | 49 ++++-------------------- python/cudf/cudf/tests/test_dataframe.py | 4 -- 3 files changed, 14 insertions(+), 77 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 246f9ca2b29..6f9fadb6203 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -6695,22 +6695,9 @@ def all(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): b False dtype: bool """ - if bool_only: - return self.select_dtypes(include="bool")._apply_support_method( - "all", - axis=axis, - bool_only=bool_only, - skipna=skipna, - level=level, - **kwargs, - ) - return self._apply_support_method( - "all", - axis=axis, - bool_only=bool_only, - skipna=skipna, - level=level, - **kwargs, + obj = self.select_dtypes(include="bool") if bool_only else self + return obj._reduce( + "all", axis=axis, skipna=skipna, level=level, **kwargs, ) def any(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): @@ -6744,22 +6731,9 @@ def any(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): b True dtype: bool """ - if bool_only: - return self.select_dtypes(include="bool")._apply_support_method( - "any", - axis=axis, - bool_only=bool_only, - skipna=skipna, - level=level, - **kwargs, - ) - return self._apply_support_method( - "any", - axis=axis, - bool_only=bool_only, - skipna=skipna, - level=level, - **kwargs, + obj = self.select_dtypes(include="bool") if bool_only else self + return obj._reduce( + "any", axis=axis, skipna=skipna, level=level, **kwargs, ) def _apply_support_method_axis_0(self, method, *args, **kwargs): diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 33b7c18b48d..198ab9fff94 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -2750,26 +2750,13 @@ def all(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): True """ - if axis not in (None, 0): - raise NotImplementedError("axis parameter is not implemented yet") - - if level is not None: - raise NotImplementedError("level parameter is not implemented yet") - if bool_only not in (None, True): raise NotImplementedError( - "bool_only parameter is not implemented yet" + "The bool_only parameter is not supported for Series." ) - - return self._column.all(skipna=skipna) - - # if skipna: - # result_series = self.nans_to_nulls() - # if len(result_series) == result_series.null_count: - # return True - # else: - # result_series = self - # return result_series._column.all() + return self._reduce( + "all", axis=axis, skipna=skipna, level=level, **kwargs, + ) def any(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): """ @@ -2801,33 +2788,13 @@ def any(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): True """ - if axis not in (None, 0): - raise NotImplementedError("axis parameter is not implemented yet") - - if level is not None: - raise NotImplementedError("level parameter is not implemented yet") - if bool_only not in (None, True): raise NotImplementedError( - "bool_only parameter is not implemented yet" + "The bool_only parameter is not supported for Series." ) - # TODO: I think we can remove this, pandas no longer supports None. - skipna = False if skipna is None else skipna - - return self._column.any(skipna=skipna) - - # if skipna is False and self.has_nulls: - # return True - # - # if skipna: - # result_series = self.nans_to_nulls() - # if len(result_series) == result_series.null_count: - # return False - # - # else: - # result_series = self - # - # return result_series._column.any() + return self._reduce( + "any", axis=axis, skipna=skipna, level=level, **kwargs, + ) def to_pandas(self, index=True, nullable=False, **kwargs): """ diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 21683d4bdd0..8f42f5179e8 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -3423,8 +3423,6 @@ def test_all(data): expected = pdata.all(bool_only=True) assert_eq(got, expected) else: - with pytest.raises(NotImplementedError): - gdata.all(bool_only=False) with pytest.raises(NotImplementedError): gdata.all(level="a") @@ -3484,8 +3482,6 @@ def test_any(data, axis): expected = pdata.any(bool_only=True) assert_eq(got, expected) else: - with pytest.raises(NotImplementedError): - gdata.any(bool_only=False) with pytest.raises(NotImplementedError): gdata.any(level="a") From f9d68f431e82442473afaf97fed6dfb995ec9808 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 26 Jul 2021 10:37:17 -0700 Subject: [PATCH 12/23] Move any and all into Frame. --- python/cudf/cudf/core/dataframe.py | 68 +---------------------------- python/cudf/cudf/core/frame.py | 70 ++++++++++++++++++++++++++++++ python/cudf/cudf/core/series.py | 66 +--------------------------- 3 files changed, 74 insertions(+), 130 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 6f9fadb6203..f65042baddb 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -6665,76 +6665,12 @@ def skew( ) def all(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): - """ - Return whether all elements are True in DataFrame. - - Parameters - ---------- - - skipna: bool, default True - Exclude NA/null values. If the entire row/column is NA and - skipna is True, then the result will be True, as for an - empty row/column. - If skipna is False, then NA are treated as True, because - these are not equal to zero. - - Returns - ------- - Series - - Notes - ----- - Parameters currently not supported are `axis`, `bool_only`, `level`. - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({'a': [3, 2, 3, 4], 'b': [7, 0, 10, 10]}) - >>> df.all() - a True - b False - dtype: bool - """ obj = self.select_dtypes(include="bool") if bool_only else self - return obj._reduce( - "all", axis=axis, skipna=skipna, level=level, **kwargs, - ) + return super(DataFrame, obj).all(axis, skipna, level, **kwargs) def any(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): - """ - Return whether any elements is True in DataFrame. - - Parameters - ---------- - - skipna: bool, default True - Exclude NA/null values. If the entire row/column is NA and - skipna is True, then the result will be False, as for an - empty row/column. - If skipna is False, then NA are treated as True, because - these are not equal to zero. - - Returns - ------- - Series - - Notes - ----- - Parameters currently not supported are `axis`, `bool_only`, `level`. - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({'a': [3, 2, 3, 4], 'b': [7, 0, 10, 10]}) - >>> df.any() - a True - b True - dtype: bool - """ obj = self.select_dtypes(include="bool") if bool_only else self - return obj._reduce( - "any", axis=axis, skipna=skipna, level=level, **kwargs, - ) + return super(DataFrame, obj).any(axis, skipna, level, **kwargs) def _apply_support_method_axis_0(self, method, *args, **kwargs): result = [ diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 7665446461a..d16b52892ea 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -3977,6 +3977,76 @@ def var( **kwargs, ) + def all(self, axis=0, skipna=True, level=None, **kwargs): + """ + Return whether all elements are True in DataFrame. + + Parameters + ---------- + + skipna: bool, default True + Exclude NA/null values. If the entire row/column is NA and + skipna is True, then the result will be True, as for an + empty row/column. + If skipna is False, then NA are treated as True, because + these are not equal to zero. + + Returns + ------- + Series + + Notes + ----- + Parameters currently not supported are `axis`, `bool_only`, `level`. + + Examples + -------- + >>> import cudf + >>> df = cudf.DataFrame({'a': [3, 2, 3, 4], 'b': [7, 0, 10, 10]}) + >>> df.all() + a True + b False + dtype: bool + """ + return self._reduce( + "all", axis=axis, skipna=skipna, level=level, **kwargs, + ) + + def any(self, axis=0, skipna=True, level=None, **kwargs): + """ + Return whether any elements is True in DataFrame. + + Parameters + ---------- + + skipna: bool, default True + Exclude NA/null values. If the entire row/column is NA and + skipna is True, then the result will be False, as for an + empty row/column. + If skipna is False, then NA are treated as True, because + these are not equal to zero. + + Returns + ------- + Series + + Notes + ----- + Parameters currently not supported are `axis`, `bool_only`, `level`. + + Examples + -------- + >>> import cudf + >>> df = cudf.DataFrame({'a': [3, 2, 3, 4], 'b': [7, 0, 10, 10]}) + >>> df.any() + a True + b True + dtype: bool + """ + return self._reduce( + "any", axis=axis, skipna=skipna, level=level, **kwargs, + ) + class SingleColumnFrame(Frame): """A one-dimensional frame. diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 198ab9fff94..1a0de243853 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -2721,80 +2721,18 @@ def nans_to_nulls(self): return self._copy_construct(data=result_col) def all(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): - """ - Return whether all elements are True in Series. - - Parameters - ---------- - - skipna : bool, default True - Exclude NA/null values. If the entire row/column is NA and - skipna is True, then the result will be True, as for an - empty row/column. - If skipna is False, then NA are treated as True, because - these are not equal to zero. - - Returns - ------- - scalar - - Notes - ----- - Parameters currently not supported are `axis`, `bool_only`, `level`. - - Examples - -------- - >>> import cudf - >>> ser = cudf.Series([1, 5, 2, 4, 3]) - >>> ser.all() - True - """ - if bool_only not in (None, True): raise NotImplementedError( "The bool_only parameter is not supported for Series." ) - return self._reduce( - "all", axis=axis, skipna=skipna, level=level, **kwargs, - ) + return super().all(axis, skipna, level, **kwargs) def any(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): - """ - Return whether any elements is True in Series. - - Parameters - ---------- - - skipna : bool, default True - Exclude NA/null values. If the entire row/column is NA and - skipna is True, then the result will be False, as for an - empty row/column. - If skipna is False, then NA are treated as True, because - these are not equal to zero. - - Returns - ------- - scalar - - Notes - ----- - Parameters currently not supported are `axis`, `bool_only`, `level`. - - Examples - -------- - >>> import cudf - >>> ser = cudf.Series([1, 5, 2, 4, 3]) - >>> ser.any() - True - """ - if bool_only not in (None, True): raise NotImplementedError( "The bool_only parameter is not supported for Series." ) - return self._reduce( - "any", axis=axis, skipna=skipna, level=level, **kwargs, - ) + return super().any(axis, skipna, level, **kwargs) def to_pandas(self, index=True, nullable=False, **kwargs): """ From 8b51959c64af776d99df1bee8c82cc362f53bdfc Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 30 Jul 2021 15:29:18 -0700 Subject: [PATCH 13/23] Optimize DataFrame-Series binop. --- python/cudf/cudf/core/dataframe.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index f65042baddb..e788aa69d65 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1463,9 +1463,20 @@ def _binaryop( None, ) elif isinstance(rhs, Series): - # Note: This logic will need updating if any of the user-facing + # Note: This branch will need updating if any of the user-facing # binop methods (e.g. DataFrame.add) ever support axis=0/rows. - right_dict = dict(zip(rhs.index.values_host, rhs.values_host)) + + # This dict comprehension using get_element is an optimization to + # avoid an expensive call to rhs.values_host. Ultimately both + # result in D2H copies (the Scalar here is eventually converted in + # column.normalize_binop_value) but this approach is still + # significantly faster. Unfortunately there is no easy way to + # eliminate the index.values_host call. + right_dict = { + k: cudf.Scalar(libcudf.copying.get_element(rhs._column, i)) + for i, k in enumerate(rhs.index.values_host) + } + left_cols = lhs._column_names # mypy thinks lhs._column_names is a List rather than a Tuple, so # we have to ignore the type check. From 25f1233585b101ba6b30a38b07d671390cd334a0 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 30 Jul 2021 15:39:15 -0700 Subject: [PATCH 14/23] Move sum_of_squares to Frame. --- python/cudf/cudf/core/frame.py | 23 +++++++++++++++++++++++ python/cudf/cudf/core/series.py | 3 --- 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index d16b52892ea..d2e2d3d4233 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -4047,6 +4047,29 @@ def any(self, axis=0, skipna=True, level=None, **kwargs): "any", axis=axis, skipna=skipna, level=level, **kwargs, ) + def sum_of_squares(self, dtype=None): + """Return the sum of squares of values. + + Parameters + ---------- + dtype: data type + Data type to cast the result to. + + Returns + ------- + Series + + Examples + -------- + >>> import cudf + >>> df = cudf.DataFrame({'a': [3, 2, 3, 4], 'b': [7, 0, 10, 10]}) + >>> df.sum_of_squares() + a 38 + b 249 + dtype: int64 + """ + return self._reduce("sum_of_squares", dtype=dtype) + class SingleColumnFrame(Frame): """A one-dimensional frame. diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 1a0de243853..13d6137b799 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -4172,9 +4172,6 @@ def cumprod(self, axis=0, skipna=True, *args, **kwargs): index=self.index, ) - def sum_of_squares(self, dtype=None): - return self._reduce("sum_of_squares", dtype=dtype) - def median( self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs ): From 57b4d0cc901d342abca5b9e2691f335846cf3656 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 30 Jul 2021 16:09:49 -0700 Subject: [PATCH 15/23] Move median to Frame and add tests of both median and sum_of_squares for DataFrame. --- python/cudf/cudf/core/frame.py | 44 ++++++++++++++++++++++ python/cudf/cudf/core/series.py | 45 ----------------------- python/cudf/cudf/tests/test_dataframe.py | 1 + python/cudf/cudf/tests/test_reductions.py | 7 +++- 4 files changed, 51 insertions(+), 46 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index d2e2d3d4233..92551775818 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -4070,6 +4070,50 @@ def sum_of_squares(self, dtype=None): """ return self._reduce("sum_of_squares", dtype=dtype) + def median( + self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs + ): + """ + Return the median of the values for the requested axis. + + Parameters + ---------- + + skipna : bool, default True + Exclude NA/null values when computing the result. + + Returns + ------- + scalar + + Notes + ----- + Parameters currently not supported are `level` and `numeric_only`. + + Examples + -------- + >>> import cudf + >>> ser = cudf.Series([10, 25, 3, 25, 24, 6]) + >>> ser + 0 10 + 1 25 + 2 3 + 3 25 + 4 24 + 5 6 + dtype: int64 + >>> ser.median() + 17.0 + """ + return self._reduce( + "median", + axis=axis, + skipna=skipna, + level=level, + numeric_only=numeric_only, + **kwargs, + ) + class SingleColumnFrame(Frame): """A one-dimensional frame. diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 13d6137b799..653607824bf 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -4172,51 +4172,6 @@ def cumprod(self, axis=0, skipna=True, *args, **kwargs): index=self.index, ) - def median( - self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs - ): - """ - Return the median of the values for the requested axis. - - Parameters - ---------- - - skipna : bool, default True - Exclude NA/null values when computing the result. - - Returns - ------- - scalar - - Notes - ----- - Parameters currently not supported are `axis`, `level` and - `numeric_only` - - Examples - -------- - >>> import cudf - >>> ser = cudf.Series([10, 25, 3, 25, 24, 6]) - >>> ser - 0 10 - 1 25 - 2 3 - 3 25 - 4 24 - 5 6 - dtype: int64 - >>> ser.median() - 17.0 - """ - return self._reduce( - "median", - axis=axis, - skipna=skipna, - level=level, - numeric_only=numeric_only, - **kwargs, - ) - def mode(self, dropna=True): """ Return the mode(s) of the dataset. diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 8f42f5179e8..76d24dcd5d2 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -1847,6 +1847,7 @@ def gdf(pdf): lambda df, **kwargs: df.cumsum(**kwargs), lambda df, **kwargs: df.cumprod(**kwargs), lambda df, **kwargs: df.mean(**kwargs), + lambda df, **kwargs: df.median(**kwargs), lambda df, **kwargs: df.sum(**kwargs), lambda df, **kwargs: df.max(**kwargs), lambda df, **kwargs: df.std(ddof=1, **kwargs), diff --git a/python/cudf/cudf/tests/test_reductions.py b/python/cudf/cudf/tests/test_reductions.py index 7cbc56f943c..2a45c75f6da 100644 --- a/python/cudf/cudf/tests/test_reductions.py +++ b/python/cudf/cudf/tests/test_reductions.py @@ -110,20 +110,25 @@ def test_sum_of_squares(dtype, nelem): dtype = np.dtype(dtype).type data = gen_rand(dtype, nelem) sr = Series(data) + df = cudf.DataFrame(sr) got = sr.sum_of_squares() - # got = dtype(got) + got_df = df.sum_of_squares() expect = (data ** 2).sum() if np.dtype(dtype).kind in {"u", "i"}: if 0 <= expect <= np.iinfo(dtype).max: np.testing.assert_array_almost_equal(expect, got) + np.testing.assert_array_almost_equal(expect, got_df.iloc[0]) else: print("overflow, passing") else: np.testing.assert_approx_equal( expect, got, significant=accuracy_for_dtype[dtype] ) + np.testing.assert_approx_equal( + expect, got_df.iloc[0], significant=accuracy_for_dtype[dtype] + ) @pytest.mark.parametrize( From 7ed91c78797f2cf46efb2a0b790787afff02486f Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 30 Jul 2021 16:13:54 -0700 Subject: [PATCH 16/23] Revert "Optimize DataFrame-Series binop." This reverts commit 034036fdc371576d8be1d7e49b753525fe1fc943. --- python/cudf/cudf/core/dataframe.py | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index e788aa69d65..f65042baddb 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1463,20 +1463,9 @@ def _binaryop( None, ) elif isinstance(rhs, Series): - # Note: This branch will need updating if any of the user-facing + # Note: This logic will need updating if any of the user-facing # binop methods (e.g. DataFrame.add) ever support axis=0/rows. - - # This dict comprehension using get_element is an optimization to - # avoid an expensive call to rhs.values_host. Ultimately both - # result in D2H copies (the Scalar here is eventually converted in - # column.normalize_binop_value) but this approach is still - # significantly faster. Unfortunately there is no easy way to - # eliminate the index.values_host call. - right_dict = { - k: cudf.Scalar(libcudf.copying.get_element(rhs._column, i)) - for i, k in enumerate(rhs.index.values_host) - } - + right_dict = dict(zip(rhs.index.values_host, rhs.values_host)) left_cols = lhs._column_names # mypy thinks lhs._column_names is a List rather than a Tuple, so # we have to ignore the type check. From 933f578817601c4d69c167653dbd191f6703e880 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 3 Aug 2021 16:56:10 -0700 Subject: [PATCH 17/23] Fix axis lookup logic. --- python/cudf/cudf/core/column/column.py | 4 ++-- python/cudf/cudf/core/dataframe.py | 9 --------- python/cudf/cudf/core/frame.py | 17 +++++++++++++++++ python/cudf/cudf/core/series.py | 6 ++++++ 4 files changed, 25 insertions(+), 11 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 2e3eb412e4a..57231d36f77 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -936,9 +936,9 @@ def astype(self, dtype: Dtype, **kwargs) -> ColumnBase: return self.as_interval_column(dtype, **kwargs) elif is_decimal_dtype(dtype): return self.as_decimal_column(dtype, **kwargs) - elif np.issubdtype(dtype, np.datetime64): + elif np.issubdtype(cast(Any, dtype), np.datetime64): return self.as_datetime_column(dtype, **kwargs) - elif np.issubdtype(dtype, np.timedelta64): + elif np.issubdtype(cast(Any, dtype), np.timedelta64): return self.as_timedelta_column(dtype, **kwargs) else: return self.as_numerical_column(dtype, **kwargs) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index f65042baddb..8cdc6eebaee 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -6283,15 +6283,6 @@ def count(self, axis=0, level=None, numeric_only=False, **kwargs): "columns": 1, } - @classmethod - def _get_axis_from_axis_arg(cls, axis): - try: - return cls._SUPPORT_AXIS_LOOKUP[axis] - except KeyError: - raise ValueError( - "Invalid axis argument, must be 0, 1, 'index', 'columns'." - ) - def _reduce( self, op, axis=None, level=None, numeric_only=None, **kwargs, ): diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 92551775818..7d407637f8d 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -3605,6 +3605,23 @@ def __abs__(self): return self._unaryop("abs") # Reductions + @classmethod + def _get_axis_from_axis_arg(cls, axis): + try: + return cls._SUPPORT_AXIS_LOOKUP[axis] + except KeyError: + raise ValueError( + "Invalid axis argument, must be one of {}.".format( + ", ".join( + ( + ax + for ax in cls._SUPPORT_AXIS_LOOKUP.keys() + if ax is not None + ) + ) + ) + ) + def _reduce(self, *args, **kwargs): raise NotImplementedError( f"Reductions are not supported for objects of type {type(self)}." diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 653607824bf..cdd20b6862d 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -3899,6 +3899,12 @@ def applymap(self, udf, out_dtype=None): # # Stats # + _SUPPORT_AXIS_LOOKUP = { + 0: 0, + None: 0, + "index": 0, + } + def _reduce( self, op, axis=None, level=None, numeric_only=None, **kwargs, ): From 3cd7903d968e82beb7b5cdbd8ac17273ab0b1221 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 4 Aug 2021 10:45:17 -0700 Subject: [PATCH 18/23] Disable reductions for index types. --- python/cudf/cudf/core/index.py | 77 ++++------------------------ python/cudf/cudf/tests/test_index.py | 10 ---- 2 files changed, 9 insertions(+), 78 deletions(-) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 97ee0948209..64ed4c6cee3 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -518,74 +518,6 @@ def gpu_values(self): """ return self._values.data_array_view - def min(self): - """ - Return the minimum value of the Index. - - Returns - ------- - scalar - Minimum value. - - See Also - -------- - cudf.core.index.Index.max : Return the maximum value in an Index. - cudf.core.series.Series.min : Return the minimum value in a Series. - cudf.core.dataframe.DataFrame.min : Return the minimum values in - a DataFrame. - - Examples - -------- - >>> import cudf - >>> idx = cudf.Index([3, 2, 1]) - >>> idx.min() - 1 - """ - return self._values.min() - - def max(self): - """ - Return the maximum value of the Index. - - Returns - ------- - scalar - Maximum value. - - See Also - -------- - cudf.core.index.Index.min : Return the minimum value in an Index. - cudf.core.series.Series.max : Return the maximum value in a Series. - cudf.core.dataframe.DataFrame.max : Return the maximum values in - a DataFrame. - - Examples - -------- - >>> import cudf - >>> idx = cudf.Index([3, 2, 1]) - >>> idx.max() - 3 - """ - return self._values.max() - - def sum(self): - """ - Return the sum of all values of the Index. - - Returns - ------- - scalar - Sum of all values. - - Examples - -------- - >>> import cudf - >>> idx = cudf.Index([3, 2, 1]) - >>> idx.sum() - 6 - """ - return self._values.sum() - @classmethod def _concat(cls, objs): data = concat_columns([o._values for o in objs]) @@ -1362,6 +1294,15 @@ def from_pandas(cls, index, nan_as_null=None): ind.name = index.name return ind + def _reduce( + # self, op, axis=None, level=None, numeric_only=None, **kwargs, + self, + op, + *args, + **kwargs, + ): + raise AttributeError(f"{type(self)} object has no attribute {op}.") + @property def _copy_construct_defaults(self): return {"data": self._column, "name": self.name} diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index f03454c479a..221b3e51914 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -124,16 +124,6 @@ def test_index_comparision(): assert rg[:-1].equals(gi[:-1]) -@pytest.mark.parametrize( - "func", [lambda x: x.min(), lambda x: x.max(), lambda x: x.sum()] -) -def test_reductions(func): - x = np.asarray([4, 5, 6, 10]) - idx = Int64Index(np.asarray([4, 5, 6, 10])) - - assert func(x) == func(idx) - - def test_name(): idx = Int64Index(np.asarray([4, 5, 6, 10]), name="foo") assert idx.name == "foo" From 8336fb59a1a43319986238fd3be617b9a7373eee Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 5 Aug 2021 10:03:46 -0700 Subject: [PATCH 19/23] Enable reductions for Index types. --- python/cudf/cudf/core/frame.py | 21 +++++++++++++++++++++ python/cudf/cudf/core/index.py | 9 --------- python/cudf/cudf/core/series.py | 21 --------------------- 3 files changed, 21 insertions(+), 30 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 7d407637f8d..4e182d73894 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -4139,6 +4139,27 @@ class SingleColumnFrame(Frame): this class. """ + _SUPPORT_AXIS_LOOKUP = { + 0: 0, + None: 0, + "index": 0, + } + + def _reduce( + self, op, axis=None, level=None, numeric_only=None, **kwargs, + ): + if axis not in (None, 0): + raise NotImplementedError("axis parameter is not implemented yet") + + if level is not None: + raise NotImplementedError("level parameter is not implemented yet") + + if numeric_only not in (None, True): + raise NotImplementedError( + "numeric_only parameter is not implemented yet" + ) + return getattr(self._column, op)(**kwargs) + @classmethod def _from_data( cls, diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 64ed4c6cee3..7d28d91c565 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -1294,15 +1294,6 @@ def from_pandas(cls, index, nan_as_null=None): ind.name = index.name return ind - def _reduce( - # self, op, axis=None, level=None, numeric_only=None, **kwargs, - self, - op, - *args, - **kwargs, - ): - raise AttributeError(f"{type(self)} object has no attribute {op}.") - @property def _copy_construct_defaults(self): return {"data": self._column, "name": self.name} diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index cdd20b6862d..438c6c76e16 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -3899,27 +3899,6 @@ def applymap(self, udf, out_dtype=None): # # Stats # - _SUPPORT_AXIS_LOOKUP = { - 0: 0, - None: 0, - "index": 0, - } - - def _reduce( - self, op, axis=None, level=None, numeric_only=None, **kwargs, - ): - if axis not in (None, 0): - raise NotImplementedError("axis parameter is not implemented yet") - - if level is not None: - raise NotImplementedError("level parameter is not implemented yet") - - if numeric_only not in (None, True): - raise NotImplementedError( - "numeric_only parameter is not implemented yet" - ) - return getattr(self._column, op)(**kwargs) - def count(self, level=None, **kwargs): """ Return number of non-NA/null observations in the Series From 76cdc13a7a7c6398decad07434e490c65c79f520 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 5 Aug 2021 10:08:28 -0700 Subject: [PATCH 20/23] Use f-string for exception. --- python/cudf/cudf/core/frame.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 4e182d73894..6a976f54c2b 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -3610,17 +3610,14 @@ def _get_axis_from_axis_arg(cls, axis): try: return cls._SUPPORT_AXIS_LOOKUP[axis] except KeyError: - raise ValueError( - "Invalid axis argument, must be one of {}.".format( - ", ".join( - ( - ax - for ax in cls._SUPPORT_AXIS_LOOKUP.keys() - if ax is not None - ) - ) + valid_axes = ", ".join( + ( + ax + for ax in cls._SUPPORT_AXIS_LOOKUP.keys() + if ax is not None ) ) + raise ValueError(f"Invalid axis, must be one of {valid_axes}.") def _reduce(self, *args, **kwargs): raise NotImplementedError( From dd756a7331fbb363a497bd812eaf89e8666d720a Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 5 Aug 2021 10:13:35 -0700 Subject: [PATCH 21/23] Fix assertion. --- python/cudf/cudf/core/series.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 438c6c76e16..9871c8534c1 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -4416,7 +4416,7 @@ def corr(self, other, method="pearson", min_periods=None): if method not in ("pearson",): raise ValueError(f"Unknown method {method}") - if min_periods not in ("None",): + if min_periods not in (None,): raise NotImplementedError("Unsupported argument 'min_periods'") if self.empty or other.empty: From 8ab25b5970e687a7630752ca29451253d7ef1a48 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 5 Aug 2021 10:37:12 -0700 Subject: [PATCH 22/23] Fix NaN handling for ColumnBase's any and all. --- python/cudf/cudf/core/column/column.py | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 57231d36f77..8aeaf08273f 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -175,13 +175,11 @@ def _null_equals(self, other: ColumnBase) -> ColumnBase: def all(self, skipna: bool = True) -> bool: # If all entries are null the result is True, including when the column # is empty. - if self.null_count == self.size: + result_col = self.nans_to_nulls() if skipna else self + + if result_col.null_count == result_col.size: return True - # We don't want to call _process_for_reduction if skipna is False - # because all is not a reduction where the final output is also - # nullified by any nulls in the the input. - result_col = self._process_for_reduction(True) if skipna else self if isinstance(result_col, ColumnBase): return libcudf.reduce.reduce("all", result_col, dtype=np.bool_) else: @@ -189,15 +187,12 @@ def all(self, skipna: bool = True) -> bool: def any(self, skipna: bool = True) -> bool: # Early exit for fast cases. - if not skipna and self.has_nulls: + result_col = self.nans_to_nulls() if skipna else self + if not skipna and result_col.has_nulls: return True - elif skipna and self.null_count == self.size: + elif skipna and result_col.null_count == result_col.size: return False - # We don't want to call _process_for_reduction if skipna is False - # because any is not a reduction where the final output is also - # nullified by any nulls in the the input. - result_col = self._process_for_reduction(True) if skipna else self if isinstance(result_col, ColumnBase): return libcudf.reduce.reduce("any", result_col, dtype=np.bool_) else: From 50f92c095efc2a1e71ff7eb3a0d1a83cceed9592 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 6 Aug 2021 15:35:35 -0700 Subject: [PATCH 23/23] Add tests for more index reductions. --- python/cudf/cudf/tests/test_index.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index b85b22f97f8..38b924006bf 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -124,6 +124,25 @@ def test_index_comparision(): assert rg[:-1].equals(gi[:-1]) +@pytest.mark.parametrize( + "func", + [ + lambda x: x.min(), + lambda x: x.max(), + lambda x: x.sum(), + lambda x: x.mean(), + lambda x: x.any(), + lambda x: x.all(), + lambda x: x.prod(), + ], +) +def test_reductions(func): + x = np.asarray([4, 5, 6, 10]) + idx = Int64Index(np.asarray([4, 5, 6, 10])) + + assert func(x) == func(idx) + + def test_name(): idx = Int64Index(np.asarray([4, 5, 6, 10]), name="foo") assert idx.name == "foo"