From 493ed175f74eacd493e8a0cde111bb255cde39da Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 6 Jun 2024 12:30:00 -0700 Subject: [PATCH 1/2] Preserve columns in more operations --- python/cudf/cudf/core/dataframe.py | 3 +- python/cudf/cudf/core/indexed_frame.py | 121 ++++++++++------------- python/cudf/cudf/core/window/rolling.py | 48 +++------ python/cudf/cudf/tests/test_dataframe.py | 12 ++- 4 files changed, 79 insertions(+), 105 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 9307267b227..e1b6cc45dd3 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -2688,6 +2688,7 @@ def _set_columns_like(self, other: ColumnAccessor) -> None: self._data = ColumnAccessor( data=dict(zip(other.names, self._data.columns)), multiindex=other.multiindex, + rangeindex=other.rangeindex, level_names=other.level_names, label_dtype=other.label_dtype, verify=False, @@ -7534,7 +7535,7 @@ def _sample_axis_1( def _from_columns_like_self( self, columns: List[ColumnBase], - column_names: abc.Iterable[str], + column_names: Optional[abc.Iterable[str]] = None, index_names: Optional[List[str]] = None, *, override_dtypes: Optional[abc.Iterable[Optional[Dtype]]] = None, diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index ecfcec15337..c7bd32b5bfb 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -40,8 +40,6 @@ from cudf.api.extensions import no_default from cudf.api.types import ( _is_non_decimal_numeric_dtype, - is_bool_dtype, - is_decimal_dtype, is_dict_like, is_list_like, is_scalar, @@ -346,6 +344,8 @@ def _from_columns_like_self( index.names = index_names else: index.name = index_names[0] + # else: + # index = self.index data = dict(zip(column_names, data_columns)) frame = self.__class__._from_data(data) @@ -372,7 +372,6 @@ def _mimic_inplace( self._index = result.index return super()._mimic_inplace(result, inplace) - # Scans @_cudf_nvtx_annotate def _scan(self, op, axis=None, skipna=True): """ @@ -417,8 +416,8 @@ def _scan(self, op, axis=None, skipna=True): cast_to_int = op in ("cumsum", "cumprod") skipna = True if skipna is None else skipna - results = {} - for name, col in self._data.items(): + results = [] + for col in self._columns: if skipna: try: result_col = col.nans_to_nulls() @@ -432,19 +431,12 @@ def _scan(self, op, axis=None, skipna=True): else: result_col = col - if ( - cast_to_int - and not is_decimal_dtype(result_col.dtype) - and ( - np.issubdtype(result_col.dtype, np.integer) - or np.issubdtype(result_col.dtype, np.bool_) - ) - ): + if cast_to_int and result_col.dtype.kind in "uib": # For reductions that accumulate a value (e.g. sum, not max) # pandas returns an int64 dtype for all int or bool dtypes. result_col = result_col.astype(np.int64) - results[name] = getattr(result_col, op)() - return self._from_data(results, self.index) + results.append(getattr(result_col, op)()) + return self._from_columns_like_self(results) def _check_data_index_length_match(self) -> None: # Validate that the number of rows in the data matches the index if the @@ -883,7 +875,6 @@ def replace( FutureWarning, ) if not (to_replace is None and value is no_default): - copy_data = {} ( all_na_per_column, to_replace_per_column, @@ -893,10 +884,10 @@ def replace( value=value, columns_dtype_map=dict(self._dtypes), ) - + copy_data = [] for name, col in self._data.items(): try: - copy_data[name] = col.find_and_replace( + replaced = col.find_and_replace( to_replace_per_column[name], replacements_per_column[name], all_na_per_column[name], @@ -909,11 +900,11 @@ def replace( # that exists in `copy_data`. # ii. There is an OverflowError while trying to cast # `to_replace_per_column` to `replacements_per_column`. - copy_data[name] = col.copy(deep=True) + replaced = col.copy(deep=True) + copy_data.append(replaced) + result = self._from_columns_like_self(copy_data) else: - copy_data = self._data.copy(deep=True) - - result = self._from_data(copy_data, self.index) + result = self.copy() return self._mimic_inplace(result, inplace=inplace) @@ -1034,12 +1025,11 @@ def clip(self, lower=None, upper=None, inplace=False, axis=1): ): lower[0], upper[0] = upper[0], lower[0] - data = { - name: col.clip(lower[i], upper[i]) - for i, (name, col) in enumerate(self._data.items()) - } - output = self._from_data(data, self.index) - output._copy_type_metadata(self, include_index=False) + data = ( + col.clip(low, high) + for col, low, high in zip(self._columns, lower, upper) + ) + output = self._from_columns_like_self(data) return self._mimic_inplace(output, inplace=inplace) @_cudf_nvtx_annotate @@ -1919,7 +1909,7 @@ def nans_to_nulls(self): col.nans_to_nulls() if isinstance(col, cudf.core.column.NumericalColumn) else col.copy() - for col in self._data.columns + for col in self._columns ) return self._from_data_like_self( self._data._from_columns_like_self(result) @@ -2031,8 +2021,8 @@ def interpolate( ) interpolator = cudf.core.algorithms.get_column_interpolator(method) - columns = {} - for colname, col in data._data.items(): + columns = [] + for col in data._columns: if isinstance(col, cudf.core.column.StringColumn): warnings.warn( f"{type(self).__name__}.interpolate with object dtype is " @@ -2043,9 +2033,10 @@ def interpolate( col = col.astype("float64").fillna(np.nan) # Interpolation methods may or may not need the index - columns[colname] = interpolator(col, index=data.index) + columns.append(interpolator(col, index=data.index)) - result = self._from_data(columns, index=data.index) + result = self._from_columns_like_self(columns) + result.index = data.index return ( result @@ -2072,9 +2063,7 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None): data_columns = ( col.shift(periods, fill_value) for col in self._columns ) - return self.__class__._from_data( - zip(self._column_names, data_columns), self.index - ) + return self._from_columns_like_self(data_columns) @_cudf_nvtx_annotate def truncate(self, before=None, after=None, axis=0, copy=True): @@ -3014,8 +3003,6 @@ def _slice(self, arg: slice, keep_index: bool = True) -> Self: self._column_names, None if has_range_index or not keep_index else self.index.names, ) - result._data.label_dtype = self._data.label_dtype - result._data.rangeindex = self._data.rangeindex if keep_index and has_range_index: result.index = self.index[start:stop] @@ -3564,11 +3551,6 @@ def sort_values( ), keep_index=not ignore_index, ) - if ( - isinstance(self, cudf.core.dataframe.DataFrame) - and self._data.multiindex - ): - out.columns = self._data.to_pandas_index() return out def _n_largest_or_smallest( @@ -3662,14 +3644,10 @@ def _align_to_index( result = result.sort_values(sort_col_id) del result[sort_col_id] - result = self.__class__._from_data( - data=result._data, index=result.index - ) - result._data.multiindex = self._data.multiindex - result._data._level_names = self._data._level_names - result.index.names = self.index.names - - return result + out = self._from_columns_like_self(result._columns) + out.index = result.index + out.index.names = self.index.names + return out @_cudf_nvtx_annotate def _reindex( @@ -3901,25 +3879,13 @@ def round(self, decimals=0, how="half_even"): "decimals must be an integer, a dict-like or a Series" ) - cols = { - name: col.round(decimals[name], how=how) - if ( - name in decimals - and _is_non_decimal_numeric_dtype(col.dtype) - and not is_bool_dtype(col.dtype) - ) + cols = ( + col.round(decimals[name], how=how) + if name in decimals and col.dtype.kind in "fiu" else col.copy(deep=True) for name, col in self._data.items() - } - - return self.__class__._from_data( - data=cudf.core.column_accessor.ColumnAccessor( - cols, - multiindex=self._data.multiindex, - level_names=self._data.level_names, - ), - index=self.index, ) + return self._from_columns_like_self(cols) def resample( self, @@ -6249,6 +6215,8 @@ def rank( f"axis={axis} is not yet supported in rank" ) + num_cols = self._num_columns + dropped_cols = False source = self if numeric_only: if isinstance( @@ -6266,15 +6234,26 @@ def rank( source = self._get_columns_by_label(numeric_cols) if source.empty: return source.astype("float64") + elif source._num_columns != num_cols: + dropped_cols = True result_columns = libcudf.sort.rank_columns( [*source._columns], method_enum, na_option, ascending, pct ) - return self.__class__._from_data( - dict(zip(source._column_names, result_columns)), - index=source.index, - ).astype(np.float64) + if dropped_cols: + result = type(source)._from_data( + ColumnAccessor( + dict(zip(source._column_names, result_columns)), + multiindex=self._data.multiindex, + level_names=self._data.level_names, + label_dtype=self._data.label_dtype, + ), + index=source.index, + ) + else: + result = source._from_columns_like_self(result_columns) + return result.astype(np.float64) def convert_dtypes( self, diff --git a/python/cudf/cudf/core/window/rolling.py b/python/cudf/cudf/core/window/rolling.py index 2037b1682db..2208cdd431e 100644 --- a/python/cudf/cudf/core/window/rolling.py +++ b/python/cudf/cudf/core/window/rolling.py @@ -1,7 +1,5 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION -import itertools - import numba import pandas as pd from pandas.api.indexers import BaseIndexer @@ -251,27 +249,11 @@ def _apply_agg_column(self, source_column, agg_name): agg_params=self.agg_params, ) - def _apply_agg_dataframe(self, df, agg_name): - return cudf.DataFrame._from_data( - { - col_name: self._apply_agg_column(col, agg_name) - for col_name, col in df._data.items() - }, - index=df.index, - ) - def _apply_agg(self, agg_name): - if isinstance(self.obj, cudf.Series): - return cudf.Series._from_data( - { - self.obj.name: self._apply_agg_column( - self.obj._column, agg_name - ) - }, - index=self.obj.index, - ) - else: - return self._apply_agg_dataframe(self.obj, agg_name) + applied = ( + self._apply_agg_column(col, agg_name) for col in self.obj._columns + ) + return self.obj._from_columns_like_self(applied) def _reduce( self, @@ -533,17 +515,19 @@ def _window_to_window_sizes(self, window): ) def _apply_agg(self, agg_name): - index = cudf.MultiIndex.from_frame( - cudf.DataFrame( - { - key: value - for key, value in itertools.chain( - self._group_keys._data.items(), - self.obj.index._data.items(), - ) - } - ) + index = cudf.MultiIndex._from_data( + {**self._group_keys._data, **self.obj.index._data} ) + # cudf.DataFrame( + # { + # key: value + # for key, value in itertools.chain( + # self._group_keys._data.items(), + # self.obj.index._data.items(), + # ) + # } + # ) + # ) result = super()._apply_agg(agg_name) result.index = index diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index d76d5eb8065..98e9f9881c7 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -10980,7 +10980,7 @@ def test_squeeze(axis, data): assert_eq(result, expected) -@pytest.mark.parametrize("column", [range(1), np.array([1], dtype=np.int8)]) +@pytest.mark.parametrize("column", [range(1, 2), np.array([1], dtype=np.int8)]) @pytest.mark.parametrize( "operation", [ @@ -10991,6 +10991,16 @@ def test_squeeze(axis, data): lambda df: abs(df), lambda df: -df, lambda df: ~df, + lambda df: df.cumsum(), + lambda df: df.replace(1, 2), + lambda df: df.replace(10, 20), + lambda df: df.clip(0, 10), + lambda df: df.rolling(1).mean(), + lambda df: df.interpolate(), + lambda df: df.shift(), + lambda df: df.sort_values(1), + lambda df: df.round(), + lambda df: df.rank(), ], ) def test_op_preserves_column_metadata(column, operation): From 14374b898adf89de4506e08f5aaca881d17175a3 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 6 Jun 2024 13:38:17 -0700 Subject: [PATCH 2/2] Fix some mistakes --- python/cudf/cudf/core/indexed_frame.py | 36 +++++++++++++++++-------- python/cudf/cudf/core/window/rolling.py | 15 +++-------- 2 files changed, 28 insertions(+), 23 deletions(-) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index c7bd32b5bfb..897c872ae31 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -344,8 +344,6 @@ def _from_columns_like_self( index.names = index_names else: index.name = index_names[0] - # else: - # index = self.index data = dict(zip(column_names, data_columns)) frame = self.__class__._from_data(data) @@ -436,7 +434,9 @@ def _scan(self, op, axis=None, skipna=True): # pandas returns an int64 dtype for all int or bool dtypes. result_col = result_col.astype(np.int64) results.append(getattr(result_col, op)()) - return self._from_columns_like_self(results) + return self._from_data_like_self( + self._data._from_columns_like_self(results) + ) def _check_data_index_length_match(self) -> None: # Validate that the number of rows in the data matches the index if the @@ -902,7 +902,9 @@ def replace( # `to_replace_per_column` to `replacements_per_column`. replaced = col.copy(deep=True) copy_data.append(replaced) - result = self._from_columns_like_self(copy_data) + result = self._from_data_like_self( + self._data._from_columns_like_self(copy_data) + ) else: result = self.copy() @@ -1029,7 +1031,9 @@ def clip(self, lower=None, upper=None, inplace=False, axis=1): col.clip(low, high) for col, low, high in zip(self._columns, lower, upper) ) - output = self._from_columns_like_self(data) + output = self._from_data_like_self( + self._data._from_columns_like_self(data) + ) return self._mimic_inplace(output, inplace=inplace) @_cudf_nvtx_annotate @@ -2035,7 +2039,9 @@ def interpolate( # Interpolation methods may or may not need the index columns.append(interpolator(col, index=data.index)) - result = self._from_columns_like_self(columns) + result = self._from_data_like_self( + self._data._from_columns_like_self(columns) + ) result.index = data.index return ( @@ -2063,7 +2069,9 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None): data_columns = ( col.shift(periods, fill_value) for col in self._columns ) - return self._from_columns_like_self(data_columns) + return self._from_data_like_self( + self._data._from_columns_like_self(data_columns) + ) @_cudf_nvtx_annotate def truncate(self, before=None, after=None, axis=0, copy=True): @@ -3644,7 +3652,9 @@ def _align_to_index( result = result.sort_values(sort_col_id) del result[sort_col_id] - out = self._from_columns_like_self(result._columns) + out = self._from_data( + self._data._from_columns_like_self(result._columns) + ) out.index = result.index out.index.names = self.index.names return out @@ -3885,7 +3895,9 @@ def round(self, decimals=0, how="half_even"): else col.copy(deep=True) for name, col in self._data.items() ) - return self._from_columns_like_self(cols) + return self._from_data_like_self( + self._data._from_columns_like_self(cols) + ) def resample( self, @@ -6249,10 +6261,12 @@ def rank( level_names=self._data.level_names, label_dtype=self._data.label_dtype, ), - index=source.index, ) else: - result = source._from_columns_like_self(result_columns) + result = source._from_data_like_self( + self._data._from_columns_like_self(result_columns) + ) + result.index = source.index return result.astype(np.float64) def convert_dtypes( diff --git a/python/cudf/cudf/core/window/rolling.py b/python/cudf/cudf/core/window/rolling.py index 2208cdd431e..7d140a1ffa5 100644 --- a/python/cudf/cudf/core/window/rolling.py +++ b/python/cudf/cudf/core/window/rolling.py @@ -253,7 +253,9 @@ def _apply_agg(self, agg_name): applied = ( self._apply_agg_column(col, agg_name) for col in self.obj._columns ) - return self.obj._from_columns_like_self(applied) + return self.obj._from_data_like_self( + self.obj._data._from_columns_like_self(applied) + ) def _reduce( self, @@ -518,17 +520,6 @@ def _apply_agg(self, agg_name): index = cudf.MultiIndex._from_data( {**self._group_keys._data, **self.obj.index._data} ) - # cudf.DataFrame( - # { - # key: value - # for key, value in itertools.chain( - # self._group_keys._data.items(), - # self.obj.index._data.items(), - # ) - # } - # ) - # ) - result = super()._apply_agg(agg_name) result.index = index return result