Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Preserve column type and class information in more DataFrame operations #15949

Merged
merged 4 commits into from
Jun 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -2688,6 +2688,7 @@ def _set_columns_like(self, other: ColumnAccessor) -> None:
self._data = ColumnAccessor(
data=dict(zip(other.names, self._data.columns)),
multiindex=other.multiindex,
rangeindex=other.rangeindex,
level_names=other.level_names,
label_dtype=other.label_dtype,
verify=False,
Expand Down Expand Up @@ -7534,7 +7535,7 @@ def _sample_axis_1(
def _from_columns_like_self(
self,
columns: List[ColumnBase],
column_names: abc.Iterable[str],
column_names: Optional[abc.Iterable[str]] = None,
index_names: Optional[List[str]] = None,
*,
override_dtypes: Optional[abc.Iterable[Optional[Dtype]]] = None,
Expand Down
131 changes: 62 additions & 69 deletions python/cudf/cudf/core/indexed_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,6 @@
from cudf.api.extensions import no_default
from cudf.api.types import (
_is_non_decimal_numeric_dtype,
is_bool_dtype,
is_decimal_dtype,
is_dict_like,
is_list_like,
is_scalar,
Expand Down Expand Up @@ -372,7 +370,6 @@ def _mimic_inplace(
self._index = result.index
return super()._mimic_inplace(result, inplace)

# Scans
@_cudf_nvtx_annotate
def _scan(self, op, axis=None, skipna=True):
"""
Expand Down Expand Up @@ -417,8 +414,8 @@ def _scan(self, op, axis=None, skipna=True):
cast_to_int = op in ("cumsum", "cumprod")
skipna = True if skipna is None else skipna

results = {}
for name, col in self._data.items():
results = []
for col in self._columns:
if skipna:
result_col = col.nans_to_nulls()
else:
Expand All @@ -429,19 +426,14 @@ def _scan(self, op, axis=None, skipna=True):
else:
result_col = col

if (
cast_to_int
and not is_decimal_dtype(result_col.dtype)
and (
np.issubdtype(result_col.dtype, np.integer)
or np.issubdtype(result_col.dtype, np.bool_)
)
):
if cast_to_int and result_col.dtype.kind in "uib":
# For reductions that accumulate a value (e.g. sum, not max)
# pandas returns an int64 dtype for all int or bool dtypes.
result_col = result_col.astype(np.int64)
results[name] = getattr(result_col, op)()
return self._from_data(results, self.index)
results.append(getattr(result_col, op)())
return self._from_data_like_self(
self._data._from_columns_like_self(results)
)

def _check_data_index_length_match(self) -> None:
# Validate that the number of rows in the data matches the index if the
Expand Down Expand Up @@ -880,7 +872,6 @@ def replace(
FutureWarning,
)
if not (to_replace is None and value is no_default):
copy_data = {}
(
all_na_per_column,
to_replace_per_column,
Expand All @@ -890,10 +881,10 @@ def replace(
value=value,
columns_dtype_map=dict(self._dtypes),
)

copy_data = []
for name, col in self._data.items():
try:
copy_data[name] = col.find_and_replace(
replaced = col.find_and_replace(
to_replace_per_column[name],
replacements_per_column[name],
all_na_per_column[name],
Expand All @@ -906,11 +897,13 @@ def replace(
# that exists in `copy_data`.
# ii. There is an OverflowError while trying to cast
# `to_replace_per_column` to `replacements_per_column`.
copy_data[name] = col.copy(deep=True)
replaced = col.copy(deep=True)
copy_data.append(replaced)
result = self._from_data_like_self(
self._data._from_columns_like_self(copy_data)
)
else:
copy_data = self._data.copy(deep=True)

result = self._from_data(copy_data, self.index)
result = self.copy()

return self._mimic_inplace(result, inplace=inplace)

Expand Down Expand Up @@ -1031,12 +1024,13 @@ def clip(self, lower=None, upper=None, inplace=False, axis=1):
):
lower[0], upper[0] = upper[0], lower[0]

data = {
name: col.clip(lower[i], upper[i])
for i, (name, col) in enumerate(self._data.items())
}
output = self._from_data(data, self.index)
output._copy_type_metadata(self, include_index=False)
data = (
col.clip(low, high)
for col, low, high in zip(self._columns, lower, upper)
)
output = self._from_data_like_self(
self._data._from_columns_like_self(data)
)
return self._mimic_inplace(output, inplace=inplace)

@_cudf_nvtx_annotate
Expand Down Expand Up @@ -1913,7 +1907,7 @@ def nans_to_nulls(self):
2 <NA> <NA>
"""
result = []
for col in self._data.columns:
for col in self._columns:
converted = col.nans_to_nulls()
if converted is col:
converted = converted.copy()
Expand Down Expand Up @@ -2028,8 +2022,8 @@ def interpolate(
)

interpolator = cudf.core.algorithms.get_column_interpolator(method)
columns = {}
for colname, col in data._data.items():
columns = []
for col in data._columns:
if isinstance(col, cudf.core.column.StringColumn):
warnings.warn(
f"{type(self).__name__}.interpolate with object dtype is "
Expand All @@ -2040,9 +2034,12 @@ def interpolate(
col = col.astype("float64").fillna(np.nan)

# Interpolation methods may or may not need the index
columns[colname] = interpolator(col, index=data.index)
columns.append(interpolator(col, index=data.index))

result = self._from_data(columns, index=data.index)
result = self._from_data_like_self(
self._data._from_columns_like_self(columns)
)
result.index = data.index
wence- marked this conversation as resolved.
Show resolved Hide resolved

return (
result
Expand All @@ -2069,8 +2066,8 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None):
data_columns = (
col.shift(periods, fill_value) for col in self._columns
)
return self.__class__._from_data(
zip(self._column_names, data_columns), self.index
return self._from_data_like_self(
self._data._from_columns_like_self(data_columns)
)

@_cudf_nvtx_annotate
Expand Down Expand Up @@ -3011,8 +3008,6 @@ def _slice(self, arg: slice, keep_index: bool = True) -> Self:
self._column_names,
None if has_range_index or not keep_index else self.index.names,
)
result._data.label_dtype = self._data.label_dtype
result._data.rangeindex = self._data.rangeindex

if keep_index and has_range_index:
result.index = self.index[start:stop]
Expand Down Expand Up @@ -3561,11 +3556,6 @@ def sort_values(
),
keep_index=not ignore_index,
)
if (
isinstance(self, cudf.core.dataframe.DataFrame)
and self._data.multiindex
):
out.columns = self._data.to_pandas_index()
return out

def _n_largest_or_smallest(
Expand Down Expand Up @@ -3659,14 +3649,12 @@ def _align_to_index(
result = result.sort_values(sort_col_id)
del result[sort_col_id]

result = self.__class__._from_data(
data=result._data, index=result.index
out = self._from_data(
self._data._from_columns_like_self(result._columns)
)
result._data.multiindex = self._data.multiindex
result._data._level_names = self._data._level_names
result.index.names = self.index.names

return result
out.index = result.index
out.index.names = self.index.names
return out

@_cudf_nvtx_annotate
def _reindex(
Expand Down Expand Up @@ -3898,24 +3886,14 @@ def round(self, decimals=0, how="half_even"):
"decimals must be an integer, a dict-like or a Series"
)

cols = {
name: col.round(decimals[name], how=how)
if (
name in decimals
and _is_non_decimal_numeric_dtype(col.dtype)
and not is_bool_dtype(col.dtype)
)
cols = (
col.round(decimals[name], how=how)
if name in decimals and col.dtype.kind in "fiu"
else col.copy(deep=True)
for name, col in self._data.items()
}

return self.__class__._from_data(
data=cudf.core.column_accessor.ColumnAccessor(
cols,
multiindex=self._data.multiindex,
level_names=self._data.level_names,
),
index=self.index,
)
return self._from_data_like_self(
self._data._from_columns_like_self(cols)
)

def resample(
Expand Down Expand Up @@ -6238,6 +6216,8 @@ def rank(
f"axis={axis} is not yet supported in rank"
)

num_cols = self._num_columns
dropped_cols = False
source = self
if numeric_only:
if isinstance(
Expand All @@ -6255,15 +6235,28 @@ def rank(
source = self._get_columns_by_label(numeric_cols)
if source.empty:
return source.astype("float64")
elif source._num_columns != num_cols:
dropped_cols = True

result_columns = libcudf.sort.rank_columns(
[*source._columns], method_enum, na_option, ascending, pct
)

return self.__class__._from_data(
dict(zip(source._column_names, result_columns)),
index=source.index,
).astype(np.float64)
if dropped_cols:
result = type(source)._from_data(
ColumnAccessor(
dict(zip(source._column_names, result_columns)),
multiindex=self._data.multiindex,
level_names=self._data.level_names,
label_dtype=self._data.label_dtype,
),
)
else:
result = source._from_data_like_self(
self._data._from_columns_like_self(result_columns)
)
result.index = source.index
return result.astype(np.float64)

def convert_dtypes(
self,
Expand Down
41 changes: 8 additions & 33 deletions python/cudf/cudf/core/window/rolling.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
# Copyright (c) 2020-2024, NVIDIA CORPORATION

import itertools

import numba
import pandas as pd
from pandas.api.indexers import BaseIndexer
Expand Down Expand Up @@ -251,27 +249,13 @@ def _apply_agg_column(self, source_column, agg_name):
agg_params=self.agg_params,
)

def _apply_agg_dataframe(self, df, agg_name):
return cudf.DataFrame._from_data(
{
col_name: self._apply_agg_column(col, agg_name)
for col_name, col in df._data.items()
},
index=df.index,
)

def _apply_agg(self, agg_name):
if isinstance(self.obj, cudf.Series):
return cudf.Series._from_data(
{
self.obj.name: self._apply_agg_column(
self.obj._column, agg_name
)
},
index=self.obj.index,
)
else:
return self._apply_agg_dataframe(self.obj, agg_name)
applied = (
self._apply_agg_column(col, agg_name) for col in self.obj._columns
)
return self.obj._from_data_like_self(
self.obj._data._from_columns_like_self(applied)
)

def _reduce(
self,
Expand Down Expand Up @@ -533,18 +517,9 @@ def _window_to_window_sizes(self, window):
)

def _apply_agg(self, agg_name):
index = cudf.MultiIndex.from_frame(
cudf.DataFrame(
{
key: value
for key, value in itertools.chain(
self._group_keys._data.items(),
self.obj.index._data.items(),
)
}
)
index = cudf.MultiIndex._from_data(
{**self._group_keys._data, **self.obj.index._data}
)

result = super()._apply_agg(agg_name)
result.index = index
return result
12 changes: 11 additions & 1 deletion python/cudf/cudf/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -10980,7 +10980,7 @@ def test_squeeze(axis, data):
assert_eq(result, expected)


@pytest.mark.parametrize("column", [range(1), np.array([1], dtype=np.int8)])
@pytest.mark.parametrize("column", [range(1, 2), np.array([1], dtype=np.int8)])
@pytest.mark.parametrize(
"operation",
[
Expand All @@ -10991,6 +10991,16 @@ def test_squeeze(axis, data):
lambda df: abs(df),
lambda df: -df,
lambda df: ~df,
lambda df: df.cumsum(),
lambda df: df.replace(1, 2),
lambda df: df.replace(10, 20),
lambda df: df.clip(0, 10),
lambda df: df.rolling(1).mean(),
lambda df: df.interpolate(),
lambda df: df.shift(),
lambda df: df.sort_values(1),
lambda df: df.round(),
lambda df: df.rank(),
],
)
def test_op_preserves_column_metadata(column, operation):
Expand Down
Loading