Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Series.drop api #7304

Merged
merged 22 commits into from
Mar 10, 2021
Merged
Show file tree
Hide file tree
Changes from 20 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 28 additions & 43 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Copyright (c) 2018-2021, NVIDIA CORPORATION.

from __future__ import division
from __future__ import annotations, division

import inspect
import itertools
Expand All @@ -10,7 +10,7 @@
import warnings
from collections import OrderedDict, defaultdict
from collections.abc import Iterable, Mapping, Sequence
from typing import Any, Set, TypeVar
from typing import Any, Optional, Set, TypeVar

import cupy
import numpy as np
Expand All @@ -30,7 +30,7 @@
from cudf.core.abc import Serializable
from cudf.core.column import as_column, column_empty
from cudf.core.column_accessor import ColumnAccessor
from cudf.core.frame import Frame
from cudf.core.frame import Frame, _drop_rows_by_labels
from cudf.core.groupby.groupby import DataFrameGroupBy
from cudf.core.index import Index, RangeIndex, as_index
from cudf.core.indexing import _DataFrameIlocIndexer, _DataFrameLocIndexer
Expand Down Expand Up @@ -495,7 +495,12 @@ def _from_table(cls, table, index=None):
return out

@classmethod
def _from_data(cls, data, index=None, columns=None):
def _from_data(
cls,
data: ColumnAccessor,
index: Optional[Index] = None,
columns: Any = None,
) -> DataFrame:
out = cls.__new__(cls)
out._data = data
if index is None:
Expand Down Expand Up @@ -3306,46 +3311,26 @@ def drop(
)

if inplace:
outdf = self
out = self
else:
outdf = self.copy()
out = self.copy()

if axis in (1, "columns"):
target = _get_host_unique(target)

_drop_columns(outdf, target, errors)
_drop_columns(out, target, errors)
elif axis in (0, "index"):
if not isinstance(target, (cudf.Series, cudf.Index)):
target = column.as_column(target)

if isinstance(self._index, cudf.MultiIndex):
if level is None:
level = 0

levels_index = outdf.index.get_level_values(level)
if errors == "raise" and not target.isin(levels_index).all():
raise KeyError("One or more values not found in axis")

# TODO : Could use anti-join as a future optimization
sliced_df = outdf.take(~levels_index.isin(target))
sliced_df._index.names = self._index.names
else:
if errors == "raise" and not target.isin(outdf.index).all():
raise KeyError("One or more values not found in axis")

sliced_df = outdf.join(
cudf.DataFrame(index=target), how="leftanti"
)
dropped = _drop_rows_by_labels(out, target, level, errors)

if columns is not None:
columns = _get_host_unique(columns)
_drop_columns(sliced_df, columns, errors)
_drop_columns(dropped, columns, errors)

outdf._data = sliced_df._data
outdf._index = sliced_df._index
out._data = dropped._data
out._index = dropped._index

if not inplace:
return outdf
return out

def _drop_column(self, name):
"""Drop a column by *name*
Expand Down Expand Up @@ -7645,17 +7630,6 @@ def _get_union_of_series_names(series_list):
return names_list


def _drop_columns(df, columns, errors):
for c in columns:
try:
df._drop_column(c)
except KeyError as e:
if errors == "ignore":
pass
else:
raise e


def _get_host_unique(array):
if isinstance(
array, (cudf.Series, cudf.Index, cudf.core.column.ColumnBase)
Expand All @@ -7665,3 +7639,14 @@ def _get_host_unique(array):
return [array]
else:
return set(array)


def _drop_columns(df: DataFrame, columns: Iterable, errors: str):
for c in columns:
try:
df._drop_column(c)
except KeyError as e:
if errors == "ignore":
pass
else:
raise e
87 changes: 85 additions & 2 deletions python/cudf/cudf/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import operator
import warnings
from collections import OrderedDict, abc as abc
from typing import TYPE_CHECKING, Any, Dict, Tuple, TypeVar, overload
from typing import TYPE_CHECKING, Any, Dict, Tuple, TypeVar, Union, overload

import cupy
import numpy as np
Expand All @@ -18,6 +18,7 @@

import cudf
from cudf import _lib as libcudf
from cudf._typing import ColumnLike, DataFrameOrSeries
from cudf.core.column import as_column, build_categorical_column, column_empty
from cudf.utils.dtypes import (
is_categorical_dtype,
Expand All @@ -27,7 +28,6 @@
min_scalar_type,
)


T = TypeVar("T", bound="Frame")

if TYPE_CHECKING:
Expand Down Expand Up @@ -3850,3 +3850,86 @@ def _is_series(obj):
instead of checking for isinstance(obj, cudf.Series)
"""
return isinstance(obj, Frame) and obj.ndim == 1 and obj._index is not None


def _drop_rows_by_labels(
obj: DataFrameOrSeries,
labels: Union[ColumnLike, abc.Iterable, str],
level: Union[int, str],
errors: str,
) -> DataFrameOrSeries:
"""Remove rows specified by `labels`. If `errors=True`, an error is raised
if some items in `labels` do not exist in `obj._index`.

Will raise if level(int) is greater or equal to index nlevels
"""
if isinstance(level, int) and level >= obj.index.nlevels:
raise ValueError("Param level out of bounds.")

if not isinstance(labels, (cudf.Series, cudf.Index)):
labels = as_column(labels)

res: DataFrameOrSeries
if isinstance(obj._index, cudf.MultiIndex):
if level is None:
level = 0

levels_index = obj.index.get_level_values(level)
if errors == "raise" and not labels.isin(levels_index).all():
raise KeyError("One or more values not found in axis")

if isinstance(level, int):
ilevel = level
else:
ilevel = obj._index.names.index(level)

# 1. Merge Index df and data df along column axis:
# | id | ._index df | data column(s) |
idx_nlv = obj._index.nlevels
working_df = obj._index._source_data
working_df.columns = [i for i in range(idx_nlv)]
for i, col in enumerate(obj._data):
working_df[idx_nlv + i] = obj._data[col]
# 2. Set `level` as common index:
# | level | ._index df w/o level | data column(s) |
working_df = working_df.set_index(level)

# 3. Use "leftanti" join to drop
# TODO: use internal API with "leftanti" and specify left and right
# join keys to bypass logic check
to_join = cudf.DataFrame(index=cudf.Index(labels, name=level))
join_res = working_df.join(to_join, how="leftanti")

# 4. Reconstruct original layout, and rename
join_res.insert(
ilevel, name=join_res._index.name, value=join_res._index
)
join_res = join_res.reset_index(drop=True)

midx = cudf.MultiIndex.from_frame(
join_res.iloc[:, 0:idx_nlv], names=obj._index.names
)

if isinstance(obj, cudf.Series):
res = obj.__class__._from_data(
isVoid marked this conversation as resolved.
Show resolved Hide resolved
join_res.iloc[:, idx_nlv:]._data, index=midx, name=obj.name
)
else:
res = obj.__class__._from_data(
join_res.iloc[:, idx_nlv:]._data,
index=midx,
columns=obj.columns,
)

else:
if errors == "raise" and not labels.isin(obj.index).all():
raise KeyError("One or more values not found in axis")

key_df = cudf.DataFrame(index=labels)
if isinstance(obj, cudf.Series):
res = obj.to_frame(name="tmp").join(key_df, how="leftanti")["tmp"]
res.name = obj.name
else:
res = obj.join(key_df, how="leftanti")

return res
124 changes: 123 additions & 1 deletion python/cudf/cudf/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
from cudf.core.column.lists import ListMethods
from cudf.core.column.string import StringMethods
from cudf.core.column_accessor import ColumnAccessor
from cudf.core.frame import Frame
from cudf.core.frame import Frame, _drop_rows_by_labels
from cudf.core.groupby.groupby import SeriesGroupBy
from cudf.core.index import Index, RangeIndex, as_index
from cudf.core.indexing import _SeriesIlocIndexer, _SeriesLocIndexer
Expand Down Expand Up @@ -490,6 +490,128 @@ def to_arrow(self):
"""
return self._column.to_arrow()

def drop(
self,
labels=None,
axis=0,
index=None,
columns=None,
level=None,
inplace=False,
errors="raise",
):
"""
Return Series with specified index labels removed.

Remove elements of a Series based on specifying the index labels.
When using a multi-index, labels on different levels can be removed by
specifying the level.

Parameters
----------
labels : single label or list-like
Index labels to drop.
axis : 0, default 0
Redundant for application on Series.
index : single label or list-like
Redundant for application on Series. But ``index`` can be used
instead of ``labels``
columns : single label or list-like
This parameter is ignored. Use ``index`` or ``labels`` to specify.
level : int or level name, optional
For MultiIndex, level from which the labels will be removed.
inplace : bool, default False
If False, return a copy. Otherwise, do operation
inplace and return None.
errors : {'ignore', 'raise'}, default 'raise'
If 'ignore', suppress error and only existing labels are
dropped.

Returns
-------
Series or None
Series with specified index labels removed or None if
``inplace=True``

Raises
------
KeyError
If any of the labels is not found in the selected axis and
``error='raise'``

See Also
--------
Series.reindex
Return only specified index labels of Series
Series.dropna
Return series without null values
Series.drop_duplicates
Return series with duplicate values removed
cudf.core.dataframe.DataFrame.drop
Drop specified labels from rows or columns in dataframe

Examples
--------
>>> s = cudf.Series([1,2,3], index=['x', 'y', 'z'])
>>> s
x 1
y 2
z 3
dtype: int64

Drop labels x and z

>>> s.drop(labels=['x', 'z'])
y 2
dtype: int64

Drop a label from the second level in MultiIndex Series.

>>> midx = cudf.MultiIndex.from_product([[0, 1, 2], ['x', 'y']])
>>> s = cudf.Series(range(6), index=midx)
>>> s
0 x 0
y 1
1 x 2
y 3
2 x 4
y 5
>>> s.drop(labels='y', level=1)
0 x 0
1 x 2
2 x 4
"""
if labels is not None:
if index is not None or columns is not None:
raise ValueError(
"Cannot specify both 'labels' and 'index'/'columns'"
)
if axis == 1:
raise ValueError("No axis named 1 for object type Series")
target = labels
elif index is not None:
target = index
elif columns is not None:
target = [] # Ignore parameter columns
else:
raise ValueError(
"Need to specify at least one of 'labels', "
"'index' or 'columns'"
)

if inplace:
out = self
else:
out = self.copy()

dropped = _drop_rows_by_labels(out, target, level, errors)

out._data = dropped._data
out._index = dropped._index

if not inplace:
return out

def __copy__(self, deep=True):
return self.copy(deep)

Expand Down
9 changes: 9 additions & 0 deletions python/cudf/cudf/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -517,6 +517,15 @@ def test_dataframe_drop_raises():
expected_error_message="One or more values not found in axis",
)

# label dtype mismatch
assert_exceptions_equal(
lfunc=pdf.drop,
rfunc=df.drop,
lfunc_args_and_kwargs=([3],),
rfunc_args_and_kwargs=([3],),
expected_error_message="One or more values not found in axis",
)

expect = pdf.drop("p", errors="ignore")
actual = df.drop("p", errors="ignore")

Expand Down
Loading