From d3be46ff7829d2a02d6c6101f1ead764cfecbf6d Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Wed, 3 Feb 2021 13:51:37 -0800 Subject: [PATCH 01/18] Moving logics into Frame.py --- python/cudf/cudf/core/dataframe.py | 90 +++------------------------- python/cudf/cudf/core/frame.py | 96 ++++++++++++++++++++++++++++++ 2 files changed, 105 insertions(+), 81 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index fc68eaa71a6..2aa1eda0a5c 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -3250,65 +3250,15 @@ def drop( weight 1.0 0.8 """ - if labels is not None: - if index is not None or columns is not None: - raise ValueError( - "Cannot specify both 'labels' and 'index'/'columns'" - ) - target = labels - elif index is not None: - target = index - axis = 0 - elif columns is not None: - target = columns - axis = 1 - else: - raise ValueError( - "Need to specify at least one of 'labels', " - "'index' or 'columns'" - ) - - if inplace: - outdf = self - else: - outdf = self.copy() - - if axis in (1, "columns"): - target = _get_host_unique(target) - - _drop_columns(outdf, target, errors) - elif axis in (0, "index"): - if not isinstance(target, (cudf.Series, cudf.Index)): - target = column.as_column(target) - - if isinstance(self._index, cudf.MultiIndex): - if level is None: - level = 0 - - levels_index = outdf.index.get_level_values(level) - if errors == "raise" and not target.isin(levels_index).all(): - raise KeyError("One or more values not found in axis") - - # TODO : Could use anti-join as a future optimization - sliced_df = outdf.take(~levels_index.isin(target)) - sliced_df._index.names = self._index.names - else: - if errors == "raise" and not target.isin(outdf.index).all(): - raise KeyError("One or more values not found in axis") - - sliced_df = outdf.join( - cudf.DataFrame(index=target), how="leftanti" - ) - - if columns is not None: - columns = _get_host_unique(columns) - _drop_columns(sliced_df, columns, errors) - - outdf._data = sliced_df._data - outdf._index = sliced_df._index - - if not inplace: - return outdf + return super().drop( + labels=labels, + axis=axis, + index=index, + columns=columns, + level=level, + inplace=inplace, + errors=errors, + ) def _drop_column(self, name): """Drop a column by *name* @@ -7606,25 +7556,3 @@ def _get_union_of_series_names(series_list): names_list = [*range(len(series_list))] return names_list - - -def _drop_columns(df, columns, errors): - for c in columns: - try: - df._drop_column(c) - except KeyError as e: - if errors == "ignore": - pass - else: - raise e - - -def _get_host_unique(array): - if isinstance( - array, (cudf.Series, cudf.Index, cudf.core.column.ColumnBase) - ): - return array.unique.to_pandas() - elif isinstance(array, (str, numbers.Number)): - return [array] - else: - return set(array) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 43d6416506d..ab5f32ea01f 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -3,6 +3,7 @@ import copy import functools +import numbers import operator import warnings from collections import OrderedDict, abc as abc @@ -1193,6 +1194,79 @@ def scatter_by_map( return tables + def drop( + self, + labels=None, + axis=0, + index=None, + columns=None, + level=None, + inplace=False, + errors="raise", + ): + """ + """ + + if labels is not None: + if index is not None or columns is not None: + raise ValueError( + "Cannot specify both 'labels' and 'index'/'columns'" + ) + target = labels + elif index is not None: + target = index + axis = 0 + elif columns is not None: + target = columns + axis = 1 + else: + raise ValueError( + "Need to specify at least one of 'labels', " + "'index' or 'columns'" + ) + + if inplace: + outdf = self + else: + outdf = self.copy() + + if axis in (1, "columns"): + target = _get_host_unique(target) + + _drop_columns(outdf, target, errors) + elif axis in (0, "index"): + if not isinstance(target, (cudf.Series, cudf.Index)): + target = as_column(target) + + if isinstance(self._index, cudf.MultiIndex): + if level is None: + level = 0 + + levels_index = outdf.index.get_level_values(level) + if errors == "raise" and not target.isin(levels_index).all(): + raise KeyError("One or more values not found in axis") + + # TODO : Could use anti-join as a future optimization + sliced_df = outdf.take(~levels_index.isin(target)) + sliced_df._index.names = self._index.names + else: + if errors == "raise" and not target.isin(outdf.index).all(): + raise KeyError("One or more values not found in axis") + + sliced_df = outdf.join( + cudf.DataFrame(index=target), how="leftanti" + ) + + if columns is not None: + columns = _get_host_unique(columns) + _drop_columns(sliced_df, columns, errors) + + outdf._data = sliced_df._data + outdf._index = sliced_df._index + + if not inplace: + return outdf + def dropna( self, axis=0, how="any", thresh=None, subset=None, inplace=False ): @@ -3856,3 +3930,25 @@ def _is_series(obj): instead of checking for isinstance(obj, cudf.Series) """ return isinstance(obj, Frame) and obj.ndim == 1 and obj._index is not None + + +def _get_host_unique(array): + if isinstance( + array, (cudf.Series, cudf.Index, cudf.core.column.ColumnBase) + ): + return array.unique.to_pandas() + elif isinstance(array, (str, numbers.Number)): + return [array] + else: + return set(array) + + +def _drop_columns(df, columns, errors): + for c in columns: + try: + df._drop_column(c) + except KeyError as e: + if errors == "ignore": + pass + else: + raise e From 85bf4f8632079b6dbb5d79e2b32b00b0b087ad8b Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Wed, 3 Feb 2021 16:27:03 -0800 Subject: [PATCH 02/18] Adding series.drop tests except errors param test --- python/cudf/cudf/core/dataframe.py | 10 +- python/cudf/cudf/core/frame.py | 9 +- python/cudf/cudf/core/series.py | 34 +++++ python/cudf/cudf/tests/test_series.py | 190 ++++++++++++++++++++++++++ 4 files changed, 238 insertions(+), 5 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 2aa1eda0a5c..d8b75a858cc 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -3250,7 +3250,7 @@ def drop( weight 1.0 0.8 """ - return super().drop( + return super()._drop( labels=labels, axis=axis, index=index, @@ -7312,6 +7312,14 @@ def equals(self, other): return False return super().equals(other) + def _drop_rows_by_labels(self, labels): + """Delete rows specified by `label` parameter. In `DataFrame`, this can + be achieved efficiently by a left-anti join operation + + labels: a list of labels specifying the rows to drop + """ + return self.join(cudf.DataFrame(index=labels), how="leftanti") + _accessors = set() # type: Set[Any] diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index ab5f32ea01f..67bb3ec5c96 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -1194,7 +1194,7 @@ def scatter_by_map( return tables - def drop( + def _drop( self, labels=None, axis=0, @@ -1253,9 +1253,7 @@ def drop( if errors == "raise" and not target.isin(outdf.index).all(): raise KeyError("One or more values not found in axis") - sliced_df = outdf.join( - cudf.DataFrame(index=target), how="leftanti" - ) + sliced_df = self._drop_rows_by_labels(target) if columns is not None: columns = _get_host_unique(columns) @@ -3660,6 +3658,9 @@ def _reindex( return self._mimic_inplace(result, inplace=inplace) + def _drop_rows_by_labels(self, labels): + raise NotImplementedError + def _get_replacement_values_for_columns( to_replace: Any, value: Any, columns_dtype_map: Dict[Any, Any] diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 6f2fdc28101..257502dc239 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -554,6 +554,28 @@ def copy(self, deep=True): result.index = self.index.copy(deep=deep) return result + def drop( + self, + labels=None, + axis=0, + index=None, + columns=None, + level=None, + inplace=False, + errors="raise", + ): + """ + """ + return super()._drop( + labels=labels, + axis=axis, + index=index, + columns=columns, + level=level, + inplace=inplace, + errors=errors, + ) + def __copy__(self, deep=True): return self.copy(deep) @@ -4567,6 +4589,18 @@ def keys(self): """ return self.index + def _drop_rows_by_labels(self, labels): + """Delete rows specified by `label` parameter. Resort to the efficient + implementation in `cudf.DataFrame` + + labels: a list of labels specifying the rows to drop + """ + df = self.to_frame(name="tmp") + dropped = df._drop_rows_by_labels(labels)["tmp"] + dropped.name = self.name + + return dropped + _accessors = set() # type: Set[Any] diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index a19b88caf4c..8c85870b4f5 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -934,3 +934,193 @@ def test_fillna_with_nan(data, nan_as_null, fill_value): actual = gs.fillna(fill_value) assert_eq(expected, actual) + + +@pytest.mark.parametrize( + "ps", + [ + pd.Series(["a"] * 20, index=range(0, 20)), + pd.Series(["b", None] * 10, index=range(0, 20), name="ASeries"), + ], +) +@pytest.mark.parametrize( + "labels", + [[1], [0], 1, 5, [5, 9], pd.Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])], +) +@pytest.mark.parametrize("inplace", [True, False]) +def test_series_drop_labels(ps, labels, inplace): + ps = ps.copy() + gs = cudf.from_pandas(ps) + + expected = ps.drop(labels=labels, axis=0, inplace=inplace) + actual = gs.drop(labels=labels, axis=0, inplace=inplace) + + if inplace: + expected = ps + actual = gs + + assert_eq(expected, actual) + + +@pytest.mark.parametrize( + "ps", + [ + pd.Series(["a"] * 20, index=range(0, 20)), + pd.Series(["b", None] * 10, index=range(0, 20), name="ASeries"), + ], +) +@pytest.mark.parametrize( + "index", + [[1], [0], 1, 5, [5, 9], pd.Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])], +) +@pytest.mark.parametrize("inplace", [True, False]) +def test_series_drop_index(ps, index, inplace): + ps = ps.copy() + gs = cudf.from_pandas(ps) + + expected = ps.drop(index=index, inplace=inplace) + actual = gs.drop(index=index, inplace=inplace) + + if inplace: + expected = ps + actual = gs + + assert_eq(expected, actual) + + +@pytest.mark.parametrize( + "ps", + [ + pd.Series( + ["a" if i % 2 == 0 else "b" for i in range(0, 10)], + index=pd.MultiIndex( + levels=[ + ["lama", "cow", "falcon"], + ["speed", "weight", "length"], + ], + codes=[ + [0, 0, 0, 1, 1, 1, 2, 2, 2, 1], + [0, 1, 2, 0, 1, 2, 0, 1, 2, 1], + ], + ), + name="abc", + ) + ], +) +@pytest.mark.parametrize( + "index,level", + [ + ("cow", 0), + ("lama", 0), + ("falcon", 0), + ("speed", 1), + ("weight", 1), + ("length", 1), + pytest.param( + "cow", + None, + marks=pytest.mark.xfail( + reason="https://github.com/pandas-dev/pandas/issues/36293" + ), + ), + pytest.param( + "lama", + None, + marks=pytest.mark.xfail( + reason="https://github.com/pandas-dev/pandas/issues/36293" + ), + ), + pytest.param( + "falcon", + None, + marks=pytest.mark.xfail( + reason="https://github.com/pandas-dev/pandas/issues/36293" + ), + ), + ], +) +@pytest.mark.parametrize("inplace", [True, False]) +def test_series_drop_multiindex(ps, index, level, inplace): + ps = ps.copy() + gs = cudf.from_pandas(ps) + + expected = ps.drop(index=index, inplace=inplace, level=level) + actual = gs.drop(index=index, inplace=inplace, level=level) + + if inplace: + expected = ps + actual = gs + + assert_eq(expected, actual) + + +# def test_series_drop_error(): +# gs = cudf.Series([42], name="a") +# ps = gs.to_pandas() + +# assert_exceptions_equal( +# lfunc=ps.drop, +# rfunc=df.drop, +# lfunc_args_and_kwargs=([], {"columns": "d"}), +# rfunc_args_and_kwargs=([], {"columns": "d"}), +# expected_error_message="column 'd' does not exist", +# ) + +# assert_exceptions_equal( +# lfunc=pdf.drop, +# rfunc=df.drop, +# lfunc_args_and_kwargs=([], {"columns": ["a", "d", "b"]}), +# rfunc_args_and_kwargs=([], {"columns": ["a", "d", "b"]}), +# expected_error_message="column 'd' does not exist", +# ) + +# assert_exceptions_equal( +# lfunc=pdf.drop, +# rfunc=df.drop, +# lfunc_args_and_kwargs=(["a"], {"columns": "a", "axis": 1}), +# rfunc_args_and_kwargs=(["a"], {"columns": "a", "axis": 1}), +# expected_error_message="Cannot specify both", +# ) + +# def test_series_drop_raises(): +# gs = cudf.DataFrame([10, 20, 30], index=["x", "y", "z"], name="c") +# ps = gs.to_pandas() + +# assert_exceptions_equal( +# lfunc=ps.drop, +# rfunc=gs.drop, +# lfunc_args_and_kwargs=(["p"],), +# rfunc_args_and_kwargs=(["p"],), +# expected_error_message="One or more values not found in axis", +# ) + +# expect = ps.drop("p", errors="ignore") +# actual = gs.drop("p", errors="ignore") + +# assert_eq(actual, expect) + +# assert_exceptions_equal( +# lfunc=ps.drop, +# rfunc=gs.drop, +# lfunc_args_and_kwargs=([], {"columns": "p"}), +# rfunc_args_and_kwargs=([], {"columns": "p"}), +# expected_error_message="column 'p' does not exist", +# ) + +# expect = ps.drop(columns="p", errors="ignore") +# actual = gs.drop(columns="p", errors="ignore") + +# assert_eq(actual, expect) + +# assert_exceptions_equal( +# lfunc=ps.drop, +# rfunc=gs.drop, +# lfunc_args_and_kwargs=([], {"labels": "p", "axis": 1}), +# rfunc_args_and_kwargs=([], {"labels": "p", "axis": 1}), +# expected_error_message="column 'p' does not exist", +# ) + +# expect = ps.drop(labels="p", axis=1, errors="ignore") +# actual = gs.drop(labels="p", axis=1, errors="ignore") + +# assert_eq(actual, expect) From b301cd8c323157a2ecd90d00fe977435f07b00a4 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Wed, 3 Feb 2021 16:57:04 -0800 Subject: [PATCH 03/18] Add errors test --- python/cudf/cudf/core/series.py | 2 + python/cudf/cudf/tests/test_series.py | 111 ++++++++++---------------- 2 files changed, 43 insertions(+), 70 deletions(-) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 257502dc239..88ea3754881 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -566,6 +566,8 @@ def drop( ): """ """ + columns = [] if columns is not None else columns + return super()._drop( labels=labels, axis=axis, diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index 8c85870b4f5..c2f0ace0327 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -1054,73 +1054,44 @@ def test_series_drop_multiindex(ps, index, level, inplace): assert_eq(expected, actual) -# def test_series_drop_error(): -# gs = cudf.Series([42], name="a") -# ps = gs.to_pandas() - -# assert_exceptions_equal( -# lfunc=ps.drop, -# rfunc=df.drop, -# lfunc_args_and_kwargs=([], {"columns": "d"}), -# rfunc_args_and_kwargs=([], {"columns": "d"}), -# expected_error_message="column 'd' does not exist", -# ) - -# assert_exceptions_equal( -# lfunc=pdf.drop, -# rfunc=df.drop, -# lfunc_args_and_kwargs=([], {"columns": ["a", "d", "b"]}), -# rfunc_args_and_kwargs=([], {"columns": ["a", "d", "b"]}), -# expected_error_message="column 'd' does not exist", -# ) - -# assert_exceptions_equal( -# lfunc=pdf.drop, -# rfunc=df.drop, -# lfunc_args_and_kwargs=(["a"], {"columns": "a", "axis": 1}), -# rfunc_args_and_kwargs=(["a"], {"columns": "a", "axis": 1}), -# expected_error_message="Cannot specify both", -# ) - -# def test_series_drop_raises(): -# gs = cudf.DataFrame([10, 20, 30], index=["x", "y", "z"], name="c") -# ps = gs.to_pandas() - -# assert_exceptions_equal( -# lfunc=ps.drop, -# rfunc=gs.drop, -# lfunc_args_and_kwargs=(["p"],), -# rfunc_args_and_kwargs=(["p"],), -# expected_error_message="One or more values not found in axis", -# ) - -# expect = ps.drop("p", errors="ignore") -# actual = gs.drop("p", errors="ignore") - -# assert_eq(actual, expect) - -# assert_exceptions_equal( -# lfunc=ps.drop, -# rfunc=gs.drop, -# lfunc_args_and_kwargs=([], {"columns": "p"}), -# rfunc_args_and_kwargs=([], {"columns": "p"}), -# expected_error_message="column 'p' does not exist", -# ) - -# expect = ps.drop(columns="p", errors="ignore") -# actual = gs.drop(columns="p", errors="ignore") - -# assert_eq(actual, expect) - -# assert_exceptions_equal( -# lfunc=ps.drop, -# rfunc=gs.drop, -# lfunc_args_and_kwargs=([], {"labels": "p", "axis": 1}), -# rfunc_args_and_kwargs=([], {"labels": "p", "axis": 1}), -# expected_error_message="column 'p' does not exist", -# ) - -# expect = ps.drop(labels="p", axis=1, errors="ignore") -# actual = gs.drop(labels="p", axis=1, errors="ignore") - -# assert_eq(actual, expect) +def test_series_drop_edge_inputs(): + gs = cudf.Series([42], name="a") + ps = gs.to_pandas() + + assert_eq(ps.drop(columns=["b"]), gs.drop(columns=["b"])) + + assert_eq(ps.drop(columns="b"), gs.drop(columns="b")) + + assert_exceptions_equal( + lfunc=ps.drop, + rfunc=gs.drop, + lfunc_args_and_kwargs=(["a"], {"columns": "a", "axis": 1}), + rfunc_args_and_kwargs=(["a"], {"columns": "a", "axis": 1}), + expected_error_message="Cannot specify both", + ) + + assert_exceptions_equal( + lfunc=ps.drop, + rfunc=gs.drop, + lfunc_args_and_kwargs=([], {}), + rfunc_args_and_kwargs=([], {}), + expected_error_message="Need to specify at least one", + ) + + +def test_series_drop_raises(): + gs = cudf.Series([10, 20, 30], index=["x", "y", "z"], name="c") + ps = gs.to_pandas() + + assert_exceptions_equal( + lfunc=ps.drop, + rfunc=gs.drop, + lfunc_args_and_kwargs=(["p"],), + rfunc_args_and_kwargs=(["p"],), + expected_error_message="One or more values not found in axis", + ) + + expect = ps.drop("p", errors="ignore") + actual = gs.drop("p", errors="ignore") + + assert_eq(actual, expect) From 818a8bb3deb73cbfb336f2fabdfea4dcf7dca62d Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Wed, 3 Feb 2021 20:43:33 -0800 Subject: [PATCH 04/18] Move common code path to helper func --- python/cudf/cudf/core/dataframe.py | 72 +++++++++++++--- python/cudf/cudf/core/frame.py | 115 ++++++-------------------- python/cudf/cudf/core/series.py | 41 ++++++--- python/cudf/cudf/tests/test_series.py | 8 ++ 4 files changed, 123 insertions(+), 113 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index d8b75a858cc..ab4b6cf2751 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -30,7 +30,7 @@ from cudf.core.abc import Serializable from cudf.core.column import as_column, column_empty from cudf.core.column_accessor import ColumnAccessor -from cudf.core.frame import Frame +from cudf.core.frame import Frame, _drop_rows_by_labels from cudf.core.groupby.groupby import DataFrameGroupBy from cudf.core.index import Index, RangeIndex, as_index from cudf.core.indexing import _DataFrameIlocIndexer, _DataFrameLocIndexer @@ -3250,15 +3250,45 @@ def drop( weight 1.0 0.8 """ - return super()._drop( - labels=labels, - axis=axis, - index=index, - columns=columns, - level=level, - inplace=inplace, - errors=errors, - ) + if labels is not None: + if index is not None or columns is not None: + raise ValueError( + "Cannot specify both 'labels' and 'index'/'columns'" + ) + target = labels + elif index is not None: + target = index + axis = 0 + elif columns is not None: + target = columns + axis = 1 + else: + raise ValueError( + "Need to specify at least one of 'labels', " + "'index' or 'columns'" + ) + + if inplace: + out = self + else: + out = self.copy() + + if axis in (1, "columns"): + target = _get_host_unique(target) + + _drop_columns(out, target, errors) + elif axis in (0, "index"): + dropped = _drop_rows_by_labels(out, target, level, errors) + + if columns is not None: + columns = _get_host_unique(columns) + _drop_columns(dropped, columns, errors) + + out._data = dropped._data + out._index = dropped._index + + if not inplace: + return out def _drop_column(self, name): """Drop a column by *name* @@ -7564,3 +7594,25 @@ def _get_union_of_series_names(series_list): names_list = [*range(len(series_list))] return names_list + + +def _get_host_unique(array): + if isinstance( + array, (cudf.Series, cudf.Index, cudf.core.column.ColumnBase) + ): + return array.unique.to_pandas() + elif isinstance(array, (str, numbers.Number)): + return [array] + else: + return set(array) + + +def _drop_columns(df, columns, errors): + for c in columns: + try: + df._drop_column(c) + except KeyError as e: + if errors == "ignore": + pass + else: + raise e diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 67bb3ec5c96..78967be4f62 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -3,7 +3,6 @@ import copy import functools -import numbers import operator import warnings from collections import OrderedDict, abc as abc @@ -1194,77 +1193,6 @@ def scatter_by_map( return tables - def _drop( - self, - labels=None, - axis=0, - index=None, - columns=None, - level=None, - inplace=False, - errors="raise", - ): - """ - """ - - if labels is not None: - if index is not None or columns is not None: - raise ValueError( - "Cannot specify both 'labels' and 'index'/'columns'" - ) - target = labels - elif index is not None: - target = index - axis = 0 - elif columns is not None: - target = columns - axis = 1 - else: - raise ValueError( - "Need to specify at least one of 'labels', " - "'index' or 'columns'" - ) - - if inplace: - outdf = self - else: - outdf = self.copy() - - if axis in (1, "columns"): - target = _get_host_unique(target) - - _drop_columns(outdf, target, errors) - elif axis in (0, "index"): - if not isinstance(target, (cudf.Series, cudf.Index)): - target = as_column(target) - - if isinstance(self._index, cudf.MultiIndex): - if level is None: - level = 0 - - levels_index = outdf.index.get_level_values(level) - if errors == "raise" and not target.isin(levels_index).all(): - raise KeyError("One or more values not found in axis") - - # TODO : Could use anti-join as a future optimization - sliced_df = outdf.take(~levels_index.isin(target)) - sliced_df._index.names = self._index.names - else: - if errors == "raise" and not target.isin(outdf.index).all(): - raise KeyError("One or more values not found in axis") - - sliced_df = self._drop_rows_by_labels(target) - - if columns is not None: - columns = _get_host_unique(columns) - _drop_columns(sliced_df, columns, errors) - - outdf._data = sliced_df._data - outdf._index = sliced_df._index - - if not inplace: - return outdf - def dropna( self, axis=0, how="any", thresh=None, subset=None, inplace=False ): @@ -3658,9 +3586,6 @@ def _reindex( return self._mimic_inplace(result, inplace=inplace) - def _drop_rows_by_labels(self, labels): - raise NotImplementedError - def _get_replacement_values_for_columns( to_replace: Any, value: Any, columns_dtype_map: Dict[Any, Any] @@ -3933,23 +3858,29 @@ def _is_series(obj): return isinstance(obj, Frame) and obj.ndim == 1 and obj._index is not None -def _get_host_unique(array): - if isinstance( - array, (cudf.Series, cudf.Index, cudf.core.column.ColumnBase) - ): - return array.unique.to_pandas() - elif isinstance(array, (str, numbers.Number)): - return [array] +def _drop_rows_by_labels(obj, labels, level, errors): + """Remove rows specified by labels + + Parameter `level` is ignored if `obj._index` is not `MultiIndex` + """ + if not isinstance(labels, (cudf.Series, cudf.Index)): + labels = as_column(labels) + + if isinstance(obj._index, cudf.MultiIndex): + if level is None: + level = 0 + + levels_index = obj.index.get_level_values(level) + if errors == "raise" and not labels.isin(levels_index).all(): + raise KeyError("One or more values not found in axis") + + # TODO : Could use anti-join as a future optimization + sliced_df = obj.take(~levels_index.isin(labels)) + sliced_df._index.names = obj._index.names else: - return set(array) + if errors == "raise" and not labels.isin(obj.index).all(): + raise KeyError("One or more values not found in axis") + sliced_df = obj._drop_rows_by_labels(labels) -def _drop_columns(df, columns, errors): - for c in columns: - try: - df._drop_column(c) - except KeyError as e: - if errors == "ignore": - pass - else: - raise e + return sliced_df diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 88ea3754881..4fc93910d34 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -35,7 +35,7 @@ from cudf.core.column.lists import ListMethods from cudf.core.column.string import StringMethods from cudf.core.column_accessor import ColumnAccessor -from cudf.core.frame import Frame +from cudf.core.frame import Frame, _drop_rows_by_labels from cudf.core.groupby.groupby import SeriesGroupBy from cudf.core.index import Index, RangeIndex, as_index from cudf.core.indexing import _SeriesIlocIndexer, _SeriesLocIndexer @@ -566,17 +566,36 @@ def drop( ): """ """ - columns = [] if columns is not None else columns + if labels is not None: + if index is not None or columns is not None: + raise ValueError( + "Cannot specify both 'labels' and 'index'/'columns'" + ) + if axis == 1: + raise ValueError("No axis named 1 for object type Series") + target = labels + elif index is not None: + target = index + elif columns is not None: + target = [] # Ignore parameter columns + else: + raise ValueError( + "Need to specify at least one of 'labels', " + "'index' or 'columns'" + ) - return super()._drop( - labels=labels, - axis=axis, - index=index, - columns=columns, - level=level, - inplace=inplace, - errors=errors, - ) + if inplace: + out = self + else: + out = self.copy() + + dropped = _drop_rows_by_labels(out, target, level, errors) + + out._data = dropped._data + out._index = dropped._index + + if not inplace: + return out def __copy__(self, deep=True): return self.copy(deep) diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index c2f0ace0327..c2bb7da1f49 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -1078,6 +1078,14 @@ def test_series_drop_edge_inputs(): expected_error_message="Need to specify at least one", ) + assert_exceptions_equal( + lfunc=ps.drop, + rfunc=gs.drop, + lfunc_args_and_kwargs=(["b"], {"axis": 1}), + rfunc_args_and_kwargs=(["b"], {"axis": 1}), + expected_error_message="No axis named 1", + ) + def test_series_drop_raises(): gs = cudf.Series([10, 20, 30], index=["x", "y", "z"], name="c") From bc0f5f2e33d8ac9971a5a9b5e1b4a406f2496db1 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Wed, 3 Feb 2021 21:14:54 -0800 Subject: [PATCH 05/18] Add typing --- python/cudf/cudf/core/dataframe.py | 7 +++++-- python/cudf/cudf/core/frame.py | 13 ++++++++++--- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index ab4b6cf2751..74c9b504dbf 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -26,6 +26,7 @@ import cudf from cudf import _lib as libcudf from cudf._lib.null_mask import MaskState, create_null_mask +from cudf._typing import ColumnLike from cudf.core import column, reshape from cudf.core.abc import Serializable from cudf.core.column import as_column, column_empty @@ -7342,7 +7343,9 @@ def equals(self, other): return False return super().equals(other) - def _drop_rows_by_labels(self, labels): + def _drop_rows_by_labels( + self: cudf.DataFrame, labels: ColumnLike + ) -> cudf.DataFrame: """Delete rows specified by `label` parameter. In `DataFrame`, this can be achieved efficiently by a left-anti join operation @@ -7607,7 +7610,7 @@ def _get_host_unique(array): return set(array) -def _drop_columns(df, columns, errors): +def _drop_columns(df: DataFrame, columns: Iterable, errors: str): for c in columns: try: df._drop_column(c) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 78967be4f62..b8cdd3db485 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -6,7 +6,7 @@ import operator import warnings from collections import OrderedDict, abc as abc -from typing import Any, Dict, Tuple, overload +from typing import Any, Dict, Tuple, Union, overload import cupy import numpy as np @@ -18,6 +18,7 @@ import cudf from cudf import _lib as libcudf +from cudf._typing import ColumnLike from cudf.core.column import as_column, build_categorical_column, column_empty from cudf.utils.dtypes import ( is_categorical_dtype, @@ -3858,8 +3859,14 @@ def _is_series(obj): return isinstance(obj, Frame) and obj.ndim == 1 and obj._index is not None -def _drop_rows_by_labels(obj, labels, level, errors): - """Remove rows specified by labels +def _drop_rows_by_labels( + obj: Union[cudf.DataFrame, cudf.Series], + labels: Union[ColumnLike, abc.Iterable, str], + level: Union[int, str], + errors: str, +) -> Union[cudf.DataFrame, cudf.Series]: + """Remove rows specified by `labels`. If `errors=True`, an error is raised + if some items in `labels` do not exist in `obj._index`. Parameter `level` is ignored if `obj._index` is not `MultiIndex` """ From 1c38141d5965dc63ccc3dcaf4cf55cbfbae45a98 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Thu, 4 Feb 2021 12:08:11 -0800 Subject: [PATCH 06/18] Updated docstring --- python/cudf/cudf/core/dataframe.py | 4 +- python/cudf/cudf/core/series.py | 79 ++++++++++++++++++++++++++++++ 2 files changed, 81 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 74c9b504dbf..115ffa6b1ec 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -7344,8 +7344,8 @@ def equals(self, other): return super().equals(other) def _drop_rows_by_labels( - self: cudf.DataFrame, labels: ColumnLike - ) -> cudf.DataFrame: + self: "cudf.DataFrame", labels: ColumnLike + ) -> "cudf.DataFrame": """Delete rows specified by `label` parameter. In `DataFrame`, this can be achieved efficiently by a left-anti join operation diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 4fc93910d34..fb781925a40 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -565,6 +565,85 @@ def drop( errors="raise", ): """ + Return Series with specified index labels removed. + + Remove elements of a Series based on specifying the index labels. + When using a multi-index, labels on different levels can be removed by + specifying the level. + + Parameters + ---------- + labels : single label or list-like + Index labels to drop. + axis : 0, default 0 + Redundant for application on Series. + index : single label or list-like + Redundant for application on Series. But ``index`` can be used + instead of ``labels`` + columns : single label or list-like + This parameter is ignored. Use ``index`` or ``labels`` to specify. + level : int or level name, optional + For MultiIndex, level from which the labels will be removed. + inplace : bool, default False + If False, return a copy. Otherwise, do operation + inplace and return None. + errors : {'ignore', 'raise'}, default 'raise' + If 'ignore', suppress error and only existing labels are + dropped. + + Returns + ------- + Series or None + Series with specified index labels removed or None if + ``inplace=True`` + + Raises + ------ + KeyError + If any of the labels is not found in the selected axis and + ``error='raise'`` + + See Also + -------- + Series.reindex + Return only specified index labels of Series + Series.dropna + Return series without null values + Series.drop_duplicates + Return series with duplicate values removed + cudf.core.dataframe.DataFrame.drop + Drop specified labels from rows or columns in dataframe + + Examples + -------- + >>> s = cudf.Series([1,2,3], index=['x', 'y', 'z']) + >>> s + x 1 + y 2 + z 3 + dtype: int64 + + Drop labels x and z + + >>> s.drop(labels=['x', 'z']) + y 2 + dtype: int64 + + Drop a label from the second level in MultiIndex Series. + + >>> midx = cudf.MultiIndex.from_product([[0, 1, 2], ['x', 'y']]) + >>> s = cudf.Series(range(6), index=midx) + >>> s + 0 x 0 + y 1 + 1 x 2 + y 3 + 2 x 4 + y 5 + >>> s.drop(labels='y', level=1) + 0 x 0 + 1 x 2 + 2 x 4 """ if labels is not None: if index is not None or columns is not None: From fd70f2b23cb8facf7141576534549d2fd6083d22 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Fri, 5 Feb 2021 11:44:17 -0800 Subject: [PATCH 07/18] Use __future__ --- python/cudf/cudf/core/dataframe.py | 6 ++---- python/cudf/cudf/core/frame.py | 3 +-- python/cudf/cudf/core/series.py | 3 ++- 3 files changed, 5 insertions(+), 7 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 94e72026f38..3619f917972 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1,6 +1,6 @@ # Copyright (c) 2018-2021, NVIDIA CORPORATION. -from __future__ import division +from __future__ import annotations, division import inspect import itertools @@ -7389,9 +7389,7 @@ def equals(self, other): return False return super().equals(other) - def _drop_rows_by_labels( - self: "cudf.DataFrame", labels: ColumnLike - ) -> "cudf.DataFrame": + def _drop_rows_by_labels(self, labels: ColumnLike) -> "cudf.DataFrame": """Delete rows specified by `label` parameter. In `DataFrame`, this can be achieved efficiently by a left-anti join operation diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index f8628c19176..4a075eb55cb 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -6,7 +6,7 @@ import operator import warnings from collections import OrderedDict, abc as abc -from typing import TYPE_CHECKING, Any, Dict, Tuple, TypeVar, overload +from typing import TYPE_CHECKING, Any, Dict, Tuple, TypeVar, Union, overload import cupy import numpy as np @@ -28,7 +28,6 @@ min_scalar_type, ) - T = TypeVar("T", bound="Frame") if TYPE_CHECKING: diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 6211bc356e9..aa7a55256d0 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -19,6 +19,7 @@ import cudf from cudf import _lib as libcudf from cudf._lib.transform import bools_to_mask +from cudf._typing import ColumnLike from cudf.core.abc import Serializable from cudf.core.column import ( ColumnBase, @@ -4625,7 +4626,7 @@ def keys(self): """ return self.index - def _drop_rows_by_labels(self, labels): + def _drop_rows_by_labels(self, labels: ColumnLike) -> "cudf.Series": """Delete rows specified by `label` parameter. Resort to the efficient implementation in `cudf.DataFrame` From 2765eab81b368ddaee670c00f956545f631d14a8 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Fri, 5 Feb 2021 11:50:01 -0800 Subject: [PATCH 08/18] Use DataFrameOrSeries --- python/cudf/cudf/core/frame.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 4a075eb55cb..5a14a24401e 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -18,7 +18,7 @@ import cudf from cudf import _lib as libcudf -from cudf._typing import ColumnLike +from cudf._typing import ColumnLike, DataFrameOrSeries from cudf.core.column import as_column, build_categorical_column, column_empty from cudf.utils.dtypes import ( is_categorical_dtype, @@ -3841,11 +3841,11 @@ def _is_series(obj): def _drop_rows_by_labels( - obj: Union[cudf.DataFrame, cudf.Series], + obj: DataFrameOrSeries, labels: Union[ColumnLike, abc.Iterable, str], level: Union[int, str], errors: str, -) -> Union[cudf.DataFrame, cudf.Series]: +) -> DataFrameOrSeries: """Remove rows specified by `labels`. If `errors=True`, an error is raised if some items in `labels` do not exist in `obj._index`. From 5325fccd16e203259fc0de2eae8a5eca38d88783 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Fri, 5 Feb 2021 13:11:17 -0800 Subject: [PATCH 09/18] Check levels --- python/cudf/cudf/core/frame.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 5a14a24401e..c897ee3ba39 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -3848,9 +3848,11 @@ def _drop_rows_by_labels( ) -> DataFrameOrSeries: """Remove rows specified by `labels`. If `errors=True`, an error is raised if some items in `labels` do not exist in `obj._index`. - - Parameter `level` is ignored if `obj._index` is not `MultiIndex` """ + if isinstance(level, int): + if obj._index is not None and level > obj._index.nlevels: + raise ValueError("Param level out of bounds.") + if not isinstance(labels, (cudf.Series, cudf.Index)): labels = as_column(labels) From a61119f745edd8fd008f84dca6e7218d839d43ce Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Fri, 5 Feb 2021 13:48:34 -0800 Subject: [PATCH 10/18] Better doc --- python/cudf/cudf/core/frame.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index c897ee3ba39..435cb1f13fb 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -3848,9 +3848,11 @@ def _drop_rows_by_labels( ) -> DataFrameOrSeries: """Remove rows specified by `labels`. If `errors=True`, an error is raised if some items in `labels` do not exist in `obj._index`. + + Will raise if level(int) is greater or equal to index nlevels """ if isinstance(level, int): - if obj._index is not None and level > obj._index.nlevels: + if obj._index is not None and level >= obj._index.nlevels: raise ValueError("Param level out of bounds.") if not isinstance(labels, (cudf.Series, cudf.Index)): From e8d494a2493e6d4135f63e4ebbc4542acce44406 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Fri, 5 Feb 2021 22:07:34 -0800 Subject: [PATCH 11/18] Initial impl midx improve --- python/cudf/cudf/core/dataframe.py | 40 +++++++++++++++++++++++++++--- python/cudf/cudf/core/frame.py | 10 +++----- python/cudf/cudf/core/series.py | 40 ++++++++++++++++++++++++++---- 3 files changed, 76 insertions(+), 14 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 3619f917972..f0fe162f312 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -10,7 +10,7 @@ import warnings from collections import OrderedDict, defaultdict from collections.abc import Iterable, Mapping, Sequence -from typing import Any, Set, TypeVar +from typing import Any, Set, TypeVar, Union import cupy import numpy as np @@ -7389,13 +7389,47 @@ def equals(self, other): return False return super().equals(other) - def _drop_rows_by_labels(self, labels: ColumnLike) -> "cudf.DataFrame": + def _drop_rows_by_labels( + self, labels: ColumnLike, level: Union[int, str] = None + ) -> "cudf.DataFrame": """Delete rows specified by `label` parameter. In `DataFrame`, this can be achieved efficiently by a left-anti join operation labels: a list of labels specifying the rows to drop """ - return self.join(cudf.DataFrame(index=labels), how="leftanti") + + if isinstance(self._index, cudf.MultiIndex): + if isinstance(level, int): + ilevel = level + else: + ilevel = self._index.names.index(level) + + idx_nlv = self._index.nlevels + working_df = self._index._source_data + # TODO: figure out what __unique__ should be + for col in self.columns: + working_df["__unique__" + str(col)] = self[col]._column + working_df = working_df.set_index(level) + + # TODO: replace with Brandon's suggestion + to_join = cudf.DataFrame(index=cudf.Index(labels, name=level)) + join_res = working_df.join(to_join, how="leftanti") + join_res.insert( + ilevel, name=join_res._index.name, value=join_res._index + ) + join_res = join_res.reset_index(drop=True) + + midx = cudf.MultiIndex.from_frame( + join_res.iloc[:, 0:idx_nlv], names=self._index.names + ) + + dropped = join_res.iloc[:, idx_nlv:] + dropped = dropped.set_index(midx) + dropped.columns = self.columns + else: + dropped = self.join(cudf.DataFrame(index=labels), how="leftanti") + + return dropped _accessors = set() # type: Set[Any] diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 435cb1f13fb..6fa3c476be9 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -3851,9 +3851,8 @@ def _drop_rows_by_labels( Will raise if level(int) is greater or equal to index nlevels """ - if isinstance(level, int): - if obj._index is not None and level >= obj._index.nlevels: - raise ValueError("Param level out of bounds.") + if isinstance(level, int) and level >= obj.index.nlevels: + raise ValueError("Param level out of bounds.") if not isinstance(labels, (cudf.Series, cudf.Index)): labels = as_column(labels) @@ -3866,9 +3865,8 @@ def _drop_rows_by_labels( if errors == "raise" and not labels.isin(levels_index).all(): raise KeyError("One or more values not found in axis") - # TODO : Could use anti-join as a future optimization - sliced_df = obj.take(~levels_index.isin(labels)) - sliced_df._index.names = obj._index.names + sliced_df = obj._drop_rows_by_labels(labels, level) + else: if errors == "raise" and not labels.isin(obj.index).all(): raise KeyError("One or more values not found in axis") diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index aa7a55256d0..756f5c34581 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -6,7 +6,7 @@ from collections import abc as abc from numbers import Number from shutil import get_terminal_size -from typing import Any, Optional, Set +from typing import Any, Optional, Set, Union from uuid import uuid4 import cupy @@ -4626,16 +4626,46 @@ def keys(self): """ return self.index - def _drop_rows_by_labels(self, labels: ColumnLike) -> "cudf.Series": + def _drop_rows_by_labels( + self, labels: ColumnLike, level: Union[int, __builtins__.str] = None + ) -> "cudf.Series": """Delete rows specified by `label` parameter. Resort to the efficient implementation in `cudf.DataFrame` labels: a list of labels specifying the rows to drop """ - df = self.to_frame(name="tmp") - dropped = df._drop_rows_by_labels(labels)["tmp"] - dropped.name = self.name + if isinstance(self._index, cudf.MultiIndex): + if isinstance(level, int): + ilevel = level + else: + ilevel = self._index.names.index(level) + + idx_nlv = self._index.nlevels + working_df = self._index._source_data + # TODO: figure out what __unique__ should be + working_df["__unique__"] = self._column + working_df = working_df.set_index(level) + + # TODO: replace with Brandon's suggestion + to_join = cudf.DataFrame(index=cudf.Index(labels, name=level)) + join_res = working_df.join(to_join, how="leftanti") + join_res.insert( + ilevel, name=join_res._index.name, value=join_res._index + ) + join_res = join_res.reset_index(drop=True) + + midx = cudf.MultiIndex.from_frame( + join_res.iloc[:, 0:idx_nlv], names=self._index.names + ) + + dropped = join_res["__unique__"] + dropped = dropped.set_index(midx) + else: + df = self.to_frame(name="tmp") + dropped = df._drop_rows_by_labels(labels)["tmp"] + + dropped.name = self.name return dropped _accessors = set() # type: Set[Any] From 43acf56525cf2d40cd79d73a199c6da83bfeaa24 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Mon, 8 Feb 2021 10:16:20 -0800 Subject: [PATCH 12/18] Use column index to rename --- python/cudf/cudf/core/dataframe.py | 6 +++--- python/cudf/cudf/core/series.py | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index f0fe162f312..8cefdf97cf6 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -7406,9 +7406,9 @@ def _drop_rows_by_labels( idx_nlv = self._index.nlevels working_df = self._index._source_data - # TODO: figure out what __unique__ should be - for col in self.columns: - working_df["__unique__" + str(col)] = self[col]._column + working_df.columns = [i for i in range(idx_nlv)] + for i, col in enumerate(self.columns): + working_df[idx_nlv + i] = self[col]._column working_df = working_df.set_index(level) # TODO: replace with Brandon's suggestion diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 756f5c34581..6d3299a42c1 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -4643,9 +4643,9 @@ def _drop_rows_by_labels( idx_nlv = self._index.nlevels working_df = self._index._source_data - # TODO: figure out what __unique__ should be - working_df["__unique__"] = self._column - working_df = working_df.set_index(level) + working_df.columns = [i for i in range(idx_nlv)] + working_df[idx_nlv] = self._column + working_df = working_df.set_index(ilevel) # TODO: replace with Brandon's suggestion to_join = cudf.DataFrame(index=cudf.Index(labels, name=level)) @@ -4659,7 +4659,7 @@ def _drop_rows_by_labels( join_res.iloc[:, 0:idx_nlv], names=self._index.names ) - dropped = join_res["__unique__"] + dropped = join_res[idx_nlv] dropped = dropped.set_index(midx) else: df = self.to_frame(name="tmp") From b72ac3ea6ed7450f897e16fe88334a2e6cfd2708 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Mon, 8 Feb 2021 10:33:25 -0800 Subject: [PATCH 13/18] Better docstrings --- python/cudf/cudf/core/dataframe.py | 7 +++++++ python/cudf/cudf/core/series.py | 7 +++++++ 2 files changed, 14 insertions(+) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 8cefdf97cf6..f0ca4cc1fd3 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -7404,16 +7404,23 @@ def _drop_rows_by_labels( else: ilevel = self._index.names.index(level) + # 1. Merge Index df and data df along column axis: + # | id | ._index df | original df | idx_nlv = self._index.nlevels working_df = self._index._source_data working_df.columns = [i for i in range(idx_nlv)] for i, col in enumerate(self.columns): working_df[idx_nlv + i] = self[col]._column + # 2. Set `level` as common index: + # | level | ._index df w/o level | original df working_df = working_df.set_index(level) + # 3. Use "leftanti" join to drop # TODO: replace with Brandon's suggestion to_join = cudf.DataFrame(index=cudf.Index(labels, name=level)) join_res = working_df.join(to_join, how="leftanti") + + # 4. Reconstruct original layout, and rename join_res.insert( ilevel, name=join_res._index.name, value=join_res._index ) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 6d3299a42c1..803bc92e2bc 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -4641,15 +4641,22 @@ def _drop_rows_by_labels( else: ilevel = self._index.names.index(level) + # 1. Merge Index df and data column along column axis: + # | id | ._index df | data column | idx_nlv = self._index.nlevels working_df = self._index._source_data working_df.columns = [i for i in range(idx_nlv)] working_df[idx_nlv] = self._column + # 2. Set `level` as common index: + # | level | ._index df w/o level | data column working_df = working_df.set_index(ilevel) + # 3. Use "leftanti" join to drop # TODO: replace with Brandon's suggestion to_join = cudf.DataFrame(index=cudf.Index(labels, name=level)) join_res = working_df.join(to_join, how="leftanti") + + # 4. Reconstruct original layout, and rename join_res.insert( ilevel, name=join_res._index.name, value=join_res._index ) From 3cb9ede2755602f5ff5135facf509f50e1780a57 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Mon, 8 Feb 2021 12:06:08 -0800 Subject: [PATCH 14/18] Merge S/DF code paths --- python/cudf/cudf/core/dataframe.py | 51 ++++++------------------------ python/cudf/cudf/core/frame.py | 47 +++++++++++++++++++++++++-- python/cudf/cudf/core/series.py | 46 +++------------------------ 3 files changed, 58 insertions(+), 86 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index f0ca4cc1fd3..ed47027afd4 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -10,7 +10,7 @@ import warnings from collections import OrderedDict, defaultdict from collections.abc import Iterable, Mapping, Sequence -from typing import Any, Set, TypeVar, Union +from typing import Any, Optional, Set, TypeVar import cupy import numpy as np @@ -492,7 +492,12 @@ def _from_table(cls, table, index=None): return out @classmethod - def _from_data(cls, data, index=None, columns=None): + def _from_data( + cls, + data: ColumnAccessor, + index: Optional[Index] = None, + columns: Any = None, + ) -> DataFrame: out = cls.__new__(cls) out._data = data if index is None: @@ -7389,52 +7394,14 @@ def equals(self, other): return False return super().equals(other) - def _drop_rows_by_labels( - self, labels: ColumnLike, level: Union[int, str] = None - ) -> "cudf.DataFrame": + def _drop_rows_by_labels(self, labels: ColumnLike) -> "cudf.DataFrame": """Delete rows specified by `label` parameter. In `DataFrame`, this can be achieved efficiently by a left-anti join operation labels: a list of labels specifying the rows to drop """ - if isinstance(self._index, cudf.MultiIndex): - if isinstance(level, int): - ilevel = level - else: - ilevel = self._index.names.index(level) - - # 1. Merge Index df and data df along column axis: - # | id | ._index df | original df | - idx_nlv = self._index.nlevels - working_df = self._index._source_data - working_df.columns = [i for i in range(idx_nlv)] - for i, col in enumerate(self.columns): - working_df[idx_nlv + i] = self[col]._column - # 2. Set `level` as common index: - # | level | ._index df w/o level | original df - working_df = working_df.set_index(level) - - # 3. Use "leftanti" join to drop - # TODO: replace with Brandon's suggestion - to_join = cudf.DataFrame(index=cudf.Index(labels, name=level)) - join_res = working_df.join(to_join, how="leftanti") - - # 4. Reconstruct original layout, and rename - join_res.insert( - ilevel, name=join_res._index.name, value=join_res._index - ) - join_res = join_res.reset_index(drop=True) - - midx = cudf.MultiIndex.from_frame( - join_res.iloc[:, 0:idx_nlv], names=self._index.names - ) - - dropped = join_res.iloc[:, idx_nlv:] - dropped = dropped.set_index(midx) - dropped.columns = self.columns - else: - dropped = self.join(cudf.DataFrame(index=labels), how="leftanti") + dropped = self.join(cudf.DataFrame(index=labels), how="leftanti") return dropped diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 6fa3c476be9..0d995183dc0 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -3857,6 +3857,7 @@ def _drop_rows_by_labels( if not isinstance(labels, (cudf.Series, cudf.Index)): labels = as_column(labels) + res: DataFrameOrSeries if isinstance(obj._index, cudf.MultiIndex): if level is None: level = 0 @@ -3865,12 +3866,52 @@ def _drop_rows_by_labels( if errors == "raise" and not labels.isin(levels_index).all(): raise KeyError("One or more values not found in axis") - sliced_df = obj._drop_rows_by_labels(labels, level) + if isinstance(level, int): + ilevel = level + else: + ilevel = obj._index.names.index(level) + + # 1. Merge Index df and data df along column axis: + # | id | ._index df | data column(s) | + idx_nlv = obj._index.nlevels + working_df = obj._index._source_data + working_df.columns = [i for i in range(idx_nlv)] + for i, col in enumerate(obj._data): + working_df[idx_nlv + i] = obj._data[col] + # 2. Set `level` as common index: + # | level | ._index df w/o level | data column(s) | + working_df = working_df.set_index(level) + + # 3. Use "leftanti" join to drop + # TODO: replace with Brandon's suggestion + to_join = cudf.DataFrame(index=cudf.Index(labels, name=level)) + join_res = working_df.join(to_join, how="leftanti") + + # 4. Reconstruct original layout, and rename + join_res.insert( + ilevel, name=join_res._index.name, value=join_res._index + ) + join_res = join_res.reset_index(drop=True) + + midx = cudf.MultiIndex.from_frame( + join_res.iloc[:, 0:idx_nlv], names=obj._index.names + ) + + if isinstance(obj, cudf.Series): + res = obj.__class__._from_data( + join_res.iloc[:, idx_nlv:]._data, index=midx, name=obj.name + ) + else: + res = obj.__class__._from_data( + join_res.iloc[:, idx_nlv:]._data, + index=midx, + columns=obj.columns, + ) else: if errors == "raise" and not labels.isin(obj.index).all(): raise KeyError("One or more values not found in axis") - sliced_df = obj._drop_rows_by_labels(labels) + res = obj._drop_rows_by_labels(labels) - return sliced_df + return res diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 803bc92e2bc..ebf7bb1c0ee 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -6,7 +6,7 @@ from collections import abc as abc from numbers import Number from shutil import get_terminal_size -from typing import Any, Optional, Set, Union +from typing import Any, Optional, Set from uuid import uuid4 import cupy @@ -4626,53 +4626,17 @@ def keys(self): """ return self.index - def _drop_rows_by_labels( - self, labels: ColumnLike, level: Union[int, __builtins__.str] = None - ) -> "cudf.Series": + def _drop_rows_by_labels(self, labels: ColumnLike) -> "cudf.Series": """Delete rows specified by `label` parameter. Resort to the efficient implementation in `cudf.DataFrame` labels: a list of labels specifying the rows to drop """ - if isinstance(self._index, cudf.MultiIndex): - if isinstance(level, int): - ilevel = level - else: - ilevel = self._index.names.index(level) - - # 1. Merge Index df and data column along column axis: - # | id | ._index df | data column | - idx_nlv = self._index.nlevels - working_df = self._index._source_data - working_df.columns = [i for i in range(idx_nlv)] - working_df[idx_nlv] = self._column - # 2. Set `level` as common index: - # | level | ._index df w/o level | data column - working_df = working_df.set_index(ilevel) - - # 3. Use "leftanti" join to drop - # TODO: replace with Brandon's suggestion - to_join = cudf.DataFrame(index=cudf.Index(labels, name=level)) - join_res = working_df.join(to_join, how="leftanti") - - # 4. Reconstruct original layout, and rename - join_res.insert( - ilevel, name=join_res._index.name, value=join_res._index - ) - join_res = join_res.reset_index(drop=True) - - midx = cudf.MultiIndex.from_frame( - join_res.iloc[:, 0:idx_nlv], names=self._index.names - ) - - dropped = join_res[idx_nlv] - dropped = dropped.set_index(midx) - else: - df = self.to_frame(name="tmp") - dropped = df._drop_rows_by_labels(labels)["tmp"] - + df = self.to_frame(name="tmp") + dropped = df._drop_rows_by_labels(labels)["tmp"] dropped.name = self.name + return dropped _accessors = set() # type: Set[Any] From bef984a0ebd60b676f8ce11df3090712b172d5f2 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Mon, 8 Feb 2021 15:31:06 -0800 Subject: [PATCH 15/18] More tests, update TODO --- python/cudf/cudf/core/frame.py | 2 +- python/cudf/cudf/tests/test_dataframe.py | 9 +++++++++ python/cudf/cudf/tests/test_series.py | 9 +++++++++ 3 files changed, 19 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 0d995183dc0..060ae5a2d92 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -3883,7 +3883,7 @@ def _drop_rows_by_labels( working_df = working_df.set_index(level) # 3. Use "leftanti" join to drop - # TODO: replace with Brandon's suggestion + # TODO: update to internal join API to bypass logic checks to_join = cudf.DataFrame(index=cudf.Index(labels, name=level)) join_res = working_df.join(to_join, how="leftanti") diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 39d2a980e87..5d61f5e4520 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -517,6 +517,15 @@ def test_dataframe_drop_raises(): expected_error_message="One or more values not found in axis", ) + # label dtype mismatch + assert_exceptions_equal( + lfunc=pdf.drop, + rfunc=df.drop, + lfunc_args_and_kwargs=([3],), + rfunc_args_and_kwargs=([3],), + expected_error_message="One or more values not found in axis", + ) + expect = pdf.drop("p", errors="ignore") actual = df.drop("p", errors="ignore") diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index c2bb7da1f49..484b1084cd7 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -1099,6 +1099,15 @@ def test_series_drop_raises(): expected_error_message="One or more values not found in axis", ) + # dtype specified mismatch + assert_exceptions_equal( + lfunc=ps.drop, + rfunc=gs.drop, + lfunc_args_and_kwargs=([3],), + rfunc_args_and_kwargs=([3],), + expected_error_message="One or more values not found in axis", + ) + expect = ps.drop("p", errors="ignore") actual = gs.drop("p", errors="ignore") From 3145f2b922c8c9838b2ec82729e3ad15986d6ad7 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Fri, 12 Feb 2021 14:11:13 -0800 Subject: [PATCH 16/18] Add todo notes --- python/cudf/cudf/core/dataframe.py | 2 ++ python/cudf/cudf/core/frame.py | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index ed47027afd4..05736e896aa 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -7401,6 +7401,8 @@ def _drop_rows_by_labels(self, labels: ColumnLike) -> "cudf.DataFrame": labels: a list of labels specifying the rows to drop """ + # TODO: use internal API with "leftanti" and specify left and right + # join keys to bypass logic check dropped = self.join(cudf.DataFrame(index=labels), how="leftanti") return dropped diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 060ae5a2d92..673bac14562 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -3883,7 +3883,8 @@ def _drop_rows_by_labels( working_df = working_df.set_index(level) # 3. Use "leftanti" join to drop - # TODO: update to internal join API to bypass logic checks + # TODO: use internal API with "leftanti" and specify left and right + # join keys to bypass logic check to_join = cudf.DataFrame(index=cudf.Index(labels, name=level)) join_res = working_df.join(to_join, how="leftanti") From 1c2e8d0a8f74c2022730c72dfcc0ce090ad8571f Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Fri, 19 Feb 2021 13:34:10 -0800 Subject: [PATCH 17/18] Consolidated code logic --- python/cudf/cudf/core/dataframe.py | 14 -------------- python/cudf/cudf/core/frame.py | 7 ++++++- python/cudf/cudf/core/series.py | 14 -------------- 3 files changed, 6 insertions(+), 29 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 8c1c35742e6..fedb0e1e6d8 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -26,7 +26,6 @@ import cudf from cudf import _lib as libcudf from cudf._lib.null_mask import MaskState, create_null_mask -from cudf._typing import ColumnLike from cudf.core import column, reshape from cudf.core.abc import Serializable from cudf.core.column import as_column, column_empty @@ -7385,19 +7384,6 @@ def equals(self, other): return False return super().equals(other) - def _drop_rows_by_labels(self, labels: ColumnLike) -> "cudf.DataFrame": - """Delete rows specified by `label` parameter. In `DataFrame`, this can - be achieved efficiently by a left-anti join operation - - labels: a list of labels specifying the rows to drop - """ - - # TODO: use internal API with "leftanti" and specify left and right - # join keys to bypass logic check - dropped = self.join(cudf.DataFrame(index=labels), how="leftanti") - - return dropped - _accessors = set() # type: Set[Any] diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 378e04b4b86..dc7f8733347 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -3925,6 +3925,11 @@ def _drop_rows_by_labels( if errors == "raise" and not labels.isin(obj.index).all(): raise KeyError("One or more values not found in axis") - res = obj._drop_rows_by_labels(labels) + key_df = cudf.DataFrame(index=labels) + if isinstance(obj, cudf.Series): + res = obj.to_frame(name="tmp").join(key_df, how="leftanti")["tmp"] + res.name = obj.name + else: + res = obj.join(key_df, how="leftanti") return res diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index ebf7bb1c0ee..809367e104c 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -19,7 +19,6 @@ import cudf from cudf import _lib as libcudf from cudf._lib.transform import bools_to_mask -from cudf._typing import ColumnLike from cudf.core.abc import Serializable from cudf.core.column import ( ColumnBase, @@ -4626,19 +4625,6 @@ def keys(self): """ return self.index - def _drop_rows_by_labels(self, labels: ColumnLike) -> "cudf.Series": - """Delete rows specified by `label` parameter. Resort to the efficient - implementation in `cudf.DataFrame` - - labels: a list of labels specifying the rows to drop - """ - - df = self.to_frame(name="tmp") - dropped = df._drop_rows_by_labels(labels)["tmp"] - dropped.name = self.name - - return dropped - _accessors = set() # type: Set[Any] From 2b893fa1b393768d66e1812f6735625c3c6b27bb Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Fri, 19 Feb 2021 13:42:53 -0800 Subject: [PATCH 18/18] removed typed `res` var --- python/cudf/cudf/core/frame.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index dc7f8733347..592d4741948 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -3869,7 +3869,6 @@ def _drop_rows_by_labels( if not isinstance(labels, (cudf.Series, cudf.Index)): labels = as_column(labels) - res: DataFrameOrSeries if isinstance(obj._index, cudf.MultiIndex): if level is None: level = 0 @@ -3911,11 +3910,11 @@ def _drop_rows_by_labels( ) if isinstance(obj, cudf.Series): - res = obj.__class__._from_data( + return obj.__class__._from_data( join_res.iloc[:, idx_nlv:]._data, index=midx, name=obj.name ) else: - res = obj.__class__._from_data( + return obj.__class__._from_data( join_res.iloc[:, idx_nlv:]._data, index=midx, columns=obj.columns, @@ -3929,7 +3928,6 @@ def _drop_rows_by_labels( if isinstance(obj, cudf.Series): res = obj.to_frame(name="tmp").join(key_df, how="leftanti")["tmp"] res.name = obj.name + return res else: - res = obj.join(key_df, how="leftanti") - - return res + return obj.join(key_df, how="leftanti")