rapidsai · rapids-bot · Mar 10, 2021 · Feb 3, 2021 · Feb 4, 2021 · Feb 4, 2021
@@ -1,6 +1,6 @@
 # Copyright (c) 2018-2021, NVIDIA CORPORATION.
 
-from __future__ import division
+from __future__ import annotations, division
 
 import inspect
 import itertools
@@ -10,7 +10,7 @@
 import warnings
 from collections import OrderedDict, defaultdict
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Any, Set, TypeVar
+from typing import Any, Set, TypeVar, Union
 
 import cupy
 import numpy as np
@@ -26,11 +26,12 @@
 import cudf
 from cudf import _lib as libcudf
 from cudf._lib.null_mask import MaskState, create_null_mask
+from cudf._typing import ColumnLike
 from cudf.core import column, reshape
 from cudf.core.abc import Serializable
 from cudf.core.column import as_column, column_empty
 from cudf.core.column_accessor import ColumnAccessor
-from cudf.core.frame import Frame
+from cudf.core.frame import Frame, _drop_rows_by_labels
 from cudf.core.groupby.groupby import DataFrameGroupBy
 from cudf.core.index import Index, RangeIndex, as_index
 from cudf.core.indexing import _DataFrameIlocIndexer, _DataFrameLocIndexer
@@ -3315,46 +3316,26 @@ def drop(
             )
 
         if inplace:
-            outdf = self
+            out = self
         else:
-            outdf = self.copy()
+            out = self.copy()
 
         if axis in (1, "columns"):
             target = _get_host_unique(target)
 
-            _drop_columns(outdf, target, errors)
+            _drop_columns(out, target, errors)
         elif axis in (0, "index"):
-            if not isinstance(target, (cudf.Series, cudf.Index)):
-                target = column.as_column(target)
-
-            if isinstance(self._index, cudf.MultiIndex):
-                if level is None:
-                    level = 0
-
-                levels_index = outdf.index.get_level_values(level)
-                if errors == "raise" and not target.isin(levels_index).all():
-                    raise KeyError("One or more values not found in axis")
-
-                # TODO : Could use anti-join as a future optimization
-                sliced_df = outdf.take(~levels_index.isin(target))
-                sliced_df._index.names = self._index.names
-            else:
-                if errors == "raise" and not target.isin(outdf.index).all():
-                    raise KeyError("One or more values not found in axis")
-
-                sliced_df = outdf.join(
-                    cudf.DataFrame(index=target), how="leftanti"
-                )
+            dropped = _drop_rows_by_labels(out, target, level, errors)
 
             if columns is not None:
                 columns = _get_host_unique(columns)
-                _drop_columns(sliced_df, columns, errors)
+                _drop_columns(dropped, columns, errors)
 
-            outdf._data = sliced_df._data
-            outdf._index = sliced_df._index
+            out._data = dropped._data
+            out._index = dropped._index
 
         if not inplace:
-            return outdf
+            return out
 
     def _drop_column(self, name):
         """Drop a column by *name*
@@ -7408,6 +7389,48 @@ def equals(self, other):
                 return False
         return super().equals(other)
 
+    def _drop_rows_by_labels(
+        self, labels: ColumnLike, level: Union[int, str] = None
+    ) -> "cudf.DataFrame":
+        """Delete rows specified by `label` parameter. In `DataFrame`, this can
+        be achieved efficiently by a left-anti join operation
+
+        labels: a list of labels specifying the rows to drop
+        """
+
+        if isinstance(self._index, cudf.MultiIndex):
+            if isinstance(level, int):
+                ilevel = level
+            else:
+                ilevel = self._index.names.index(level)
+
+            idx_nlv = self._index.nlevels
+            working_df = self._index._source_data
+            # TODO: figure out what __unique__ should be
+            for col in self.columns:
+                working_df["__unique__" + str(col)] = self[col]._column
+            working_df = working_df.set_index(level)
+
+            # TODO: replace with Brandon's suggestion
+            to_join = cudf.DataFrame(index=cudf.Index(labels, name=level))
+            join_res = working_df.join(to_join, how="leftanti")
+            join_res.insert(
+                ilevel, name=join_res._index.name, value=join_res._index
+            )
+            join_res = join_res.reset_index(drop=True)
+
+            midx = cudf.MultiIndex.from_frame(
+                join_res.iloc[:, 0:idx_nlv], names=self._index.names
+            )
+
+            dropped = join_res.iloc[:, idx_nlv:]
+            dropped = dropped.set_index(midx)
+            dropped.columns = self.columns
+        else:
+            dropped = self.join(cudf.DataFrame(index=labels), how="leftanti")
+
+        return dropped
+
     _accessors = set()  # type: Set[Any]
 
 
@@ -7654,17 +7677,6 @@ def _get_union_of_series_names(series_list):
     return names_list
 
 
-def _drop_columns(df, columns, errors):
-    for c in columns:
-        try:
-            df._drop_column(c)
-        except KeyError as e:
-            if errors == "ignore":
-                pass
-            else:
-                raise e
-
-
 def _get_host_unique(array):
     if isinstance(
         array, (cudf.Series, cudf.Index, cudf.core.column.ColumnBase)
@@ -7674,3 +7686,14 @@ def _get_host_unique(array):
         return [array]
     else:
         return set(array)
+
+
+def _drop_columns(df: DataFrame, columns: Iterable, errors: str):
+    for c in columns:
+        try:
+            df._drop_column(c)
+        except KeyError as e:
+            if errors == "ignore":
+                pass
+            else:
+                raise e
@@ -6,7 +6,7 @@
 import operator
 import warnings
 from collections import OrderedDict, abc as abc
-from typing import TYPE_CHECKING, Any, Dict, Tuple, TypeVar, overload
+from typing import TYPE_CHECKING, Any, Dict, Tuple, TypeVar, Union, overload
 
 import cupy
 import numpy as np
@@ -18,6 +18,7 @@
 
 import cudf
 from cudf import _lib as libcudf
+from cudf._typing import ColumnLike, DataFrameOrSeries
 from cudf.core.column import as_column, build_categorical_column, column_empty
 from cudf.utils.dtypes import (
     is_categorical_dtype,
@@ -27,7 +28,6 @@
     min_scalar_type,
 )
 
-
 T = TypeVar("T", bound="Frame")
 
 if TYPE_CHECKING:
@@ -3838,3 +3838,39 @@ def _is_series(obj):
     instead of checking for isinstance(obj, cudf.Series)
     """
     return isinstance(obj, Frame) and obj.ndim == 1 and obj._index is not None
+
+
+def _drop_rows_by_labels(
+    obj: DataFrameOrSeries,
+    labels: Union[ColumnLike, abc.Iterable, str],
+    level: Union[int, str],
+    errors: str,
+) -> DataFrameOrSeries:
+    """Remove rows specified by `labels`. If `errors=True`, an error is raised
+    if some items in `labels` do not exist in `obj._index`.
+
+    Will raise if level(int) is greater or equal to index nlevels
+    """
+    if isinstance(level, int) and level >= obj.index.nlevels:
+        raise ValueError("Param level out of bounds.")
+
+    if not isinstance(labels, (cudf.Series, cudf.Index)):
+        labels = as_column(labels)
+
+    if isinstance(obj._index, cudf.MultiIndex):
+        if level is None:
+            level = 0
+
+        levels_index = obj.index.get_level_values(level)
+        if errors == "raise" and not labels.isin(levels_index).all():
+            raise KeyError("One or more values not found in axis")
+
+        sliced_df = obj._drop_rows_by_labels(labels, level)
+
+    else:
+        if errors == "raise" and not labels.isin(obj.index).all():
+            raise KeyError("One or more values not found in axis")
+
+        sliced_df = obj._drop_rows_by_labels(labels)
+
+    return sliced_df