Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use list of column inputs for apply_boolean_mask #9832

Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 5 additions & 13 deletions python/cudf/cudf/_lib/stream_compaction.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -75,24 +75,22 @@ def drop_nulls(columns: list, how="any", keys=None, thresh=None):
return columns_from_unique_ptr(move(c_result))


def apply_boolean_mask(source_table, Column boolean_mask):
def apply_boolean_mask(columns: list, Column boolean_mask):
"""
Drops the rows which correspond to False in boolean_mask.

Parameters
----------
source_table : source table whose rows are dropped as per boolean_mask
columns : list of columns whose rows are dropped as per boolean_mask
boolean_mask : a boolean column of same size as source_table

Returns
-------
Frame obtained from applying mask
columns obtained from applying mask
"""

assert pd.api.types.is_bool_dtype(boolean_mask.dtype)

cdef unique_ptr[table] c_result
cdef table_view source_table_view = table_view_from_table(source_table)
cdef table_view source_table_view = table_view_from_columns(columns)
cdef column_view boolean_mask_view = boolean_mask.view()

with nogil:
Expand All @@ -103,13 +101,7 @@ def apply_boolean_mask(source_table, Column boolean_mask):
)
)

return data_from_unique_ptr(
move(c_result),
column_names=source_table._column_names,
index_names=(
None if source_table._index
is None else source_table._index_names)
)
return columns_from_unique_ptr(move(c_result))


def drop_duplicates(columns: list,
Expand Down
25 changes: 24 additions & 1 deletion python/cudf/cudf/core/_base_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,15 @@
import pandas as pd

import cudf
from cudf._lib.stream_compaction import apply_boolean_mask
from cudf._typing import DtypeObj
from cudf.api.types import is_dtype_equal, is_integer, is_list_like, is_scalar
from cudf.api.types import (
is_bool_dtype,
is_dtype_equal,
is_integer,
is_list_like,
is_scalar,
)
from cudf.core.abc import Serializable
from cudf.core.column import ColumnBase, column
from cudf.core.column_accessor import ColumnAccessor
Expand Down Expand Up @@ -1414,6 +1421,22 @@ def from_pandas(cls, index, nan_as_null=None):
def _constructor_expanddim(self):
return cudf.MultiIndex

def _apply_boolean_mask(self, boolean_mask):
"""Apply boolean mask to each row of `self`.

Rows corresponding to `False` is dropped.
"""
boolean_mask = cudf.core.column.as_column(boolean_mask)
if not is_bool_dtype(boolean_mask.dtype):
raise ValueError("boolean_mask is not boolean type.")

result = self.__class__._from_columns(
vyasr marked this conversation as resolved.
Show resolved Hide resolved
apply_boolean_mask(list(self._columns), boolean_mask),
column_names=self._column_names,
)
result._copy_type_metadata(self)
return result


def _get_result_name(left_name, right_name):
if left_name == right_name:
Expand Down
4 changes: 2 additions & 2 deletions python/cudf/cudf/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
import numpy as np

from cudf.core.column import as_column
from cudf.core.frame import Frame
from cudf.core.index import Index, RangeIndex
from cudf.core.indexed_frame import IndexedFrame
from cudf.core.series import Series


Expand Down Expand Up @@ -92,7 +92,7 @@ def _index_or_values_interpolation(column, index=None):
if num_nan == 0 or num_nan == len(column):
return column

to_interp = Frame(data={None: column}, index=index)
to_interp = IndexedFrame(data={None: column}, index=index)
known_x_and_y = to_interp._apply_boolean_mask(as_column(~mask))

known_x = known_x_and_y._index._column.values
Expand Down
10 changes: 7 additions & 3 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
)
from cudf._lib.scalar import as_device_scalar
from cudf._lib.stream_compaction import (
apply_boolean_mask,
distinct_count as cpp_distinct_count,
drop_duplicates,
drop_nulls,
Expand Down Expand Up @@ -997,9 +998,12 @@ def as_decimal32_column(
raise NotImplementedError

def apply_boolean_mask(self, mask) -> ColumnBase:
mask = as_column(mask, dtype="bool")
return (
self.as_frame()._apply_boolean_mask(boolean_mask=mask)._as_column()
mask = as_column(mask)
if not is_bool_dtype(mask.dtype):
raise ValueError("boolean_mask is not boolean type.")

return apply_boolean_mask([self], mask)[0]._with_type_metadata(
self.dtype
)

def argsort(
Expand Down
15 changes: 0 additions & 15 deletions python/cudf/cudf/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -1494,21 +1494,6 @@ def _drop_na_columns(self, how="any", subset=None, thresh=None):

return self[out_cols]

def _apply_boolean_mask(self, boolean_mask):
"""
Applies boolean mask to each row of `self`,
rows corresponding to `False` is dropped
"""
boolean_mask = as_column(boolean_mask)

result = self.__class__._from_data(
*libcudf.stream_compaction.apply_boolean_mask(
self, as_column(boolean_mask)
)
)
result._copy_type_metadata(self)
return result

def interpolate(
self,
method="linear",
Expand Down
20 changes: 20 additions & 0 deletions python/cudf/cudf/core/indexed_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from cudf._typing import ColumnLike
from cudf.api.types import (
_is_non_decimal_numeric_dtype,
is_bool_dtype,
is_categorical_dtype,
is_integer_dtype,
is_list_like,
Expand Down Expand Up @@ -1107,6 +1108,25 @@ def resample(
else cudf.core.resample.DataFrameResampler(self, by=by)
)

def _apply_boolean_mask(self, boolean_mask):
"""Apply boolean mask to each row of `self`.

Rows corresponding to `False` is dropped.
"""
boolean_mask = cudf.core.column.as_column(boolean_mask)
if not is_bool_dtype(boolean_mask.dtype):
raise ValueError("boolean_mask is not boolean type.")

result = self.__class__._from_columns(
libcudf.stream_compaction.apply_boolean_mask(
list(self._index._columns + self._columns), boolean_mask
),
column_names=self._column_names,
index_names=self._index.names,
)
result._copy_type_metadata(self)
return result

def _first_or_last(
self, offset, idx: int, op: Callable, side: str, slice_func: Callable
) -> "IndexedFrame":
Expand Down