Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implements Series.filter #1511

Merged
merged 5 commits into from
May 28, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
139 changes: 0 additions & 139 deletions databricks/koalas/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -8870,145 +8870,6 @@ def rank(self, method="average", ascending=True):
"""
return self._apply_series_op(lambda kser: kser.rank(method=method, ascending=ascending))

def filter(self, items=None, like=None, regex=None, axis=None):
"""
Subset rows or columns of dataframe according to labels in
the specified index.

Note that this routine does not filter a dataframe on its
contents. The filter is applied to the labels of the index.

Parameters
----------
items : list-like
Keep labels from axis which are in items.
like : string
Keep labels from axis for which "like in label == True".
regex : string (regular expression)
Keep labels from axis for which re.search(regex, label) == True.
axis : int or string axis name
The axis to filter on. By default this is the info axis,
'index' for Series, 'columns' for DataFrame.

Returns
-------
same type as input object

See Also
--------
DataFrame.loc

Notes
-----
The ``items``, ``like``, and ``regex`` parameters are
enforced to be mutually exclusive.

``axis`` defaults to the info axis that is used when indexing
with ``[]``.

Examples
--------
>>> df = ks.DataFrame(np.array(([1, 2, 3], [4, 5, 6])),
... index=['mouse', 'rabbit'],
... columns=['one', 'two', 'three'])

>>> # select columns by name
>>> df.filter(items=['one', 'three'])
one three
mouse 1 3
rabbit 4 6

>>> # select columns by regular expression
>>> df.filter(regex='e$', axis=1)
one three
mouse 1 3
rabbit 4 6

>>> # select rows containing 'bbi'
>>> df.filter(like='bbi', axis=0)
one two three
rabbit 4 5 6
"""

if sum(x is not None for x in (items, like, regex)) > 1:
raise TypeError(
"Keyword arguments `items`, `like`, or `regex` " "are mutually exclusive"
)

axis = validate_axis(axis, none_axis=1)

index_scols = self._internal.index_spark_columns

if items is not None:
if is_list_like(items):
items = list(items)
else:
raise ValueError("items should be a list-like object.")
if axis == 0:
if len(index_scols) == 1:
col = None
for item in items:
if col is None:
col = index_scols[0] == F.lit(item)
else:
col = col | (index_scols[0] == F.lit(item))
elif len(index_scols) > 1:
# for multi-index
col = None
for item in items:
if not isinstance(item, (tuple)):
raise TypeError("Unsupported type {}".format(type(item)))
if not item:
raise ValueError("The item should not be empty.")
midx_col = None
for i, element in enumerate(item):
if midx_col is None:
midx_col = index_scols[i] == F.lit(element)
else:
midx_col = midx_col & (index_scols[i] == F.lit(element))
if col is None:
col = midx_col
else:
col = col | midx_col
else:
raise ValueError("Single or multi index must be specified.")
return DataFrame(self._internal.with_filter(col))
elif axis == 1:
return self[items]
elif like is not None:
if axis == 0:
col = None
for index_scol in index_scols:
if col is None:
col = index_scol.contains(like)
else:
col = col | index_scol.contains(like)
return DataFrame(self._internal.with_filter(col))
elif axis == 1:
column_labels = self._internal.column_labels
output_labels = [label for label in column_labels if any(like in i for i in label)]
return self[output_labels]
elif regex is not None:
if axis == 0:
col = None
for index_scol in index_scols:
if col is None:
col = index_scol.rlike(regex)
else:
col = col | index_scol.rlike(regex)
return DataFrame(self._internal.with_filter(col))
elif axis == 1:
column_labels = self._internal.column_labels
matcher = re.compile(regex)
output_labels = [
label
for label in column_labels
if any(matcher.search(i) is not None for i in label)
]
return self[output_labels]
else:
raise TypeError("Must pass either `items`, `like`, or `regex`")

def rename(
self,
mapper=None,
Expand Down
147 changes: 147 additions & 0 deletions databricks/koalas/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
"""
A base class to be monkey-patched to DataFrame/Column to behave similar to pandas DataFrame/Series.
"""
import re
import warnings
from collections import Counter
from collections.abc import Iterable
Expand All @@ -26,6 +27,7 @@

import numpy as np
import pandas as pd
from pandas.api.types import is_list_like

from pyspark import sql as spark
from pyspark.sql import functions as F
Expand Down Expand Up @@ -1634,6 +1636,151 @@ def first_valid_index(self):

return first_valid_idx

def filter(self, items=None, like=None, regex=None, axis=None):
"""
Subset rows or columns of dataframe according to labels in
the specified index.
Note that this routine does not filter a dataframe on its
contents. The filter is applied to the labels of the index.
Parameters
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah .. I didn't notice the newlines disappeared here.. it breaks the doc rendering. Let's be careful next time.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oppse I'll be careful next time. I'm Sorry

----------
items : list-like
Keep labels from axis which are in items.
like : string
Keep labels from axis for which "like in label == True".
regex : string (regular expression)
Keep labels from axis for which re.search(regex, label) == True.
axis : int or string axis name
The axis to filter on. By default this is the info axis,
'index' for Series, 'columns' for DataFrame.
Returns
-------
same type as input object
See Also
--------
DataFrame.loc
Notes
-----
The ``items``, ``like``, and ``regex`` parameters are
enforced to be mutually exclusive.
``axis`` defaults to the info axis that is used when indexing
with ``[]``.
Examples
--------
>>> df = ks.DataFrame(np.array(([1, 2, 3], [4, 5, 6])),
... index=['mouse', 'rabbit'],
... columns=['one', 'two', 'three'])
>>> # select columns by name
>>> df.filter(items=['one', 'three'])
one three
mouse 1 3
rabbit 4 6
>>> # select columns by regular expression
>>> df.filter(regex='e$', axis=1)
one three
mouse 1 3
rabbit 4 6
>>> # select rows containing 'bbi'
>>> df.filter(like='bbi', axis=0)
one two three
rabbit 4 5 6
"""
from databricks.koalas.series import first_series

if sum(x is not None for x in (items, like, regex)) > 1:
raise TypeError(
"Keyword arguments `items`, `like`, or `regex` " "are mutually exclusive"
)

is_series = isinstance(self, ks.Series)

kdf = self
if is_series:
kdf = self.to_frame()
axis = validate_axis(axis)
if axis == 1:
raise ValueError("Series does not support columns axis.")
else:
axis = validate_axis(axis, none_axis=1)
beobest2 marked this conversation as resolved.
Show resolved Hide resolved
assert isinstance(kdf, ks.DataFrame)

index_scols = kdf._internal.index_spark_columns

if items is not None:
if is_list_like(items):
items = list(items)
else:
raise ValueError("items should be a list-like object.")
if axis == 0:
if len(index_scols) == 1:
col = None
for item in items:
if col is None:
col = index_scols[0] == F.lit(item)
else:
col = col | (index_scols[0] == F.lit(item))
elif len(index_scols) > 1:
# for multi-index
col = None
for item in items:
if not isinstance(item, (tuple)):
raise TypeError("Unsupported type {}".format(type(item)))
if not item:
raise ValueError("The item should not be empty.")
midx_col = None
for i, element in enumerate(item):
if midx_col is None:
midx_col = index_scols[i] == F.lit(element)
else:
midx_col = midx_col & (index_scols[i] == F.lit(element))
if col is None:
col = midx_col
else:
col = col | midx_col
else:
raise ValueError("Single or multi index must be specified.")
filtered_df = ks.DataFrame(kdf._internal.with_filter(col))
elif axis == 1:
filtered_df = kdf[items]
elif like is not None:
if axis == 0:
col = None
for index_scol in index_scols:
if col is None:
col = index_scol.contains(like)
else:
col = col | index_scol.contains(like)
filtered_df = ks.DataFrame(self._internal.with_filter(col))
elif axis == 1:
column_labels = kdf._internal.column_labels
output_labels = [label for label in column_labels if any(like in i for i in label)]
filtered_df = kdf[output_labels]
elif regex is not None:
if axis == 0:
col = None
for index_scol in index_scols:
if col is None:
col = index_scol.rlike(regex)
else:
col = col | index_scol.rlike(regex)
filtered_df = ks.DataFrame(self._internal.with_filter(col))
elif axis == 1:
column_labels = kdf._internal.column_labels
matcher = re.compile(regex)
output_labels = [
label
for label in column_labels
if any(matcher.search(i) is not None for i in label)
]
filtered_df = kdf[output_labels]
else:
raise TypeError("Must pass either `items`, `like`, or `regex`")

if is_series:
return first_series(filtered_df)
else:
return filtered_df

def median(self, axis=None, numeric_only=True, accuracy=10000):
"""
Return the median of the values for the requested axis.
Expand Down
1 change: 0 additions & 1 deletion databricks/koalas/missing/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@ class MissingPandasLikeSeries(object):
ewm = _unsupported_function("ewm")
factorize = _unsupported_function("factorize")
ffill = _unsupported_function("ffill")
filter = _unsupported_function("filter")
first = _unsupported_function("first")
infer_objects = _unsupported_function("infer_objects")
interpolate = _unsupported_function("interpolate")
Expand Down
27 changes: 27 additions & 0 deletions databricks/koalas/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1631,3 +1631,30 @@ def test_unstack(self):
def test_item(self):
kser = ks.Series([10, 20])
self.assertRaises(ValueError, lambda: kser.item())

def test_filter(self):
kser = ks.Series([0, 1, 2], index=["one", "two", "three"])
pser = kser.to_pandas()

self.assert_eq(pser.filter(items=["one", "three"]), kser.filter(items=["one", "three"]))
self.assert_eq(pser.filter(regex="e$"), kser.filter(regex="e$"))
self.assert_eq(pser.filter(like="hre"), kser.filter(like="hre"))
HyukjinKwon marked this conversation as resolved.
Show resolved Hide resolved

with self.assertRaisesRegex(ValueError, "Series does not support columns axis."):
kser.filter(like="hre", axis=1)

# for MultiIndex
midx = pd.MultiIndex.from_tuples([("one", "x"), ("two", "y"), ("three", "z")])
kser = ks.Series([0, 1, 2], index=midx)
pser = kser.to_pandas()

self.assert_eq(
pser.filter(items=[("one", "x"), ("three", "z")]),
kser.filter(items=[("one", "x"), ("three", "z")]),
)

with self.assertRaisesRegex(TypeError, "Unsupported type <class 'list'>"):
kser.filter(items=[["one", "x"], ("three", "z")])

with self.assertRaisesRegex(ValueError, "The item should not be empty."):
kser.filter(items=[(), ("three", "z")])
1 change: 1 addition & 0 deletions docs/source/reference/series.rst
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,7 @@ Computations / Descriptive Stats
Series.cumsum
Series.cumprod
Series.describe
Series.filter
Series.kurt
Series.mad
Series.max
Expand Down