databricks · HyukjinKwon · May 28, 2020 · May 18, 2020 · May 28, 2020 · May 28, 2020
diff --git a/databricks/koalas/frame.py b/databricks/koalas/frame.py
@@ -8870,145 +8870,6 @@ def rank(self, method="average", ascending=True):
         """
         return self._apply_series_op(lambda kser: kser.rank(method=method, ascending=ascending))
 
-    def filter(self, items=None, like=None, regex=None, axis=None):
-        """
-        Subset rows or columns of dataframe according to labels in
-        the specified index.
-
-        Note that this routine does not filter a dataframe on its
-        contents. The filter is applied to the labels of the index.
-
-        Parameters
-        ----------
-        items : list-like
-            Keep labels from axis which are in items.
-        like : string
-            Keep labels from axis for which "like in label == True".
-        regex : string (regular expression)
-            Keep labels from axis for which re.search(regex, label) == True.
-        axis : int or string axis name
-            The axis to filter on.  By default this is the info axis,
-            'index' for Series, 'columns' for DataFrame.
-
-        Returns
-        -------
-        same type as input object
-
-        See Also
-        --------
-        DataFrame.loc
-
-        Notes
-        -----
-        The ``items``, ``like``, and ``regex`` parameters are
-        enforced to be mutually exclusive.
-
-        ``axis`` defaults to the info axis that is used when indexing
-        with ``[]``.
-
-        Examples
-        --------
-        >>> df = ks.DataFrame(np.array(([1, 2, 3], [4, 5, 6])),
-        ...                   index=['mouse', 'rabbit'],
-        ...                   columns=['one', 'two', 'three'])
-
-        >>> # select columns by name
-        >>> df.filter(items=['one', 'three'])
-                one  three
-        mouse     1      3
-        rabbit    4      6
-
-        >>> # select columns by regular expression
-        >>> df.filter(regex='e$', axis=1)
-                one  three
-        mouse     1      3
-        rabbit    4      6
-
-        >>> # select rows containing 'bbi'
-        >>> df.filter(like='bbi', axis=0)
-                one  two  three
-        rabbit    4    5      6
-        """
-
-        if sum(x is not None for x in (items, like, regex)) > 1:
-            raise TypeError(
-                "Keyword arguments `items`, `like`, or `regex` " "are mutually exclusive"
-            )
-
-        axis = validate_axis(axis, none_axis=1)
-
-        index_scols = self._internal.index_spark_columns
-
-        if items is not None:
-            if is_list_like(items):
-                items = list(items)
-            else:
-                raise ValueError("items should be a list-like object.")
-            if axis == 0:
-                if len(index_scols) == 1:
-                    col = None
-                    for item in items:
-                        if col is None:
-                            col = index_scols[0] == F.lit(item)
-                        else:
-                            col = col | (index_scols[0] == F.lit(item))
-                elif len(index_scols) > 1:
-                    # for multi-index
-                    col = None
-                    for item in items:
-                        if not isinstance(item, (tuple)):
-                            raise TypeError("Unsupported type {}".format(type(item)))
-                        if not item:
-                            raise ValueError("The item should not be empty.")
-                        midx_col = None
-                        for i, element in enumerate(item):
-                            if midx_col is None:
-                                midx_col = index_scols[i] == F.lit(element)
-                            else:
-                                midx_col = midx_col & (index_scols[i] == F.lit(element))
-                        if col is None:
-                            col = midx_col
-                        else:
-                            col = col | midx_col
-                else:
-                    raise ValueError("Single or multi index must be specified.")
-                return DataFrame(self._internal.with_filter(col))
-            elif axis == 1:
-                return self[items]
-        elif like is not None:
-            if axis == 0:
-                col = None
-                for index_scol in index_scols:
-                    if col is None:
-                        col = index_scol.contains(like)
-                    else:
-                        col = col | index_scol.contains(like)
-                return DataFrame(self._internal.with_filter(col))
-            elif axis == 1:
-                column_labels = self._internal.column_labels
-                output_labels = [label for label in column_labels if any(like in i for i in label)]
-                return self[output_labels]
-        elif regex is not None:
-            if axis == 0:
-                col = None
-                for index_scol in index_scols:
-                    if col is None:
-                        col = index_scol.rlike(regex)
-                    else:
-                        col = col | index_scol.rlike(regex)
-                return DataFrame(self._internal.with_filter(col))
-            elif axis == 1:
-                column_labels = self._internal.column_labels
-                matcher = re.compile(regex)
-                output_labels = [
-                    label
-                    for label in column_labels
-                    if any(matcher.search(i) is not None for i in label)
-                ]
-                return self[output_labels]
-        else:
-            raise TypeError("Must pass either `items`, `like`, or `regex`")
-
     def rename(
         self,
         mapper=None,

diff --git a/databricks/koalas/generic.py b/databricks/koalas/generic.py
@@ -17,6 +17,7 @@
 """
 A base class to be monkey-patched to DataFrame/Column to behave similar to pandas DataFrame/Series.
 """
+import re
 import warnings
 from collections import Counter
 from collections.abc import Iterable
@@ -26,6 +27,7 @@
 
 import numpy as np
 import pandas as pd
+from pandas.api.types import is_list_like
 
 from pyspark import sql as spark
 from pyspark.sql import functions as F
@@ -1634,6 +1636,151 @@ def first_valid_index(self):
 
         return first_valid_idx
 
+    def filter(self, items=None, like=None, regex=None, axis=None):
+        """
+        Subset rows or columns of dataframe according to labels in
+        the specified index.
+        Note that this routine does not filter a dataframe on its
+        contents. The filter is applied to the labels of the index.
+        Parameters
+        ----------
+        items : list-like
+            Keep labels from axis which are in items.
+        like : string
+            Keep labels from axis for which "like in label == True".
+        regex : string (regular expression)
+            Keep labels from axis for which re.search(regex, label) == True.
+        axis : int or string axis name
+            The axis to filter on.  By default this is the info axis,
+            'index' for Series, 'columns' for DataFrame.
+        Returns
+        -------
+        same type as input object
+        See Also
+        --------
+        DataFrame.loc
+        Notes
+        -----
+        The ``items``, ``like``, and ``regex`` parameters are
+        enforced to be mutually exclusive.
+        ``axis`` defaults to the info axis that is used when indexing
+        with ``[]``.
+        Examples
+        --------
+        >>> df = ks.DataFrame(np.array(([1, 2, 3], [4, 5, 6])),
+        ...                   index=['mouse', 'rabbit'],
+        ...                   columns=['one', 'two', 'three'])
+        >>> # select columns by name
+        >>> df.filter(items=['one', 'three'])
+                one  three
+        mouse     1      3
+        rabbit    4      6
+        >>> # select columns by regular expression
+        >>> df.filter(regex='e$', axis=1)
+                one  three
+        mouse     1      3
+        rabbit    4      6
+        >>> # select rows containing 'bbi'
+        >>> df.filter(like='bbi', axis=0)
+                one  two  three
+        rabbit    4    5      6
+        """
+        from databricks.koalas.series import first_series
+
+        if sum(x is not None for x in (items, like, regex)) > 1:
+            raise TypeError(
+                "Keyword arguments `items`, `like`, or `regex` " "are mutually exclusive"
+            )
+
+        is_series = isinstance(self, ks.Series)
+
+        kdf = self
+        if is_series:
+            kdf = self.to_frame()
+            axis = validate_axis(axis)
+            if axis == 1:
+                raise ValueError("Series does not support columns axis.")
+        else:
+            axis = validate_axis(axis, none_axis=1)
+        assert isinstance(kdf, ks.DataFrame)
+
+        index_scols = kdf._internal.index_spark_columns
+
+        if items is not None:
+            if is_list_like(items):
+                items = list(items)
+            else:
+                raise ValueError("items should be a list-like object.")
+            if axis == 0:
+                if len(index_scols) == 1:
+                    col = None
+                    for item in items:
+                        if col is None:
+                            col = index_scols[0] == F.lit(item)
+                        else:
+                            col = col | (index_scols[0] == F.lit(item))
+                elif len(index_scols) > 1:
+                    # for multi-index
+                    col = None
+                    for item in items:
+                        if not isinstance(item, (tuple)):
+                            raise TypeError("Unsupported type {}".format(type(item)))
+                        if not item:
+                            raise ValueError("The item should not be empty.")
+                        midx_col = None
+                        for i, element in enumerate(item):
+                            if midx_col is None:
+                                midx_col = index_scols[i] == F.lit(element)
+                            else:
+                                midx_col = midx_col & (index_scols[i] == F.lit(element))
+                        if col is None:
+                            col = midx_col
+                        else:
+                            col = col | midx_col
+                else:
+                    raise ValueError("Single or multi index must be specified.")
+                filtered_df = ks.DataFrame(kdf._internal.with_filter(col))
+            elif axis == 1:
+                filtered_df = kdf[items]
+        elif like is not None:
+            if axis == 0:
+                col = None
+                for index_scol in index_scols:
+                    if col is None:
+                        col = index_scol.contains(like)
+                    else:
+                        col = col | index_scol.contains(like)
+                filtered_df = ks.DataFrame(self._internal.with_filter(col))
+            elif axis == 1:
+                column_labels = kdf._internal.column_labels
+                output_labels = [label for label in column_labels if any(like in i for i in label)]
+                filtered_df = kdf[output_labels]
+        elif regex is not None:
+            if axis == 0:
+                col = None
+                for index_scol in index_scols:
+                    if col is None:
+                        col = index_scol.rlike(regex)
+                    else:
+                        col = col | index_scol.rlike(regex)
+                filtered_df = ks.DataFrame(self._internal.with_filter(col))
+            elif axis == 1:
+                column_labels = kdf._internal.column_labels
+                matcher = re.compile(regex)
+                output_labels = [
+                    label
+                    for label in column_labels
+                    if any(matcher.search(i) is not None for i in label)
+                ]
+                filtered_df = kdf[output_labels]
+        else:
+            raise TypeError("Must pass either `items`, `like`, or `regex`")
+
+        if is_series:
+            return first_series(filtered_df)
+        else:
+            return filtered_df
+
     def median(self, axis=None, numeric_only=True, accuracy=10000):
         """
         Return the median of the values for the requested axis.

diff --git a/databricks/koalas/missing/series.py b/databricks/koalas/missing/series.py
@@ -48,7 +48,6 @@ class MissingPandasLikeSeries(object):
     ewm = _unsupported_function("ewm")
     factorize = _unsupported_function("factorize")
     ffill = _unsupported_function("ffill")
-    filter = _unsupported_function("filter")
     first = _unsupported_function("first")
     infer_objects = _unsupported_function("infer_objects")
     interpolate = _unsupported_function("interpolate")

diff --git a/databricks/koalas/tests/test_series.py b/databricks/koalas/tests/test_series.py
@@ -1631,3 +1631,30 @@ def test_unstack(self):
     def test_item(self):
         kser = ks.Series([10, 20])
         self.assertRaises(ValueError, lambda: kser.item())
+
+    def test_filter(self):
+        kser = ks.Series([0, 1, 2], index=["one", "two", "three"])
+        pser = kser.to_pandas()
+
+        self.assert_eq(pser.filter(items=["one", "three"]), kser.filter(items=["one", "three"]))
+        self.assert_eq(pser.filter(regex="e$"), kser.filter(regex="e$"))
+        self.assert_eq(pser.filter(like="hre"), kser.filter(like="hre"))
+
+        with self.assertRaisesRegex(ValueError, "Series does not support columns axis."):
+            kser.filter(like="hre", axis=1)
+
+        # for MultiIndex
+        midx = pd.MultiIndex.from_tuples([("one", "x"), ("two", "y"), ("three", "z")])
+        kser = ks.Series([0, 1, 2], index=midx)
+        pser = kser.to_pandas()
+
+        self.assert_eq(
+            pser.filter(items=[("one", "x"), ("three", "z")]),
+            kser.filter(items=[("one", "x"), ("three", "z")]),
+        )
+
+        with self.assertRaisesRegex(TypeError, "Unsupported type <class 'list'>"):
+            kser.filter(items=[["one", "x"], ("three", "z")])
+
+        with self.assertRaisesRegex(ValueError, "The item should not be empty."):
+            kser.filter(items=[(), ("three", "z")])
diff --git a/docs/source/reference/series.rst b/docs/source/reference/series.rst
@@ -123,6 +123,7 @@ Computations / Descriptive Stats
    Series.cumsum
    Series.cumprod
    Series.describe
+   Series.filter
    Series.kurt
    Series.mad
    Series.max