databricks · HyukjinKwon · Nov 7, 2019 · Oct 31, 2019 · Oct 31, 2019 · Nov 1, 2019
diff --git a/databricks/koalas/frame.py b/databricks/koalas/frame.py
@@ -7451,6 +7451,127 @@ def keys(self):
         """
         return self.columns
 
+    # TODO: fix parameter 'axis' and 'numeric_only' to work same as pandas'
+    def quantile(self, q=0.5, axis=0, numeric_only=True, accuracy=10000):
+        """
+        Return value at the given quantile.
+
+        .. note:: Unlike pandas', the quantile in Koalas is an approximated quantile based upon
+            approximate percentile computation because computing quantile across a large dataset
+            is extremely expensive.
+
+        Parameters
+        ----------
+        q : float or array-like, default 0.5 (50% quantile)
+            0 <= q <= 1, the quantile(s) to compute.
+        axis : int, default 0 or 'index'
+            Can only be set to 0 at the moment.
+        numeric_only : bool, default True
+            If False, the quantile of datetime and timedelta data will be computed as well.
+            Can only be set to True at the moment.
+        accuracy : int, optional
+            Default accuracy of approximation. Larger value means better accuracy.
+            The relative error can be deduced by 1.0 / accuracy.
+
+        Returns
+        -------
+        Series or DataFrame
+            If q is an array, a DataFrame will be returned where the
+            index is q, the columns are the columns of self, and the values are the quantiles.
+            If q is a float, a Series will be returned where the
+            index is the columns of self and the values are the quantiles.
+
+        Examples
+        --------
+        >>> kdf = ks.DataFrame({'a': [1, 2, 3, 4, 5], 'b': [6, 7, 8, 9, 0]})
+        >>> kdf
+           a  b
+        0  1  6
+        1  2  7
+        2  3  8
+        3  4  9
+        4  5  0
+
+        >>> kdf.quantile(.5)
+        a    3
+        b    7
+        Name: 0.5, dtype: int64
+
+        >>> kdf.quantile([.25, .5, .75])
+              a  b
+        0.25  2  6
+        0.5   3  7
+        0.75  4  8
+        """
+        result_as_series = False
+        if axis not in [0, 'index']:
+            raise ValueError('axis should be either 0 or "index" currently.')
+        if numeric_only is not True:
+            raise ValueError("quantile currently doesn't supports numeric_only")
+        if isinstance(q, float):
+            result_as_series = True
+            key = str(q)
+            q = (q,)
+
+        quantiles = q
+        # First calculate the percentiles from all columns and map it to each `quantiles`
+        # by creating each entry as a struct. So, it becomes an array of structs as below:
+        #
+        # +-----------------------------------------+
+        # |                                   arrays|
+        # +-----------------------------------------+
+        # |[[0.25, 2, 6], [0.5, 3, 7], [0.75, 4, 8]]|
+        # +-----------------------------------------+
+        sdf = self._sdf
+        args = ", ".join(map(str, quantiles))
+
+        percentile_cols = []
+        for column in self._internal.data_columns:
+            sdf = self._sdf
+            percentile_cols.append(F.expr(
+                "approx_percentile(`%s`, array(%s), %s)" % (column, args, accuracy))
+                .alias(column))
+        sdf = sdf.select(percentile_cols)
+        # Here, after select percntile cols, a sdf looks like below:
+        # +---------+---------+
+        # |        a|        b|
+        # +---------+---------+
+        # |[2, 3, 4]|[6, 7, 8]|
+        # +---------+---------+
+
+        cols_dict = OrderedDict()
+        for column in self._internal.data_columns:
+            cols_dict[column] = list()
+            for i in range(len(quantiles)):
+                cols_dict[column].append(scol_for(sdf, column).getItem(i).alias(column))
+
+        internal_index_column = SPARK_INDEX_NAME_FORMAT(0)
+        cols = []
+        for i, col in enumerate(zip(*cols_dict.values())):
+            cols.append(F.struct(
+                F.lit("%s" % quantiles[i]).alias(internal_index_column),
+                *col))
+        sdf = sdf.select(F.array(*cols).alias("arrays"))
+
+        # And then, explode it and manually set the index.
+        # +-----------------+---+---+
+        # |__index_level_0__|  a|  b|
+        # +-----------------+---+---+
+        # |             0.25|  2|  6|
+        # |              0.5|  3|  7|
+        # |             0.75|  4|  8|
+        # +-----------------+---+---+
+        sdf = sdf.select(F.explode(F.col("arrays"))).selectExpr("col.*")
+
+        internal = self._internal.copy(
+            sdf=sdf,
+            data_columns=self._internal.data_columns,
+            index_map=[(internal_index_column, None)],
+            column_index=self._internal.column_index,
+            column_index_names=None)
+
+        return DataFrame(internal) if not result_as_series else DataFrame(internal).T[key]
+
     def _get_from_multiindex_column(self, key):
         """ Select columns from multi-index columns.
 

diff --git a/databricks/koalas/missing/frame.py b/databricks/koalas/missing/frame.py
@@ -74,7 +74,6 @@ class _MissingPandasLikeDataFrame(object):
     pct_change = unsupported_function('pct_change')
     prod = unsupported_function('prod')
     product = unsupported_function('product')
-    quantile = unsupported_function('quantile')
     query = unsupported_function('query')
     reindex_like = unsupported_function('reindex_like')
     rename_axis = unsupported_function('rename_axis')

diff --git a/databricks/koalas/tests/test_dataframe.py b/databricks/koalas/tests/test_dataframe.py
@@ -2177,3 +2177,12 @@ def test_keys(self):
         pdf = kdf.to_pandas()
 
         self.assert_eq(kdf.keys(), pdf.keys())
+
+    def test_quantile(self):
+        kdf = ks.from_pandas(self.pdf)
+
+        with self.assertRaisesRegex(ValueError, 'axis should be either 0 or "index" currently.'):
+            kdf.quantile(.5, axis=1)
+
+        with self.assertRaisesRegex(ValueError, "quantile currently doesn't supports numeric_only"):
+            kdf.quantile(.5, numeric_only=False)
diff --git a/docs/source/reference/frame.rst b/docs/source/reference/frame.rst
@@ -119,6 +119,7 @@ Computations / Descriptive Stats
    DataFrame.mean
    DataFrame.min
    DataFrame.median
+   DataFrame.quantile
    DataFrame.nunique
    DataFrame.skew
    DataFrame.sum