databricks · ueshin · May 29, 2019 · May 23, 2019 · May 23, 2019 · May 23, 2019
diff --git a/databricks/koalas/frame.py b/databricks/koalas/frame.py
@@ -2541,7 +2541,7 @@ def astype(self, dtype) -> 'DataFrame':
         return DataFrame(sdf, self._metadata.copy())
 
     # TODO: percentiles, include, and exclude should be implemented.
-    def describe(self) -> 'DataFrame':
+    def describe(self, percentiles=[0.25, 0.5, 0.75]) -> 'DataFrame':
         """
         Generate descriptive statistics that summarize the central tendency,
         dispersion and shape of a dataset's distribution, excluding
@@ -2552,6 +2552,12 @@ def describe(self) -> 'DataFrame':
         will vary depending on what is provided. Refer to the notes
         below for more detail.
 
+        Parameters
+        ----------
+        percentiles : list of ``float`` in range (0.0, 1.0), default [0.25, 0.5, 0.75]
+            A list of percentiles to be computed. 
+            Use an empty list if no percentiles should be computed. 
+
         Returns
         -------
         Series or DataFrame
@@ -2568,7 +2574,7 @@ def describe(self) -> 'DataFrame':
         Notes
         -----
         For numeric data, the result's index will include ``count``,
-        ``mean``, ``stddev``, ``min``, ``max``.
+        ``mean``, ``stddev``, ``min``, ``25%``, ``50%``, ``75%``, ``max``.
 
         Currently only numeric data is supported.
 
@@ -2582,6 +2588,9 @@ def describe(self) -> 'DataFrame':
         mean      2.0
         stddev    1.0
         min       1.0
+        25%       1.0
+        50%       2.0
+        75%       3.0
         max       3.0
         Name: 0, dtype: float64
 
@@ -2598,6 +2607,25 @@ def describe(self) -> 'DataFrame':
         mean         2.0       5.0
         stddev       1.0       1.0
         min          1.0       4.0
+        25%          1.0       4.0
+        50%          2.0       5.0
+        75%          3.0       6.0
+        max          3.0       6.0
+
+        Describing a ``DataFrame`` and selecting custom percentiles.
+
+        >>> df = ks.DataFrame({'numeric1': [1, 2, 3],
+        ...                    'numeric2': [4.0, 5.0, 6.0]
+        ...                   },
+        ...                   columns=['numeric1', 'numeric2', 'object'])
+        >>> df.describe(percentiles = [0.15, 0.85])
+                numeric1  numeric2
+        count        3.0       3.0
+        mean         2.0       5.0
+        stddev       1.0       1.0
+        min          1.0       4.0
+        15%          1.0       4.0
+        85%          3.0       6.0
         max          3.0       6.0
 
         Describing a column from a ``DataFrame`` by accessing it as
@@ -2608,6 +2636,9 @@ def describe(self) -> 'DataFrame':
         mean      2.0
         stddev    1.0
         min       1.0
+        25%       1.0
+        50%       2.0
+        75%       3.0
         max       3.0
         Name: numeric1, dtype: float64
         """
@@ -2626,7 +2657,11 @@ def describe(self) -> 'DataFrame':
         if len(exprs) == 0:
             raise ValueError("Cannot describe a DataFrame without columns")
 
-        sdf = self._sdf.select(*exprs).describe()
+        formatted_perc =  ["{:.0%}".format(p) for p in percentiles]
+        stats = ["count", "mean", "stddev", "min", *formatted_perc, "max"]
+
+        sdf = self._sdf.select(*exprs).summary(stats)
+
         return DataFrame(sdf, index=Metadata(data_columns=data_columns,
                                              index_map=[('summary', None)])).astype('float64')