Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement DataFrame.quantile #984

Merged
merged 10 commits into from
Nov 7, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
121 changes: 121 additions & 0 deletions databricks/koalas/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -7451,6 +7451,127 @@ def keys(self):
"""
return self.columns

# TODO: fix parameter 'axis' and 'numeric_only' to work same as pandas'
def quantile(self, q=0.5, axis=0, numeric_only=True, accuracy=10000):
"""
Return value at the given quantile.

.. note:: Unlike pandas', the quantile in Koalas is an approximated quantile based upon
approximate percentile computation because computing quantile across a large dataset
is extremely expensive.

Parameters
----------
q : float or array-like, default 0.5 (50% quantile)
0 <= q <= 1, the quantile(s) to compute.
axis : int, default 0 or 'index'
Can only be set to 0 at the moment.
numeric_only : bool, default True
If False, the quantile of datetime and timedelta data will be computed as well.
Can only be set to True at the moment.
accuracy : int, optional
Default accuracy of approximation. Larger value means better accuracy.
The relative error can be deduced by 1.0 / accuracy.

Returns
-------
Series or DataFrame
If q is an array, a DataFrame will be returned where the
index is q, the columns are the columns of self, and the values are the quantiles.
If q is a float, a Series will be returned where the
index is the columns of self and the values are the quantiles.

Examples
--------
>>> kdf = ks.DataFrame({'a': [1, 2, 3, 4, 5], 'b': [6, 7, 8, 9, 0]})
>>> kdf
a b
0 1 6
1 2 7
2 3 8
3 4 9
4 5 0

>>> kdf.quantile(.5)
a 3
b 7
Name: 0.5, dtype: int64

>>> kdf.quantile([.25, .5, .75])
a b
0.25 2 6
0.5 3 7
0.75 4 8
"""
result_as_series = False
if axis not in [0, 'index']:
raise ValueError('axis should be either 0 or "index" currently.')
if numeric_only is not True:
raise ValueError("quantile currently doesn't supports numeric_only")
if isinstance(q, float):
result_as_series = True
key = str(q)
q = (q,)
itholic marked this conversation as resolved.
Show resolved Hide resolved

quantiles = q
# First calculate the percentiles from all columns and map it to each `quantiles`
# by creating each entry as a struct. So, it becomes an array of structs as below:
#
# +-----------------------------------------+
# | arrays|
# +-----------------------------------------+
# |[[0.25, 2, 6], [0.5, 3, 7], [0.75, 4, 8]]|
# +-----------------------------------------+
itholic marked this conversation as resolved.
Show resolved Hide resolved
sdf = self._sdf
args = ", ".join(map(str, quantiles))

percentile_cols = []
for column in self._internal.data_columns:
sdf = self._sdf
percentile_cols.append(F.expr(
"approx_percentile(`%s`, array(%s), %s)" % (column, args, accuracy))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

.alias(column))
sdf = sdf.select(percentile_cols)
# Here, after select percntile cols, a sdf looks like below:
# +---------+---------+
# | a| b|
# +---------+---------+
# |[2, 3, 4]|[6, 7, 8]|
# +---------+---------+

cols_dict = OrderedDict()
for column in self._internal.data_columns:
cols_dict[column] = list()
for i in range(len(quantiles)):
cols_dict[column].append(scol_for(sdf, column).getItem(i).alias(column))

internal_index_column = SPARK_INDEX_NAME_FORMAT(0)
cols = []
for i, col in enumerate(zip(*cols_dict.values())):
itholic marked this conversation as resolved.
Show resolved Hide resolved
cols.append(F.struct(
F.lit("%s" % quantiles[i]).alias(internal_index_column),
*col))
sdf = sdf.select(F.array(*cols).alias("arrays"))

# And then, explode it and manually set the index.
# +-----------------+---+---+
# |__index_level_0__| a| b|
# +-----------------+---+---+
# | 0.25| 2| 6|
# | 0.5| 3| 7|
# | 0.75| 4| 8|
# +-----------------+---+---+
sdf = sdf.select(F.explode(F.col("arrays"))).selectExpr("col.*")

internal = self._internal.copy(
sdf=sdf,
data_columns=self._internal.data_columns,
index_map=[(internal_index_column, None)],
column_index=self._internal.column_index,
column_index_names=None)

return DataFrame(internal) if not result_as_series else DataFrame(internal).T[key]

def _get_from_multiindex_column(self, key):
""" Select columns from multi-index columns.

Expand Down
1 change: 0 additions & 1 deletion databricks/koalas/missing/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,6 @@ class _MissingPandasLikeDataFrame(object):
pct_change = unsupported_function('pct_change')
prod = unsupported_function('prod')
product = unsupported_function('product')
quantile = unsupported_function('quantile')
query = unsupported_function('query')
reindex_like = unsupported_function('reindex_like')
rename_axis = unsupported_function('rename_axis')
Expand Down
9 changes: 9 additions & 0 deletions databricks/koalas/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -2177,3 +2177,12 @@ def test_keys(self):
pdf = kdf.to_pandas()

self.assert_eq(kdf.keys(), pdf.keys())

def test_quantile(self):
kdf = ks.from_pandas(self.pdf)

with self.assertRaisesRegex(ValueError, 'axis should be either 0 or "index" currently.'):
kdf.quantile(.5, axis=1)

with self.assertRaisesRegex(ValueError, "quantile currently doesn't supports numeric_only"):
kdf.quantile(.5, numeric_only=False)
1 change: 1 addition & 0 deletions docs/source/reference/frame.rst
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,7 @@ Computations / Descriptive Stats
DataFrame.mean
DataFrame.min
DataFrame.median
DataFrame.quantile
DataFrame.nunique
DataFrame.skew
DataFrame.sum
Expand Down