Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(api): move analytic window functions to top-level #7327

Merged
merged 5 commits into from
Oct 12, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions docs/_quarto.yml
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,21 @@ quartodoc:
- name: row_number
dynamic: true
signature_name: full
- name: rank
dynamic: true
signature_name: full
- name: dense_rank
dynamic: true
signature_name: full
- name: percent_rank
dynamic: true
signature_name: full
- name: cume_dist
dynamic: true
signature_name: full
- name: ntile
dynamic: true
signature_name: full
- name: window
dynamic: true
signature_name: full
Expand Down
8 changes: 4 additions & 4 deletions ibis/backends/base/sql/alchemy/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -675,10 +675,10 @@ class array_filter(FunctionElement):
ops.FirstValue: unary(sa.func.first_value),
ops.LastValue: unary(sa.func.last_value),
ops.RowNumber: fixed_arity(sa.func.row_number, 0),
ops.DenseRank: unary(lambda _: sa.func.dense_rank()),
ops.MinRank: unary(lambda _: sa.func.rank()),
ops.PercentRank: unary(lambda _: sa.func.percent_rank()),
ops.CumeDist: unary(lambda _: sa.func.cume_dist()),
ops.DenseRank: fixed_arity(sa.func.dense_rank, 0),
ops.MinRank: fixed_arity(sa.func.rank, 0),
ops.PercentRank: fixed_arity(sa.func.percent_rank, 0),
ops.CumeDist: fixed_arity(sa.func.cume_dist, 0),
ops.NthValue: _nth_value,
ops.WindowFunction: _window_function,
}
Expand Down
4 changes: 2 additions & 2 deletions ibis/backends/dask/tests/execution/test_window.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,8 +200,8 @@ def test_batting_avg_change_in_games_per_year(players, players_df):


@pytest.mark.xfail(
raises=NotImplementedError,
reason="Grouped and order windows not supported yet",
raises=AssertionError,
reason="Dask doesn't support the `rank` method on SeriesGroupBy",
)
def test_batting_most_hits(players, players_df):
expr = players.mutate(
Expand Down
2 changes: 0 additions & 2 deletions ibis/backends/duckdb/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -489,8 +489,6 @@ def _to_json_collection(t, op):


_invalid_operations = {
# ibis.expr.operations.analytic
ops.NTile,
# ibis.expr.operations.strings
ops.Translate,
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1 +1 @@
ntile(3)
ntile(3) OVER (ORDER BY `double_col` ASC)
Original file line number Diff line number Diff line change
@@ -1 +1 @@
percent_rank()
percent_rank() OVER (ORDER BY `double_col` ASC)
51 changes: 30 additions & 21 deletions ibis/backends/pandas/execution/window.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,10 +254,10 @@ def trim_window_result(data: pd.Series | pd.DataFrame, timecontext: TimeContext
return indexed_subset[name]


@execute_node.register(ops.WindowFunction, pd.Series)
@execute_node.register(ops.WindowFunction, [pd.Series])
def execute_window_op(
op,
data,
*data,
scope: Scope | None = None,
timecontext: TimeContext | None = None,
aggcontext=None,
Expand Down Expand Up @@ -485,33 +485,42 @@ def execute_series_group_by_last_value(op, data, aggcontext=None, **kwargs):
return aggcontext.agg(data, lambda x: _getter(x, -1))


@execute_node.register(ops.MinRank, (pd.Series, SeriesGroupBy))
def execute_series_min_rank(op, data, **kwargs):
# TODO(phillipc): Handle ORDER BY
@execute_node.register(ops.MinRank)
def execute_series_min_rank(op, aggcontext=None, **kwargs):
(key,) = aggcontext.order_by
df = aggcontext.parent
data = df[key]
return data.rank(method="min", ascending=True).astype("int64") - 1


@execute_node.register(ops.DenseRank, (pd.Series, SeriesGroupBy))
def execute_series_dense_rank(op, data, **kwargs):
# TODO(phillipc): Handle ORDER BY
@execute_node.register(ops.DenseRank)
def execute_series_dense_rank(op, aggcontext=None, **kwargs):
(key,) = aggcontext.order_by
df = aggcontext.parent
data = df[key]
return data.rank(method="dense", ascending=True).astype("int64") - 1


@execute_node.register(ops.PercentRank, SeriesGroupBy)
def execute_series_group_by_percent_rank(op, data, **kwargs):
return (
data.rank(method="min", ascending=True)
.sub(1)
.div(data.transform("count").sub(1))
)
@execute_node.register(ops.PercentRank)
def execute_series_group_by_percent_rank(op, aggcontext=None, **kwargs):
(key,) = aggcontext.order_by
df = aggcontext.parent
data = df[key]

result = data.rank(method="min", ascending=True) - 1

if isinstance(data, SeriesGroupBy):
nrows = data.transform("count")
else:
nrows = len(data)

@execute_node.register(ops.PercentRank, pd.Series)
def execute_series_percent_rank(op, data, **kwargs):
# TODO(phillipc): Handle ORDER BY
return data.rank(method="min", ascending=True).sub(1).div(len(data) - 1)
result /= nrows - 1
return result


@execute_node.register(ops.CumeDist, (pd.Series, SeriesGroupBy))
def execute_series_group_by_cume_dist(op, data, **kwargs):
@execute_node.register(ops.CumeDist)
def execute_series_group_by_cume_dist(op, aggcontext=None, **kwargs):
(key,) = aggcontext.order_by
df = aggcontext.parent
data = df[key]
return data.rank(method="min", ascending=True, pct=True)
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
SELECT
RANK() OVER (ORDER BY t0.double_col ASC) - 1 AS rank,
DENSE_RANK() OVER (ORDER BY t0.double_col ASC) - 1 AS dense_rank,
CUME_DIST() OVER (ORDER BY t0.double_col ASC) AS cume_dist,
NTILE(7) OVER (ORDER BY t0.double_col ASC) - 1 AS ntile,
PERCENT_RANK() OVER (ORDER BY t0.double_col ASC) AS percent_rank
FROM functional_alltypes AS t0
41 changes: 8 additions & 33 deletions ibis/backends/postgres/tests/test_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -1110,40 +1110,15 @@ def test_identical_to(con, df):
tm.assert_series_equal(result, expected)


def test_rank(con):
t = con.table("functional_alltypes")
expr = t.double_col.rank().name("rank")
sqla_expr = expr.compile()
result = str(sqla_expr.compile(compile_kwargs={"literal_binds": True}))
expected = (
"SELECT rank() OVER (ORDER BY t0.double_col) - 1 AS rank \n"
"FROM functional_alltypes AS t0"
)
assert result == expected


def test_percent_rank(con):
t = con.table("functional_alltypes")
expr = t.double_col.percent_rank().name("percent_rank")
sqla_expr = expr.compile()
result = str(sqla_expr.compile(compile_kwargs={"literal_binds": True}))
expected = (
"SELECT percent_rank() OVER (ORDER BY t0.double_col) AS "
"percent_rank \nFROM functional_alltypes AS t0"
def test_analytic_functions(alltypes, snapshot):
expr = alltypes.select(
rank=alltypes.double_col.rank(),
dense_rank=alltypes.double_col.dense_rank(),
cume_dist=alltypes.double_col.cume_dist(),
ntile=alltypes.double_col.ntile(7),
percent_rank=alltypes.double_col.percent_rank(),
)
assert result == expected


def test_ntile(con):
t = con.table("functional_alltypes")
expr = t.double_col.ntile(7).name("result")
sqla_expr = expr.compile()
result = str(sqla_expr.compile(compile_kwargs={"literal_binds": True}))
expected = (
"SELECT ntile(7) OVER (ORDER BY t0.double_col) - 1 AS result \n"
"FROM functional_alltypes AS t0"
)
assert result == expected
snapshot.assert_match(str(ibis.to_sql(expr)), "out.sql")


@pytest.mark.parametrize("opname", ["invert", "neg"])
Expand Down
9 changes: 4 additions & 5 deletions ibis/backends/tests/test_window.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,9 +157,8 @@ def calc_zscore(s):
lambda t: t.cumcount(),
id="row_number",
marks=[
pytest.mark.notimpl(
["dask", "pandas"], raises=com.OperationNotDefinedError
)
pytest.mark.notimpl(["dask"], raises=NotImplementedError),
pytest.mark.notimpl(["pandas"], raises=com.OperationNotDefinedError),
],
),
param(
Expand Down Expand Up @@ -891,9 +890,9 @@ def gb_fn(df):


@pytest.mark.notimpl(
["clickhouse", "dask", "datafusion", "polars"],
raises=com.OperationNotDefinedError,
["clickhouse", "datafusion", "polars"], raises=com.OperationNotDefinedError
)
@pytest.mark.notimpl(["dask"], raises=AttributeError)
@pytest.mark.notimpl(["pyspark"], raises=AnalysisException)
@pytest.mark.notyet(
["clickhouse"],
Expand Down
Loading