From 77e96ff56d54f54ad933b4b0d780c37e2ead4d51 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Thu, 4 Jan 2024 12:46:03 +0100 Subject: [PATCH] refactor(pandas): port the pandas backend with an improved execution model (#7797) Since we need to reimplement/port all of the backends for #7752, I took an attempt at reimplementing the pandas backend using a new execution engine. Previously the pandas backend was implemented using a top-down execution model and each operation was executing using a multidispatched function. While it served us well for a long time, it had a few drawbacks: - it was often hard to understand what was going on due to the complex preparation steps and various execution hooks - the multidispatched functions were hard to debug, additionally they supported a wide variety of inputs making the implementation rather bulky - due to the previous reaon, several inputs combinations were not supported, e.g. value operations with multiple columnar inputs - the `Scope` object was used to pass around the execution context which was created for each operation separately and the results were not reusable even though the same operation was executed multiple times The new execution model has changed in several ways: - there is a rewrite layer before execution which lowers the input expression to a form closer to the pandas execution model, this makes it much easier to implement the operations and also makes the input "plan" inspectable - the execution is now topologically sorted and executed in a bottom-up manner; the intermediate results are reused, making the execution more efficient while also aggressively cleaned up as soon as they are not needed anymore to reduce the memory usage - the execute function is now single-dispatched making the implementation easier to locate and debug - the inputs now broadcasted to columnar shape so that the same implementation can be used for multiple input shape combinations, this removes several special cases from the implementation in exchange of a negligible performance overhead - there are helper utilities making it easier to implement compute kernels for the various value operations: `rowwise`, `columnwise`, `elementwise`, `serieswise`; if there are multiple implementations available for a given operation, the most efficient one is selected based on the input shapes The new backend implementation has a higher feature coverage while the implementation is one third of the size of the previous one. BREAKING CHANGE: the `timecontext` feature is not supported anymore --- .github/workflows/ibis-backends.yml | 8 +- ibis/backends/base/df/__init__.py | 0 ibis/backends/base/df/scope.py | 211 --- ibis/backends/base/df/timecontext.py | 304 ---- .../dask/tests/execution/test_join.py | 1 + ibis/backends/pandas/__init__.py | 47 +- ibis/backends/pandas/aggcontext.py | 710 -------- ibis/backends/pandas/convert.py | 88 + ibis/backends/pandas/core.py | 605 ------- ibis/backends/pandas/dispatch.py | 110 -- ibis/backends/pandas/dispatcher.py | 113 -- ibis/backends/pandas/execution/__init__.py | 13 - ibis/backends/pandas/execution/arrays.py | 172 -- ibis/backends/pandas/execution/constants.py | 106 -- ibis/backends/pandas/execution/decimal.py | 135 -- ibis/backends/pandas/execution/generic.py | 1479 ----------------- ibis/backends/pandas/execution/join.py | 183 -- ibis/backends/pandas/execution/maps.py | 208 --- ibis/backends/pandas/execution/selection.py | 337 ---- ibis/backends/pandas/execution/strings.py | 560 ------- ibis/backends/pandas/execution/structs.py | 44 - ibis/backends/pandas/execution/temporal.py | 341 ---- ibis/backends/pandas/execution/timecontext.py | 93 -- ibis/backends/pandas/execution/util.py | 144 -- ibis/backends/pandas/execution/window.py | 526 ------ ibis/backends/pandas/executor.py | 761 +++++++++ ibis/backends/pandas/helpers.py | 211 +++ ibis/backends/pandas/kernels.py | 513 ++++++ ibis/backends/pandas/rewrites.py | 322 ++++ ibis/backends/pandas/tests/conftest.py | 286 ++++ .../pandas/tests/execution/__init__.py | 0 .../pandas/tests/execution/conftest.py | 289 ---- .../tests/execution/test_timecontext.py | 399 ----- ibis/backends/pandas/tests/test_aggcontext.py | 167 -- .../tests/{execution => }/test_arrays.py | 7 + .../pandas/tests/{execution => }/test_cast.py | 40 +- ibis/backends/pandas/tests/test_core.py | 65 +- ibis/backends/pandas/tests/test_dispatcher.py | 143 -- .../tests/{execution => }/test_functions.py | 35 +- ibis/backends/pandas/tests/test_helpers.py | 72 + .../pandas/tests/{execution => }/test_join.py | 69 +- .../pandas/tests/{execution => }/test_maps.py | 0 .../tests/{execution => }/test_operations.py | 6 +- .../tests/{execution => }/test_strings.py | 24 +- .../tests/{execution => }/test_structs.py | 6 +- .../tests/{execution => }/test_temporal.py | 5 +- ibis/backends/pandas/tests/test_udf.py | 42 +- .../tests/{execution => }/test_window.py | 48 +- ibis/backends/pandas/trace.py | 170 -- ibis/backends/pandas/udf.py | 145 -- ibis/backends/tests/test_aggregation.py | 19 +- ibis/backends/tests/test_array.py | 33 +- ibis/backends/tests/test_generic.py | 6 +- ibis/backends/tests/test_interactive.py | 4 + ibis/backends/tests/test_param.py | 4 +- ibis/backends/tests/test_string.py | 2 +- ibis/backends/tests/test_temporal.py | 10 +- ibis/backends/tests/test_timecontext.py | 4 +- ibis/backends/tests/test_vectorized_udf.py | 3 +- ibis/backends/tests/test_window.py | 28 +- ibis/expr/operations/reductions.py | 1 + ibis/formats/pandas.py | 24 +- ibis/formats/tests/test_dask.py | 9 - 63 files changed, 2517 insertions(+), 7993 deletions(-) delete mode 100644 ibis/backends/base/df/__init__.py delete mode 100644 ibis/backends/base/df/scope.py delete mode 100644 ibis/backends/base/df/timecontext.py delete mode 100644 ibis/backends/pandas/aggcontext.py create mode 100644 ibis/backends/pandas/convert.py delete mode 100644 ibis/backends/pandas/core.py delete mode 100644 ibis/backends/pandas/dispatch.py delete mode 100644 ibis/backends/pandas/dispatcher.py delete mode 100644 ibis/backends/pandas/execution/__init__.py delete mode 100644 ibis/backends/pandas/execution/arrays.py delete mode 100644 ibis/backends/pandas/execution/constants.py delete mode 100644 ibis/backends/pandas/execution/decimal.py delete mode 100644 ibis/backends/pandas/execution/generic.py delete mode 100644 ibis/backends/pandas/execution/join.py delete mode 100644 ibis/backends/pandas/execution/maps.py delete mode 100644 ibis/backends/pandas/execution/selection.py delete mode 100644 ibis/backends/pandas/execution/strings.py delete mode 100644 ibis/backends/pandas/execution/structs.py delete mode 100644 ibis/backends/pandas/execution/temporal.py delete mode 100644 ibis/backends/pandas/execution/timecontext.py delete mode 100644 ibis/backends/pandas/execution/util.py delete mode 100644 ibis/backends/pandas/execution/window.py create mode 100644 ibis/backends/pandas/executor.py create mode 100644 ibis/backends/pandas/helpers.py create mode 100644 ibis/backends/pandas/kernels.py create mode 100644 ibis/backends/pandas/rewrites.py delete mode 100644 ibis/backends/pandas/tests/execution/__init__.py delete mode 100644 ibis/backends/pandas/tests/execution/conftest.py delete mode 100644 ibis/backends/pandas/tests/execution/test_timecontext.py delete mode 100644 ibis/backends/pandas/tests/test_aggcontext.py rename ibis/backends/pandas/tests/{execution => }/test_arrays.py (96%) rename ibis/backends/pandas/tests/{execution => }/test_cast.py (80%) delete mode 100644 ibis/backends/pandas/tests/test_dispatcher.py rename ibis/backends/pandas/tests/{execution => }/test_functions.py (92%) create mode 100644 ibis/backends/pandas/tests/test_helpers.py rename ibis/backends/pandas/tests/{execution => }/test_join.py (89%) rename ibis/backends/pandas/tests/{execution => }/test_maps.py (100%) rename ibis/backends/pandas/tests/{execution => }/test_operations.py (99%) rename ibis/backends/pandas/tests/{execution => }/test_strings.py (89%) rename ibis/backends/pandas/tests/{execution => }/test_structs.py (95%) rename ibis/backends/pandas/tests/{execution => }/test_temporal.py (98%) rename ibis/backends/pandas/tests/{execution => }/test_window.py (93%) delete mode 100644 ibis/backends/pandas/trace.py diff --git a/.github/workflows/ibis-backends.yml b/.github/workflows/ibis-backends.yml index ff7caf83b69da..baffb490b3d0c 100644 --- a/.github/workflows/ibis-backends.yml +++ b/.github/workflows/ibis-backends.yml @@ -77,10 +77,10 @@ jobs: # title: Dask # extras: # - dask - # - name: pandas - # title: Pandas - # extras: - # - pandas + - name: pandas + title: Pandas + extras: + - pandas # - name: sqlite # title: SQLite # extras: diff --git a/ibis/backends/base/df/__init__.py b/ibis/backends/base/df/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/ibis/backends/base/df/scope.py b/ibis/backends/base/df/scope.py deleted file mode 100644 index 1d41da93464d7..0000000000000 --- a/ibis/backends/base/df/scope.py +++ /dev/null @@ -1,211 +0,0 @@ -"""Module for scope. - -The motivation of Scope is to cache data for calculated ops. - -`scope` in Scope class is the main cache. It is a dictionary mapping -ibis node instances to concrete data, and the time context associate -with it (if any). - -When there are no time contexts associate with the cached result, getting -and setting values in Scope would be as simple as get and set in a normal -dictionary. With time contexts, we need the following logic for getting -and setting items in scope: - -Before setting the value op in scope we need to perform the following -check first: - -Test if `op` is in `scope` yet -- No, then put `op` in `scope`, set 'timecontext' to be the current -`timecontext` (None if `timecontext` is not present), set 'value' to be -the actual data. -- Yes, then get the time context stored in `scope` for `op` as -`old_timecontext`, and compare it with current `timecontext`: -If current `timecontext` is a subset of `_timecontext`, that means we -already cached a larger range of data. Do nothing and we will trim data in -later execution process. -If current `timecontext` is a superset of `old_timecontext`, that means we -need to update cache. Set 'value' to be the current data and set -'timecontext' to be the current `timecontext` for `op`. -If current `timecontext` is neither a subset nor a superset of -`old_timcontext`, but they overlap, or not overlap at all (For example -when there is a window that looks forward, over a window that looks -back), in this case, we should not trust the data stored either because -the data stored in scope doesn't cover the current time context. -For simplicity, we update cache in this case, instead of merge data of -different time contexts. -""" -from __future__ import annotations - -from collections import namedtuple -from typing import TYPE_CHECKING, Any - -import pandas as pd - -from ibis.backends.base.df.timecontext import TimeContextRelation, compare_timecontext - -if TYPE_CHECKING: - from collections.abc import Iterable - - from ibis.expr.operations import Node - -TimeContext = tuple[pd.Timestamp, pd.Timestamp] - -ScopeItem = namedtuple("ScopeItem", ["timecontext", "value"]) - - -class Scope: - def __init__( - self, - param: dict[Node, Any] | None = None, - timecontext: TimeContext | None = None, - ): - """Create a new scope. - - Associate None as timecontext by default. This is mostly used to - init a scope with a set of given params. - """ - self._items = ( - {op: ScopeItem(timecontext, value) for op, value in param.items()} - if param - else {} - ) - - def __contains__(self, op): - """Given an `op`, return if `op` is present in Scope. - - Note that this `__contain__` method doesn't take `timecontext` - as a parameter. This could be used to iterate all keys in - current scope, or any case that doesn't care about value, just - simply test if `op` is in scope or not. - When trying to get value in scope, use `get_value(op, timecontext)` - instead. Because the cached data could be trusted only if: - 1. `op` is in `scope`, and, - 2. The `timecontext` associated with `op` is a time context equal - to, or larger than the current time context. - """ - return op in self._items - - def __iter__(self): - return iter(self._items.keys()) - - def set_value(self, op: Node, timecontext: TimeContext | None, value: Any) -> None: - """Set values in scope. - - Given an `op`, `timecontext` and `value`, set `op` and - `(value, timecontext)` in scope. - - This method doesn't simply override and set, but takes time context - into consideration. - - If there is a value associated with the key, but time context is - smaller than the current time context we are going to set, `get_value` - will return None and we will proceed to set the new value in scope. - - Parameters - ---------- - op - Key in scope - timecontext - Time context - value - the cached result to save in scope, an object whose type may - differ in different backends. - """ - if self.get_value(op, timecontext) is None: - self._items[op] = ScopeItem(timecontext, value) - - def get_value(self, op: Node, timecontext: TimeContext | None = None) -> Any: - """Given a op and timecontext, get the result from scope. - - Parameters - ---------- - op - Key in scope - timecontext - Time context - - Returns - ------- - Any - The cached result, an object whose type may differ in different - backends. - """ - if op not in self: - return None - - # for ops without timecontext - if timecontext is None: - return self._items[op].value - else: - # For op with timecontext, there are some ops cannot use cached - # result with a different (larger) timecontext to get the - # correct result. - # For example, a groupby followed by count, if we use a larger or - # smaller dataset from cache, we will get an error in result. - # Such ops with global aggregation, ops whose result is - # depending on other rows in result Dataframe, cannot use cached - # result with different time context to optimize calculation. - # These are time context sensitive operations. Since these cases - # are rare in actual use case, we just enable optimization for - # all nodes for now. - cached_timecontext = self._items[op].timecontext - if cached_timecontext: - relation = compare_timecontext(timecontext, cached_timecontext) - if relation == TimeContextRelation.SUBSET: - return self._items[op].value - else: - return self._items[op].value - return None - - def merge_scope(self, other_scope: Scope, overwrite=False) -> Scope: - """Merge items in `other_scope` into this scope. - - Parameters - ---------- - other_scope - Scope to be merged with - overwrite - if `True`, force overwrite `value` if node already exists. - - Returns - ------- - Scope - a new Scope instance with items in two scopes merged. - """ - result = Scope() - - for op in self: - result._items[op] = self._items[op] - - for op in other_scope: - # if get_scope returns a not None value, then data is already - # cached in scope and it is at least a greater range than - # the current timecontext, so we drop the item. Otherwise - # add it into scope. - v = other_scope._items[op] - if overwrite or result.get_value(op, v.timecontext) is None: - result._items[op] = v - return result - - def merge_scopes(self, other_scopes: Iterable[Scope], overwrite=False) -> Scope: - """Merge items in `other_scopes` into this scope. - - Parameters - ---------- - other_scopes - scopes to be merged with - overwrite - if `True`, force overwrite value if node already exists. - - Returns - ------- - Scope - a new Scope instance with items in input scopes merged. - """ - result = Scope() - for op in self: - result._items[op] = self._items[op] - - for s in other_scopes: - result = result.merge_scope(s, overwrite) - return result diff --git a/ibis/backends/base/df/timecontext.py b/ibis/backends/base/df/timecontext.py deleted file mode 100644 index f84dd473bc4c4..0000000000000 --- a/ibis/backends/base/df/timecontext.py +++ /dev/null @@ -1,304 +0,0 @@ -"""Time context module. - -This is an implementation of time context extension without affecting the -existing SQL-like execution model for backends. - -Most of the execution is built on the foundation that "Data is uniquely -defined by the op tree". This is true in SQL analysis where there is no -ambiguity what the result of executing a Table is. - -In time series analysis, however, this is not necessarily True. We have defined -an extension to ibis execution for time series analysis where the result of -executing a Table is defined by the Table plus the time context are -associated with the execution. - -Time context specifies the temporal range of a query, it carries the start and -end datetimes. For example, a Table can represent the query select count(a) -from table, but the result of that is different with time context -("20190101", "20200101") vs ("20200101", "20210101"), because what data is in -"table" depends also on the time context. - -While data in scope is public and global for all nodes, `timecontext` is -intended to store 'local' time context data for each node in execution. i.e., -each subtree of an expr tree can have different time context. Which makes it -so that when executing each node, we also need to know the "local time context" -for that node. - -And we propose to store these data as 'timecontext', calculate in execution -pass it along to children nodes, in the ibis tree. See each backends for -implementation details. - -Time context adjustment algorithm - In an Ibis tree, time context is local for each node, and they should be - adjusted accordingly for some specific nodes. Those operations may - require extra data outside of the global time context that user defines. - For example, in asof_join, we need to look back extra `tolerance` daays - for the right table to get the data for joining. Similarly for window - operation with preceding and following. - Algorithm to calculate context adjustment are defined in this module - and could be used by multiple backends. -""" - -from __future__ import annotations - -import enum -import functools -from typing import TYPE_CHECKING, Any - -import pandas as pd - -import ibis.common.exceptions as com -import ibis.expr.operations as ops -from ibis import config - -TimeContext = tuple[pd.Timestamp, pd.Timestamp] - - -if TYPE_CHECKING: - from ibis.backends.base.df.scope import Scope - - -# In order to use time context feature, there must be a column of Timestamp -# type, and named as 'time' in Table. This TIME_COL constant will be -# used in filtering data from a table or columns of a table. It can be changed -# by running: -# -# ibis.config.options.context_adjustment.time_col = "other_time_col" - - -def get_time_col(): - return config.options.context_adjustment.time_col - - -class TimeContextRelation(enum.Enum): - """Enum to classify the relationship between two time contexts. - - Assume that we have two timecontext `c1 (begin1, end1)`, `c2(begin2, end2)`: - - - `SUBSET` means `c1` is a subset of `c2`, `begin1` is greater than or - equal to `begin2`, and `end1` is less than or equal to `end2`. - - `SUPERSET` means that `begin1` is earlier than `begin2`, and `end1` - is later than `end2`. - - If neither of the two contexts is a superset of each other, and they - share some time range in common, we called them `OVERLAP`. - - `NONOVERLAP` means the two contexts doesn't overlap at all, which - means `end1` is earlier than `begin2` or `end2` is earlier than - `begin1`. - """ - - SUBSET = 0 - SUPERSET = 1 - OVERLAP = 2 - NONOVERLAP = 3 - - -def compare_timecontext( - left_context: TimeContext, right_context: TimeContext -) -> TimeContextRelation: - """Compare two time contexts and return the relationship between them.""" - left_begin, left_end = left_context - right_begin, right_end = right_context - if right_begin <= left_begin and right_end >= left_end: - return TimeContextRelation.SUBSET - elif right_begin >= left_begin and right_end <= left_end: - return TimeContextRelation.SUPERSET - elif right_end < left_begin or left_end < right_begin: - return TimeContextRelation.NONOVERLAP - else: - return TimeContextRelation.OVERLAP - - -def canonicalize_context( - timecontext: TimeContext | None, -) -> TimeContext | None: - """Canonicalize a timecontext with type pandas.Timestamp for its begin and end time.""" - - SUPPORTS_TIMESTAMP_TYPE = pd.Timestamp - if not isinstance(timecontext, tuple) or len(timecontext) != 2: - raise com.IbisError(f"Timecontext {timecontext} should specify (begin, end)") - - begin, end = timecontext - - if not isinstance(begin, SUPPORTS_TIMESTAMP_TYPE): - raise com.IbisError( - f"begin time value {begin} of type {type(begin)} is not" - " of type pd.Timestamp" - ) - if not isinstance(end, SUPPORTS_TIMESTAMP_TYPE): - raise com.IbisError( - f"end time value {end} of type {type(begin)} is not of type pd.Timestamp" - ) - if begin > end: - raise com.IbisError( - f"begin time {begin} must be before or equal to end time {end}" - ) - return begin, end - - -def localize_context(timecontext: TimeContext, timezone: str) -> TimeContext: - """Localize tz-naive context.""" - begin, end = timecontext - if begin.tz is None: - begin = begin.tz_localize(timezone) - - if end.tz is None: - end = end.tz_localize(timezone) - - return begin, end - - -def construct_time_context_aware_series( - series: pd.Series, frame: pd.DataFrame -) -> pd.Series: - """Construct a Series by adding 'time' in its MultiIndex. - - In window execution, the result Series of udf may need - to be trimmed by timecontext. In order to do so, 'time' - must be added as an index to the Series. We extract - time column from the parent Dataframe `frame`. - See `trim_window_result` in execution/window.py for - trimming implementation. - - Examples - -------- - >>> import pandas as pd - >>> from ibis.backends.base.df.timecontext import ( - ... construct_time_context_aware_series, - ... ) - >>> df = pd.DataFrame( - ... { - ... "time": pd.Series(pd.date_range(start="2017-01-02", periods=3).values), - ... "id": [1, 2, 3], - ... "value": [1.1, 2.2, 3.3], - ... } - ... ) - >>> df - time id value - 0 2017-01-02 1 1.1 - 1 2017-01-03 2 2.2 - 2 2017-01-04 3 3.3 - >>> series = df["value"] - >>> series - 0 1.1 - 1 2.2 - 2 3.3 - Name: value, dtype: float64 - >>> construct_time_context_aware_series(series, df) # quartodoc: +SKIP # doctest: +SKIP - time - 0 2017-01-02 1.1 - 1 2017-01-03 2.2 - 2 2017-01-04 3.3 - Name: value, dtype: float64 - - The index will be a MultiIndex of the original RangeIndex - and a DateTimeIndex. - - >>> timed_series = construct_time_context_aware_series(series, df) - >>> timed_series # quartodoc: +SKIP # doctest: +SKIP - time - 0 2017-01-02 1.1 - 1 2017-01-03 2.2 - 2 2017-01-04 3.3 - Name: value, dtype: float64 - - >>> construct_time_context_aware_series( - ... timed_series, df - ... ) # quartodoc: +SKIP # doctest: +SKIP - time - 0 2017-01-02 1.1 - 1 2017-01-03 2.2 - 2 2017-01-04 3.3 - Name: value, dtype: float64 - The result is unchanged for a series already has 'time' as its index. - """ - time_col = get_time_col() - if time_col == frame.index.name: - time_index = frame.index - elif time_col in frame: - time_index = pd.Index(frame[time_col]) - else: - raise com.IbisError(f'"time" column not present in DataFrame {frame}') - if time_col not in series.index.names: - series.index = pd.MultiIndex.from_arrays( - list(map(series.index.get_level_values, range(series.index.nlevels))) - + [time_index], - names=series.index.names + [time_col], - ) - return series - - -@functools.singledispatch -def adjust_context(op: Any, scope: Scope, timecontext: TimeContext) -> TimeContext: - """Adjust the `timecontext` for `op`. - - Parameters - ---------- - op - Ibis operation. - scope - Incoming scope. - timecontext - Time context associated with the node. - - Returns - ------- - TimeContext - For `op` that is not of type Node, raise an error to avoid failing - silently since the default behavior is to return `timecontext`. - """ - raise com.IbisError(f"Unsupported input type for adjust context for {op}") - - -@adjust_context.register(ops.Node) -def adjust_context_node( - op: ops.Node, scope: Scope, timecontext: TimeContext -) -> TimeContext: - # For any node, by default, do not adjust time context - return timecontext - - -@adjust_context.register(ops.Alias) -def adjust_context_alias( - op: ops.Node, scope: Scope, timecontext: TimeContext -) -> TimeContext: - # For any node, by default, do not adjust time context - return adjust_context(op.arg, scope, timecontext) - - -@adjust_context.register(ops.AsOfJoin) -def adjust_context_asof_join( - op: ops.AsOfJoin, scope: Scope, timecontext: TimeContext -) -> TimeContext: - begin, end = timecontext - - if op.tolerance is not None: - from ibis.backends.pandas.execution import execute - - timedelta = execute(op.tolerance) - return (begin - timedelta, end) - - return timecontext - - -@adjust_context.register(ops.WindowFunction) -def adjust_context_window( - op: ops.WindowFunction, scope: Scope, timecontext: TimeContext -) -> TimeContext: - # TODO(kszucs): this file should be really moved to the pandas - # backend instead of the current central placement - from ibis.backends.pandas.execution import execute - - # adjust time context by preceding and following - begin, end = timecontext - - if op.frame.start is not None: - value = execute(op.frame.start.value) - if value: - begin = begin - value - - if op.frame.end is not None: - value = execute(op.frame.end.value) - if value: - end = end + value - - return (begin, end) diff --git a/ibis/backends/dask/tests/execution/test_join.py b/ibis/backends/dask/tests/execution/test_join.py index e9805c74c1421..e76097b65cddd 100644 --- a/ibis/backends/dask/tests/execution/test_join.py +++ b/ibis/backends/dask/tests/execution/test_join.py @@ -96,6 +96,7 @@ def test_join_with_multiple_predicates(how, left, right, df1, df2): left, right.key3, right.other_value ] result = expr.execute().sort_values(by=["key"]).reset_index(drop=True) + expected = ( dd.merge(df1, df2, how=how, left_on=["key", "key2"], right_on=["key", "key3"]) .compute(scheduler="single-threaded") diff --git a/ibis/backends/pandas/__init__.py b/ibis/backends/pandas/__init__.py index 4349400c50ab8..881a460b7f5e7 100644 --- a/ibis/backends/pandas/__init__.py +++ b/ibis/backends/pandas/__init__.py @@ -1,6 +1,5 @@ from __future__ import annotations -import importlib from functools import lru_cache from typing import TYPE_CHECKING, Any @@ -22,8 +21,6 @@ import pathlib from collections.abc import Mapping, MutableMapping -raise RuntimeError("Temporarily make the pandas backend dysfunctional") - class BasePandasBackend(BaseBackend): """Base class for backends based on pandas.""" @@ -51,9 +48,6 @@ def do_connect( >>> ibis.pandas.connect({"t": pd.DataFrame({"a": [1, 2, 3]})}) """ - # register dispatchers - from ibis.backends.pandas import execution, udf # noqa: F401 - self.dictionary = dictionary or {} self.schemas: MutableMapping[str, sch.Schema] = {} @@ -256,34 +250,13 @@ def _convert_object(cls, obj: Any) -> Any: @classmethod @lru_cache def _get_operations(cls): - backend = f"ibis.backends.{cls.name}" - - execution = importlib.import_module(f"{backend}.execution") - execute_node = execution.execute_node + from ibis.backends.pandas.kernels import supported_operations - # import UDF to pick up AnalyticVectorizedUDF and others - importlib.import_module(f"{backend}.udf") - - dispatch = importlib.import_module(f"{backend}.dispatch") - pre_execute = dispatch.pre_execute - - return frozenset( - op - for op, *_ in execute_node.funcs.keys() | pre_execute.funcs.keys() - if issubclass(op, ops.Value) - ) + return supported_operations @classmethod def has_operation(cls, operation: type[ops.Value]) -> bool: - # Pandas doesn't support geospatial ops, but the dispatcher implements - # a common base class that makes it appear that it does. Explicitly - # exclude these operations. - if issubclass(operation, (ops.GeoSpatialUnOp, ops.GeoSpatialBinOp)): - return False - op_classes = cls._get_operations() - return operation in op_classes or any( - issubclass(operation, op_impl) for op_impl in op_classes - ) + return operation in cls._get_operations() def _clean_up_cached_table(self, op): del self.dictionary[op.name] @@ -331,7 +304,7 @@ class Backend(BasePandasBackend): name = "pandas" def execute(self, query, params=None, limit="default", **kwargs): - from ibis.backends.pandas.core import execute_and_reset + from ibis.backends.pandas.executor import Executor if limit != "default" and limit is not None: raise ValueError( @@ -346,16 +319,10 @@ def execute(self, query, params=None, limit="default", **kwargs): ) ) - node = query.op() - - if params is None: - params = {} - else: - params = { - k.op() if isinstance(k, ir.Expr) else k: v for k, v in params.items() - } + params = params or {} + params = {k.op() if isinstance(k, ir.Expr) else k: v for k, v in params.items()} - return execute_and_reset(node, params=params, **kwargs) + return Executor.execute(query.op(), backend=self, params=params) def _load_into_cache(self, name, expr): self.create_table(name, expr.execute()) diff --git a/ibis/backends/pandas/aggcontext.py b/ibis/backends/pandas/aggcontext.py deleted file mode 100644 index 64a4f73bc6869..0000000000000 --- a/ibis/backends/pandas/aggcontext.py +++ /dev/null @@ -1,710 +0,0 @@ -"""Implements an object to describe the context of a window aggregation. - -For any particular aggregation such as ``sum``, ``mean``, etc we need to decide -based on the presence or absence of other expressions like ``group_by`` and -``order_by`` whether we should call a different method of aggregation. - -Here are the different aggregation contexts and the conditions under which they -are used. - -Note that in the pandas backend, only trailing and cumulative windows are -supported right now. - -No ``group_by`` or ``order_by``: ``context.Summarize()`` --------------------------------------------------------- -This is an aggregation on a column, repeated for every row in the table. - -SQL - -:: - - SELECT SUM(value) OVER () AS sum_value FROM t - -Pandas - -:: - >>> import pandas as pd - >>> import numpy as np - >>> df = pd.DataFrame( - ... { - ... "key": list("aabc"), - ... "value": np.random.randn(4), - ... "time": pd.date_range(periods=4, start="now"), - ... } - ... ) - >>> s = pd.Series(df.value.sum(), index=df.index, name="sum_value") - >>> s # quartodoc: +SKIP # doctest: +SKIP - -Ibis - -:: - - >>> import ibis - >>> schema = dict(time="timestamp", key="string", value="double") - >>> t = ibis.table(schema, name="t") - >>> t[ - ... t, t.value.sum().name("sum_value") - ... ].sum_value # quartodoc: +SKIP # doctest: +SKIP - - -``group_by``, no ``order_by``: ``context.Transform()`` ------------------------------------------------------- - -This performs an aggregation per group and repeats it across every row in the -group. - -SQL - -:: - - SELECT SUM(value) OVER (PARTITION BY key) AS sum_value - FROM t - -Pandas - -:: - - >>> import pandas as pd - >>> import numpy as np - >>> df = pd.DataFrame( - ... { - ... "key": list("aabc"), - ... "value": np.random.randn(4), - ... "time": pd.date_range(periods=4, start="now"), - ... } - ... ) - >>> df.groupby("key").value.transform("sum") # quartodoc: +SKIP # doctest: +SKIP - -Ibis - -:: - - >>> import ibis - >>> schema = dict(time="timestamp", key="string", value="double") - >>> t = ibis.table(schema, name="t") - >>> t.value.sum().over( - ... ibis.window(group_by=t.key) - ... ) # quartodoc: +SKIP # doctest: +SKIP - -``order_by``, no ``group_by``: ``context.Cumulative()``/``context.Rolling()`` ------------------------------------------------------------------------------ - -Cumulative and trailing window operations. - -Cumulative -~~~~~~~~~~ - -Also called expanding. - -SQL - -:: - - SELECT SUM(value) OVER ( - ORDER BY time ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW - ) AS sum_value - FROM t - - -Pandas - -:: - - >>> import pandas as pd - >>> import numpy as np - >>> df = pd.DataFrame( - ... { - ... "key": list("aabc"), - ... "value": np.random.randn(4), - ... "time": pd.date_range(periods=4, start="now"), - ... } - ... ) - >>> df.sort_values("time").value.cumsum() # quartodoc: +SKIP # doctest: +SKIP - -Ibis - -:: - - >>> import ibis - >>> schema = dict(time="timestamp", key="string", value="double") - >>> t = ibis.table(schema, name="t") - >>> window = ibis.cumulative_window(order_by=t.time) - >>> t.value.sum().over(window) # quartodoc: +SKIP # doctest: +SKIP - -Moving -~~~~~~ - -Also called referred to as "rolling" in other libraries such as pandas. - -SQL - -:: - - SELECT SUM(value) OVER ( - ORDER BY time ROWS BETWEEN 3 PRECEDING AND CURRENT ROW - ) AS sum_value - FROM t - - -Pandas - -:: - - >>> import pandas as pd - >>> import numpy as np - >>> df = pd.DataFrame( - ... { - ... "key": list("aabc"), - ... "value": np.random.randn(4), - ... "time": pd.date_range(periods=4, start="now"), - ... } - ... ) - >>> df.sort_values("time").value.rolling( - ... 3 - ... ).sum() # quartodoc: +SKIP # doctest: +SKIP - -Ibis - -:: - - >>> import ibis - >>> schema = dict(time="timestamp", key="string", value="double") - >>> t = ibis.table(schema, name="t") - >>> window = ibis.trailing_window(3, order_by=t.time) - >>> t.value.sum().over(window) # quartodoc: +SKIP # doctest: +SKIP - - -``group_by`` and ``order_by``: ``context.Cumulative()``/``context.Rolling()`` ------------------------------------------------------------------------------ - -This performs a cumulative or rolling operation within a group. - -SQL - -:: - - SELECT SUM(value) OVER ( - PARTITION BY key ORDER BY time ROWS BETWEEN 4 PRECEDING AND CURRENT ROW - ) AS sum_value - FROM t - - -Pandas - -:: - - >>> import pandas as pd - >>> import numpy as np - >>> df = pd.DataFrame( - ... { - ... "key": list("aabc"), - ... "value": np.random.randn(4), - ... "time": pd.date_range(periods=4, start="now"), - ... } - ... ) - >>> sorter = lambda df: df.sort_values("time") - >>> gb = ( - ... df.groupby("key", group_keys=False) - ... .apply(sorter) - ... .reset_index(drop=True) - ... .groupby("key") - ... ) - >>> rolling = gb.value.rolling(2) - >>> rolling.sum() # quartodoc: +SKIP # doctest: +SKIP - -Ibis - -:: - - >>> import ibis - >>> schema = dict(time="timestamp", key="string", value="double") - >>> t = ibis.table(schema, name="t") - >>> window = ibis.trailing_window(2, order_by=t.time, group_by=t.key) - >>> t.value.sum().over(window) # quartodoc: +SKIP # doctest: +SKIP -""" - -from __future__ import annotations - -import abc -import functools -import itertools -import operator -from typing import TYPE_CHECKING, Any, Callable - -import pandas as pd -from pandas.core.groupby import SeriesGroupBy - -import ibis -import ibis.common.exceptions as com -import ibis.expr.datatypes as dt -import ibis.expr.operations as ops -import ibis.util -from ibis.backends.base.df.timecontext import ( - construct_time_context_aware_series, - get_time_col, -) - -if TYPE_CHECKING: - from collections.abc import Iterator - - import numpy as np - - -class AggregationContext(abc.ABC): - __slots__ = ( - "parent", - "group_by", - "order_by", - "dtype", - "max_lookback", - "output_type", - ) - - def __init__( - self, - parent=None, - group_by=None, - order_by=None, - max_lookback=None, - output_type=None, - ): - self.parent = parent - self.group_by = group_by - self.order_by = order_by - self.dtype = None if output_type is None else output_type.to_pandas() - self.output_type = output_type - self.max_lookback = max_lookback - - @abc.abstractmethod - def agg(self, grouped_data, function, *args, **kwargs): - pass - - -def wrap_for_apply( - function: Callable, - args: tuple[Any, ...] | None = None, - kwargs: dict[str, Any] | None = None, -) -> Callable: - """Wrap a function for use with Pandas `apply`. - - Parameters - ---------- - function : Callable - A function to be used with Pandas `apply`. - args : Optional[Tuple[Any, ...]] - args to be passed to function when it is called by Pandas `apply` - kwargs : Optional[Dict[str, Any]] - kwargs to be passed to function when it is called by Pandas `apply` - """ - assert callable(function), f"function {function} is not callable" - - new_args: tuple[Any, ...] = () - if args is not None: - new_args = args - - new_kwargs: dict[str, Any] = {} - if kwargs is not None: - new_kwargs = kwargs - - @functools.wraps(function) - def wrapped_func( - data: Any, - function: Callable = function, - args: tuple[Any, ...] = new_args, - kwargs: dict[str, Any] = new_kwargs, - ) -> Callable: - return function(data, *args, **kwargs) - - return wrapped_func - - -def wrap_for_agg( - function: Callable, - args: tuple[Any, ...], - kwargs: dict[str, Any], -) -> Callable: - """Wrap a function for use with Pandas `agg`. - - This includes special logic that will force Pandas `agg` to always treat - the function as an aggregation function. Details: - - When passed a function, Pandas `agg` will either: - 1) Behave like Pandas `apply` and treat the function as a N->N mapping - function (i.e. calls the function once for every value in the Series - that `agg` is being called on), OR - 2) Treat the function as a N->1 aggregation function (i.e. calls the - function once on the entire Series) - Pandas `agg` will use behavior #1 unless an error is raised when doing so. - - We want to force Pandas `agg` to use behavior #2. To do this, we will wrap - the function with logic that checks that a Series is being passed in, and - raises a TypeError otherwise. When Pandas `agg` is attempting to use - behavior #1 but sees the TypeError, it will fall back to behavior #2. - - Parameters - ---------- - function : Callable - An aggregation function to be used with Pandas `agg`. - args : Tuple[Any, ...] - args to be passed to function when it is called by Pandas `agg` - kwargs : Dict[str, Any] - kwargs to be passed to function when it is called by Pandas `agg` - """ - assert callable(function), f"function {function} is not callable" - - @functools.wraps(function) - def wrapped_func( - data: Any, - function: Callable = function, - args: tuple[Any, ...] = args, - kwargs: dict[str, Any] = kwargs, - ) -> Callable: - # `data` will be a scalar here if Pandas `agg` is trying to behave like - # like Pandas `apply`. - if not isinstance(data, pd.Series): - # Force `agg` to NOT behave like `apply`. We want Pandas to use - # `function` as an aggregation function, not as a mapping function. - raise TypeError( - f"This function expects a Series, but saw an object of type " - f"{type(data)} instead." - ) - return function(data, *args, **kwargs) - - return wrapped_func - - -class Summarize(AggregationContext): - __slots__ = () - - def agg(self, grouped_data, function, *args, **kwargs): - if isinstance(function, str): - return getattr(grouped_data, function)(*args, **kwargs) - - if not callable(function): - raise TypeError(f"Object {function} is not callable or a string") - - if isinstance(grouped_data, pd.core.groupby.generic.SeriesGroupBy) and len( - grouped_data - ): - # `SeriesGroupBy.agg` does not allow np.arrays to be returned - # from UDFs. To avoid `SeriesGroupBy.agg`, we will call the - # aggregation function manually on each group. (#2768) - aggs = {} - for k, v in grouped_data: - func_args = [d.get_group(k) for d in args] - aggs[k] = function(v, *func_args, **kwargs) - grouped_col_name = v.name - return ( - pd.Series(aggs) - .rename(grouped_col_name) - .rename_axis(grouped_data.grouper.names) - ) - else: - return grouped_data.agg(wrap_for_agg(function, args, kwargs)) - - -class Transform(AggregationContext): - __slots__ = () - - def agg(self, grouped_data, function, *args, **kwargs): - # If this is a multi column UDF, then we cannot use - # "transform" here (Data must be 1-dimensional) - # Instead, we need to use "apply", which can return a non - # numeric type, e.g, tuple of two double. - if self.output_type.is_struct(): - res = grouped_data.apply(function, *args, **kwargs) - else: - res = grouped_data.transform(function, *args, **kwargs) - - # The result series uses the name of the input. We should - # unset it to avoid confusion, when result is not guaranteed - # to be the same series / have the same type after transform - res.name = None - return res - - -@functools.singledispatch -def compute_window_spec(dtype, obj): - raise com.IbisTypeError( - f"Unknown dtype type {dtype} and object {obj} for compute_window_spec" - ) - - -@compute_window_spec.register(dt.Integer) -def compute_window_spec_none(_, obj): - """Helper method only used for row-based windows. - - Window spec in ibis is an inclusive window bound. A bound of 0 - indicates the current row. Window spec in Pandas indicates window - size. Therefore, we must add 1 to the ibis window bound to get the - expected behavior. - """ - from ibis.backends.pandas.core import execute - - value = execute(obj) - return value + 1 - - -@compute_window_spec.register(dt.Interval) -def compute_window_spec_interval(_, obj): - from ibis.backends.pandas.core import execute - - value = execute(obj) - return pd.tseries.frequencies.to_offset(value) - - -def window_agg_built_in( - frame: pd.DataFrame, - windowed: pd.core.window.Window, - function: str, - max_lookback: ops.Literal, - *args: tuple[Any, ...], - **kwargs: dict[str, Any], -) -> pd.Series: - """Apply window aggregation with built-in aggregators.""" - assert isinstance(function, str) - method = operator.methodcaller(function, *args, **kwargs) - - if max_lookback is not None: - agg_method = method - - def sliced_agg(s): - return agg_method(s.iloc[-max_lookback.value :]) - - method = operator.methodcaller("apply", sliced_agg, raw=False) - - result = method(windowed) - index = result.index - result.index = pd.MultiIndex.from_arrays( - [frame.index] + list(map(index.get_level_values, range(index.nlevels))), - names=[frame.index.name] + index.names, - ) - return result - - -def create_window_input_iter( - grouped_data: SeriesGroupBy | pd.Series, - masked_window_lower_indices: pd.Series, - masked_window_upper_indices: pd.Series, -) -> Iterator[np.ndarray]: - # create a generator for each input series - # the generator will yield a slice of the - # input series for each valid window - data = getattr(grouped_data, "obj", grouped_data).values - lower_indices_array = masked_window_lower_indices.values - upper_indices_array = masked_window_upper_indices.values - for i in range(len(lower_indices_array)): - lower_index = lower_indices_array[i] - upper_index = upper_indices_array[i] - yield data[lower_index:upper_index] - - -def window_agg_udf( - grouped_data: SeriesGroupBy, - function: Callable, - window_lower_indices: pd.Series, - window_upper_indices: pd.Series, - mask: pd.Series, - result_index: pd.Index, - dtype: np.dtype, - max_lookback: int, - *args: tuple[Any, ...], - **kwargs: dict[str, Any], -) -> pd.Series: - """Apply window aggregation with UDFs. - - Notes - ----- - Use custom logic to computing rolling window UDF instead of - using pandas's rolling function. - This is because pandas's rolling function doesn't support - multi param UDFs. - """ - assert len(window_lower_indices) == len(window_upper_indices) - assert len(window_lower_indices) == len(mask) - - # Reset index here so we don't need to deal with mismatching - # indices - window_lower_indices = window_lower_indices.reset_index(drop=True) - window_upper_indices = window_upper_indices.reset_index(drop=True) - mask = mask.reset_index(drop=True) - - # Compute window indices and manually roll - # over the window. - - # If an window has only nan values, we output nan for - # the window result. This follows pandas rolling apply - # behavior. - - # The first input column is in grouped_data, but there may - # be additional input columns in args. - inputs = (grouped_data,) + args - - masked_window_lower_indices = window_lower_indices[mask].astype("i8") - masked_window_upper_indices = window_upper_indices[mask].astype("i8") - - input_iters = [ - create_window_input_iter( - arg, masked_window_lower_indices, masked_window_upper_indices - ) - if isinstance(arg, (pd.Series, SeriesGroupBy)) - else itertools.repeat(arg) - for arg in inputs - ] - - valid_result = pd.Series( - function(*(next(gen) for gen in input_iters)) - for i in range(len(masked_window_lower_indices)) - ) - - valid_result = pd.Series(valid_result) - valid_result.index = masked_window_lower_indices.index - result = pd.Series(index=mask.index, dtype=dtype) - result[mask] = valid_result - result.index = result_index - - return result - - -class Window(AggregationContext): - __slots__ = ("construct_window",) - - def __init__(self, kind, *args, **kwargs): - super().__init__( - parent=kwargs.pop("parent", None), - group_by=kwargs.pop("group_by", None), - order_by=kwargs.pop("order_by", None), - output_type=kwargs.pop("output_type"), - max_lookback=kwargs.pop("max_lookback", None), - ) - self.construct_window = operator.methodcaller(kind, *args, **kwargs) - - def agg( - self, - grouped_data: pd.Series | SeriesGroupBy, - function: str | Callable, - *args: Any, - **kwargs: Any, - ) -> pd.Series: - # avoid a pandas warning about numpy arrays being passed through - # directly - group_by = self.group_by - order_by = self.order_by - - assert group_by or order_by - - # Get the DataFrame from which the operand originated - # (passed in when constructing this context object in - # execute_node(ops.Window)) - parent = self.parent - frame = getattr(parent, "obj", parent) - obj = getattr(grouped_data, "obj", grouped_data) - name = obj.name - if frame[name] is not obj or name in group_by or name in order_by: - name = f"{name}_{ibis.util.guid()}" - frame = frame.assign(**{name: obj}) - - # set the index to our order_by keys and append it to the existing - # index - # TODO: see if we can do this in the caller, when the context - # is constructed rather than pulling out the data - columns = group_by + order_by + [name] - # Create a new frame to avoid mutating the original one - indexed_by_ordering = frame[columns].copy() - # placeholder column to compute window_sizes below - indexed_by_ordering["_placeholder"] = 0 - indexed_by_ordering = indexed_by_ordering.set_index(order_by) - - # regroup if needed - if group_by: - grouped_frame = indexed_by_ordering.groupby(group_by, group_keys=False) - else: - grouped_frame = indexed_by_ordering - grouped = grouped_frame[name] - - if callable(function): - # To compute the window_size, we need to construct a - # RollingGroupby and compute count using construct_window. - # However, if the RollingGroupby is not numeric, e.g., - # we are calling window UDF on a timestamp column, we - # cannot compute rolling count directly because: - # (1) windowed.count() will exclude NaN observations - # , which results in incorrect window sizes. - # (2) windowed.apply(len, raw=True) will include NaN - # observations, but doesn't work on non-numeric types. - # https://github.com/pandas-dev/pandas/issues/23002 - # To deal with this, we create a _placeholder column - - windowed_frame = self.construct_window(grouped_frame) - window_sizes = windowed_frame["_placeholder"].count().reset_index(drop=True) - mask = ~(window_sizes.isna()) - window_upper_indices = pd.Series(range(len(window_sizes))) + 1 - window_lower_indices = window_upper_indices - window_sizes - # The result Series of udf may need to be trimmed by - # timecontext. In order to do so, 'time' must be added - # as an index to the Series, if present. Here We extract - # time column from the parent Dataframe `frame`. - if get_time_col() in frame: - result_index = construct_time_context_aware_series(obj, frame).index - else: - result_index = obj.index - result = window_agg_udf( - grouped_data, - function, - window_lower_indices, - window_upper_indices, - mask, - result_index, - self.dtype, - self.max_lookback, - *args, - **kwargs, - ) - else: - # perform the per-group rolling operation - windowed = self.construct_window(grouped) - result = window_agg_built_in( - frame, - windowed, - function, - self.max_lookback, - *args, - **kwargs, - ) - try: - return result.astype(self.dtype, copy=False) - except (TypeError, ValueError): - return result - - -class Cumulative(Window): - __slots__ = () - - def __init__(self, *args, **kwargs): - super().__init__("expanding", *args, **kwargs) - - -class Moving(Window): - __slots__ = () - - def __init__(self, start, max_lookback, *args, **kwargs): - from ibis.backends.pandas.core import timedelta_types - - start = compute_window_spec(start.dtype, start.value) - if isinstance(start, timedelta_types + (pd.offsets.DateOffset,)): - closed = "both" - else: - closed = None - - super().__init__( - "rolling", - start, - *args, - max_lookback=max_lookback, - closed=closed, - min_periods=1, - **kwargs, - ) - - def short_circuit_method(self, grouped_data, function): - raise AttributeError("No short circuit method for rolling operations") diff --git a/ibis/backends/pandas/convert.py b/ibis/backends/pandas/convert.py new file mode 100644 index 0000000000000..76528d3e92582 --- /dev/null +++ b/ibis/backends/pandas/convert.py @@ -0,0 +1,88 @@ +from __future__ import annotations + +import pandas as pd +import pandas.api.types as pdt + +import ibis.expr.datatypes as dt +from ibis.formats.pandas import DataMapper, PandasType + + +class PandasConverter(DataMapper): + @classmethod + def convert_scalar(cls, obj, dtype): + series = pd.Series([obj]) + casted = cls.convert_column(series, dtype) + return casted[0] + + @classmethod + def convert_column(cls, obj, dtype): + pandas_type = PandasType.from_ibis(dtype) + + method_name = f"convert_{dtype.__class__.__name__}" + convert_method = getattr(cls, method_name, cls.convert_default) + + return convert_method(obj, dtype, pandas_type) + + @classmethod + def convert_default(cls, s, dtype, pandas_type): + if pandas_type == object: + func = lambda x: x if x is pd.NA else dt.normalize(dtype, x) + return s.map(func, na_action="ignore").astype(pandas_type) + else: + return s.astype(pandas_type) + + @classmethod + def convert_Integer(cls, s, dtype, pandas_type): + if pdt.is_datetime64_any_dtype(s.dtype): + return s.astype("int64").floordiv(int(1e9)).astype(pandas_type) + else: + return s.astype(pandas_type, errors="ignore") + + convert_SignedInteger = convert_UnsignedInteger = convert_Integer + convert_Int64 = convert_Int32 = convert_Int16 = convert_Int8 = convert_SignedInteger + convert_UInt64 = ( + convert_UInt32 + ) = convert_UInt16 = convert_UInt8 = convert_UnsignedInteger + + @classmethod + def convert_Floating(cls, s, dtype, pandas_type): + if pdt.is_datetime64_any_dtype(s.dtype): + return s.astype("int64").floordiv(int(1e9)).astype(pandas_type) + else: + return s.astype(pandas_type, errors="ignore") + + convert_Float64 = convert_Float32 = convert_Float16 = convert_Floating + + @classmethod + def convert_Timestamp(cls, s, dtype, pandas_type): + if isinstance(dtype, pd.DatetimeTZDtype): + return s.dt.tz_convert(dtype.timezone) + elif pdt.is_datetime64_dtype(s.dtype): + return s.dt.tz_localize(dtype.timezone) + elif pdt.is_numeric_dtype(s.dtype): + return pd.to_datetime(s, unit="s").dt.tz_localize(dtype.timezone) + else: + try: + return s.astype(pandas_type) + except TypeError: + try: + return pd.to_datetime(s).dt.tz_convert(dtype.timezone) + except TypeError: + return pd.to_datetime(s).dt.tz_localize(dtype.timezone) + + @classmethod + def convert_Date(cls, s, dtype, pandas_type): + if isinstance(s.dtype, pd.DatetimeTZDtype): + s = s.dt.tz_convert("UTC").dt.tz_localize(None) + elif pdt.is_numeric_dtype(s.dtype): + s = pd.to_datetime(s, unit="D") + else: + s = pd.to_datetime(s).astype(pandas_type, errors="ignore") + + return s.dt.normalize() + + @classmethod + def convert_String(cls, s, dtype, pandas_type): + # TODO(kszucs): should switch to the new pandas string type and convert + # object columns using s.convert_dtypes() method + return s.map(str, na_action="ignore").astype(object) diff --git a/ibis/backends/pandas/core.py b/ibis/backends/pandas/core.py deleted file mode 100644 index ef29b2bb29cc6..0000000000000 --- a/ibis/backends/pandas/core.py +++ /dev/null @@ -1,605 +0,0 @@ -"""The pandas backend. - -The pandas backend is a departure from the typical ibis backend in that it -doesn't compile to anything, and the execution of the ibis expression is under -the purview of ibis itself rather than executing SQL on a server. - -Design ------- -The pandas backend uses a technique called `multiple dispatch -`_, implemented in a -third-party open source library called `multipledispatch -`_. - -Multiple dispatch is a generalization of standard single-dispatch runtime -polymorphism to multiple arguments. - -Compilation ------------ -This is a no-op because we execute ibis expressions directly. - -Execution ---------- -Execution is divided into different dispatched functions, each arising from -a different use case. - -A top level function `execute` exists to provide the API for executing an ibis -expression against in-memory data. - -The general flow of execution is: - -:: - If the current operation is in scope: - return it - Else: - execute the arguments of the current node - - execute the current node with its executed arguments - -Specifically, execute is comprised of a series of steps that happen at -different times during the loop. - -1. ``compute_time_context`` ---------------------------- -First, at the beginning of the main execution loop, ``compute_time_context`` is -called. This function computes time contexts, and pass them to all children of -the current node. These time contexts could be used in later steps to get data. -This is essential for time series Table, and related operations that adjust -time context, such as window, asof_join, etc. - -By default, this function simply pass the unchanged time context to all -children nodes. - - -2. ``pre_execute`` ------------------- -Second, ``pre_execute`` is called. -This function serves a similar purpose to ``data_preload``, the key difference -being that ``pre_execute`` is called *every time* there's a call to execute. - -By default this function does nothing. - -3. ``execute_node`` -------------------- - -Then, when an expression is ready to be evaluated we call -:func:`~ibis.backends.pandas.core.execute` on the expressions arguments and -then :func:`~ibis.backends.pandas.dispatch.execute_node` on the expression -with its now-materialized arguments. - -4. ``post_execute`` -------------------- -The final step--``post_execute``--is called immediately after the previous call -to ``execute_node`` and takes the instance of the -:class:`~ibis.expr.operations.Node` just computed and the result of the -computation. - -The purpose of this function is to allow additional computation to happen in -the context of the current level of the execution loop. You might be wondering -That may sound vague, so let's look at an example. - -Let's say you want to take a three day rolling average, and you want to include -3 days of data prior to the first date of the input. You don't want to see that -data in the result for a few reasons, one of which is that it would break the -contract of window functions: given N rows of input there are N rows of output. - -Defining a ``post_execute`` rule for :class:`~ibis.expr.operations.Window` -allows you to encode such logic. One might want to implement this using -:class:`~ibis.expr.operations.ScalarParameter`, in which case the ``scope`` -passed to ``post_execute`` would be the bound values passed in at the time the -``execute`` method was called. - - -Scope ------ -Scope is used across the execution phases, it iss a map that maps Ibis -operators to actual data. It is used to cache data for calculated ops. It is -an optimization to reused executed results. - -With time context included, the key is op associated with each expression; -And scope value is another key-value map: -- value: pd.DataFrame or pd.Series that is the result of executing key op -- timecontext: of type TimeContext, the time context associated with the data -stored in value - -See ibis.common.scope for details about the implementation. -""" - -from __future__ import annotations - -import datetime -import functools -import numbers -from typing import TYPE_CHECKING, Any, Callable - -import numpy as np -import pandas as pd -from multipledispatch import Dispatcher - -import ibis.common.exceptions as com -import ibis.expr.datatypes as dt -import ibis.expr.operations as ops -import ibis.util -from ibis.backends.base import BaseBackend -from ibis.backends.base.df.scope import Scope -from ibis.backends.base.df.timecontext import TimeContext, canonicalize_context -from ibis.backends.pandas import aggcontext as agg_ctx -from ibis.backends.pandas.dispatch import ( - execute_literal, - execute_node, - post_execute, - pre_execute, -) -from ibis.backends.pandas.trace import trace - -if TYPE_CHECKING: - from collections.abc import Iterable, Mapping - -integer_types = np.integer, int -floating_types = (numbers.Real,) -numeric_types = integer_types + floating_types -boolean_types = bool, np.bool_ -fixed_width_types = numeric_types + boolean_types -date_types = (datetime.date,) -time_types = (datetime.time,) -timestamp_types = pd.Timestamp, datetime.datetime, np.datetime64 -timedelta_types = pd.Timedelta, datetime.timedelta, np.timedelta64 -temporal_types = date_types + time_types + timestamp_types + timedelta_types -scalar_types = fixed_width_types + temporal_types -simple_types = scalar_types + (str, type(None)) - - -@functools.singledispatch -def is_computable_input(arg): - """All inputs are not computable without a specific override.""" - return False - - -@is_computable_input.register(BaseBackend) -@is_computable_input.register(ops.Node) -@is_computable_input.register(dt.DataType) -@is_computable_input.register(type(None)) -@is_computable_input.register(tuple) -def is_computable_input_arg(arg): - """Return whether `arg` is a valid computable argument.""" - return True - - -# Register is_computable_input for each scalar type (int, float, date, etc). -# We use consume here to avoid leaking the iteration variable into the module. -ibis.util.consume( - is_computable_input.register(t)(is_computable_input_arg) for t in scalar_types -) - - -def execute_with_scope( - node: ops.Node, - scope: Scope, - timecontext: TimeContext | None = None, - aggcontext: agg_ctx.AggregationContext | None = None, - clients=None, - **kwargs: Any, -): - """Execute an expression `expr`, with data provided in `scope`. - - Parameters - ---------- - node - The operation node to execute. - scope - A Scope class, with dictionary mapping `ibis.expr.operations.Node` - subclass instances to concrete data such as a pandas DataFrame. - timecontext - A tuple of (begin, end) that is passed from parent Node to children - see [timecontext.py](ibis/backends/pandas/execution/timecontext.py) for - detailed usage for this time context. - aggcontext - Aggregation context - clients - Iterable of clients - kwargs - Keyword arguments - """ - # Call pre_execute, to allow clients to intercept the expression before - # computing anything *and* before associating leaf nodes with data. This - # allows clients to provide their own data for each leaf. - if clients is None: - clients, _ = node.to_expr()._find_backends() - - if aggcontext is None: - aggcontext = agg_ctx.Summarize() - - pre_executed_scope = pre_execute( - node, - *clients, - scope=scope, - timecontext=timecontext, - aggcontext=aggcontext, - **kwargs, - ) - new_scope = scope.merge_scope(pre_executed_scope) - result = execute_until_in_scope( - node, - new_scope, - timecontext=timecontext, - aggcontext=aggcontext, - clients=clients, - # XXX: we *explicitly* pass in scope and not new_scope here so that - # post_execute sees the scope of execute_with_scope, not the scope of - # execute_until_in_scope - post_execute_=functools.partial( - post_execute, - scope=scope, - timecontext=timecontext, - aggcontext=aggcontext, - clients=clients, - **kwargs, - ), - **kwargs, - ).get_value(node, timecontext) - return result - - -@trace -def execute_until_in_scope( - node, - scope: Scope, - timecontext: TimeContext | None = None, - aggcontext: agg_ctx.AggregationContext | None = None, - clients: Iterable | None = None, - post_execute_: Callable | None = None, - **kwargs: Any, -) -> Scope: - """Execute until our op is in `scope`.""" - # these should never be None - assert aggcontext is not None, "aggcontext is None" - assert clients is not None, "clients is None" - assert post_execute_ is not None, "post_execute_ is None" - - # base case: our op has been computed (or is a leaf data node), so - # return the corresponding value - if scope.get_value(node, timecontext) is not None: - return scope - if isinstance(node, ops.Literal): - # special case literals to avoid the overhead of dispatching - # execute_node - return Scope( - { - node: execute_literal( - node, - node.value, - node.dtype, - aggcontext=aggcontext, - **kwargs, - ) - }, - timecontext, - ) - - # figure out what arguments we're able to compute on based on the - # expressions inputs. things like expressions, None, and scalar types are - # computable whereas ``list``s are not - computable_args = [ - arg for arg in get_node_arguments(node) if is_computable_input(arg) - ] - - # pre_executed_states is a list of states with same the length of - # computable_args, these states are passed to each arg - if timecontext: - arg_timecontexts = compute_time_context( - node, - num_args=len(computable_args), - timecontext=timecontext, - clients=clients, - scope=scope, - ) - else: - arg_timecontexts = [None] * len(computable_args) - - pre_executed_scope = pre_execute( - node, - *clients, - scope=scope, - timecontext=timecontext, - aggcontext=aggcontext, - **kwargs, - ) - - new_scope = scope.merge_scope(pre_executed_scope) - - # Short circuit: if pre_execute puts op in scope, then we don't need to - # execute its computable_args - if new_scope.get_value(node, timecontext) is not None: - return new_scope - - # recursively compute each node's arguments until we've changed type. - # compute_time_context should return with a list with the same length - # as computable_args, the two lists will be zipping together for - # further execution - if len(arg_timecontexts) != len(computable_args): - raise com.IbisError( - "arg_timecontexts differ with computable_arg in length " - f"for type:\n{type(node).__name__}." - ) - - scopes = [ - execute_until_in_scope( - arg, - new_scope, - timecontext=timecontext, - aggcontext=aggcontext, - post_execute_=post_execute_, - clients=clients, - **kwargs, - ) - if isinstance(arg, ops.Node) - else Scope({arg: arg}, timecontext) - for (arg, timecontext) in zip(computable_args, arg_timecontexts) - ] - - # if we're unable to find data then raise an exception - if not scopes and computable_args: - raise com.UnboundExpressionError(f"Unable to find data for node:\n{node!r}") - - # there should be exactly one dictionary per computable argument - assert len(computable_args) == len(scopes) - - new_scope = new_scope.merge_scopes(scopes) - # pass our computed arguments to this node's execute_node implementation - data = [ - new_scope.get_value(arg, timecontext) if isinstance(arg, ops.Node) else arg - for (arg, timecontext) in zip(computable_args, arg_timecontexts) - ] - result = execute_node( - node, - *data, - scope=scope, - timecontext=timecontext, - aggcontext=aggcontext, - clients=clients, - **kwargs, - ) - computed = post_execute_( - node, result, timecontext=timecontext, aggcontext=aggcontext, **kwargs - ) - return Scope({node: computed}, timecontext) - - -execute = Dispatcher("execute") - - -@execute.register(ops.Node) -@trace -def main_execute( - node: ops.Node, - params: Mapping[ops.Node, Any] | None = None, - scope: Scope | None = None, - timecontext: TimeContext | None = None, - aggcontext: agg_ctx.AggregationContext | None = None, - cache: Mapping[ops.Node, Any] | None = None, - **kwargs: Any, -): - """Execute an expression against data that are bound to it. - - If no data are bound, raise an Exception. - - Parameters - ---------- - node : ibis.expr.operations.Node - The operation node to execute - params : Mapping[ibis.expr.operations.Node, object] - The data that an unbound parameter in `node` maps to - scope : Mapping[ibis.expr.operations.Node, object] - Additional scope, mapping ibis operations to data - timecontext : Optional[TimeContext] - timecontext needed for execution - aggcontext : Optional[ibis.backends.pandas.aggcontext.AggregationContext] - An object indicating how to compute aggregations. For example, - a rolling mean needs to be computed differently than the mean of a - column. - cache - Mapping for storing computation results. - kwargs - Additional arguments that can potentially be used by individual node - execution - - Returns - ------- - result : Union[ - pandas.Series, pandas.DataFrame, ibis.backends.pandas.core.simple_types - ] - - Raises - ------ - ValueError - * If no data are bound to the input expression - """ - if scope is None: - scope = Scope() - - if timecontext is not None: - # convert timecontext to datetime type, if time strings are provided - timecontext = canonicalize_context(timecontext) - - if params is None: - params = {} - - if cache is None: - cache = {} - - scope = scope.merge_scope(Scope(params, timecontext)) - return execute_with_scope( - node, - scope, - timecontext=timecontext, - aggcontext=aggcontext, - cache=cache, - **kwargs, - ) - - -def execute_and_reset( - node, - params=None, - scope=None, - timecontext: TimeContext | None = None, - aggcontext=None, - **kwargs, -): - """Execute an expression against data that are bound to it. - - If no data are bound, raise an Exception. - - The difference between this function and - `ibis.backends.pandas.core.execute` is that this function resets the index - of the result, if the result has an index. - - Parameters - ---------- - node : ibis.expr.operations.Node - The operation node to execute - params : Mapping[ibis.expr.operation.Node, object] - The data that an unbound parameter in `node` maps to - scope : Mapping[ibis.expr.operations.Node, object] - Additional scope, mapping ibis operations to data - timecontext : Optional[TimeContext] - timecontext needed for execution - aggcontext : Optional[ibis.backends.pandas.aggcontext.AggregationContext] - An object indicating how to compute aggregations. For example, - a rolling mean needs to be computed differently than the mean of a - column. - kwargs : Dict[str, object] - Additional arguments that can potentially be used by individual node - execution - - Returns - ------- - pandas.Series | pandas.DataFrame | ibis.backends.pandas.core.simple_types - Result of execution - - Raises - ------ - ValueError - * If no data are bound to the input expression - """ - result = execute( - node, - params=params, - scope=scope, - timecontext=timecontext, - aggcontext=aggcontext, - **kwargs, - ) - return _apply_schema(node, result) - - -def _apply_schema(op: ops.Node, result: pd.DataFrame | pd.Series): - from ibis.formats.pandas import PandasData - - assert isinstance(op, ops.Node), type(op) - if isinstance(result, pd.DataFrame): - df = result.reset_index().loc[:, list(op.schema.names)] - return PandasData.convert_table(df, op.schema) - elif isinstance(result, pd.Series): - schema = op.to_expr().as_table().schema() - df = PandasData.convert_table(result.to_frame(), schema) - return df.iloc[:, 0].reset_index(drop=True) - else: - return result - - -compute_time_context = Dispatcher( - "compute_time_context", - doc="""Compute the time context for a node in execution. - -Notes ------ -For a given node, return with a list of timecontext that are going to be -passed to its children nodes. - -Time context is useful when data is not uniquely defined by op tree. For example, -a table `t` can represent the query `SELECT count(a) FROM table`, but the -result of that is different with time context `(pd.Timestamp("20190101"), -pd.Timestamp("20200101"))` vs `(pd.Timestamp("20200101"), -pd.Timestamp("20210101“))` because what data is in `table` also depends on -the time context. Such context may be different for different nodes, that is, -each node may have a different time context. - -This function computes attributes that are going to be used in execution and -passes these attributes to child nodes. - -Parameters ----------- -clients : List[ibis.backends.base.BaseBackend] - backends for execution -timecontext : Optional[TimeContext] - begin and end time context needed for execution - -Returns -------- -List[Optional[TimeContext]] - A list of timecontexts for children nodes of the current node. Note that - timecontext are calculated for children nodes of computable args only. - The length of the return list is same of the length of computable inputs. - See `computable_args` in `execute_until_in_scope` -""", -) - - -@compute_time_context.register(ops.Node) -def compute_time_context_default( - node: ops.Node, - scope: Scope, - timecontext: TimeContext | None = None, - **kwargs, -): - return [timecontext for arg in get_node_arguments(node) if is_computable_input(arg)] - - -get_node_arguments = Dispatcher("get_node_arguments") - - -@get_node_arguments.register(ops.Node) -def get_node_arguments_default(node): - return node.args - - -@get_node_arguments.register(ops.ScalarParameter) -def get_node_arguments_parameter(node): - return () - - -@get_node_arguments.register(ops.DatabaseTable) -def get_node_arguments_table(node): - return (node.source,) - - -@get_node_arguments.register(ops.DropNa) -def get_node_arguments_dropna(node): - return (node.table,) - - -@get_node_arguments.register(ops.Selection) -def get_node_arguments_selection(node): - return (node.table,) - - -@get_node_arguments.register(ops.Aggregation) -def get_node_arguments_aggregation(node): - return (node.table,) - - -@get_node_arguments.register(ops.WindowFunction) -def get_node_arguments_window(node): - return get_node_arguments(node.func)[:1] - - -@get_node_arguments.register( - ( - ops.ElementWiseVectorizedUDF, - ops.ReductionVectorizedUDF, - ops.AnalyticVectorizedUDF, - ) -) -def get_node_arguments_udf(node): - return node.func_args diff --git a/ibis/backends/pandas/dispatch.py b/ibis/backends/pandas/dispatch.py deleted file mode 100644 index b5e080ade3bb8..0000000000000 --- a/ibis/backends/pandas/dispatch.py +++ /dev/null @@ -1,110 +0,0 @@ -from __future__ import annotations - -from functools import partial - -from multipledispatch import Dispatcher - -import ibis.common.exceptions as com -import ibis.expr.operations as ops -from ibis.backends.base import BaseBackend -from ibis.backends.base.df.scope import Scope -from ibis.backends.pandas.trace import TraceTwoLevelDispatcher - -# Individual operation execution -execute_node = TraceTwoLevelDispatcher( - "execute_node", - doc=( - "Execute an individual operation given the operation and its computed " - "arguments" - ), -) - - -@execute_node.register(ops.Node, [object]) -def raise_unknown_op(node, *args, **kwargs): - signature = ", ".join(type(arg).__name__ for arg in args) - raise com.OperationNotDefinedError( - "Operation is not implemented for this backend with " - f"signature: execute_node({type(node).__name__}, {signature})" - ) - - -@execute_node.register(ops.TableNode) -def raise_unknown_table_node(node, **kwargs): - raise com.UnboundExpressionError( - f"Node of type {type(node).__name__!r} has no data bound to it. " - "You probably tried to execute an expression without a data " - "source." - ) - - -pre_execute = Dispatcher( - "pre_execute", - doc="""\ -Given a node, compute a (possibly partial) scope prior to standard execution. - -Notes ------ -This function is useful if parts of the tree structure need to be executed at -the same time or if there are other reasons to need to interrupt the regular -depth-first traversal of the tree. -""", -) - - -# Default returns an empty scope -@pre_execute.register(ops.Node) -@pre_execute.register(ops.Node, BaseBackend) -def pre_execute_default(node, *clients, **kwargs): - return Scope() - - -# Merge the results of all client pre-execution with scope -@pre_execute.register(ops.Node, [BaseBackend]) -def pre_execute_multiple_clients(node, *clients, scope=None, **kwargs): - scope = scope.merge_scopes( - list(map(partial(pre_execute, node, scope=scope, **kwargs), clients)) - ) - return scope - - -execute_literal = Dispatcher( - "execute_literal", - doc="""\ -Special case literal execution to avoid the dispatching overhead of -``execute_node``. - -Parameters ----------- -op : ibis.expr.operations.Node -value : object - The literal value of the object, e.g., int, float. -datatype : ibis.expr.datatypes.DataType - Used to specialize on expressions whose underlying value is of a different - type than its would-be type. For example, interval values are represented - by an integer. -""", -) - - -post_execute = Dispatcher( - "post_execute", - doc="""\ -Execute code on the result of a computation. - -Parameters ----------- -op : ibis.expr.operations.Node - The operation that was just executed -data : object - The result of the computation -""", -) - - -@post_execute.register(ops.Node, object) -def post_execute_default(op, data, **kwargs): - return data - - -execute = Dispatcher("execute") diff --git a/ibis/backends/pandas/dispatcher.py b/ibis/backends/pandas/dispatcher.py deleted file mode 100644 index 6240c0106c3fd..0000000000000 --- a/ibis/backends/pandas/dispatcher.py +++ /dev/null @@ -1,113 +0,0 @@ -from __future__ import annotations - -from multipledispatch import Dispatcher - - -class TwoLevelDispatcher(Dispatcher): - """A `multipledispatch.Dispatcher` with two levels of dispatching. - - The major change is that this class no longer trigger reorder in - dispatch_iter. Because the majority of the slowness is happening - in reorder, this implementation makes dispatch_iter faster. - Instead, this implementation will trigger reorder in the meta dispatcher - and second level dispatcher. Because the number of registered signatures - for each dispatcher is much smaller in this implementation (In pandas - backend, the number of signatures in one level implementation is - O(1000), and the max number of signatures for the meta dispatcher and - second level dispatcher is O(100)), the overall dispatch_iter is faster. - - This implementation consist of three Dispatcher instance: - - (1) This dispatcher, or the instance of this class itself. This class - inherits Dispatcher to avoid duplicating __call__, cache, ambiguities - detection, as well as properties like ordering and funcs. - - (2) First level dispatcher, aka, meta dispatcher. This is the dispatcher - is used to dispatch to the second level dispatcher using the type of the - first arg. - - (3) Second level dispatcher. This is the actual dispatcher used for linear - searching of matched function given type of args. - - Implementation notes: - - (1) register: - This method will now (a) create the second level dispatcher - if missing and register it with the meta dispatcher. (b) return a function - decorator that will register with all the second level dispatcher. Note - that multiple second level dispatcher could be registered with because this - is supported: - - @foo.register((C1, C2), ...) - - The decorator will also register with this dispatcher so that func and - ordering works properly. - - (2) dispatcher_iter - Instead of searching through self.ordering, this method now searches - through: - (a) dispatch_iter of the meta dispatcher (to find matching second level - dispatcher). - (b) for each second level dispatcher, searches through its dispatch_iter. - Because dispatch_iter of meta dispatcher and second level dispatcher - searches through registered functions in proper order (from subclasses to - base classes). - - (3) ambiguity detection, ordering, and funcs - Because this dispatcher has the same func and ordering property as - multipledispatch.Dispatcher. We can completely reuse the ambiguity - detection logic of Dispatcher. Note: - (a) we never actually linear search through ordering of this dispatcher - for dispatching. It's only used for ambiguity detection. - (b) deleting an entry from func of this dispatcher (i.e. del - dispatcher.func[A, B]) does not unregister it. Entries from the second - level dispatcher also needs to be deleted. This is OK because it is not - public API. - - Difference in behavior: - (1) ambiguity detection - Because this implementation doesn't not trigger total reorder of signatures - in dispatch_iter, ambiguity warning will trigger when user calls - "ordering", instead of "dispatch". - """ - - def __init__(self, name, doc=None): - super().__init__(name, doc) - self._meta_dispatcher = Dispatcher(f"{name}_meta") - - def register(self, *types, **kwargs): - type0 = types[0] - - if isinstance(type0, type): - type0 = [type0] - - dispatchers = [] - - for t in type0: - if (t,) in self._meta_dispatcher.funcs: - dispatcher = self._meta_dispatcher.funcs[(t,)] - else: - dispatcher = Dispatcher(f"{self.name}_{t.__name__}") - self._meta_dispatcher.register(t)(dispatcher) - - dispatchers.append((t, dispatcher)) - - def _(func): - self.add(types, func, **kwargs) - for t, dispatcher in dispatchers: - dispatcher.add((t, *types[1:]), func, **kwargs) - return func - - return _ - - def __delitem__(self, types): - del self.funcs[types] - del self._meta_dispatcher.funcs[types[:1]].funcs[types] - if not self._meta_dispatcher.funcs[types[:1]].funcs: - del self._meta_dispatcher.funcs[types[1:]] - - def dispatch_iter(self, *types): - for dispatcher in self._meta_dispatcher.dispatch_iter(types[0]): - func = dispatcher.dispatch(*types) - if func is not None: - yield func diff --git a/ibis/backends/pandas/execution/__init__.py b/ibis/backends/pandas/execution/__init__.py deleted file mode 100644 index 5a79d5166b938..0000000000000 --- a/ibis/backends/pandas/execution/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -from __future__ import annotations - -from ibis.backends.pandas.execution.arrays import * # noqa: F403 -from ibis.backends.pandas.execution.decimal import * # noqa: F403 -from ibis.backends.pandas.execution.generic import * # noqa: F403 -from ibis.backends.pandas.execution.join import * # noqa: F403 -from ibis.backends.pandas.execution.maps import * # noqa: F403 -from ibis.backends.pandas.execution.selection import * # noqa: F403 -from ibis.backends.pandas.execution.strings import * # noqa: F403 -from ibis.backends.pandas.execution.structs import * # noqa: F403 -from ibis.backends.pandas.execution.temporal import * # noqa: F403 -from ibis.backends.pandas.execution.timecontext import * # noqa: F403 -from ibis.backends.pandas.execution.window import * # noqa: F403 diff --git a/ibis/backends/pandas/execution/arrays.py b/ibis/backends/pandas/execution/arrays.py deleted file mode 100644 index 20461f0222413..0000000000000 --- a/ibis/backends/pandas/execution/arrays.py +++ /dev/null @@ -1,172 +0,0 @@ -from __future__ import annotations - -import itertools -import operator -from functools import partial -from typing import TYPE_CHECKING, Any - -import numpy as np -import pandas as pd -from pandas.core.groupby import SeriesGroupBy - -import ibis.expr.operations as ops -from ibis.backends.pandas.core import execute -from ibis.backends.pandas.dispatch import execute_node - -if TYPE_CHECKING: - from collections.abc import Collection - - -@execute_node.register(ops.Array, tuple) -def execute_array(op, cols, **kwargs): - vals = [execute(arg, **kwargs) for arg in cols] - length = next((len(v) for v in vals if isinstance(v, pd.Series)), None) - - if length is None: - return vals - - def ensure_series(v): - if isinstance(v, pd.Series): - return v - else: - return pd.Series(v, index=range(length)) - - # pd.concat() can only handle array-likes. - # If we're given a scalar, we need to broadcast it as a Series. - df = pd.concat([ensure_series(v) for v in vals], axis=1) - return df.apply(lambda row: np.array(row, dtype=object), axis=1) - - -@execute_node.register(ops.ArrayLength, pd.Series) -def execute_array_length(op, data, **kwargs): - return data.apply(len) - - -@execute_node.register(ops.ArrayLength, (list, np.ndarray)) -def execute_array_length_scalar(op, data, **kwargs): - return len(data) - - -@execute_node.register(ops.ArraySlice, pd.Series, int, (int, type(None))) -def execute_array_slice(op, data, start, stop, **kwargs): - return data.apply(operator.itemgetter(slice(start, stop))) - - -@execute_node.register(ops.ArraySlice, (list, np.ndarray), int, (int, type(None))) -def execute_array_slice_scalar(op, data, start, stop, **kwargs): - return data[start:stop] - - -@execute_node.register(ops.ArrayIndex, pd.Series, int) -def execute_array_index(op, data, index, **kwargs): - return data.apply( - lambda array, index=index: ( - array[index] if -len(array) <= index < len(array) else None - ) - ) - - -@execute_node.register(ops.ArrayIndex, (list, np.ndarray), int) -def execute_array_index_scalar(op, data, index, **kwargs): - try: - return data[index] - except IndexError: - return None - - -@execute_node.register(ops.ArrayContains, (list, np.ndarray), object) -def execute_node_contains_value_array(op, haystack, needle, **kwargs): - return needle in haystack - - -def _concat_iterables_to_series(*iters: Collection[Any]) -> pd.Series: - """Concatenate two collections to create a Series. - - The two collections are assumed to have the same length. - - Used for ArrayConcat implementation. - """ - first, *rest = iters - assert all(len(series) == len(first) for series in rest) - # Doing the iteration using `map` is much faster than doing the iteration - # using `Series.apply` due to Pandas-related overhead. - return pd.Series(map(lambda *args: np.concatenate(args), first, *rest)) - - -@execute_node.register(ops.ArrayConcat, tuple) -def execute_array_concat(op, args, **kwargs): - return execute_node(op, *map(partial(execute, **kwargs), args), **kwargs) - - -@execute_node.register(ops.ArrayConcat, pd.Series, pd.Series, [pd.Series]) -def execute_array_concat_series(op, first, second, *args, **kwargs): - return _concat_iterables_to_series(first, second, *args) - - -@execute_node.register( - ops.ArrayConcat, (list, np.ndarray), pd.Series, [(pd.Series, list, np.ndarray)] -) -def execute_array_concat_mixed_left(op, left, right, *args, **kwargs): - # ArrayConcat given a column (pd.Series) and a scalar (np.ndarray). - # We will broadcast the scalar to the length of the column. - # Broadcast `left` to the length of `right` - left = np.tile(left, (len(right), 1)) - return _concat_iterables_to_series(left, right) - - -@execute_node.register( - ops.ArrayConcat, pd.Series, (list, np.ndarray), [(pd.Series, list, np.ndarray)] -) -def execute_array_concat_mixed_right(op, left, right, *args, **kwargs): - # Broadcast `right` to the length of `left` - right = np.tile(right, (len(left), 1)) - return _concat_iterables_to_series(left, right) - - -@execute_node.register( - ops.ArrayConcat, (list, np.ndarray), (list, np.ndarray), [(list, np.ndarray)] -) -def execute_array_concat_scalar(op, left, right, *args, **kwargs): - return np.concatenate([left, right, *args]) - - -@execute_node.register(ops.ArrayRepeat, pd.Series, int) -def execute_array_repeat(op, data, n, **kwargs): - # Negative n will be treated as 0 (repeat will produce empty array) - n = max(n, 0) - return pd.Series(np.tile(arr, n) for arr in data) - - -@execute_node.register(ops.ArrayRepeat, (list, np.ndarray), int) -def execute_array_repeat_scalar(op, data, n, **kwargs): - # Negative n will be treated as 0 (repeat will produce empty array) - return np.tile(data, max(n, 0)) - - -@execute_node.register(ops.ArrayCollect, pd.Series, (type(None), pd.Series)) -def execute_array_collect(op, data, where, aggcontext=None, **kwargs): - return aggcontext.agg(data.loc[where] if where is not None else data, np.array) - - -@execute_node.register(ops.ArrayCollect, SeriesGroupBy, (type(None), pd.Series)) -def execute_array_collect_groupby(op, data, where, aggcontext=None, **kwargs): - return aggcontext.agg( - ( - data.obj.loc[where].groupby(data.grouping.grouper) - if where is not None - else data - ), - np.array, - ) - - -@execute_node.register(ops.Unnest, pd.Series) -def execute_unnest(op, data, **kwargs): - return data[data.map(lambda v: bool(len(v)), na_action="ignore")].explode() - - -@execute_node.register(ops.ArrayFlatten, pd.Series) -def execute_array_flatten(op, data, **kwargs): - return data.map( - lambda v: list(itertools.chain.from_iterable(v)), na_action="ignore" - ) diff --git a/ibis/backends/pandas/execution/constants.py b/ibis/backends/pandas/execution/constants.py deleted file mode 100644 index 0e543561a869e..0000000000000 --- a/ibis/backends/pandas/execution/constants.py +++ /dev/null @@ -1,106 +0,0 @@ -"""Constants for the pandas backend.""" - -from __future__ import annotations - -import operator - -import numpy as np -import pandas as pd - -import ibis.expr.datatypes as dt -import ibis.expr.operations as ops -import ibis.util - -JOIN_TYPES = { - ops.LeftJoin: "left", - ops.RightJoin: "right", - ops.InnerJoin: "inner", - ops.OuterJoin: "outer", -} - - -LEFT_JOIN_SUFFIX = f"_ibis_left_{ibis.util.guid()}" -RIGHT_JOIN_SUFFIX = f"_ibis_right_{ibis.util.guid()}" -JOIN_SUFFIXES = LEFT_JOIN_SUFFIX, RIGHT_JOIN_SUFFIX -ALTERNATE_SUFFIXES = { - LEFT_JOIN_SUFFIX: RIGHT_JOIN_SUFFIX, - RIGHT_JOIN_SUFFIX: LEFT_JOIN_SUFFIX, -} - - -IBIS_TYPE_TO_PANDAS_TYPE: dict[dt.DataType, type | str] = { - dt.float16: np.float16, - dt.float32: np.float32, - dt.float64: np.float64, - dt.float32: np.float32, - dt.float64: np.float64, - dt.int8: np.int8, - dt.int16: np.int16, - dt.int32: np.int32, - dt.int64: np.int64, - dt.string: str, - dt.timestamp: "datetime64[ns]", - dt.boolean: np.bool_, - dt.json: str, - dt.float16.copy(nullable=False): np.float16, - dt.float32.copy(nullable=False): np.float32, - dt.float64.copy(nullable=False): np.float64, - dt.float32.copy(nullable=False): np.float32, - dt.float64.copy(nullable=False): np.float64, - dt.int8.copy(nullable=False): np.int8, - dt.int16.copy(nullable=False): np.int16, - dt.int32.copy(nullable=False): np.int32, - dt.int64.copy(nullable=False): np.int64, - dt.string.copy(nullable=False): str, - dt.timestamp.copy(nullable=False): "datetime64[ns]", - dt.boolean.copy(nullable=False): np.bool_, - dt.json.copy(nullable=False): str, -} - - -IBIS_TO_PYTHON_LITERAL_TYPES = { - dt.boolean: bool, - dt.float64: float, - dt.float32: float, - dt.int64: int, - dt.int32: int, - dt.int16: int, - dt.int8: int, - dt.string: str, - dt.date: lambda x: pd.Timestamp(x).to_pydatetime().date(), - dt.boolean.copy(nullable=False): bool, - dt.float64.copy(nullable=False): float, - dt.float32.copy(nullable=False): float, - dt.int64.copy(nullable=False): int, - dt.int32.copy(nullable=False): int, - dt.int16.copy(nullable=False): int, - dt.int8.copy(nullable=False): int, - dt.string.copy(nullable=False): str, - dt.date.copy(nullable=False): lambda x: pd.Timestamp(x).to_pydatetime().date(), -} - - -BINARY_OPERATIONS = { - ops.Greater: operator.gt, - ops.Less: operator.lt, - ops.LessEqual: operator.le, - ops.GreaterEqual: operator.ge, - ops.Equals: operator.eq, - ops.NotEquals: operator.ne, - ops.And: operator.and_, - ops.Or: operator.or_, - ops.Xor: operator.xor, - ops.Add: operator.add, - ops.Subtract: operator.sub, - ops.Multiply: operator.mul, - ops.Divide: operator.truediv, - ops.FloorDivide: operator.floordiv, - ops.Modulus: operator.mod, - ops.Power: operator.pow, - ops.IdenticalTo: lambda x, y: (x == y) | (pd.isnull(x) & pd.isnull(y)), - ops.BitwiseXor: lambda x, y: np.bitwise_xor(x, y), - ops.BitwiseOr: lambda x, y: np.bitwise_or(x, y), - ops.BitwiseAnd: lambda x, y: np.bitwise_and(x, y), - ops.BitwiseLeftShift: lambda x, y: np.left_shift(x, y), - ops.BitwiseRightShift: lambda x, y: np.right_shift(x, y), -} diff --git a/ibis/backends/pandas/execution/decimal.py b/ibis/backends/pandas/execution/decimal.py deleted file mode 100644 index ac34bea4e8a21..0000000000000 --- a/ibis/backends/pandas/execution/decimal.py +++ /dev/null @@ -1,135 +0,0 @@ -from __future__ import annotations - -import decimal -import math -import numbers - -import numpy as np -import pandas as pd - -import ibis.expr.datatypes as dt -import ibis.expr.operations as ops -from ibis.backends.pandas.dispatch import execute_node -from ibis.common.exceptions import OperationNotDefinedError - - -@execute_node.register(ops.Ln, decimal.Decimal) -def execute_decimal_natural_log(op, data, **kwargs): - try: - return data.ln() - except decimal.InvalidOperation: - return decimal.Decimal("NaN") - - -@execute_node.register(ops.Log, decimal.Decimal, decimal.Decimal) -def execute_decimal_log_with_decimal_base(op, data, base, **kwargs): - try: - return data.ln() / base.ln() - except decimal.InvalidOperation: - return decimal.Decimal("NaN") - - -@execute_node.register(ops.Log, decimal.Decimal, type(None)) -def execute_decimal_log_with_no_base(op, data, _, **kwargs): - return execute_decimal_natural_log(op, data, **kwargs) - - -@execute_node.register(ops.Log, decimal.Decimal, numbers.Real) -def execute_decimal_log_with_real_base(op, data, base, **kwargs): - return execute_node(op, data, decimal.Decimal(base), **kwargs) - - -@execute_node.register(ops.Log, decimal.Decimal, np.integer) -def execute_decimal_log_with_np_integer_base(op, data, base, **kwargs): - return execute_node(op, data, int(base), **kwargs) - - -@execute_node.register(ops.Log2, decimal.Decimal) -def execute_decimal_log2(op, data, **kwargs): - try: - return data.ln() / decimal.Decimal(2).ln() - except decimal.InvalidOperation: - return decimal.Decimal("NaN") - - -# While ops.Negate is a subclass of ops.Unary, multipledispatch will be -# faster if we provide types that can potentially match the types of inputs -# exactly -@execute_node.register((ops.Unary, ops.Negate), decimal.Decimal) -def execute_decimal_unary(op, data, **kwargs): - op_type = type(op) - operation_name = op_type.__name__.lower() - function = getattr( - decimal.Decimal, - operation_name, - None, - ) - if function is None: - math_function = getattr(math, operation_name, None) - if math_function is None: - raise OperationNotDefinedError(f"{op_type.__name__} not supported") - function = lambda x: decimal.Decimal(math_function(x)) - try: - return function(data) - except decimal.InvalidOperation: - return decimal.Decimal("NaN") - - -@execute_node.register(ops.Sign, decimal.Decimal) -def execute_decimal_sign(op, data, **kwargs): - return data if not data else decimal.Decimal(1).copy_sign(data) - - -@execute_node.register(ops.Abs, decimal.Decimal) -def execute_decimal_abs(op, data, **kwargs): - return abs(data) - - -@execute_node.register(ops.Round, decimal.Decimal, (np.integer, int)) -def execute_round_decimal(op, data, places, **kwargs): - # If we only allowed Python 3, we wouldn't have to implement any of this; - # we could just call round(data, places) :( - tuple_value = data.as_tuple() - precision = len(tuple_value.digits) - integer_part_length = precision + min(tuple_value.exponent, 0) - - if places < 0: - decimal_format_string = "0.{}E+{:d}".format( - "0" * (integer_part_length - 1 + places), - max(integer_part_length + places, abs(places)), - ) - else: - decimal_format_string = "{}.{}".format("0" * integer_part_length, "0" * places) - - places = decimal.Decimal(decimal_format_string) - return data.quantize(places) - - -@execute_node.register(ops.Round, decimal.Decimal, type(None)) -def execute_round_decimal_no_places(op, data, _, **kwargs): - return np.int64(round(data)) - - -@execute_node.register(ops.Cast, pd.Series, dt.Decimal) -def execute_cast_series_to_decimal(op, data, type, **kwargs): - precision = type.precision - scale = type.scale - context = decimal.Context(prec=precision) - places = context.create_decimal( - "{}.{}".format("0" * (precision - scale), "0" * scale) - ) - return data.apply( - lambda x, context=context, places=places: ( - context.create_decimal(x).quantize(places) - ) - ) - - -@execute_node.register(ops.E) -def execute_e(op, **kwargs): - return np.e - - -@execute_node.register(ops.Pi) -def execute_pi(op, **kwargs): - return np.pi diff --git a/ibis/backends/pandas/execution/generic.py b/ibis/backends/pandas/execution/generic.py deleted file mode 100644 index 7c8b53cc2f790..0000000000000 --- a/ibis/backends/pandas/execution/generic.py +++ /dev/null @@ -1,1479 +0,0 @@ -"""Execution rules for generic ibis operations.""" - -from __future__ import annotations - -import collections -import contextlib -import datetime -import decimal -import functools -import math -import numbers -import operator -from collections.abc import Mapping, Sized - -import numpy as np -import pandas as pd -import pytz -import toolz -from pandas.core.groupby import DataFrameGroupBy, SeriesGroupBy - -import ibis.common.exceptions as com -import ibis.expr.datatypes as dt -import ibis.expr.operations as ops -import ibis.expr.types as ir -from ibis.backends.base.df.scope import Scope -from ibis.backends.base.df.timecontext import TimeContext, get_time_col -from ibis.backends.pandas import Backend as PandasBackend -from ibis.backends.pandas import aggcontext as agg_ctx -from ibis.backends.pandas.core import ( - boolean_types, - date_types, - execute, - fixed_width_types, - floating_types, - integer_types, - numeric_types, - scalar_types, - simple_types, - timedelta_types, - timestamp_types, -) -from ibis.backends.pandas.dispatch import execute_literal, execute_node -from ibis.backends.pandas.execution import constants -from ibis.backends.pandas.execution.util import coerce_to_output, get_grouping - - -# By default return the literal value -@execute_literal.register(ops.Literal, object, dt.DataType) -def execute_node_literal_value_datatype(op, value, datatype, **kwargs): - return value - - -# Because True and 1 hash to the same value, if we have True or False in scope -# keys while executing anything that should evaluate to 1 or 0 evaluates to -# True or False respectively. This is a hack to work around that by casting the -# bool to an integer. -@execute_literal.register(ops.Literal, object, dt.Integer) -def execute_node_literal_any_integer_datatype(op, value, datatype, **kwargs): - if value is None: - return value - return int(value) - - -@execute_literal.register(ops.Literal, object, dt.Boolean) -def execute_node_literal_any_boolean_datatype(op, value, datatype, **kwargs): - if value is None: - return value - return bool(value) - - -@execute_literal.register(ops.Literal, object, dt.Floating) -def execute_node_literal_any_floating_datatype(op, value, datatype, **kwargs): - if value is None: - return value - return float(value) - - -@execute_literal.register(ops.Literal, object, dt.Array) -def execute_node_literal_any_array_datatype(op, value, datatype, **kwargs): - if value is None: - return value - return np.array(value) - - -@execute_literal.register(ops.Literal, dt.DataType) -def execute_node_literal_datatype(op, datatype, **kwargs): - return op.value - - -@execute_literal.register( - ops.Literal, (*timedelta_types, str, *integer_types, type(None)), dt.Interval -) -def execute_interval_literal(op, value, dtype, **kwargs): - if value is None: - return pd.NaT - return pd.Timedelta(value, dtype.unit.short) - - -@execute_node.register(ops.Limit, pd.DataFrame, integer_types, integer_types) -def execute_limit_frame(op, data, nrows: int, offset: int, **kwargs): - return data.iloc[offset : offset + nrows] - - -@execute_node.register(ops.Limit, pd.DataFrame, type(None), integer_types) -def execute_limit_frame_no_limit(op, data, nrows: None, offset: int, **kwargs): - return data.iloc[offset:] - - -@execute_node.register(ops.Cast, SeriesGroupBy, dt.DataType) -def execute_cast_series_group_by(op, data, type, **kwargs): - result = execute_cast_series_generic(op, data.obj, type, **kwargs) - return result.groupby(get_grouping(data.grouper.groupings), group_keys=False) - - -@execute_node.register(ops.Cast, pd.Series, dt.DataType) -def execute_cast_series_generic(op, data, type, **kwargs): - out = data.astype(constants.IBIS_TYPE_TO_PANDAS_TYPE[type]) - if type.is_integer(): - if op.arg.dtype.is_timestamp(): - return out.floordiv(int(1e9)) - elif op.arg.dtype.is_date(): - return out.floordiv(int(24 * 60 * 60 * 1e9)) - return out - - -@execute_node.register(ops.Cast, pd.Series, dt.Array) -def execute_cast_series_array(op, data, type, **kwargs): - value_type = type.value_type - numpy_type = constants.IBIS_TYPE_TO_PANDAS_TYPE.get(value_type, None) - if numpy_type is None: - raise ValueError( - "Array value type must be a primitive type " - "(e.g., number, string, or timestamp)" - ) - - def cast_to_array(array, numpy_type=numpy_type): - elems = [ - el if el is None else np.array(el, dtype=numpy_type).item() for el in array - ] - try: - return np.array(elems, dtype=numpy_type) - except TypeError: - return np.array(elems) - - return data.map(cast_to_array) - - -@execute_node.register(ops.Cast, list, dt.Array) -def execute_cast_list_array(op, data, type, **kwargs): - value_type = type.value_type - numpy_type = constants.IBIS_TYPE_TO_PANDAS_TYPE.get(value_type, None) - if numpy_type is None: - raise ValueError( - "Array value type must be a primitive type " - "(e.g., number, string, or timestamp)" - ) - - def cast_to_array(array, numpy_type=numpy_type): - elems = [ - el if el is None else np.array(el, dtype=numpy_type).item() for el in array - ] - try: - return np.array(elems, dtype=numpy_type) - except TypeError: - return np.array(elems) - - return cast_to_array(data) - - -@execute_node.register(ops.Cast, pd.Series, dt.Timestamp) -def execute_cast_series_timestamp(op, data, type, **kwargs): - arg = op.arg - from_type = arg.dtype - - if from_type.equals(type): # noop cast - return data - - tz = type.timezone - - if from_type.is_timestamp(): - from_tz = from_type.timezone - if tz is None and from_tz is None: - return data - elif tz is None or from_tz is None: - return data.dt.tz_localize(tz) - elif tz is not None and from_tz is not None: - return data.dt.tz_convert(tz) - elif from_type.is_date(): - return data if tz is None else data.dt.tz_localize(tz) - - if from_type.is_string() or from_type.is_integer(): - if from_type.is_integer(): - timestamps = pd.to_datetime(data.values, unit="s") - else: - timestamps = pd.to_datetime(data.values) - if getattr(timestamps.dtype, "tz", None) is not None: - method_name = "tz_convert" - else: - method_name = "tz_localize" - method = getattr(timestamps, method_name) - timestamps = method(tz) - return pd.Series(timestamps, index=data.index, name=data.name) - - raise TypeError(f"Don't know how to cast {from_type} to {type}") - - -def _normalize(values, original_index, name, timezone=None): - index = pd.DatetimeIndex(values, tz=timezone) - return pd.Series(index.normalize(), index=original_index, name=name) - - -@execute_node.register(ops.Cast, pd.Series, dt.Date) -def execute_cast_series_date(op, data, type, **kwargs): - arg = op.args[0] - from_type = arg.dtype - - if from_type.equals(type): - return data - - if from_type.is_timestamp(): - return _normalize( - data.values, data.index, data.name, timezone=from_type.timezone - ) - - if from_type.is_string(): - values = data.values - datetimes = pd.to_datetime(values) - with contextlib.suppress(TypeError): - datetimes = datetimes.tz_convert(None) - dates = _normalize(datetimes, data.index, data.name) - return pd.Series(dates, index=data.index, name=data.name) - - if from_type.is_integer(): - return pd.Series( - pd.to_datetime(data.values, unit="D").values, - index=data.index, - name=data.name, - ) - - raise TypeError(f"Don't know how to cast {from_type} to {type}") - - -@execute_node.register(ops.SortKey, pd.Series, bool) -def execute_sort_key_series(op, data, _, **kwargs): - return data - - -def call_numpy_ufunc(func, op, data, **kwargs): - if getattr(data, "dtype", None) == np.dtype(np.object_): - return data.apply(functools.partial(execute_node, op, **kwargs)) - if func is None: - raise com.OperationNotDefinedError(f"{type(op).__name__} not supported") - return func(data) - - -@execute_node.register(ops.Negate, fixed_width_types + timedelta_types) -def execute_obj_negate(op, data, **kwargs): - return -data - - -@execute_node.register(ops.Negate, pd.Series) -def execute_series_negate(op, data, **kwargs): - return call_numpy_ufunc(np.negative, op, data, **kwargs) - - -@execute_node.register(ops.Negate, SeriesGroupBy) -def execute_series_group_by_negate(op, data, **kwargs): - return execute_series_negate(op, data.obj, **kwargs).groupby( - get_grouping(data.grouper.groupings), group_keys=False - ) - - -@execute_node.register(ops.Unary, pd.Series) -def execute_series_unary_op(op, data, **kwargs): - op_type = type(op) - if op_type == ops.BitwiseNot: - function = np.bitwise_not - else: - function = getattr(np, op_type.__name__.lower()) - return call_numpy_ufunc(function, op, data, **kwargs) - - -@execute_node.register(ops.Acos, (pd.Series, *numeric_types)) -def execute_series_acos(_, data, **kwargs): - return np.arccos(data) - - -@execute_node.register(ops.Asin, (pd.Series, *numeric_types)) -def execute_series_asin(_, data, **kwargs): - return np.arcsin(data) - - -@execute_node.register(ops.Atan, (pd.Series, *numeric_types)) -def execute_series_atan(_, data, **kwargs): - return np.arctan(data) - - -@execute_node.register(ops.Cot, (pd.Series, *numeric_types)) -def execute_series_cot(_, data, **kwargs): - return 1.0 / np.tan(data) - - -@execute_node.register( - ops.Atan2, (pd.Series, *numeric_types), (pd.Series, *numeric_types) -) -def execute_series_atan2(_, y, x, **kwargs): - return np.arctan2(y, x) - - -@execute_node.register((ops.Cos, ops.Sin, ops.Tan), (pd.Series, *numeric_types)) -def execute_series_trig(op, data, **kwargs): - function = getattr(np, type(op).__name__.lower()) - return call_numpy_ufunc(function, op, data, **kwargs) - - -@execute_node.register(ops.Radians, (pd.Series, *numeric_types)) -def execute_series_radians(_, data, **kwargs): - return np.radians(data) - - -@execute_node.register(ops.Degrees, (pd.Series, *numeric_types)) -def execute_series_degrees(_, data, **kwargs): - return np.degrees(data) - - -@execute_node.register((ops.Ceil, ops.Floor), pd.Series) -def execute_series_ceil(op, data, **kwargs): - return_type = np.object_ if data.dtype == np.object_ else np.int64 - func = getattr(np, type(op).__name__.lower()) - return call_numpy_ufunc(func, op, data, **kwargs).astype(return_type) - - -@execute_node.register(ops.BitwiseNot, integer_types) -def execute_int_bitwise_not(op, data, **kwargs): - return np.invert(data) - - -def vectorize_object(op, arg, *args, **kwargs): - func = np.vectorize(functools.partial(execute_node, op, **kwargs)) - return pd.Series(func(arg, *args), index=arg.index, name=arg.name) - - -@execute_node.register( - ops.Log, pd.Series, (pd.Series, numbers.Real, decimal.Decimal, type(None)) -) -def execute_series_log_with_base(op, data, base, **kwargs): - if data.dtype == np.dtype(np.object_): - return vectorize_object(op, data, base, **kwargs) - - if base is None: - return np.log(data) - return np.log(data) / np.log(base) - - -@execute_node.register(ops.Ln, pd.Series) -def execute_series_natural_log(op, data, **kwargs): - if data.dtype == np.dtype(np.object_): - return data.apply(functools.partial(execute_node, op, **kwargs)) - return np.log(data) - - -@execute_node.register( - ops.Clip, - pd.Series, - (pd.Series, type(None)) + numeric_types, - (pd.Series, type(None)) + numeric_types, -) -def execute_series_clip(op, data, lower, upper, **kwargs): - return data.clip(lower=lower, upper=upper) - - -@execute_node.register( - ops.Quantile, - pd.Series, - (np.ndarray, *numeric_types), - (pd.Series, type(None)), -) -def execute_series_quantile(op, data, quantile, mask, aggcontext=None, **_): - return aggcontext.agg( - data if mask is None else data.loc[mask], - "quantile", - q=quantile, - ) - - -@execute_node.register(ops.Quantile, pd.Series, (np.ndarray, *numeric_types)) -def execute_series_quantile_default(op, data, quantile, aggcontext=None, **_): - return aggcontext.agg(data, "quantile", q=quantile) - - -@execute_node.register( - ops.Quantile, - SeriesGroupBy, - (np.ndarray, *numeric_types), - (SeriesGroupBy, type(None)), -) -def execute_series_group_by_quantile(op, data, quantile, mask, aggcontext=None, **_): - return aggcontext.agg( - data, - ( - "quantile" - if mask is None - else functools.partial(_filtered_reduction, mask.obj, pd.Series.quantile) - ), - q=quantile, - ) - - -@execute_node.register( - ops.MultiQuantile, - pd.Series, - (np.ndarray, *numeric_types), - (pd.Series, type(None)), -) -def execute_series_quantile_multi(op, data, quantile, mask, aggcontext=None, **_): - return np.array( - aggcontext.agg(data if mask is None else data.loc[mask], "quantile", q=quantile) - ) - - -@execute_node.register( - ops.MultiQuantile, - SeriesGroupBy, - np.ndarray, - (SeriesGroupBy, type(None)), -) -def execute_series_quantile_multi_groupby( - op, data, quantile, mask, aggcontext=None, **kwargs -): - def q(x, quantile): - result = x.quantile(quantile).tolist() - return [result for _ in range(len(x))] - - return aggcontext.agg( - data, - q if mask is None else functools.partial(_filtered_reduction, mask.obj, q), - quantile, - ) - - -@execute_node.register(ops.MultiQuantile, SeriesGroupBy, np.ndarray) -def execute_series_quantile_multi_groupby_default( - op, data, quantile, aggcontext=None, **_ -): - def q(x, quantile): - result = x.quantile(quantile).tolist() - return [result for _ in range(len(x))] - - return aggcontext.agg(data, q, quantile) - - -@execute_node.register(ops.Cast, type(None), dt.DataType) -def execute_cast_null_to_anything(op, data, type, **kwargs): - return None - - -@execute_node.register(ops.Cast, datetime.datetime, dt.String) -def execute_cast_datetime_or_timestamp_to_string(op, data, type, **kwargs): - """Cast timestamps to strings.""" - return str(data) - - -@execute_node.register(ops.Cast, datetime.datetime, dt.Int64) -def execute_cast_timestamp_to_integer(op, data, type, **kwargs): - """Cast timestamps to integers.""" - t = pd.Timestamp(data) - return pd.NA if pd.isna(t) else int(t.timestamp()) - - -@execute_node.register(ops.Cast, (np.bool_, bool), dt.Timestamp) -def execute_cast_bool_to_timestamp(op, data, type, **kwargs): - raise TypeError( - "Casting boolean values to timestamps does not make sense. If you " - "really want to cast boolean values to timestamps please cast to " - "int64 first then to timestamp: " - "value.cast('int64').cast('timestamp')" - ) - - -@execute_node.register(ops.Cast, (np.bool_, bool), dt.Interval) -def execute_cast_bool_to_interval(op, data, type, **kwargs): - raise TypeError( - "Casting boolean values to intervals does not make sense. If you " - "really want to cast boolean values to intervals please cast to " - "int64 first then to interval: " - "value.cast('int64').cast(ibis.expr.datatypes.Interval(...))" - ) - - -@execute_node.register(ops.Cast, integer_types, dt.Timestamp) -def execute_cast_integer_to_timestamp(op, data, type, **kwargs): - """Cast integer to timestamp.""" - return pd.Timestamp(data, unit="s", tz=type.timezone) - - -@execute_node.register(ops.Cast, str, dt.Timestamp) -def execute_cast_string_to_timestamp(op, data, type, **kwargs): - """Cast string to timestamp.""" - return pd.Timestamp(data, tz=type.timezone) - - -@execute_node.register(ops.Cast, datetime.datetime, dt.Timestamp) -def execute_cast_timestamp_to_timestamp(op, data, type, **kwargs): - """Cast timestamps to other timestamps including timezone if necessary.""" - input_timezone = data.tzinfo - target_timezone = type.timezone - - if input_timezone == target_timezone: - return data - - if input_timezone is None or target_timezone is None: - return data.astimezone( - tz=None if target_timezone is None else pytz.timezone(target_timezone) - ) - - return data.astimezone(tz=pytz.timezone(target_timezone)) - - -@execute_node.register(ops.Cast, fixed_width_types + (str,), dt.DataType) -def execute_cast_string_literal(op, data, type, **kwargs): - try: - cast_function = constants.IBIS_TO_PYTHON_LITERAL_TYPES[type] - except KeyError: - raise TypeError(f"Don't know how to cast {data!r} to type {type}") - else: - return cast_function(data) - - -@execute_node.register(ops.Cast, Mapping, dt.DataType) -def execute_cast_mapping_literal(op, data, type, **kwargs): - data = ( - (ops.Literal(k, type.key_type), ops.Literal(v, type.value_type)) - for k, v in data.items() - ) - return {execute(k, **kwargs): execute(v, **kwargs) for k, v in data} - - -@execute_node.register(ops.Round, scalar_types, (int, type(None))) -def execute_round_scalars(op, data, places, **kwargs): - return round(data, places) if places else round(data) - - -@execute_node.register(ops.Round, pd.Series, (pd.Series, np.integer, type(None), int)) -def execute_round_series(op, data, places, **kwargs): - if data.dtype == np.dtype(np.object_): - return vectorize_object(op, data, places, **kwargs) - result = data.round(places or 0) - return result if places else result.astype("int64") - - -@execute_node.register(ops.TableColumn, (pd.DataFrame, DataFrameGroupBy)) -def execute_table_column_df_or_df_groupby(op, data, **kwargs): - return data[op.name] - - -@execute_node.register(ops.Aggregation, pd.DataFrame) -def execute_aggregation_dataframe( - op, - data, - scope=None, - timecontext: TimeContext | None = None, - **kwargs, -): - assert op.metrics, "no metrics found during aggregation execution" - - if op.sort_keys: - raise NotImplementedError("sorting on aggregations not yet implemented") - - if op.predicates: - predicate = functools.reduce( - operator.and_, - ( - execute(p, scope=scope, timecontext=timecontext, **kwargs) - for p in op.predicates - ), - ) - data = data.loc[predicate] - - columns: dict[str, str] = {} - - if op.by: - grouping_keys = [ - key.name - if isinstance(key, ops.TableColumn) - else execute(key, scope=scope, timecontext=timecontext, **kwargs).rename( - key.name - ) - for key in op.by - ] - source = data.groupby( - grouping_keys[0] if len(grouping_keys) == 1 else grouping_keys, - group_keys=False, - ) - else: - source = data - - scope = scope.merge_scope(Scope({op.table: source}, timecontext)) - - pieces = [ - coerce_to_output( - execute(metric, scope=scope, timecontext=timecontext, **kwargs), - metric, - ) - for metric in op.metrics - ] - - result = pd.concat(pieces, axis=1) - - # If grouping, need a reset to get the grouping key back as a column - if op.by: - result = result.reset_index() - - result.columns = [columns.get(c, c) for c in result.columns] - - if op.having: - # .having(...) is only accessible on groupby, so this should never - # raise - if not op.by: - raise ValueError( - "Filtering out aggregation values is not allowed without at " - "least one grouping key" - ) - - # TODO(phillipc): Don't recompute identical subexpressions - predicate = functools.reduce( - operator.and_, - ( - execute(h, scope=scope, timecontext=timecontext, **kwargs) - for h in op.having - ), - ) - assert len(predicate) == len( - result - ), "length of predicate does not match length of DataFrame" - result = result.loc[predicate.values] - return result - - -@execute_node.register(ops.Reduction, SeriesGroupBy, type(None)) -def execute_reduction_series_groupby(op, data, mask, aggcontext=None, **kwargs): - return aggcontext.agg(data, type(op).__name__.lower()) - - -@execute_node.register(ops.First, SeriesGroupBy, type(None)) -def execute_first_series_groupby(op, data, mask, aggcontext=None, **kwargs): - return aggcontext.agg(data, lambda x: getattr(x, "iat", x)[0]) - - -@execute_node.register(ops.Last, SeriesGroupBy, type(None)) -def execute_last_series_groupby(op, data, mask, aggcontext=None, **kwargs): - return aggcontext.agg(data, lambda x: getattr(x, "iat", x)[-1]) - - -variance_ddof = {"pop": 0, "sample": 1} - - -@execute_node.register(ops.Variance, SeriesGroupBy, type(None)) -def execute_reduction_series_groupby_var(op, data, _, aggcontext=None, **kwargs): - return aggcontext.agg(data, "var", ddof=variance_ddof[op.how]) - - -@execute_node.register(ops.StandardDev, SeriesGroupBy, type(None)) -def execute_reduction_series_groupby_std(op, data, _, aggcontext=None, **kwargs): - return aggcontext.agg(data, "std", ddof=variance_ddof[op.how]) - - -@execute_node.register( - (ops.CountDistinct, ops.ApproxCountDistinct), - SeriesGroupBy, - type(None), -) -def execute_count_distinct_series_groupby(op, data, _, aggcontext=None, **kwargs): - return aggcontext.agg(data, "nunique") - - -@execute_node.register(ops.Arbitrary, SeriesGroupBy, type(None)) -def execute_arbitrary_series_groupby(op, data, _, aggcontext=None, **kwargs): - how = op.how - if how is None: - how = "first" - - if how not in {"first", "last"}: - raise com.OperationNotDefinedError(f"Arbitrary {how!r} is not supported") - return aggcontext.agg(data, how) - - -@execute_node.register( - (ops.ArgMin, ops.ArgMax), - SeriesGroupBy, - SeriesGroupBy, - type(None), -) -def execute_reduction_series_groupby_argidx( - op, data, key, _, aggcontext=None, **kwargs -): - method = operator.methodcaller(op.__class__.__name__.lower()) - - def reduce(data, key=key.obj, method=method): - return data.iloc[method(key.loc[data.index])] - - return aggcontext.agg(data, reduce) - - -def _filtered_reduction(mask, method, data): - return method(data[mask[data.index]]) - - -@execute_node.register(ops.Reduction, SeriesGroupBy, SeriesGroupBy) -def execute_reduction_series_gb_mask(op, data, mask, aggcontext=None, **kwargs): - method = operator.methodcaller(type(op).__name__.lower()) - return aggcontext.agg( - data, functools.partial(_filtered_reduction, mask.obj, method) - ) - - -@execute_node.register(ops.First, SeriesGroupBy, SeriesGroupBy) -def execute_first_series_gb_mask(op, data, mask, aggcontext=None, **kwargs): - return aggcontext.agg( - data, functools.partial(_filtered_reduction, mask.obj, lambda x: x.iloc[0]) - ) - - -@execute_node.register(ops.Last, SeriesGroupBy, SeriesGroupBy) -def execute_last_series_gb_mask(op, data, mask, aggcontext=None, **kwargs): - return aggcontext.agg( - data, functools.partial(_filtered_reduction, mask.obj, lambda x: x.iloc[-1]) - ) - - -@execute_node.register( - (ops.CountDistinct, ops.ApproxCountDistinct), - SeriesGroupBy, - SeriesGroupBy, -) -def execute_count_distinct_series_groupby_mask( - op, data, mask, aggcontext=None, **kwargs -): - return aggcontext.agg( - data, - functools.partial(_filtered_reduction, mask.obj, pd.Series.nunique), - ) - - -@execute_node.register(ops.Variance, SeriesGroupBy, SeriesGroupBy) -def execute_var_series_groupby_mask(op, data, mask, aggcontext=None, **kwargs): - return aggcontext.agg( - data, - lambda x, mask=mask.obj, ddof=variance_ddof[op.how]: ( - x[mask[x.index]].var(ddof=ddof) - ), - ) - - -@execute_node.register(ops.StandardDev, SeriesGroupBy, SeriesGroupBy) -def execute_std_series_groupby_mask(op, data, mask, aggcontext=None, **kwargs): - return aggcontext.agg( - data, - lambda x, mask=mask.obj, ddof=variance_ddof[op.how]: ( - x[mask[x.index]].std(ddof=ddof) - ), - ) - - -@execute_node.register(ops.CountStar, DataFrameGroupBy, type(None)) -def execute_count_star_frame_groupby(op, data, _, **kwargs): - return data.size() - - -@execute_node.register(ops.CountDistinctStar, DataFrameGroupBy, type(None)) -def execute_count_distinct_star_frame_groupby(op, data, _, **kwargs): - return data.nunique() - - -@execute_node.register(ops.Reduction, pd.Series, (pd.Series, type(None))) -def execute_reduction_series_mask(op, data, mask, aggcontext=None, **kwargs): - operand = data[mask] if mask is not None else data - return aggcontext.agg(operand, type(op).__name__.lower()) - - -@execute_node.register(ops.First, pd.Series, (pd.Series, type(None))) -def execute_first_series_mask(op, data, mask, aggcontext=None, **kwargs): - operand = data[mask] if mask is not None else data - - def _first(x): - return getattr(x, "iloc", x)[0] - - return aggcontext.agg(operand, _first) - - -@execute_node.register(ops.Last, pd.Series, (pd.Series, type(None))) -def execute_last_series_mask(op, data, mask, aggcontext=None, **kwargs): - operand = data[mask] if mask is not None else data - - def _last(x): - return getattr(x, "iloc", x)[-1] - - return aggcontext.agg(operand, _last) - - -@execute_node.register( - (ops.CountDistinct, ops.ApproxCountDistinct), - pd.Series, - (pd.Series, type(None)), -) -def execute_count_distinct_series_mask(op, data, mask, aggcontext=None, **kwargs): - return aggcontext.agg(data[mask] if mask is not None else data, "nunique") - - -@execute_node.register(ops.Arbitrary, pd.Series, (pd.Series, type(None))) -def execute_arbitrary_series_mask(op, data, mask, aggcontext=None, **kwargs): - if op.how == "first": - index = 0 - elif op.how == "last": - index = -1 - else: - raise com.OperationNotDefinedError(f"Arbitrary {op.how!r} is not supported") - - data = data[mask] if mask is not None else data - return data.iloc[index] - - -@execute_node.register(ops.StandardDev, pd.Series, (pd.Series, type(None))) -def execute_standard_dev_series(op, data, mask, aggcontext=None, **kwargs): - return aggcontext.agg( - data[mask] if mask is not None else data, - "std", - ddof=variance_ddof[op.how], - ) - - -@execute_node.register(ops.Variance, pd.Series, (pd.Series, type(None))) -def execute_variance_series(op, data, mask, aggcontext=None, **kwargs): - return aggcontext.agg( - data[mask] if mask is not None else data, - "var", - ddof=variance_ddof[op.how], - ) - - -@execute_node.register((ops.Any, ops.All), pd.Series, (pd.Series, type(None))) -def execute_any_all_series(op, data, mask, aggcontext=None, **kwargs): - if mask is not None: - data = data.loc[mask] - if isinstance(aggcontext, (agg_ctx.Summarize, agg_ctx.Transform)): - result = aggcontext.agg(data, type(op).__name__.lower()) - else: - result = aggcontext.agg( - data, lambda data: getattr(data, type(op).__name__.lower())() - ) - try: - return result.astype(bool) - except TypeError: - return result - - -@execute_node.register((ops.Any, ops.All), SeriesGroupBy, type(None)) -def execute_any_all_series_group_by(op, data, mask, aggcontext=None, **kwargs): - if mask is not None: - data = data.obj.loc[mask].groupby(get_grouping(data.grouper.groupings)) - if isinstance(aggcontext, (agg_ctx.Summarize, agg_ctx.Transform)): - result = aggcontext.agg(data, type(op).__name__.lower()) - else: - result = aggcontext.agg( - data, lambda data: getattr(data, type(op).__name__.lower())() - ) - try: - return result.astype(bool) - except TypeError: - return result - - -@execute_node.register(ops.CountStar, pd.DataFrame, type(None)) -def execute_count_star_frame(op, data, _, **kwargs): - return len(data) - - -@execute_node.register(ops.CountStar, pd.DataFrame, pd.Series) -def execute_count_star_frame_filter(op, data, where, **kwargs): - return len(data) - len(where) + where.sum() - - -@execute_node.register(ops.CountDistinctStar, pd.DataFrame, type(None)) -def execute_count_distinct_star_frame(op, data, _, **kwargs): - return len(data.drop_duplicates()) - - -@execute_node.register(ops.CountDistinctStar, pd.DataFrame, pd.Series) -def execute_count_distinct_star_frame_filter(op, data, filt, **kwargs): - return len(data.loc[filt].drop_duplicates()) - - -@execute_node.register(ops.BitAnd, pd.Series, (pd.Series, type(None))) -def execute_bit_and_series(_, data, mask, aggcontext=None, **kwargs): - return aggcontext.agg( - data[mask] if mask is not None else data, - np.bitwise_and.reduce, - ) - - -@execute_node.register(ops.BitOr, pd.Series, (pd.Series, type(None))) -def execute_bit_or_series(_, data, mask, aggcontext=None, **kwargs): - return aggcontext.agg( - data[mask] if mask is not None else data, - np.bitwise_or.reduce, - ) - - -@execute_node.register(ops.BitXor, pd.Series, (pd.Series, type(None))) -def execute_bit_xor_series(_, data, mask, aggcontext=None, **kwargs): - return aggcontext.agg( - data[mask] if mask is not None else data, - np.bitwise_xor.reduce, - ) - - -@execute_node.register( - (ops.ArgMin, ops.ArgMax), - pd.Series, - pd.Series, - (pd.Series, type(None)), -) -def execute_argmin_series_mask(op, data, key, mask, aggcontext=None, **kwargs): - method_name = op.__class__.__name__.lower() - masked_key = key[mask] if mask is not None else key - idx = aggcontext.agg(masked_key, method_name) - masked = data[mask] if mask is not None else data - return masked.iloc[idx] - - -@execute_node.register(ops.Mode, pd.Series, (pd.Series, type(None))) -def execute_mode_series(_, data, mask, aggcontext=None, **kwargs): - return aggcontext.agg( - data[mask] if mask is not None else data, lambda x: x.mode().iloc[0] - ) - - -@execute_node.register(ops.Mode, SeriesGroupBy, (SeriesGroupBy, type(None))) -def execute_mode_series_groupby(_, data, mask, aggcontext=None, **kwargs): - def mode(x): - return x.mode().iloc[0] - - if mask is not None: - mode = functools.partial(_filtered_reduction, mask.obj, mode) - - return aggcontext.agg(data, mode) - - -@execute_node.register(ops.ApproxMedian, pd.Series, (pd.Series, type(None))) -def execute_approx_median_series(_, data, mask, aggcontext=None, **kwargs): - return aggcontext.agg( - data[mask] if mask is not None else data, lambda x: x.median() - ) - - -@execute_node.register(ops.ApproxMedian, SeriesGroupBy, (SeriesGroupBy, type(None))) -def execute_approx_median_series_groupby(_, data, mask, aggcontext=None, **kwargs): - median = pd.Series.median - - if mask is not None: - median = functools.partial(_filtered_reduction, mask.obj, median) - - return aggcontext.agg(data, median) - - -@execute_node.register((ops.Not, ops.Negate), (bool, np.bool_)) -def execute_not_bool(_, data, **kwargs): - return not data - - -def _execute_binary_op_impl(op, left, right, **_): - op_type = type(op) - try: - operation = constants.BINARY_OPERATIONS[op_type] - except KeyError: - raise com.OperationNotDefinedError( - f"Binary operation {op_type.__name__} not implemented" - ) - else: - return operation(left, right) - - -@execute_node.register(ops.Binary, pd.Series, pd.Series) -@execute_node.register( - (ops.NumericBinary, ops.LogicalBinary, ops.Comparison), - numeric_types, - pd.Series, -) -@execute_node.register( - (ops.NumericBinary, ops.LogicalBinary, ops.Comparison), - pd.Series, - numeric_types, -) -@execute_node.register( - (ops.NumericBinary, ops.LogicalBinary, ops.Comparison), - numeric_types, - numeric_types, -) -@execute_node.register((ops.Comparison, ops.Add, ops.Multiply), pd.Series, str) -@execute_node.register((ops.Comparison, ops.Add, ops.Multiply), str, pd.Series) -@execute_node.register((ops.Comparison, ops.Add), str, str) -@execute_node.register(ops.Multiply, integer_types, str) -@execute_node.register(ops.Multiply, str, integer_types) -@execute_node.register(ops.Comparison, pd.Series, timestamp_types) -@execute_node.register(ops.Comparison, timedelta_types, pd.Series) -@execute_node.register(ops.BitwiseBinary, integer_types, integer_types) -@execute_node.register(ops.BitwiseBinary, pd.Series, integer_types) -@execute_node.register(ops.BitwiseBinary, integer_types, pd.Series) -def execute_binary_op(op, left, right, **kwargs): - return _execute_binary_op_impl(op, left, right, **kwargs) - - -@execute_node.register(ops.Comparison, pd.Series, date_types) -def execute_binary_op_date(op, left, right, **kwargs): - return _execute_binary_op_impl( - op, pd.to_datetime(left), pd.to_datetime(right), **kwargs - ) - - -@execute_node.register(ops.Binary, SeriesGroupBy, SeriesGroupBy) -def execute_binary_op_series_group_by(op, left, right, **kwargs): - left_groupings = get_grouping(left.grouper.groupings) - right_groupings = get_grouping(right.grouper.groupings) - if left_groupings != right_groupings: - raise ValueError( - f"Cannot perform {type(op).__name__} operation on two series with " - "different groupings" - ) - result = execute_binary_op(op, left.obj, right.obj, **kwargs) - return result.groupby(left_groupings, group_keys=False) - - -@execute_node.register(ops.Binary, SeriesGroupBy, simple_types) -def execute_binary_op_series_gb_simple(op, left, right, **kwargs): - result = execute_binary_op(op, left.obj, right, **kwargs) - return result.groupby(get_grouping(left.grouper.groupings), group_keys=False) - - -@execute_node.register(ops.Binary, simple_types, SeriesGroupBy) -def execute_binary_op_simple_series_gb(op, left, right, **kwargs): - result = execute_binary_op(op, left, right.obj, **kwargs) - return result.groupby(get_grouping(right.grouper.groupings), group_keys=False) - - -@execute_node.register(ops.Unary, SeriesGroupBy) -def execute_unary_op_series_gb(op, operand, **kwargs): - result = execute_node(op, operand.obj, **kwargs) - return result.groupby(get_grouping(operand.grouper.groupings), group_keys=False) - - -@execute_node.register( - (ops.Log, ops.Round), - SeriesGroupBy, - (numbers.Real, decimal.Decimal, type(None)), -) -def execute_log_series_gb_others(op, left, right, **kwargs): - result = execute_node(op, left.obj, right, **kwargs) - return result.groupby(get_grouping(left.grouper.groupings), group_keys=False) - - -@execute_node.register((ops.Log, ops.Round), SeriesGroupBy, SeriesGroupBy) -def execute_log_series_gb_series_gb(op, left, right, **kwargs): - result = execute_node(op, left.obj, right.obj, **kwargs) - return result.groupby(get_grouping(left.grouper.groupings), group_keys=False) - - -@execute_node.register(ops.Not, pd.Series) -def execute_not_series(op, data, **kwargs): - return ~data - - -@execute_node.register(ops.StringSplit, pd.Series, (pd.Series, str)) -def execute_string_split(op, data, delimiter, **kwargs): - # Doing the iteration using `map` is much faster than doing the iteration - # using `Series.apply` due to Pandas-related overhead. - return pd.Series(np.array(s.split(delimiter)) for s in data) - - -@execute_node.register( - ops.Between, - pd.Series, - (pd.Series, numbers.Real, str, datetime.datetime), - (pd.Series, numbers.Real, str, datetime.datetime), -) -def execute_between(op, data, lower, upper, **kwargs): - return data.between(lower, upper) - - -@execute_node.register(ops.Union, pd.DataFrame, pd.DataFrame, bool) -def execute_union_dataframe_dataframe( - op, left: pd.DataFrame, right: pd.DataFrame, distinct, **kwargs -): - result = pd.concat([left, right], axis=0) - return result.drop_duplicates() if distinct else result - - -@execute_node.register(ops.Intersection, pd.DataFrame, pd.DataFrame, bool) -def execute_intersection_dataframe_dataframe( - op, - left: pd.DataFrame, - right: pd.DataFrame, - distinct: bool, - **kwargs, -): - if not distinct: - raise NotImplementedError( - "`distinct=False` is not supported by the pandas backend" - ) - result = left.merge(right, on=list(left.columns), how="inner") - return result - - -@execute_node.register(ops.Difference, pd.DataFrame, pd.DataFrame, bool) -def execute_difference_dataframe_dataframe( - op, - left: pd.DataFrame, - right: pd.DataFrame, - distinct: bool, - **kwargs, -): - if not distinct: - raise NotImplementedError( - "`distinct=False` is not supported by the pandas backend" - ) - merged = left.merge(right, on=list(left.columns), how="outer", indicator=True) - result = merged[merged["_merge"] == "left_only"].drop("_merge", axis=1) - return result - - -@execute_node.register(ops.IsNull, pd.Series) -def execute_series_isnull(op, data, **kwargs): - return data.isnull() - - -@execute_node.register(ops.NotNull, pd.Series) -def execute_series_notnnull(op, data, **kwargs): - return data.notnull() - - -@execute_node.register(ops.IsNan, (pd.Series, floating_types)) -def execute_isnan(op, data, **kwargs): - try: - return np.isnan(data) - except (TypeError, ValueError): - # if `data` contains `None` np.isnan will complain - # so we take advantage of NaN not equaling itself - # to do the correct thing - return data != data - - -@execute_node.register(ops.IsInf, (pd.Series, floating_types)) -def execute_isinf(op, data, **kwargs): - return np.isinf(data) - - -@execute_node.register(ops.SelfReference, pd.DataFrame) -def execute_node_self_reference_dataframe(op, data, **kwargs): - return data - - -@execute_node.register(ops.Alias, object) -def execute_alias(op, data, **kwargs): - # just return the underlying argument because the naming is handled - # by the translator for the top level expression - return data - - -@execute_node.register(ops.StringConcat, tuple) -def execute_node_string_concat(op, values, **kwargs): - values = [execute(arg, **kwargs) for arg in values] - return functools.reduce(operator.add, values) - - -@execute_node.register(ops.StringJoin, collections.abc.Sequence) -def execute_node_string_join(op, args, **kwargs): - return op.sep.join(args) - - -@execute_node.register(ops.InValues, object, tuple) -def execute_node_scalar_in_values(op, data, elements, **kwargs): - elements = [execute(arg, **kwargs) for arg in elements] - return data in elements - - -@execute_node.register(ops.InColumn, object, np.ndarray) -def execute_node_scalar_in_column(op, data, elements, **kwargs): - return data in elements - - -@execute_node.register(ops.InValues, pd.Series, tuple) -def execute_node_column_in_values(op, data, elements, **kwargs): - elements = [execute(arg, **kwargs) for arg in elements] - return data.isin(elements) - - -@execute_node.register(ops.InColumn, pd.Series, pd.Series) -def execute_node_column_in_column(op, data, elements, **kwargs): - return data.isin(elements) - - -@execute_node.register(ops.InValues, SeriesGroupBy, tuple) -def execute_node_group_in_values(op, data, elements, **kwargs): - elements = [execute(arg, **kwargs) for arg in elements] - return data.obj.isin(elements).groupby( - get_grouping(data.grouper.groupings), group_keys=False - ) - - -@execute_node.register(ops.InColumn, SeriesGroupBy, pd.Series) -def execute_node_group_in_column(op, data, elements, **kwargs): - return data.obj.isin(elements).groupby( - get_grouping(data.grouper.groupings), group_keys=False - ) - - -def pd_where(cond, true, false): - """Execute `where` following ibis's intended semantics.""" - if isinstance(cond, pd.Series): - if not isinstance(true, pd.Series): - true = pd.Series( - np.repeat(true, len(cond)), name=cond.name, index=cond.index - ) - return true.where(cond, other=false) - if cond: - if isinstance(false, pd.Series) and not isinstance(true, pd.Series): - return pd.Series(np.repeat(true, len(false))) - return true - else: - if isinstance(true, pd.Series) and not isinstance(false, pd.Series): - return pd.Series(np.repeat(false, len(true)), index=true.index) - return false - - -@execute_node.register(ops.IfElse, (pd.Series, *boolean_types), pd.Series, pd.Series) -@execute_node.register(ops.IfElse, (pd.Series, *boolean_types), pd.Series, simple_types) -@execute_node.register(ops.IfElse, (pd.Series, *boolean_types), simple_types, pd.Series) -@execute_node.register(ops.IfElse, (pd.Series, *boolean_types), type(None), type(None)) -def execute_node_where(op, cond, true, false, **kwargs): - return pd_where(cond, true, false) - - -# For true/false as scalars, we only support identical type pairs + None to -# limit the size of the dispatch table and not have to worry about type -# promotion. -for typ in (str, *scalar_types): - for cond_typ in (pd.Series, *boolean_types): - execute_node.register(ops.IfElse, cond_typ, typ, typ)(execute_node_where) - execute_node.register(ops.IfElse, cond_typ, type(None), typ)(execute_node_where) - execute_node.register(ops.IfElse, cond_typ, typ, type(None))(execute_node_where) - - -@execute_node.register(ops.DatabaseTable, PandasBackend) -def execute_database_table_client( - op, client, timecontext: TimeContext | None, **kwargs -): - df = client.dictionary[op.name] - if timecontext: - begin, end = timecontext - time_col = get_time_col() - if time_col not in df: - raise com.IbisError( - f"Table {op.name} must have a time column named {time_col}" - " to execute with time context." - ) - # filter with time context - mask = df[time_col].between(begin, end) - return df.loc[mask].reset_index(drop=True) - return df - - -MATH_FUNCTIONS = { - ops.Floor: math.floor, - ops.Ln: math.log, - ops.Log2: lambda x: math.log(x, 2), - ops.Log10: math.log10, - ops.Exp: math.exp, - ops.Sqrt: math.sqrt, - ops.Abs: abs, - ops.Ceil: math.ceil, - ops.Sign: lambda x: 0 if not x else -1 if x < 0 else 1, -} - -MATH_FUNCTION_TYPES = tuple(MATH_FUNCTIONS.keys()) - - -@execute_node.register(MATH_FUNCTION_TYPES, numeric_types) -def execute_node_math_function_number(op, value, **kwargs): - return MATH_FUNCTIONS[type(op)](value) - - -@execute_node.register(ops.Log, numeric_types, numeric_types) -def execute_node_log_number_number(op, value, base, **kwargs): - return math.log(value, base) - - -@execute_node.register(ops.DropNa, pd.DataFrame) -def execute_node_dropna_dataframe(op, df, **kwargs): - if op.subset is not None: - subset = [col.name for col in op.subset] - else: - subset = None - return df.dropna(how=op.how, subset=subset) - - -@execute_node.register(ops.FillNa, pd.DataFrame, simple_types) -def execute_node_fillna_dataframe_scalar(op, df, replacements, **kwargs): - return df.fillna(replacements) - - -@execute_node.register(ops.FillNa, pd.DataFrame) -def execute_node_fillna_dataframe_dict(op, df, **kwargs): - return df.fillna(dict(op.replacements)) - - -@execute_node.register(ops.NullIf, simple_types, simple_types) -def execute_node_nullif_scalars(op, value1, value2, **kwargs): - return np.nan if value1 == value2 else value1 - - -@execute_node.register(ops.NullIf, pd.Series, (pd.Series, *simple_types)) -def execute_node_nullif_series(op, left, right, **kwargs): - return left.where(left != right) - - -@execute_node.register(ops.NullIf, simple_types, pd.Series) -def execute_node_nullif_scalar_series(op, value, series, **kwargs): - return series.where(series != value) - - -def coalesce(values): - return functools.reduce( - lambda a1, a2: np.where(pd.isnull(a1), a2, a1), - values, - ) - - -@toolz.curry -def promote_to_sequence(length, obj): - try: - return obj.values - except AttributeError: - return np.repeat(obj, length) - - -def compute_row_reduction(func, values, **kwargs): - final_sizes = {len(x) for x in values if isinstance(x, Sized)} - if not final_sizes: - return func(values) - (final_size,) = final_sizes - raw = func(list(map(promote_to_sequence(final_size), values)), **kwargs) - return pd.Series(raw).squeeze() - - -@execute_node.register(ops.Greatest, tuple) -def execute_node_greatest_list(op, values, **kwargs): - values = [execute(arg, **kwargs) for arg in values] - return compute_row_reduction(np.maximum.reduce, values, axis=0) - - -@execute_node.register(ops.Least, tuple) -def execute_node_least_list(op, values, **kwargs): - values = [execute(arg, **kwargs) for arg in values] - return compute_row_reduction(np.minimum.reduce, values, axis=0) - - -@execute_node.register(ops.Coalesce, tuple) -def execute_node_coalesce(op, values, **kwargs): - # TODO: this is slow - values = [execute(arg, **kwargs) for arg in values] - return compute_row_reduction(coalesce, values) - - -def wrap_case_result(raw, expr): - """Wrap a CASE statement result in a Series and handle returning scalars. - - Parameters - ---------- - raw : ndarray[T] - The raw results of executing the ``CASE`` expression - expr : Value - The expression from the which `raw` was computed - - Returns - ------- - Union[scalar, Series] - """ - raw_1d = np.atleast_1d(raw) - if np.any(pd.isnull(raw_1d)): - result = pd.Series(raw_1d) - else: - result = pd.Series( - raw_1d, dtype=constants.IBIS_TYPE_TO_PANDAS_TYPE[expr.type()] - ) - if result.size == 1 and isinstance(expr, ir.Scalar): - value = result.iloc[0] - try: - return value.item() - except AttributeError: - return value - return result - - -def _build_select(op, whens, thens, otherwise, func=None, **kwargs): - if func is None: - func = lambda x: x - - whens_ = [] - grouped = 0 - for when in whens: - res = execute(when, **kwargs) - obj = getattr(res, "obj", res) - grouped += obj is not res - whens_.append(obj) - - thens_ = [] - for then in thens: - res = execute(then, **kwargs) - obj = getattr(res, "obj", res) - grouped += obj is not res - thens_.append(obj) - - if otherwise is None: - otherwise = np.nan - - raw = np.select(func(whens_), thens_, otherwise) - - if grouped: - return pd.Series(raw).groupby(get_grouping(res.grouper.groupings)) - return wrap_case_result(raw, op.to_expr()) - - -@execute_node.register(ops.SearchedCase, tuple, tuple, object) -def execute_searched_case(op, whens, thens, otherwise, **kwargs): - return _build_select(op, whens, thens, otherwise, **kwargs) - - -@execute_node.register(ops.SimpleCase, object, tuple, tuple, object) -def execute_simple_case_scalar(op, value, whens, thens, otherwise, **kwargs): - value = getattr(value, "obj", value) - return _build_select( - op, - whens, - thens, - otherwise, - func=lambda whens: np.asarray(whens) == value, - **kwargs, - ) - - -@execute_node.register(ops.SimpleCase, (pd.Series, SeriesGroupBy), tuple, tuple, object) -def execute_simple_case_series(op, value, whens, thens, otherwise, **kwargs): - value = getattr(value, "obj", value) - return _build_select( - op, - whens, - thens, - otherwise, - func=lambda whens: [value == when for when in whens], - **kwargs, - ) - - -@execute_node.register(ops.Distinct, pd.DataFrame) -def execute_distinct_dataframe(op, df, **kwargs): - return df.drop_duplicates() - - -@execute_node.register(ops.TableArrayView, pd.DataFrame) -def execute_table_array_view(op, _, **kwargs): - return execute(op.table).squeeze() - - -@execute_node.register(ops.InMemoryTable) -def execute_in_memory_table(op, **kwargs): - return op.data.to_frame() - - -@execute_node.register(ops.Sample, pd.DataFrame, object, object) -def execute_sample(op, data, fraction, seed, **kwargs): - return data.sample(frac=fraction, random_state=seed) diff --git a/ibis/backends/pandas/execution/join.py b/ibis/backends/pandas/execution/join.py deleted file mode 100644 index adf39079f6595..0000000000000 --- a/ibis/backends/pandas/execution/join.py +++ /dev/null @@ -1,183 +0,0 @@ -from __future__ import annotations - -import itertools - -import pandas as pd - -import ibis.expr.analysis as an -import ibis.expr.operations as ops -from ibis.backends.pandas.core import execute -from ibis.backends.pandas.dispatch import execute_node -from ibis.backends.pandas.execution import constants -from ibis.common.exceptions import UnsupportedOperationError - - -def _compute_join_column(column, **kwargs): - if isinstance(column, ops.TableColumn): - new_column = column.name - else: - new_column = execute(column, **kwargs) - root_table, *_ = an.find_immediate_parent_tables(column) - return new_column, root_table - - -@execute_node.register(ops.CrossJoin, pd.DataFrame, pd.DataFrame, tuple) -def execute_cross_join(op, left, right, predicates, **kwargs): - """Execute a cross join in pandas. - - Notes - ----- - We create a dummy column of all :data:`True` instances and use that as the - join key. This results in the desired Cartesian product behavior guaranteed - by cross join. - """ - assert not predicates, "cross join predicates must be empty" - return pd.merge( - left, - right, - how="cross", - copy=False, - suffixes=constants.JOIN_SUFFIXES, - ) - - -def _get_semi_anti_join_filter(op, left, right, predicates, **kwargs): - left_on, right_on = _construct_join_predicate_columns( - op, - predicates, - **kwargs, - ) - inner = left.merge( - right[right_on].drop_duplicates(), - on=left_on, - how="left", - indicator=True, - ) - return (inner["_merge"] == "both").values - - -@execute_node.register(ops.LeftSemiJoin, pd.DataFrame, pd.DataFrame, tuple) -def execute_left_semi_join(op, left, right, predicates, **kwargs): - """Execute a left semi join in pandas.""" - inner_filt = _get_semi_anti_join_filter( - op, - left, - right, - predicates, - **kwargs, - ) - return left.loc[inner_filt, :] - - -@execute_node.register(ops.LeftAntiJoin, pd.DataFrame, pd.DataFrame, tuple) -def execute_left_anti_join(op, left, right, predicates, **kwargs): - """Execute a left anti join in pandas.""" - inner_filt = _get_semi_anti_join_filter( - op, - left, - right, - predicates, - **kwargs, - ) - return left.loc[~inner_filt, :] - - -def _construct_join_predicate_columns(op, predicates, **kwargs): - on = {op.left: [], op.right: []} - - for predicate in predicates: - if not isinstance(predicate, ops.Equals): - raise TypeError("Only equality join predicates supported with pandas") - new_left_column, left_pred_root = _compute_join_column(predicate.left, **kwargs) - on[left_pred_root].append(new_left_column) - - new_right_column, right_pred_root = _compute_join_column( - predicate.right, **kwargs - ) - on[right_pred_root].append(new_right_column) - return on[op.left], on[op.right] - - -@execute_node.register(ops.Join, pd.DataFrame, pd.DataFrame, tuple) -def execute_join(op, left, right, predicates, **kwargs): - op_type = type(op) - - try: - how = constants.JOIN_TYPES[op_type] - except KeyError: - raise UnsupportedOperationError(f"{op_type.__name__} not supported") - - left_on, right_on = _construct_join_predicate_columns(op, predicates, **kwargs) - - df = pd.merge( - left, - right, - how=how, - left_on=left_on, - right_on=right_on, - suffixes=constants.JOIN_SUFFIXES, - ) - return df - - -@execute_node.register( - ops.AsOfJoin, - pd.DataFrame, - pd.DataFrame, - tuple, - (pd.Timedelta, type(None)), - tuple, -) -def execute_asof_join(op, left, right, by, tolerance, predicates, **kwargs): - left_on, right_on = _extract_predicate_names(predicates) - left_by, right_by = _extract_predicate_names(by) - - # Add default join suffixes to predicates and groups and rename the - # corresponding columns before the `merge_asof`. If we don't do this and the - # predicates have the same column name, we lose the original RHS column - # values in the output. Instead, the RHS values are copies of the LHS values. - # xref https://github.com/ibis-project/ibis/issues/6080 - left_on_suffixed = [x + constants.JOIN_SUFFIXES[0] for x in left_on] - right_on_suffixed = [x + constants.JOIN_SUFFIXES[1] for x in right_on] - - left_by_suffixed = [x + constants.JOIN_SUFFIXES[0] for x in left_by] - right_by_suffixed = [x + constants.JOIN_SUFFIXES[1] for x in right_by] - - left = left.rename( - columns=dict( - itertools.chain( - zip(left_on, left_on_suffixed), zip(left_by, left_by_suffixed) - ) - ) - ) - right = right.rename( - columns=dict( - itertools.chain( - zip(right_on, right_on_suffixed), zip(right_by, right_by_suffixed) - ) - ) - ) - - return pd.merge_asof( - left=left, - right=right, - left_on=left_on_suffixed, - right_on=right_on_suffixed, - left_by=left_by_suffixed or None, - right_by=right_by_suffixed or None, - tolerance=tolerance, - suffixes=constants.JOIN_SUFFIXES, - ) - - -def _extract_predicate_names(predicates): - lefts = [] - rights = [] - for predicate in predicates: - if not isinstance(predicate, ops.Equals): - raise TypeError("Only equality join predicates supported with pandas") - left_name = predicate.left.name - right_name = predicate.right.name - lefts.append(left_name) - rights.append(right_name) - return lefts, rights diff --git a/ibis/backends/pandas/execution/maps.py b/ibis/backends/pandas/execution/maps.py deleted file mode 100644 index 2da84583362ca..0000000000000 --- a/ibis/backends/pandas/execution/maps.py +++ /dev/null @@ -1,208 +0,0 @@ -from __future__ import annotations - -import collections -import functools - -import numpy as np -import pandas as pd -import toolz - -import ibis.expr.operations as ops -from ibis.backends.pandas.dispatch import execute_node - - -@execute_node.register(ops.Map, np.ndarray, np.ndarray) -def map_ndarray_ndarray(op, keys, values, **kwargs): - return dict(zip(keys, values)) - - -@execute_node.register(ops.Map, pd.Series, pd.Series) -def map_series_series(op, keys, values, **kwargs): - return keys.combine(values, lambda a, b: dict(zip(a, b))) - - -@execute_node.register(ops.MapLength, pd.Series) -def map_length_series(op, data, **kwargs): - # TODO: investigate whether calling a lambda is faster - return data.dropna().map(len).reindex(data.index) - - -@execute_node.register(ops.MapLength, (collections.abc.Mapping, type(None))) -def map_length_dict(op, data, **kwargs): - return None if data is None else len(data) - - -@execute_node.register(ops.MapGet, pd.Series, object, object) -def map_get_series_scalar_scalar(op, data, key, default, **kwargs): - return data.map(functools.partial(safe_get, key=key, default=default)) - - -@execute_node.register(ops.MapGet, pd.Series, object, pd.Series) -def map_get_series_scalar_series(op, data, key, default, **kwargs): - defaultiter = iter(default.values) - return data.map( - lambda mapping, key=key, defaultiter=defaultiter: safe_get( - mapping, key, next(defaultiter) - ) - ) - - -@execute_node.register(ops.MapGet, pd.Series, pd.Series, object) -def map_get_series_series_scalar(op, data, key, default, **kwargs): - keyiter = iter(key.values) - return data.map( - lambda mapping, keyiter=keyiter, default=default: safe_get( - mapping, next(keyiter), default - ) - ) - - -@execute_node.register(ops.MapGet, pd.Series, pd.Series, pd.Series) -def map_get_series_series_series(op, data, key, default): - keyiter = iter(key.values) - defaultiter = iter(default.values) - - def get(mapping, keyiter=keyiter, defaultiter=defaultiter): - return safe_get(mapping, next(keyiter), next(defaultiter)) - - return data.map(get) - - -@execute_node.register(ops.MapGet, collections.abc.Mapping, object, object) -def map_get_dict_scalar_scalar(op, data, key, default, **kwargs): - return safe_get(data, key, default) - - -@execute_node.register(ops.MapGet, collections.abc.Mapping, object, pd.Series) -def map_get_dict_scalar_series(op, data, key, default, **kwargs): - return default.map(lambda d, data=data, key=key: safe_get(data, key, d)) - - -@execute_node.register(ops.MapGet, collections.abc.Mapping, pd.Series, object) -def map_get_dict_series_scalar(op, data, key, default, **kwargs): - return key.map(lambda k, data=data, default=default: safe_get(data, k, default)) - - -@execute_node.register(ops.MapGet, collections.abc.Mapping, pd.Series, pd.Series) -def map_get_dict_series_series(op, data, key, default, **kwargs): - defaultiter = iter(default.values) - return key.map( - lambda k, data=data, defaultiter=defaultiter: safe_get( - data, k, next(defaultiter) - ) - ) - - -@execute_node.register(ops.MapContains, collections.abc.Mapping, object) -def map_contains_dict_object(op, data, key, **kwargs): - return safe_contains(data, key) - - -@execute_node.register(ops.MapContains, collections.abc.Mapping, pd.Series) -def map_contains_dict_series(op, data, key, **kwargs): - return key.map(lambda k, data=data: safe_contains(data, k)) - - -@execute_node.register(ops.MapContains, pd.Series, object) -def map_contains_series_object(op, data, key, **kwargs): - return data.map(lambda d: safe_contains(d, key)) - - -@execute_node.register(ops.MapContains, pd.Series, pd.Series) -def map_contains_series_series(op, data, key, **kwargs): - return data.combine(key, lambda d, k: safe_contains(d, k)) - - -def safe_method(mapping, method, *args, **kwargs): - if mapping is None: - return None - try: - method = getattr(mapping, method) - except AttributeError: - return None - else: - return method(*args, **kwargs) - - -def safe_get(mapping, key, default=None): - return safe_method(mapping, "get", key, default) - - -def safe_contains(mapping, key): - return safe_method(mapping, "__contains__", key) - - -def safe_keys(mapping): - result = safe_method(mapping, "keys") - if result is None: - return None - # list(...) to unpack iterable - return np.array(list(result)) - - -def safe_values(mapping): - result = safe_method(mapping, "values") - if result is None: - return None - # list(...) to unpack iterable - return np.array(list(result), dtype="object") - - -@execute_node.register(ops.MapKeys, pd.Series) -def map_keys_series(op, data, **kwargs): - return data.map(safe_keys) - - -@execute_node.register(ops.MapKeys, (collections.abc.Mapping, type(None))) -def map_keys_dict(op, data, **kwargs): - if data is None: - return None - # list(...) to unpack iterable - return np.array(list(data.keys())) - - -@execute_node.register(ops.MapValues, pd.Series) -def map_values_series(op, data, **kwargs): - res = data.map(safe_values) - return res - - -@execute_node.register(ops.MapValues, (collections.abc.Mapping, type(None))) -def map_values_dict(op, data, **kwargs): - if data is None: - return None - # list(...) to unpack iterable - return np.array(list(data.values())) - - -def safe_merge(*maps): - return None if any(m is None for m in maps) else toolz.merge(*maps) - - -@execute_node.register( - ops.MapMerge, - (collections.abc.Mapping, type(None)), - (collections.abc.Mapping, type(None)), -) -def map_merge_dict_dict(op, lhs, rhs, **kwargs): - return safe_merge(lhs, rhs) - - -@execute_node.register(ops.MapMerge, (collections.abc.Mapping, type(None)), pd.Series) -def map_merge_dict_series(op, lhs, rhs, **kwargs): - if lhs is None: - return pd.Series([None] * len(rhs)) - return rhs.map(lambda m, lhs=lhs: safe_merge(lhs, m)) - - -@execute_node.register(ops.MapMerge, pd.Series, (collections.abc.Mapping, type(None))) -def map_merge_series_dict(op, lhs, rhs, **kwargs): - if rhs is None: - return pd.Series([None] * len(lhs)) - return lhs.map(lambda m, rhs=rhs: safe_merge(m, rhs)) - - -@execute_node.register(ops.MapMerge, pd.Series, pd.Series) -def map_merge_series_series(op, lhs, rhs, **kwargs): - rhsiter = iter(rhs.values) - return lhs.map(lambda m, rhsiter=rhsiter: safe_merge(m, next(rhsiter))) diff --git a/ibis/backends/pandas/execution/selection.py b/ibis/backends/pandas/execution/selection.py deleted file mode 100644 index b1f8a0ee66599..0000000000000 --- a/ibis/backends/pandas/execution/selection.py +++ /dev/null @@ -1,337 +0,0 @@ -"""Dispatching code for Selection operations.""" - -from __future__ import annotations - -import functools -import operator -from collections import defaultdict -from typing import TYPE_CHECKING, Any - -import pandas as pd -from toolz import concatv, first - -import ibis.expr.analysis as an -import ibis.expr.operations as ops -import ibis.expr.types as ir -from ibis.backends.base.df.scope import Scope -from ibis.backends.pandas.core import execute -from ibis.backends.pandas.dispatch import execute_node -from ibis.backends.pandas.execution import constants, util -from ibis.backends.pandas.execution.util import coerce_to_output - -if TYPE_CHECKING: - from collections.abc import Iterable - - from ibis.backends.base.df.timecontext import TimeContext - - -def compute_projection( - node: ops.Node, - parent: ops.Selection, - data: pd.DataFrame, - scope: Scope | None = None, - timecontext: TimeContext | None = None, - **kwargs: Any, -): - """Compute a projection. - - `ibis.expr.types.Scalar` instances occur when a specific column projection - is a window operation. - """ - if isinstance(node, ops.TableNode): - if node == parent.table: - return data - - assert isinstance(parent.table, ops.Join) - assert node in (parent.table.left, parent.table.right) - - mapping = remap_overlapping_column_names( - parent.table, - root_table=node, - data_columns=frozenset(data.columns), - ) - return map_new_column_names_to_data(mapping, data) - elif isinstance(node, ops.Value): - name = node.name - assert name is not None, "Value selection name is None" - - if node.shape.is_scalar(): - data_columns = frozenset(data.columns) - - if scope is None: - scope = Scope() - - scope = scope.merge_scopes( - Scope( - { - t: map_new_column_names_to_data( - remap_overlapping_column_names( - parent.table, t, data_columns - ), - data, - ) - }, - timecontext, - ) - for t in an.find_immediate_parent_tables(node) - ) - scalar = execute(node, scope=scope, **kwargs) - result = pd.Series([scalar], name=name).repeat(len(data.index)) - result.index = data.index - return result - else: - if isinstance(node, ops.TableColumn): - if name in data: - return data[name].rename(name) - - if not isinstance(parent.table, ops.Join): - raise KeyError(name) - - suffix = util.get_join_suffix_for_op(node, parent.table) - return data.loc[:, name + suffix].rename(name) - - data_columns = frozenset(data.columns) - - scope = scope.merge_scopes( - Scope( - { - t: map_new_column_names_to_data( - remap_overlapping_column_names( - parent.table, t, data_columns - ), - data, - ) - }, - timecontext, - ) - for t in an.find_immediate_parent_tables(node) - ) - - result = execute(node, scope=scope, timecontext=timecontext, **kwargs) - return coerce_to_output(result, node, data.index) - else: - raise TypeError(node) - - -def remap_overlapping_column_names(table, root_table, data_columns): - """Return a mapping of suffixed column names to column names without suffixes. - - Parameters - ---------- - table : TableNode - The ``TableNode`` we're selecting from. - root_table : TableNode - The root table of the expression we're selecting from. - data_columns - The available columns to select from - - Returns - ------- - dict[str, str] - A mapping from possibly-suffixed column names to column names without - suffixes. - """ - if not isinstance(table, ops.Join): - return None - - left_root, right_root = an.find_immediate_parent_tables([table.left, table.right]) - suffixes = { - left_root: constants.LEFT_JOIN_SUFFIX, - right_root: constants.RIGHT_JOIN_SUFFIX, - } - - # if we're selecting from the root table and that's not the left or right - # child, don't add a suffix - # - # this can happen when selecting directly from a join as opposed to - # explicitly referencing the left or right tables - # - # we use setdefault here because the root_table can be the left/right table - # which we may have already put into `suffixes` - suffixes.setdefault(root_table, "") - - suffix = suffixes[root_table] - - column_names = [ - ({name, f"{name}{suffix}"} & data_columns, name) - for name in root_table.schema.names - ] - mapping = { - first(col_name): final_name for col_name, final_name in column_names if col_name - } - return mapping - - -def map_new_column_names_to_data(mapping, df): - if mapping: - return df.loc[:, mapping.keys()].rename(columns=mapping) - return df - - -def _compute_predicates( - table_op: ops.TableNode, - predicates: Iterable[ir.BooleanColumn], - data: pd.DataFrame, - scope: Scope, - timecontext: TimeContext | None, - **kwargs: Any, -) -> pd.Series: - """Compute the predicates for a table operation. - - This handles the cases where `predicates` are computed columns, in addition - to the simple case of named columns coming directly from the input table. - """ - for predicate in predicates: - # Map each root table of the predicate to the data so that we compute - # predicates on the result instead of any left or right tables if the - # Selection is on a Join. Project data to only include columns from - # the root table. - root_tables = an.find_immediate_parent_tables(predicate) - - # handle suffixes - data_columns = frozenset(data.columns) - - additional_scope = Scope() - for root_table in root_tables: - mapping = remap_overlapping_column_names(table_op, root_table, data_columns) - new_data = map_new_column_names_to_data(mapping, data) - additional_scope = additional_scope.merge_scope( - Scope({root_table: new_data}, timecontext) - ) - - scope = scope.merge_scope(additional_scope) - yield execute(predicate, scope=scope, **kwargs) - - -def build_df_from_selection( - selections: list[ops.Value], - data: pd.DataFrame, - table: ops.Node, -) -> pd.DataFrame: - """Build up a df by doing direct selections, renaming if necessary. - - Special logic for: - - Joins where suffixes have been added to column names - - Cases where new columns are created and selected. - """ - cols = defaultdict(list) - - for node in selections: - selection = node.name - if selection not in data: - if not isinstance(table, ops.Join): - raise KeyError(selection) - join_suffix = util.get_join_suffix_for_op(node, table) - if selection + join_suffix not in data: - raise KeyError(selection) - selection += join_suffix - cols[selection].append(node.name) - - result = data[list(cols.keys())] - - renamed_cols = {} - for from_col, to_cols in cols.items(): - if len(to_cols) == 1 and from_col != to_cols[0]: - renamed_cols[from_col] = to_cols[0] - else: - for new_col in to_cols: - if from_col != new_col: - result[new_col] = result[from_col] - - if renamed_cols: - result = result.rename(columns=renamed_cols) - - return result - - -def build_df_from_projection( - selection_exprs: list[ir.Expr], - op: ops.Selection, - data: pd.DataFrame, - **kwargs, -) -> pd.DataFrame: - data_pieces = [ - compute_projection(node, op, data, **kwargs) for node in selection_exprs - ] - - new_pieces = [ - piece.reset_index(level=list(range(1, piece.index.nlevels)), drop=True) - if piece.index.nlevels > 1 - else piece - for piece in data_pieces - ] - # Result series might be trimmed by time context, thus index may - # have changed. To concat rows properly, we first `sort_index` on - # each pieces then assign data index manually to series - # - # If cardinality changes (e.g. unnest/explode), trying to do this - # won't work so don't try? - for i, piece in enumerate(new_pieces): - new_pieces[i] = piece.sort_index() - if len(new_pieces[i].index) == len(data.index): - new_pieces[i].index = data.index - - return pd.concat(new_pieces, axis=1) - - -@execute_node.register(ops.Selection, pd.DataFrame) -def execute_selection_dataframe( - op, - data, - scope: Scope, - timecontext: TimeContext | None, - **kwargs, -): - result = data - - # Build up the individual pandas structures from column expressions - if op.selections: - if all(isinstance(s, ops.TableColumn) for s in op.selections): - result = build_df_from_selection(op.selections, data, op.table) - else: - result = build_df_from_projection( - op.selections, - op, - data, - scope=scope, - timecontext=timecontext, - **kwargs, - ) - - if op.predicates: - predicates = _compute_predicates( - op.table, op.predicates, data, scope, timecontext, **kwargs - ) - predicate = functools.reduce(operator.and_, predicates) - assert len(predicate) == len( - result - ), "Selection predicate length does not match underlying table" - result = result.loc[predicate] - - if op.sort_keys: - result, grouping_keys, ordering_keys = util.compute_sorted_frame( - result, - order_by=op.sort_keys, - scope=scope, - timecontext=timecontext, - **kwargs, - ) - else: - grouping_keys = ordering_keys = () - - # return early if we do not have any temporary grouping or ordering columns - assert not grouping_keys, "group by should never show up in Selection" - if not ordering_keys: - return result - - # create a sequence of columns that we need to drop - temporary_columns = pd.Index(concatv(grouping_keys, ordering_keys)).difference( - data.columns - ) - - # no reason to call drop if we don't need to - if temporary_columns.empty: - return result - - # drop every temporary column we created for ordering or grouping - return result.drop(temporary_columns, axis=1) diff --git a/ibis/backends/pandas/execution/strings.py b/ibis/backends/pandas/execution/strings.py deleted file mode 100644 index 66e325b6d3671..0000000000000 --- a/ibis/backends/pandas/execution/strings.py +++ /dev/null @@ -1,560 +0,0 @@ -from __future__ import annotations - -import itertools -import json -import operator -from functools import partial, reduce -from urllib.parse import parse_qs, urlsplit - -import numpy as np -import pandas as pd -import toolz -from pandas.core.groupby import SeriesGroupBy - -try: - import regex as re -except ImportError: - import re - -import ibis.expr.operations as ops -import ibis.util -from ibis.backends.pandas.core import execute, integer_types, scalar_types -from ibis.backends.pandas.dispatch import execute_node -from ibis.backends.pandas.execution.util import get_grouping - - -@execute_node.register(ops.StringLength, pd.Series) -def execute_string_length_series(op, data, **kwargs): - return data.str.len().astype("int32") - - -@execute_node.register( - ops.Substring, pd.Series, integer_types, (type(None), *integer_types) -) -def execute_substring_int_int(op, data, start, length, **kwargs): - if length is None: - return data.str[start:] - else: - return data.str[start : start + length] - - -@execute_node.register(ops.Substring, pd.Series, pd.Series, integer_types) -def execute_substring_series_int(op, data, start, length, **kwargs): - return execute_substring_series_series( - op, data, start, pd.Series(np.repeat(length, len(start))), **kwargs - ) - - -@execute_node.register(ops.Substring, pd.Series, integer_types, pd.Series) -def execute_string_substring_int_series(op, data, start, length, **kwargs): - return execute_substring_series_series( - op, data, pd.Series(np.repeat(start, len(length))), length, **kwargs - ) - - -@execute_node.register(ops.Substring, pd.Series, pd.Series, pd.Series) -def execute_substring_series_series(op, data, start, length, **kwargs): - end = start + length - - return pd.Series( - [ - None - if (begin is not None and pd.isnull(begin)) - or (stop is not None and pd.isnull(stop)) - else value[begin:stop] - for value, begin, stop in zip(data, start.values, end.values) - ], - dtype=data.dtype, - name=data.name, - ) - - -@execute_node.register(ops.Strip, pd.Series) -def execute_string_strip(op, data, **kwargs): - return data.str.strip() - - -@execute_node.register(ops.LStrip, pd.Series) -def execute_string_lstrip(op, data, **kwargs): - return data.str.lstrip() - - -@execute_node.register(ops.RStrip, pd.Series) -def execute_string_rstrip(op, data, **kwargs): - return data.str.rstrip() - - -@execute_node.register( - ops.LPad, pd.Series, (pd.Series,) + integer_types, (pd.Series, str) -) -def execute_string_lpad(op, data, length, pad, **kwargs): - return data.str.pad(length, side="left", fillchar=pad) - - -@execute_node.register( - ops.RPad, pd.Series, (pd.Series,) + integer_types, (pd.Series, str) -) -def execute_string_rpad(op, data, length, pad, **kwargs): - return data.str.pad(length, side="right", fillchar=pad) - - -@execute_node.register(ops.Reverse, pd.Series) -def execute_string_reverse(op, data, **kwargs): - return data.str[::-1] - - -@execute_node.register(ops.Lowercase, pd.Series) -def execute_string_lower(op, data, **kwargs): - return data.str.lower() - - -@execute_node.register(ops.Uppercase, pd.Series) -def execute_string_upper(op, data, **kwargs): - return data.str.upper() - - -@execute_node.register(ops.Capitalize, (pd.Series, str)) -def execute_string_capitalize(op, data, **kwargs): - return getattr(data, "str", data).capitalize() - - -@execute_node.register(ops.Repeat, pd.Series, (pd.Series,) + integer_types) -def execute_string_repeat(op, data, times, **kwargs): - return data.str.repeat(times) - - -@execute_node.register(ops.StringContains, pd.Series, (pd.Series, str)) -def execute_string_contains(_, data, needle, **kwargs): - return data.str.contains(needle) - - -@execute_node.register( - ops.StringFind, - pd.Series, - (pd.Series, str), - (pd.Series, type(None)) + integer_types, - (pd.Series, type(None)) + integer_types, -) -def execute_string_find(op, data, needle, start, end, **kwargs): - return data.str.find(needle, start, end) - - -def _sql_like_to_regex(pattern, escape): - cur_i = 0 - pattern_length = len(pattern) - - while cur_i < pattern_length: - nxt_i = cur_i + 1 - - cur = pattern[cur_i] - nxt = pattern[nxt_i] if nxt_i < pattern_length else None - - skip = 1 - - if nxt is not None and escape is not None and cur == escape: - yield nxt - skip = 2 - elif cur == "%": - yield ".*" - elif cur == "_": - yield "." - else: - yield cur - - cur_i += skip - - -def sql_like_to_regex(pattern: str, escape: str | None = None) -> str: - """Convert a SQL `LIKE` pattern to an equivalent Python regular expression. - - Parameters - ---------- - pattern - A LIKE pattern with the following semantics: - * `%` matches zero or more characters - * `_` matches exactly one character - * To escape `%` and `_` (or to match the `escape` parameter - itself), prefix the desired character with `escape`. - escape - Escape character - - Returns - ------- - str - A regular expression pattern equivalent to the input SQL `LIKE` pattern. - - Examples - -------- - >>> sql_like_to_regex("6%") # default is to not escape anything - '^6.*$' - >>> sql_like_to_regex("6^%", escape="^") - '^6%$' - >>> sql_like_to_regex("6_") - '^6.$' - >>> sql_like_to_regex("6/_", escape="/") - '^6_$' - >>> sql_like_to_regex("%abc") # any string ending with "abc" - '^.*abc$' - >>> sql_like_to_regex("abc%") # any string starting with "abc" - '^abc.*$' - """ - return f"^{''.join(_sql_like_to_regex(pattern, escape))}$" - - -@execute_node.register(ops.StringSQLLike, pd.Series, str, (str, type(None))) -def execute_string_like_series_string(op, data, pattern, escape, **kwargs): - new_pattern = sql_like_to_regex(pattern, escape=escape) - return data.str.contains(new_pattern, regex=True) - - -@execute_node.register(ops.StringSQLLike, SeriesGroupBy, str, str) -def execute_string_like_series_groupby_string(op, data, pattern, escape, **kwargs): - return execute_string_like_series_string( - op, data.obj, pattern, escape, **kwargs - ).groupby(get_grouping(data.grouper.groupings), group_keys=False) - - -@execute_node.register(ops.GroupConcat, pd.Series, str, (pd.Series, type(None))) -def execute_group_concat_series_mask(op, data, sep, mask, aggcontext=None, **kwargs): - return aggcontext.agg( - data[mask] if mask is not None else data, - lambda series, sep=sep: sep.join(series.values), - ) - - -@execute_node.register(ops.GroupConcat, SeriesGroupBy, str, type(None)) -def execute_group_concat_series_gb(op, data, sep, _, aggcontext=None, **kwargs): - return aggcontext.agg(data, lambda data, sep=sep: sep.join(data.values.astype(str))) - - -@execute_node.register(ops.GroupConcat, SeriesGroupBy, str, SeriesGroupBy) -def execute_group_concat_series_gb_mask(op, data, sep, mask, aggcontext=None, **kwargs): - def method(series, sep=sep): - if series.empty: - return pd.NA - return sep.join(series.values.astype(str)) - - return aggcontext.agg( - data, - lambda data, mask=mask.obj, method=method: method(data[mask[data.index]]), - ) - - -@execute_node.register(ops.StringAscii, pd.Series) -def execute_string_ascii(op, data, **kwargs): - return data.map(ord).astype("int32") - - -@execute_node.register(ops.StringAscii, SeriesGroupBy) -def execute_string_ascii_group_by(op, data, **kwargs): - return execute_string_ascii(op, data, **kwargs).groupby( - get_grouping(data.grouper.groupings), group_keys=False - ) - - -@execute_node.register(ops.RegexSearch, pd.Series, str) -def execute_series_regex_search(op, data, pattern, **kwargs): - pattern = re.compile(pattern) - return data.map(lambda x, pattern=pattern: pattern.search(x) is not None) - - -@execute_node.register(ops.RegexSearch, SeriesGroupBy, str) -def execute_series_regex_search_gb(op, data, pattern, **kwargs): - return execute_series_regex_search( - op, data, getattr(pattern, "obj", pattern), **kwargs - ).groupby(get_grouping(data.grouper.groupings), group_keys=False) - - -@execute_node.register(ops.StartsWith, pd.Series, str) -def execute_series_starts_with(op, data, pattern, **kwargs): - return data.str.startswith(pattern) - - -@execute_node.register(ops.EndsWith, pd.Series, str) -def execute_series_ends_with(op, data, pattern, **kwargs): - return data.str.endswith(pattern) - - -@execute_node.register(ops.RegexExtract, pd.Series, str, integer_types) -def execute_series_regex_extract(op, data, pattern, index, **kwargs): - pattern = re.compile(pattern) - return pd.Series( - [ - None if (match is None or index > match.lastindex) else match[index] - for match in map(pattern.search, data) - ], - dtype=data.dtype, - name=data.name, - ) - - -@execute_node.register(ops.RegexExtract, SeriesGroupBy, str, integer_types) -def execute_series_regex_extract_gb(op, data, pattern, index, **kwargs): - return execute_series_regex_extract(op, data.obj, pattern, index, **kwargs).groupby( - get_grouping(data.grouper.groupings), group_keys=False - ) - - -@execute_node.register(ops.RegexReplace, pd.Series, str, str) -def execute_series_regex_replace(op, data, pattern, replacement, **kwargs): - pattern = re.compile(pattern) - - def replacer(x, pattern=pattern): - return pattern.sub(replacement, x) - - return data.apply(replacer) - - -@execute_node.register(ops.RegexReplace, str, str, str) -def execute_str_regex_replace(_, arg, pattern, replacement, **kwargs): - return re.sub(pattern, replacement, arg) - - -@execute_node.register(ops.RegexReplace, SeriesGroupBy, str, str) -def execute_series_regex_replace_gb(op, data, pattern, replacement, **kwargs): - return execute_series_regex_replace( - data.obj, pattern, replacement, **kwargs - ).groupby(get_grouping(data.grouper.groupings), group_keys=False) - - -@execute_node.register(ops.Translate, pd.Series, pd.Series, pd.Series) -def execute_series_translate_series_series(op, data, from_string, to_string, **kwargs): - tables = [ - str.maketrans(source, target) for source, target in zip(from_string, to_string) - ] - return pd.Series( - [string.translate(table) for string, table in zip(data, tables)], - dtype=data.dtype, - name=data.name, - ) - - -@execute_node.register(ops.Translate, pd.Series, pd.Series, str) -def execute_series_translate_series_scalar(op, data, from_string, to_string, **kwargs): - tables = [str.maketrans(source, to_string) for source in from_string] - return pd.Series( - [string.translate(table) for string, table in zip(data, tables)], - dtype=data.dtype, - name=data.name, - ) - - -@execute_node.register(ops.Translate, pd.Series, str, pd.Series) -def execute_series_translate_scalar_series(op, data, from_string, to_string, **kwargs): - tables = [str.maketrans(from_string, target) for target in to_string] - return pd.Series( - [string.translate(table) for string, table in zip(data, tables)], - dtype=data.dtype, - name=data.name, - ) - - -@execute_node.register(ops.Translate, pd.Series, str, str) -def execute_series_translate_scalar_scalar(op, data, from_string, to_string, **kwargs): - return data.str.translate(str.maketrans(from_string, to_string)) - - -@execute_node.register(ops.StrRight, pd.Series, integer_types) -def execute_series_right(op, data, nchars, **kwargs): - return data.str[-nchars:] - - -@execute_node.register(ops.StrRight, SeriesGroupBy, integer_types) -def execute_series_right_gb(op, data, nchars, **kwargs): - return execute_series_right(op, data.obj, nchars).groupby( - get_grouping(data.grouper.groupings), group_keys=False - ) - - -@execute_node.register(ops.StringReplace, pd.Series, (pd.Series, str), (pd.Series, str)) -def execute_series_string_replace(_, data, needle, replacement, **kwargs): - return data.str.replace(needle, replacement) - - -@execute_node.register(ops.StringJoin, (pd.Series, str), tuple) -def execute_series_join_scalar_sep(op, sep, args, **kwargs): - data = [execute(arg, **kwargs) for arg in args] - return reduce(lambda x, y: x + sep + y, data) - - -def haystack_to_series_of_lists(haystack, index=None): - if index is None: - index = toolz.first( - piece.index for piece in haystack if hasattr(piece, "index") - ) - pieces = reduce( - operator.add, - ( - pd.Series(getattr(piece, "values", piece), index=index).map( - ibis.util.promote_list - ) - for piece in haystack - ), - ) - return pieces - - -@execute_node.register(ops.FindInSet, pd.Series, tuple) -def execute_series_find_in_set(op, needle, haystack, **kwargs): - haystack = [execute(arg, **kwargs) for arg in haystack] - pieces = haystack_to_series_of_lists(haystack, index=needle.index) - index = itertools.count() - return pieces.map( - lambda elements, needle=needle, index=index: ( - ibis.util.safe_index(elements, needle.iat[next(index)]) - ) - ) - - -@execute_node.register(ops.FindInSet, SeriesGroupBy, list) -def execute_series_group_by_find_in_set(op, needle, haystack, **kwargs): - pieces = [getattr(piece, "obj", piece) for piece in haystack] - return execute_series_find_in_set(op, needle.obj, pieces, **kwargs).groupby( - get_grouping(needle.grouper.groupings), group_keys=False - ) - - -@execute_node.register(ops.FindInSet, scalar_types, list) -def execute_string_group_by_find_in_set(op, needle, haystack, **kwargs): - # `list` could contain series, series groupbys, or scalars - # mixing series and series groupbys is not allowed - series_in_haystack = [ - type(piece) - for piece in haystack - if isinstance(piece, (pd.Series, SeriesGroupBy)) - ] - - if not series_in_haystack: - return ibis.util.safe_index(haystack, needle) - - try: - (collection_type,) = frozenset(map(type, series_in_haystack)) - except ValueError: - raise ValueError("Mixing Series and SeriesGroupBy is not allowed") - - pieces = haystack_to_series_of_lists( - [getattr(piece, "obj", piece) for piece in haystack] - ) - - result = pieces.map(toolz.flip(ibis.util.safe_index)(needle)) - if issubclass(collection_type, pd.Series): - return result - - assert issubclass(collection_type, SeriesGroupBy) - - return result.groupby( - get_grouping( - toolz.first( - piece.grouper.groupings - for piece in haystack - if hasattr(piece, "grouper") - ) - ), - group_keys=False, - ) - - -def try_getitem(value, key): - try: - # try to deserialize the value -> return None if it's None - if (js := json.loads(value)) is None: - return None - except (json.JSONDecodeError, TypeError): - # if there's an error related to decoding or a type error return None - return None - - try: - # try to extract the value as an array element or mapping key - return js[key] - except (KeyError, IndexError, TypeError): - # KeyError: missing mapping key - # IndexError: missing sequence key - # TypeError: `js` doesn't implement __getitem__, either at all or for - # the type of `key` - return None - - -@execute_node.register(ops.JSONGetItem, pd.Series, (str, int)) -def execute_json_getitem_series_str_int(_, data, key, **kwargs): - return pd.Series(map(partial(try_getitem, key=key), data), dtype="object") - - -@execute_node.register(ops.JSONGetItem, pd.Series, pd.Series) -def execute_json_getitem_series_series(_, data, key, **kwargs): - return pd.Series(map(try_getitem, data, key), dtype="object") - - -def _extract_url_field(data, field_name): - if isinstance(data, str): - return getattr(urlsplit(data), field_name, "") - - return pd.Series( - [getattr(urlsplit(string), field_name, "") for string in data], - dtype=data.dtype, - name=data.name, - ) - - -@execute_node.register(ops.ExtractProtocol, (pd.Series, str)) -def execute_extract_protocol(op, data, **kwargs): - return _extract_url_field(data, "scheme") - - -@execute_node.register(ops.ExtractAuthority, (pd.Series, str)) -def execute_extract_authority(op, data, **kwargs): - return _extract_url_field(data, "netloc") - - -@execute_node.register(ops.ExtractPath, (pd.Series, str)) -def execute_extract_path(op, data, **kwargs): - return _extract_url_field(data, "path") - - -@execute_node.register(ops.ExtractFragment, (pd.Series, str)) -def execute_extract_fragment(op, data, **kwargs): - return _extract_url_field(data, "fragment") - - -@execute_node.register(ops.ExtractHost, (pd.Series, str)) -def execute_extract_host(op, data, **kwargs): - return _extract_url_field(data, "hostname") - - -@execute_node.register(ops.ExtractQuery, (pd.Series, str), (str, type(None))) -def execute_extract_query(op, data, key, **kwargs): - def extract_query_param(url, param_name): - query = urlsplit(url).query - if param_name is not None: - value = parse_qs(query)[param_name] - return value if len(value) > 1 else value[0] - else: - return query - - if isinstance(data, str): - return extract_query_param(data, key) - - return pd.Series( - [extract_query_param(url, key) for url in data], - dtype=data.dtype, - name=data.name, - ) - - -@execute_node.register(ops.ExtractUserInfo, (pd.Series, str)) -def execute_extract_user_info(op, data, **kwargs): - def extract_user_info(url): - url_parts = urlsplit(url) - - username = url_parts.username or "" - password = url_parts.password or "" - - return f"{username}:{password}" - - if isinstance(data, str): - return extract_user_info(data) - - return pd.Series( - [extract_user_info(string) for string in data], - dtype=data.dtype, - name=data.name, - ) diff --git a/ibis/backends/pandas/execution/structs.py b/ibis/backends/pandas/execution/structs.py deleted file mode 100644 index a2bcf7a94e115..0000000000000 --- a/ibis/backends/pandas/execution/structs.py +++ /dev/null @@ -1,44 +0,0 @@ -"""Pandas backend execution of struct fields and literals.""" - -from __future__ import annotations - -import collections -import functools - -import pandas as pd -from pandas.core.groupby import SeriesGroupBy - -import ibis.expr.operations as ops -from ibis.backends.pandas.dispatch import execute_node -from ibis.backends.pandas.execution.util import get_grouping - - -@execute_node.register(ops.StructField, (collections.abc.Mapping, pd.DataFrame)) -def execute_node_struct_field_dict(op, data, **kwargs): - return data[op.field] - - -@execute_node.register(ops.StructField, (type(None), type(pd.NA), float)) -def execute_node_struct_field_none(op, data, **_): - assert (isinstance(data, float) and pd.isna(data)) or not isinstance(data, float) - return pd.NA - - -def _safe_getter(value, field: str): - if pd.isna(value): - return pd.NA - else: - return value[field] - - -@execute_node.register(ops.StructField, pd.Series) -def execute_node_struct_field_series(op, data, **kwargs): - getter = functools.partial(_safe_getter, field=op.field) - return data.map(getter).rename(op.field) - - -@execute_node.register(ops.StructField, SeriesGroupBy) -def execute_node_struct_field_series_group_by(op, data, **kwargs): - getter = functools.partial(_safe_getter, field=op.field) - groupings = get_grouping(data.grouper.groupings) - return data.obj.map(getter).rename(op.field).groupby(groupings, group_keys=False) diff --git a/ibis/backends/pandas/execution/temporal.py b/ibis/backends/pandas/execution/temporal.py deleted file mode 100644 index a2f2b5d8b5ec8..0000000000000 --- a/ibis/backends/pandas/execution/temporal.py +++ /dev/null @@ -1,341 +0,0 @@ -from __future__ import annotations - -import datetime - -import numpy as np -import pandas as pd -from pandas.core.groupby import SeriesGroupBy - -import ibis.expr.datatypes as dt -import ibis.expr.operations as ops -from ibis.backends.base import BaseBackend -from ibis.backends.base.df.scope import Scope -from ibis.backends.pandas.core import ( - date_types, - integer_types, - numeric_types, - timedelta_types, - timestamp_types, -) -from ibis.backends.pandas.dispatch import execute_node, pre_execute -from ibis.backends.pandas.execution.util import get_grouping - - -@execute_node.register(ops.Strftime, pd.Timestamp, str) -def execute_strftime_timestamp_str(op, data, format_string, **kwargs): - return data.strftime(format_string) - - -@execute_node.register(ops.Strftime, pd.Series, str) -def execute_strftime_series_str(op, data, format_string, **kwargs): - return data.dt.strftime(format_string) - - -@execute_node.register(ops.ExtractTemporalField, datetime.datetime) -def execute_extract_timestamp_field_timestamp(op, data, **kwargs): - field_name = type(op).__name__.lower().replace("extract", "") - return getattr(data, field_name) - - -@execute_node.register(ops.ExtractTemporalField, pd.Series) -def execute_extract_timestamp_field_series(op, data, **kwargs): - field_name = type(op).__name__.lower().replace("extract", "") - if field_name == "weekofyear": - return data.dt.isocalendar().week.astype(np.int32) - return getattr(data.dt, field_name).astype(np.int32) - - -@execute_node.register(ops.ExtractMillisecond, datetime.datetime) -def execute_extract_millisecond_timestamp(op, data, **kwargs): - return int(data.microsecond // 1_000) - - -@execute_node.register(ops.ExtractMicrosecond, datetime.datetime) -def execute_extract_microsecond_timestamp(op, data, **kwargs): - return int(data.microsecond) - - -@execute_node.register(ops.ExtractMillisecond, pd.Series) -def execute_extract_millisecond_series(op, data, **kwargs): - return (data.dt.microsecond // 1_000).astype(np.int32) - - -@execute_node.register(ops.ExtractMicrosecond, pd.Series) -def execute_extract_microsecond_series(op, data, **kwargs): - return data.dt.microsecond.astype(np.int32) - - -@execute_node.register(ops.ExtractEpochSeconds, pd.Series) -def execute_epoch_seconds_series(op, data, **kwargs): - return ( - data.astype("datetime64[ns]") - .astype("int64") - .floordiv(1_000_000_000) - .astype("int32") - ) - - -@execute_node.register(ops.ExtractEpochSeconds, (pd.Timestamp, datetime.datetime)) -def execute_epoch_seconds_literal(op, data, **kwargs): - return pd.Timestamp(data).floor("s").value // 1_000_000_000 - - -@execute_node.register( - ops.BetweenTime, - pd.Series, - (pd.Series, str, datetime.time), - (pd.Series, str, datetime.time), -) -def execute_between_time(op, data, lower, upper, **kwargs): - idx = pd.DatetimeIndex(data) - if idx.tz is not None: - idx = idx.tz_convert(None) # make naive because times are naive - indexer = idx.indexer_between_time(lower, upper) - result = np.zeros(len(data), dtype=np.bool_) - result[indexer] = True - return pd.Series(result) - - -@execute_node.register(ops.Date, pd.Series) -def execute_timestamp_date(op, data, **kwargs): - return data.dt.floor("d") - - -PANDAS_UNITS = { - "m": "Min", - "ms": "L", -} - - -@execute_node.register((ops.TimestampTruncate, ops.DateTruncate), pd.Series) -def execute_timestamp_truncate(op, data, **kwargs): - dt = data.dt - unit = PANDAS_UNITS.get(op.unit.short, op.unit.short) - try: - return dt.floor(unit) - except ValueError: - return dt.to_period(unit).dt.to_timestamp() - - -OFFSET_CLASS = { - "Y": pd.offsets.DateOffset, - "Q": pd.offsets.DateOffset, - "M": pd.offsets.DateOffset, - "W": pd.offsets.DateOffset, - # all other units are timedelta64s -} - - -@execute_node.register(ops.IntervalFromInteger, pd.Series) -def execute_interval_from_integer_series(op, data, **kwargs): - unit = op.unit.short - resolution = op.unit.plural - cls = OFFSET_CLASS.get(unit, None) - - # fast path for timedelta conversion - if cls is None: - return data.astype(f"timedelta64[{unit}]") - return data.apply(lambda n, cls=cls, resolution=resolution: cls(**{resolution: n})) - - -@execute_node.register(ops.IntervalFromInteger, integer_types) -def execute_interval_from_integer_integer_types(op, data, **kwargs): - unit = op.unit.short - resolution = op.unit.plural - cls = OFFSET_CLASS.get(unit, None) - - if cls is None: - return pd.Timedelta(data, unit=unit) - return cls(**{resolution: data}) - - -@execute_node.register(ops.Cast, pd.Series, dt.Interval) -def execute_cast_integer_to_interval_series(op, data, type, **kwargs): - to = op.to - unit = to.unit.short - resolution = to.unit.plural - cls = OFFSET_CLASS.get(unit, None) - - if cls is None: - return data.astype(f"timedelta64[{unit}]") - return data.apply(lambda n, cls=cls, resolution=resolution: cls(**{resolution: n})) - - -@execute_node.register(ops.Cast, integer_types, dt.Interval) -def execute_cast_integer_to_interval_integer_types(op, data, type, **kwargs): - to = op.to - unit = to.unit.short - resolution = to.unit.plural - cls = OFFSET_CLASS.get(unit, None) - - if cls is None: - return pd.Timedelta(data, unit=unit) - return cls(**{resolution: data}) - - -@execute_node.register(ops.TimestampAdd, timestamp_types, timedelta_types) -def execute_timestamp_add_datetime_timedelta(op, left, right, **kwargs): - return pd.Timestamp(left) + pd.Timedelta(right) - - -@execute_node.register(ops.TimestampAdd, timestamp_types, pd.Series) -def execute_timestamp_add_datetime_series(op, left, right, **kwargs): - return pd.Timestamp(left) + right - - -@execute_node.register(ops.IntervalAdd, timedelta_types, timedelta_types) -def execute_interval_add_delta_delta(op, left, right, **kwargs): - return op.op(pd.Timedelta(left), pd.Timedelta(right)) - - -@execute_node.register(ops.IntervalAdd, timedelta_types, pd.Series) -@execute_node.register( - ops.IntervalMultiply, timedelta_types, numeric_types + (pd.Series,) -) -def execute_interval_add_multiply_delta_series(op, left, right, **kwargs): - return op.op(pd.Timedelta(left), right) - - -@execute_node.register((ops.TimestampAdd, ops.IntervalAdd), pd.Series, timedelta_types) -def execute_timestamp_interval_add_series_delta(op, left, right, **kwargs): - return left + pd.Timedelta(right) - - -@execute_node.register((ops.TimestampAdd, ops.IntervalAdd), pd.Series, pd.Series) -def execute_timestamp_interval_add_series_series(op, left, right, **kwargs): - return left + right - - -@execute_node.register(ops.TimestampSub, timestamp_types, timedelta_types) -def execute_timestamp_sub_datetime_timedelta(op, left, right, **kwargs): - return pd.Timestamp(left) - pd.Timedelta(right) - - -@execute_node.register( - (ops.TimestampDiff, ops.TimestampSub), timestamp_types, pd.Series -) -def execute_timestamp_diff_sub_datetime_series(op, left, right, **kwargs): - return pd.Timestamp(left) - right - - -@execute_node.register(ops.TimestampSub, pd.Series, timedelta_types) -def execute_timestamp_sub_series_timedelta(op, left, right, **kwargs): - return left - pd.Timedelta(right) - - -@execute_node.register( - (ops.TimestampDiff, ops.TimestampSub, ops.IntervalSubtract), - pd.Series, - pd.Series, -) -def execute_timestamp_diff_sub_series_series(op, left, right, **kwargs): - return left - right - - -@execute_node.register(ops.TimestampDiff, timestamp_types, timestamp_types) -def execute_timestamp_diff_datetime_datetime(op, left, right, **kwargs): - return pd.Timestamp(left) - pd.Timestamp(right) - - -@execute_node.register(ops.TimestampDiff, pd.Series, timestamp_types) -def execute_timestamp_diff_series_datetime(op, left, right, **kwargs): - return left - pd.Timestamp(right) - - -@execute_node.register(ops.IntervalMultiply, pd.Series, numeric_types + (pd.Series,)) -@execute_node.register( - ops.IntervalFloorDivide, - (pd.Timedelta, pd.Series), - numeric_types + (pd.Series,), -) -def execute_interval_multiply_fdiv_series_numeric(op, left, right, **kwargs): - return op.op(left, right) - - -@execute_node.register(ops.TimestampFromUNIX, (pd.Series,) + integer_types) -def execute_timestamp_from_unix(op, data, **kwargs): - return pd.to_datetime(data, unit=op.unit.short) - - -@pre_execute.register(ops.TimestampNow) -@pre_execute.register(ops.TimestampNow, BaseBackend) -def pre_execute_timestamp_now(op, *args, **kwargs): - timecontext = kwargs.get("timecontext", None) - now = pd.Timestamp("now", tz="UTC").tz_localize(None) - return Scope({op: now}, timecontext) - - -@execute_node.register(ops.DayOfWeekIndex, (str, datetime.date)) -def execute_day_of_week_index_any(op, value, **kwargs): - return pd.Timestamp(value).dayofweek - - -@execute_node.register(ops.DayOfWeekIndex, pd.Series) -def execute_day_of_week_index_series(op, data, **kwargs): - return data.dt.dayofweek.astype(np.int16) - - -@execute_node.register(ops.DayOfWeekIndex, SeriesGroupBy) -def execute_day_of_week_index_series_group_by(op, data, **kwargs): - groupings = get_grouping(data.grouper.groupings) - return data.obj.dt.dayofweek.astype(np.int16).groupby(groupings, group_keys=False) - - -def day_name(obj: pd.core.indexes.accessors.DatetimeProperties | pd.Timestamp) -> str: - """Backwards compatible name-of-day getting function. - - Returns - ------- - str - The name of the day corresponding to `obj` - """ - try: - return obj.day_name() - except AttributeError: - return obj.weekday_name - - -@execute_node.register(ops.DayOfWeekName, (str, datetime.date)) -def execute_day_of_week_name_any(op, value, **kwargs): - return day_name(pd.Timestamp(value)) - - -@execute_node.register(ops.DayOfWeekName, pd.Series) -def execute_day_of_week_name_series(op, data, **kwargs): - return day_name(data.dt) - - -@execute_node.register(ops.DayOfWeekName, SeriesGroupBy) -def execute_day_of_week_name_series_group_by(op, data, **kwargs): - return day_name(data.obj.dt).groupby( - get_grouping(data.grouper.groupings), group_keys=False - ) - - -@execute_node.register(ops.DateSub, date_types, timedelta_types) -@execute_node.register(ops.DateSub, pd.Series, timedelta_types) -@execute_node.register((ops.DateDiff, ops.DateSub), pd.Series, pd.Series) -@execute_node.register(ops.DateDiff, date_types, date_types) -def execute_date_sub_diff(op, left, right, **kwargs): - return left - right - - -@execute_node.register((ops.DateDiff, ops.DateSub), date_types, pd.Series) -def execute_date_sub_diff_date_series(op, left, right, **kwargs): - return pd.Timestamp(left, unit="D") - right - - -@execute_node.register(ops.DateDiff, pd.Series, date_types) -def execute_date_sub_diff_series_date(op, left, right, **kwargs): - return left - pd.Timestamp(right, unit="D") - - -@execute_node.register(ops.DateAdd, pd.Series, timedelta_types) -@execute_node.register(ops.DateAdd, timedelta_types, pd.Series) -@execute_node.register(ops.DateAdd, pd.Series, pd.Series) -@execute_node.register(ops.DateAdd, date_types, timedelta_types) -@execute_node.register(ops.DateAdd, timedelta_types, date_types) -@execute_node.register(ops.DateAdd, date_types, pd.Series) -@execute_node.register(ops.DateAdd, pd.Series, date_types) -def execute_date_add(op, left, right, **kwargs): - return left + right diff --git a/ibis/backends/pandas/execution/timecontext.py b/ibis/backends/pandas/execution/timecontext.py deleted file mode 100644 index c9be8f75757fc..0000000000000 --- a/ibis/backends/pandas/execution/timecontext.py +++ /dev/null @@ -1,93 +0,0 @@ -"""Implementation of compute_time_context for time context related operations. - -Time context of a node is computed at the beginning of execution phase. - -To use time context to load time series data: - -For operations like window, asof_join that adjust time context in execution, -implement ``compute_time_context`` to pass different time contexts to child -nodes. - -If ``pre_execute`` preloads any data, it should use timecontext to trim data -to be in the time range. - -``execute_node`` of a leaf node can use timecontext to trim data, or to pass -it as a filter in the database query. - -In some cases, data need to be trimmed in ``post_execute``. - -Note: In order to use the feature we implemented here, there must be a -column of Timestamp type, and named as 'time' in Table. And this 'time' -column should be preserved across the expression tree. If 'time' column is -dropped then execution will result in error. -See ``execute_database_table_client`` in ``generic.py``. -And we assume timecontext is passed in as a tuple (begin, end) where begin and -end are timestamp, or datetime string like "20100101". Time range is inclusive -(include both begin and end points). - -This is an optional feature. The result of executing an expression without time -context is conceptually the same as executing an expression with (-inf, inf) -time context. -""" -from __future__ import annotations - -from typing import TYPE_CHECKING - -import ibis.expr.operations as ops -from ibis.backends.base.df.timecontext import TimeContext, adjust_context -from ibis.backends.pandas.core import ( - compute_time_context, - get_node_arguments, - is_computable_input, -) - -if TYPE_CHECKING: - from ibis.backends.base import BaseBackend - from ibis.backends.base.df.scope import Scope - - -@compute_time_context.register(ops.AsOfJoin) -def compute_time_context_asof_join( - op: ops.AsOfJoin, - scope: Scope, - clients: list[BaseBackend], - timecontext: TimeContext | None = None, - **kwargs, -): - new_timecontexts = [ - timecontext for arg in get_node_arguments(op) if is_computable_input(arg) - ] - - if not timecontext: - return new_timecontexts - - # right table is the second node in children - new_timecontexts = [ - new_timecontexts[0], - adjust_context(op, scope, timecontext), - *new_timecontexts[2:], - ] - return new_timecontexts - - -@compute_time_context.register(ops.Window) -def compute_time_context_window( - op: ops.Window, - scope: Scope, - clients: list[BaseBackend], - timecontext: TimeContext | None = None, - **kwargs, -): - new_timecontexts = [ - timecontext for arg in get_node_arguments(op) if is_computable_input(arg) - ] - - if not timecontext: - return new_timecontexts - - result = adjust_context(op, scope, timecontext) - - new_timecontexts = [ - result for arg in get_node_arguments(op) if is_computable_input(arg) - ] - return new_timecontexts diff --git a/ibis/backends/pandas/execution/util.py b/ibis/backends/pandas/execution/util.py deleted file mode 100644 index 15b43c8832bde..0000000000000 --- a/ibis/backends/pandas/execution/util.py +++ /dev/null @@ -1,144 +0,0 @@ -from __future__ import annotations - -from typing import Any - -import pandas as pd - -import ibis.expr.analysis as an -import ibis.expr.operations as ops -import ibis.util -from ibis.backends.base.df.scope import Scope -from ibis.backends.pandas.core import execute -from ibis.backends.pandas.execution import constants - - -def get_grouping(grouper): - # this is such an annoying hack - assert isinstance(grouper, list) - if len(grouper) == 1: - return grouper[0] - return grouper - - -def get_join_suffix_for_op(op: ops.TableColumn, join_op: ops.Join): - (root_table,) = an.find_immediate_parent_tables(op) - left_root, right_root = an.find_immediate_parent_tables( - [join_op.left, join_op.right] - ) - return { - left_root: constants.LEFT_JOIN_SUFFIX, - right_root: constants.RIGHT_JOIN_SUFFIX, - }[root_table] - - -def compute_sort_key(key, data, timecontext, scope=None, **kwargs): - if key.shape.is_columnar(): - if key.name in data: - return key.name, None - else: - if scope is None: - scope = Scope() - scope = scope.merge_scopes( - Scope({t: data}, timecontext) - for t in an.find_immediate_parent_tables(key) - ) - new_column = execute(key, scope=scope, **kwargs) - name = ibis.util.guid() - new_column.name = name - return name, new_column - else: - raise NotImplementedError( - "Scalar sort keys are not yet supported in the pandas backend" - ) - - -def compute_sorted_frame(df, order_by, group_by=(), timecontext=None, **kwargs): - sort_keys = [] - ascending = [] - - for value in group_by: - sort_keys.append(value) - ascending.append(True) - for key in order_by: - sort_keys.append(key) - ascending.append(key.ascending) - - new_columns = {} - computed_sort_keys = [] - for key in sort_keys: - computed_sort_key, temporary_column = compute_sort_key( - key, df, timecontext, **kwargs - ) - computed_sort_keys.append(computed_sort_key) - - if temporary_column is not None: - new_columns[computed_sort_key] = temporary_column - - result = df.assign(**new_columns) - try: - result = result.sort_values( - computed_sort_keys, ascending=ascending, kind="mergesort" - ) - except TypeError: - result = result.sort_values(computed_sort_keys, ascending=ascending) - # TODO: we'll eventually need to return this frame with the temporary - # columns and drop them in the caller (maybe using post_execute?) - ngrouping_keys = len(group_by) - return ( - result, - computed_sort_keys[:ngrouping_keys], - computed_sort_keys[ngrouping_keys:], - ) - - -def coerce_to_output( - result: Any, node: ops.Node, index: pd.Index | None = None -) -> pd.Series | pd.DataFrame: - """Cast the result to either a Series or DataFrame. - - This method casts result of an execution to a Series or DataFrame, - depending on the type of the expression and shape of the result. - - Parameters - ---------- - result: Any - The result to cast - node: ibis.expr.operations.Node - The operation node associated with the result - index: pd.Index - Optional. If passed, scalar results will be broadcasted according - to the index. - - Returns - ------- - result: A Series or DataFrame - - Examples - -------- - For dataframe outputs, see ``ibis.util.coerce_to_dataframe``. - - >>> coerce_to_output(pd.Series(1), node) # quartodoc: +SKIP # doctest: +SKIP - 0 1 - Name: result, dtype: int64 - >>> coerce_to_output(1, node) # quartodoc: +SKIP # doctest: +SKIP - 0 1 - Name: result, dtype: int64 - >>> coerce_to_output(1, node, [1, 2, 3]) # quartodoc: +SKIP # doctest: +SKIP - 1 1 - 2 1 - 3 1 - Name: result, dtype: int64 - >>> coerce_to_output([1, 2, 3], node) # quartodoc: +SKIP # doctest: +SKIP - 0 [1, 2, 3] - Name: result, dtype: object - """ - if isinstance(result, pd.DataFrame): - rows = result.to_dict(orient="records") - return pd.Series(rows, name=node.name) - - # columnar result - if isinstance(result, pd.Series): - return result.rename(node.name) - - # Wrap `result` into a single-element Series. - return pd.Series([result], name=node.name) diff --git a/ibis/backends/pandas/execution/window.py b/ibis/backends/pandas/execution/window.py deleted file mode 100644 index 39475ecc2bb6c..0000000000000 --- a/ibis/backends/pandas/execution/window.py +++ /dev/null @@ -1,526 +0,0 @@ -"""Code for computing window functions with ibis and pandas.""" - -from __future__ import annotations - -import operator -from typing import TYPE_CHECKING, Any, Callable, NoReturn - -import numpy as np -import pandas as pd -import toolz -from multipledispatch import Dispatcher -from pandas.core.groupby import SeriesGroupBy - -import ibis.expr.analysis as an -import ibis.expr.operations as ops -from ibis.backends.base.df.scope import Scope -from ibis.backends.base.df.timecontext import ( - TimeContext, - construct_time_context_aware_series, - get_time_col, -) -from ibis.backends.pandas import aggcontext as agg_ctx -from ibis.backends.pandas.core import ( - compute_time_context, - date_types, - execute, - integer_types, - simple_types, - timedelta_types, - timestamp_types, -) -from ibis.backends.pandas.dispatch import execute_node, pre_execute -from ibis.backends.pandas.execution import util - -if TYPE_CHECKING: - from ibis.backends.pandas.aggcontext import AggregationContext - - -def _post_process_empty( - result: Any, - parent: pd.DataFrame, - order_by: list[str], - group_by: list[str], - timecontext: TimeContext | None, -) -> pd.Series: - # This is the post process of the no groupby nor orderby window - # `result` could be a Series, DataFrame, or a scalar. generated - # by `agg` method of class `Window`. For window without grouby or - # orderby, `agg` calls pands method directly. So if timecontext is - # present, we need to insert 'time' column into index for trimming the - # result. For cases when grouby or orderby is present, `agg` calls - # Ibis method `window_agg_built_in` and `window_agg_udf`, time - # context is already inserted there. - assert not order_by and not group_by - if isinstance(result, (pd.Series, pd.DataFrame)): - if timecontext: - result = construct_time_context_aware_series(result, parent) - return result - else: - # `result` is a scalar when a reduction operation is being - # applied over the window, since reduction operations are N->1 - # in this case we do not need to trim result by timecontext, - # just expand reduction result to be a Series with `index`. - index = parent.index - result = pd.Series([result]).repeat(len(index)) - result.index = index - return result - - -def _post_process_group_by( - series: pd.Series, - parent: pd.DataFrame, - order_by: list[str], - group_by: list[str], - timecontext: TimeContext | None, -) -> pd.Series: - assert not order_by and group_by - return series - - -def _post_process_order_by( - series, - parent: pd.DataFrame, - order_by: list[str], - group_by: list[str], - timecontext: TimeContext | None, -) -> pd.Series: - assert order_by and not group_by - indexed_parent = parent.set_index(order_by) - index = indexed_parent.index - - # get the names of the levels that will be in the result - series_index_names = frozenset(series.index.names) - - # get the levels common to series.index, in the order that they occur in - # the parent's index - reordered_levels = [name for name in index.names if name in series_index_names] - - if len(reordered_levels) > 1: - series = series.reorder_levels(reordered_levels) - - series = series.iloc[index.argsort(kind="mergesort")] - return series - - -def _post_process_group_by_order_by( - series: pd.Series, - parent: pd.DataFrame, - order_by: list[str], - group_by: list[str], - timecontext: TimeContext | None, -) -> pd.Series: - indexed_parent = parent.set_index(group_by + order_by, append=True) - index = indexed_parent.index - - # get the names of the levels that will be in the result - series_index_names = frozenset(series.index.names) - - # get the levels common to series.index, in the order that they occur in - # the parent's index - reordered_levels = [name for name in index.names if name in series_index_names] - - if len(reordered_levels) > 1: - series = series.reorder_levels(reordered_levels) - return series - - -get_aggcontext = Dispatcher("get_aggcontext") - - -@get_aggcontext.register(object) -def get_aggcontext_default( - window, - *, - scope, - operand, - parent, - group_by, - order_by, - **kwargs, -) -> NoReturn: - raise NotImplementedError( - f"get_aggcontext is not implemented for {type(window).__name__}" - ) - - -@get_aggcontext.register(ops.WindowFrame) -def get_aggcontext_window( - frame, - *, - scope, - operand, - parent, - group_by, - order_by, - **kwargs, -) -> AggregationContext: - # no order by or group by: default summarization aggcontext - # - # if we're reducing and we have an order by expression then we need to - # expand or roll. - # - # otherwise we're transforming - output_type = operand.dtype - - if not group_by and not order_by: - aggcontext = agg_ctx.Summarize(parent=parent, output_type=output_type) - elif group_by and not order_by: - # groupby transform (window with a partition by clause in SQL parlance) - aggcontext = agg_ctx.Transform( - parent=parent, - group_by=group_by, - order_by=order_by, - output_type=output_type, - ) - elif frame.start is not None: - if isinstance(frame, ops.RowsWindowFrame): - max_lookback = frame.max_lookback - else: - max_lookback = None - - aggcontext = agg_ctx.Moving( - frame.start, - # FIXME(kszucs): I don't think that we have a proper max_lookback test - # case because passing None here is not braking anything - max_lookback=max_lookback, - parent=parent, - group_by=group_by, - order_by=order_by, - output_type=output_type, - ) - else: - # expanding window - aggcontext = agg_ctx.Cumulative( - parent=parent, - group_by=group_by, - order_by=order_by, - output_type=output_type, - ) - - return aggcontext - - -def trim_window_result(data: pd.Series | pd.DataFrame, timecontext: TimeContext | None): - """Trim data within time range defined by timecontext. - - This is a util function used in ``execute_window_op``, where time - context might be adjusted for calculation. Data must be trimmed - within the original time context before return. - `data` is a pd.Series with Multiindex for most cases, for multi - column udf result, `data` could be a pd.DataFrame - - Params - ------ - data: pd.Series or pd.DataFrame - timecontext: Optional[TimeContext] - - Returns - ------- - a trimmed pd.Series or or pd.DataFrame with the same Multiindex - as data's - """ - # noop if timecontext is None - if not timecontext: - return data - assert isinstance( - data, (pd.Series, pd.DataFrame) - ), "window computed columns is not a pd.Series nor a pd.DataFrame" - - # reset multiindex, convert Series into a DataFrame - df = data.reset_index() - - # Filter the data, here we preserve the time index so that when user is - # computing a single column, the computation and the relevant time - # indexes are returned. - time_col = get_time_col() - if time_col not in df: - return data - - subset = df.loc[df[time_col].between(*timecontext)] - - # Get columns to set for index - if isinstance(data, pd.Series): - # if Series doesn't contain a name, reset_index will assign - # '0' as the column name for the column of value - name = data.name if data.name else 0 - index_columns = list(subset.columns.difference([name])) - else: - name = data.columns - index_columns = list(subset.columns.difference(name)) - - # set the correct index for return Series / DataFrame - indexed_subset = subset.set_index(index_columns) - return indexed_subset[name] - - -@execute_node.register(ops.WindowFunction, [pd.Series]) -def execute_window_op( - op, - *data, - scope: Scope | None = None, - timecontext: TimeContext | None = None, - aggcontext=None, - clients=None, - **kwargs, -): - func, frame = op.func, op.frame - - if frame.how == "range" and any( - not col.dtype.is_temporal() for col in frame.order_by - ): - raise NotImplementedError( - "The pandas backend only implements range windows with temporal " - "ordering keys" - ) - - # pre execute "manually" here because otherwise we wouldn't pickup - # relevant scope changes from the child operand since we're managing - # execution of that by hand - - adjusted_timecontext = None - if timecontext: - arg_timecontexts = compute_time_context( - op, timecontext=timecontext, clients=clients, scope=scope - ) - # timecontext is the original time context required by parent node - # of this Window, while adjusted_timecontext is the adjusted context - # of this Window, since we are doing a manual execution here, use - # adjusted_timecontext in later execution phases - adjusted_timecontext = arg_timecontexts[0] - - pre_executed_scope = pre_execute( - func, - *clients, - scope=scope, - timecontext=adjusted_timecontext, - aggcontext=aggcontext, - **kwargs, - ) - if scope is None: - scope = pre_executed_scope - else: - scope = scope.merge_scope(pre_executed_scope) - - root_table = an.find_first_base_table(op) - data = execute( - root_table, - scope=scope, - timecontext=adjusted_timecontext, - clients=clients, - aggcontext=aggcontext, - **kwargs, - ) - - grouping_keys = [ - key.name - if isinstance(key, ops.TableColumn) - else execute( - key, - scope=scope, - clients=clients, - timecontext=adjusted_timecontext, - aggcontext=aggcontext, - **kwargs, - ) - for key in frame.group_by - ] - - if not frame.order_by: - ordering_keys = [] - - post_process: Callable[ - [Any, pd.DataFrame, list[str], list[str], TimeContext | None], - pd.Series, - ] - if frame.group_by: - if frame.order_by: - sorted_df, grouping_keys, ordering_keys = util.compute_sorted_frame( - data, - frame.order_by, - group_by=frame.group_by, - timecontext=adjusted_timecontext, - **kwargs, - ) - source = sorted_df.groupby(grouping_keys, sort=True, group_keys=False) - post_process = _post_process_group_by_order_by - else: - source = data.groupby(grouping_keys, sort=False, group_keys=False) - post_process = _post_process_group_by - elif frame.order_by: - source, grouping_keys, ordering_keys = util.compute_sorted_frame( - data, frame.order_by, timecontext=adjusted_timecontext, **kwargs - ) - post_process = _post_process_order_by - else: - source = data - post_process = _post_process_empty - - # Here groupby object should be add to the corresponding node in scope - # for execution, data will be overwrite to a groupby object, so we - # force an update regardless of time context - new_scope = scope.merge_scopes( - [ - Scope({t: source}, adjusted_timecontext) - for t in an.find_immediate_parent_tables(func) - ], - overwrite=True, - ) - - aggcontext = get_aggcontext( - frame, - scope=scope, - operand=func, - parent=source, - group_by=grouping_keys, - order_by=ordering_keys, - **kwargs, - ) - result = execute( - func, - scope=new_scope, - timecontext=adjusted_timecontext, - aggcontext=aggcontext, - clients=clients, - **kwargs, - ) - result = post_process( - result, - data, - ordering_keys, - grouping_keys, - adjusted_timecontext, - ) - assert len(data) == len( - result - ), "input data source and computed column do not have the same length" - - # trim data to original time context - result = trim_window_result(result, timecontext) - return result - - -def post_lead_lag(result, default): - if not pd.isnull(default): - return result.fillna(default) - return result - - -@execute_node.register( - (ops.Lead, ops.Lag), - (pd.Series, SeriesGroupBy), - integer_types + (type(None),), - simple_types + (type(None),), -) -def execute_series_lead_lag(op, data, offset, default, **kwargs): - func = toolz.identity if isinstance(op, ops.Lag) else operator.neg - result = data.shift(func(1 if offset is None else offset)) - return post_lead_lag(result, default) - - -@execute_node.register( - (ops.Lead, ops.Lag), - (pd.Series, SeriesGroupBy), - timedelta_types, - date_types + timestamp_types + (str, type(None)), -) -def execute_series_lead_lag_timedelta( - op, data, offset, default, aggcontext=None, **kwargs -): - """Shift a column relative to another one in units of time instead of rows.""" - # lagging adds time (delayed), leading subtracts time (moved up) - func = operator.add if isinstance(op, ops.Lag) else operator.sub - group_by = aggcontext.group_by - order_by = aggcontext.order_by - - # get the parent object from which `data` originated - parent = aggcontext.parent - - # get the DataFrame from the parent object, handling the DataFrameGroupBy - # case - parent_df = getattr(parent, "obj", parent) - - # index our parent df by grouping and ordering keys - indexed_original_df = parent_df.set_index(group_by + order_by) - - # perform the time shift - adjusted_parent_df = parent_df.assign( - **{k: func(parent_df[k], offset) for k in order_by} - ) - - # index the parent *after* adjustment - adjusted_indexed_parent = adjusted_parent_df.set_index(group_by + order_by) - - # get the column we care about - result = adjusted_indexed_parent[getattr(data, "obj", data).name] - - # reindex the shifted data by the original frame's index - result = result.reindex(indexed_original_df.index) - - # add a default if necessary - return post_lead_lag(result, default) - - -@execute_node.register(ops.FirstValue, pd.Series) -def execute_series_first_value(op, data, **kwargs): - return data.iloc[np.repeat(0, len(data))] - - -def _getter(x: pd.Series | np.ndarray, idx: int): - return getattr(x, "values", x)[idx] - - -@execute_node.register(ops.FirstValue, SeriesGroupBy) -def execute_series_group_by_first_value(op, data, aggcontext=None, **kwargs): - return aggcontext.agg(data, lambda x: _getter(x, 0)) - - -@execute_node.register(ops.LastValue, pd.Series) -def execute_series_last_value(op, data, **kwargs): - return data.iloc[np.repeat(-1, len(data))] - - -@execute_node.register(ops.LastValue, SeriesGroupBy) -def execute_series_group_by_last_value(op, data, aggcontext=None, **kwargs): - return aggcontext.agg(data, lambda x: _getter(x, -1)) - - -@execute_node.register(ops.MinRank) -def execute_series_min_rank(op, aggcontext=None, **kwargs): - (key,) = aggcontext.order_by - df = aggcontext.parent - data = df[key] - return data.rank(method="min", ascending=True).astype("int64") - 1 - - -@execute_node.register(ops.DenseRank) -def execute_series_dense_rank(op, aggcontext=None, **kwargs): - (key,) = aggcontext.order_by - df = aggcontext.parent - data = df[key] - return data.rank(method="dense", ascending=True).astype("int64") - 1 - - -@execute_node.register(ops.PercentRank) -def execute_series_group_by_percent_rank(op, aggcontext=None, **kwargs): - (key,) = aggcontext.order_by - df = aggcontext.parent - data = df[key] - - result = data.rank(method="min", ascending=True) - 1 - - if isinstance(data, SeriesGroupBy): - nrows = data.transform("count") - else: - nrows = len(data) - - result /= nrows - 1 - return result - - -@execute_node.register(ops.CumeDist) -def execute_series_group_by_cume_dist(op, aggcontext=None, **kwargs): - (key,) = aggcontext.order_by - df = aggcontext.parent - data = df[key] - return data.rank(method="min", ascending=True, pct=True) diff --git a/ibis/backends/pandas/executor.py b/ibis/backends/pandas/executor.py new file mode 100644 index 0000000000000..f9dd69a3c0271 --- /dev/null +++ b/ibis/backends/pandas/executor.py @@ -0,0 +1,761 @@ +from __future__ import annotations + +import operator +from functools import reduce + +import numpy as np +import pandas as pd + +import ibis.expr.operations as ops +from ibis.backends.pandas.convert import PandasConverter +from ibis.backends.pandas.helpers import ( + GroupedFrame, + RangeFrame, + RowsFrame, + UngroupedFrame, + agg, + asframe, + asseries, + columnwise, + elementwise, + rowwise, + serieswise, +) +from ibis.backends.pandas.kernels import pick_kernel +from ibis.backends.pandas.rewrites import ( + PandasAggregate, + PandasAsofJoin, + PandasJoin, + PandasLimit, + PandasRename, + PandasScalarSubquery, + plan, +) +from ibis.common.dispatch import Dispatched +from ibis.common.exceptions import OperationNotDefinedError, UnboundExpressionError +from ibis.formats.pandas import PandasData +from ibis.util import gen_name + +# ruff: noqa: F811 + + +_reduction_operations = { + ops.Min: lambda x: x.min(), + ops.Max: lambda x: x.max(), + ops.Sum: lambda x: x.sum(), + ops.Mean: lambda x: x.mean(), + ops.Count: lambda x: x.count(), + ops.Mode: lambda x: x.mode().iat[0], + ops.Any: lambda x: x.any(), + ops.All: lambda x: x.all(), + ops.Median: lambda x: x.median(), + ops.ApproxMedian: lambda x: x.median(), + ops.BitAnd: lambda x: np.bitwise_and.reduce(x.values), + ops.BitOr: lambda x: np.bitwise_or.reduce(x.values), + ops.BitXor: lambda x: np.bitwise_xor.reduce(x.values), + ops.Last: lambda x: x.iat[-1], + ops.First: lambda x: x.iat[0], + ops.CountDistinct: lambda x: x.nunique(), + ops.ApproxCountDistinct: lambda x: x.nunique(), + ops.ArrayCollect: lambda x: x.tolist(), +} + + +class Executor(Dispatched): + @classmethod + def visit(cls, op: ops.Node, **kwargs): + raise OperationNotDefinedError( + f"Operation {op!r} is not implemented for the pandas backend" + ) + + @classmethod + def visit(cls, op: ops.Literal, value, dtype): + if dtype.is_interval(): + value = pd.Timedelta(value, dtype.unit.short) + elif dtype.is_array(): + value = np.array(value) + elif dtype.is_date(): + value = pd.Timestamp(value, tz="UTC").tz_localize(None) + return value + + @classmethod + def visit(cls, op: ops.Field, rel, name): + return rel[name] + + @classmethod + def visit(cls, op: ops.Alias, arg, name): + try: + return arg.rename(name) + except AttributeError: + return arg + + @classmethod + def visit(cls, op: ops.SortKey, expr, ascending): + return expr + + @classmethod + def visit(cls, op: ops.Cast, arg, to): + if isinstance(arg, pd.Series): + return PandasConverter.convert_column(arg, to) + else: + return PandasConverter.convert_scalar(arg, to) + + @classmethod + def visit(cls, op: ops.TypeOf, arg): + raise OperationNotDefinedError("TypeOf is not implemented") + + @classmethod + def visit(cls, op: ops.RandomScalar): + raise OperationNotDefinedError("RandomScalar is not implemented") + + @classmethod + def visit(cls, op: ops.Greatest, arg): + return columnwise(lambda df: df.max(axis=1), arg) + + @classmethod + def visit(cls, op: ops.Least, arg): + return columnwise(lambda df: df.min(axis=1), arg) + + @classmethod + def visit(cls, op: ops.Coalesce, arg): + return columnwise(lambda df: df.bfill(axis=1).iloc[:, 0], arg) + + @classmethod + def visit(cls, op: ops.Value, **operands): + return pick_kernel(op, operands) + + @classmethod + def visit(cls, op: ops.IsNan, arg): + try: + return np.isnan(arg) + except (TypeError, ValueError): + # if `arg` contains `None` np.isnan will complain + # so we take advantage of NaN not equaling itself + # to do the correct thing + return arg != arg + + @classmethod + def visit(cls, op: ops.SearchedCase, cases, results, default): + cases, _ = asframe(cases, concat=False) + results, _ = asframe(results, concat=False) + out = np.select(cases, results, default) + return pd.Series(out) + + @classmethod + def visit(cls, op: ops.SimpleCase, base, cases, results, default): + if isinstance(default, pd.Series): + raise NotImplementedError( + "SimpleCase with a columnar shaped default value is not implemented" + ) + cases = tuple(base == case for case in cases) + cases, _ = asframe(cases, concat=False) + results, _ = asframe(results, concat=False) + out = np.select(cases, results, default) + return pd.Series(out) + + @classmethod + def visit(cls, op: ops.TimestampTruncate | ops.DateTruncate, arg, unit): + # TODO(kszucs): should use serieswise() + unit = {"m": "Min", "ms": "L"}.get(unit.short, unit.short) + try: + return arg.dt.floor(unit) + except ValueError: + return arg.dt.to_period(unit).dt.to_timestamp() + + @classmethod + def visit(cls, op: ops.IntervalFromInteger, unit, **kwargs): + if unit.short in {"Y", "Q", "M", "W"}: + return elementwise(lambda v: pd.DateOffset(**{unit.plural: v}), kwargs) + else: + return serieswise( + lambda arg: arg.astype(f"timedelta64[{unit.short}]"), kwargs + ) + + @classmethod + def visit(cls, op: ops.BetweenTime, arg, lower_bound, upper_bound): + idx = pd.DatetimeIndex(arg) + if idx.tz is not None: + idx = idx.tz_convert(None) # make naive because times are naive + indexer = idx.indexer_between_time(lower_bound, upper_bound) + result = np.zeros(len(arg), dtype=np.bool_) + result[indexer] = True + return pd.Series(result) + + @classmethod + def visit(cls, op: ops.FindInSet, needle, values): + (needle, *haystack), _ = asframe((needle, *values), concat=False) + condlist = [needle == col for col in haystack] + choicelist = [i for i, _ in enumerate(haystack)] + result = np.select(condlist, choicelist, default=-1) + return pd.Series(result, name=op.name) + + @classmethod + def visit(cls, op: ops.Array, exprs): + return rowwise(lambda row: np.array(row, dtype=object), exprs) + + @classmethod + def visit(cls, op: ops.ArrayConcat, arg): + return rowwise(lambda row: np.concatenate(row.values), arg) + + @classmethod + def visit(cls, op: ops.Unnest, arg): + arg = asseries(arg) + mask = arg.map(lambda v: bool(len(v)), na_action="ignore") + return arg[mask].explode() + + @classmethod + def visit( + cls, op: ops.ElementWiseVectorizedUDF, func, func_args, input_type, return_type + ): + """Execute an elementwise UDF.""" + + res = func(*func_args) + if isinstance(res, pd.DataFrame): + # it is important otherwise it is going to fill up the memory + res = res.apply(lambda row: row.to_dict(), axis=1) + + return res + + ############################# Reductions ################################## + + @classmethod + def visit(cls, op: ops.Reduction, arg, where): + func = _reduction_operations[type(op)] + return agg(func, arg, where) + + @classmethod + def visit(cls, op: ops.CountStar, arg, where): + def agg(df): + if where is None: + return len(df) + else: + return df[where.name].sum() + + return agg + + @classmethod + def visit(cls, op: ops.CountDistinctStar, arg, where): + def agg(df): + if where is None: + return df.nunique() + else: + return df[where.name].nunique() + + return agg + + @classmethod + def visit(cls, op: ops.Arbitrary, arg, where, how): + if how == "first": + return agg(lambda x: x.iat[0], arg, where) + elif how == "last": + return agg(lambda x: x.iat[-1], arg, where) + else: + raise OperationNotDefinedError(f"Arbitrary {how!r} is not supported") + + @classmethod + def visit(cls, op: ops.ArgMin | ops.ArgMax, arg, key, where): + func = operator.methodcaller(op.__class__.__name__.lower()) + + if where is None: + + def agg(df): + indices = func(df[key.name]) + return df[arg.name].iloc[indices] + else: + + def agg(df): + mask = df[where.name] + filtered = df[mask] + indices = func(filtered[key.name]) + return filtered[arg.name].iloc[indices] + + return agg + + @classmethod + def visit(cls, op: ops.Variance, arg, where, how): + ddof = {"pop": 0, "sample": 1}[how] + return agg(lambda x: x.var(ddof=ddof), arg, where) + + @classmethod + def visit(cls, op: ops.StandardDev, arg, where, how): + ddof = {"pop": 0, "sample": 1}[how] + return agg(lambda x: x.std(ddof=ddof), arg, where) + + @classmethod + def visit(cls, op: ops.Correlation, left, right, where, how): + if where is None: + + def agg(df): + return df[left.name].corr(df[right.name]) + else: + + def agg(df): + mask = df[where.name] + lhs = df[left.name][mask] + rhs = df[right.name][mask] + return lhs.corr(rhs) + + return agg + + @classmethod + def visit(cls, op: ops.Covariance, left, right, where, how): + ddof = {"pop": 0, "sample": 1}[how] + if where is None: + + def agg(df): + return df[left.name].cov(df[right.name], ddof=ddof) + else: + + def agg(df): + mask = df[where.name] + lhs = df[left.name][mask] + rhs = df[right.name][mask] + return lhs.cov(rhs, ddof=ddof) + + return agg + + @classmethod + def visit(cls, op: ops.GroupConcat, arg, sep, where): + if where is None: + + def agg(df): + return sep.join(df[arg.name].astype(str)) + else: + + def agg(df): + mask = df[where.name] + group = df[arg.name][mask] + if group.empty: + return pd.NA + return sep.join(group) + + return agg + + @classmethod + def visit(cls, op: ops.Quantile, arg, quantile, where): + return agg(lambda x: x.quantile(quantile), arg, where) + + @classmethod + def visit(cls, op: ops.MultiQuantile, arg, quantile, where): + return agg(lambda x: list(x.quantile(quantile)), arg, where) + + @classmethod + def visit( + cls, op: ops.ReductionVectorizedUDF, func, func_args, input_type, return_type + ): + def agg(df): + args = [df[col.name] for col in func_args] + return func(*args) + + return agg + + ############################# Analytic #################################### + + @classmethod + def visit(cls, op: ops.RowNumber): + def agg(df, order_keys): + return pd.Series(np.arange(len(df)), index=df.index) + + return agg + + @classmethod + def visit(cls, op: ops.Lag | ops.Lead, arg, offset, default): + if isinstance(op, ops.Lag): + sign = lambda x: x + else: + sign = lambda x: -x + + if op.offset is not None and op.offset.dtype.is_interval(): + + def agg(df, order_keys): + df = df.set_index(order_keys) + col = df[arg.name].shift(freq=sign(offset)) + return col.reindex(df.index, fill_value=default) + else: + offset = 1 if offset is None else offset + + def agg(df, order_keys): + return df[arg.name].shift(sign(offset), fill_value=default) + + return agg + + @classmethod + def visit(cls, op: ops.MinRank | ops.DenseRank): + method = "dense" if isinstance(op, ops.DenseRank) else "min" + + def agg(df, order_keys): + if len(order_keys) == 0: + raise ValueError("order_by argument is required for rank functions") + elif len(order_keys) == 1: + s = df[order_keys[0]] + else: + s = df[order_keys].apply(tuple, axis=1) + + return s.rank(method=method).astype("int64") - 1 + + return agg + + @classmethod + def visit(cls, op: ops.PercentRank): + def agg(df, order_keys): + if len(order_keys) == 0: + raise ValueError("order_by argument is required for rank functions") + elif len(order_keys) == 1: + s = df[order_keys[0]] + else: + s = df[order_keys].apply(tuple, axis=1) + + return s.rank(method="min").sub(1).div(len(df) - 1) + + return agg + + @classmethod + def visit(cls, op: ops.CumeDist): + def agg(df, order_keys): + if len(order_keys) == 0: + raise ValueError("order_by argument is required for rank functions") + elif len(order_keys) == 1: + s = df[order_keys[0]] + else: + s = df[order_keys].apply(tuple, axis=1) + + return s.rank(method="average", pct=True) + + return agg + + @classmethod + def visit(cls, op: ops.FirstValue | ops.LastValue, arg): + i = 0 if isinstance(op, ops.FirstValue) else -1 + + def agg(df, order_keys): + return df[arg.name].iat[i] + + return agg + + @classmethod + def visit( + cls, op: ops.AnalyticVectorizedUDF, func, func_args, input_type, return_type + ): + def agg(df, order_keys): + args = [df[col.name] for col in func_args] + return func(*args) + + return agg + + ############################ Window functions ############################# + + @classmethod + def visit(cls, op: ops.WindowBoundary, value, preceding): + return value + + @classmethod + def visit( + cls, op: ops.WindowFrame, table, start, end, group_by, order_by, **kwargs + ): + if start is not None: + start = asseries(start, len(table)) + if op.start.preceding: + start = -start + if end is not None: + end = asseries(end, len(table)) + if op.end.preceding: + end = -end + + table = table.assign(__start__=start, __end__=end) + + # TODO(kszucs): order by ibis.random() is not supported because it is + # excluded from the group by keys due to its scalar shape + group_keys = [group.name for group in op.group_by] + order_keys = [key.name for key in op.order_by if key.shape.is_columnar()] + ascending = [key.ascending for key in op.order_by if key.shape.is_columnar()] + + if order_by: + table = table.sort_values(order_keys, ascending=ascending, kind="mergesort") + + if group_by: + frame = GroupedFrame(df=table, group_keys=group_keys) + else: + frame = UngroupedFrame(df=table) + + if start is None and end is None: + return frame + elif op.how == "rows": + return RowsFrame(parent=frame) + elif op.how == "range": + if len(order_keys) != 1: + raise NotImplementedError( + "Only single column order by is supported for range window frames" + ) + return RangeFrame(parent=frame, order_key=order_keys[0]) + else: + raise NotImplementedError(f"Unsupported window frame type: {op.how}") + + @classmethod + def visit(cls, op: ops.WindowFunction, func, frame): + if isinstance(op.func, ops.Analytic): + order_keys = [key.name for key in op.frame.order_by] + return frame.apply_analytic(func, order_keys=order_keys) + else: + return frame.apply_reduction(func) + + ############################ Relational ################################### + + @classmethod + def visit(cls, op: ops.DatabaseTable, name, schema, source, namespace): + try: + return source.dictionary[name] + except KeyError: + raise UnboundExpressionError( + f"{name} is not a table in the {source.name!r} backend, you " + "probably tried to execute an expression without a data source" + ) + + @classmethod + def visit(cls, op: ops.InMemoryTable, name, schema, data): + return data.to_frame() + + @classmethod + def visit(cls, op: ops.DummyTable, values): + df, _ = asframe(values) + return df + + @classmethod + def visit(cls, op: ops.SelfReference | ops.JoinTable, parent, **kwargs): + return parent + + @classmethod + def visit(cls, op: PandasRename, parent, mapping): + return parent.rename(columns=mapping) + + @classmethod + def visit(cls, op: PandasLimit, parent, n, offset): + n = n.iat[0, 0] + offset = offset.iat[0, 0] + if n is None: + return parent.iloc[offset:] + else: + return parent.iloc[offset : offset + n] + + @classmethod + def visit(cls, op: ops.Sample, parent, fraction, method, seed): + return parent.sample(frac=fraction, random_state=seed) + + @classmethod + def visit(cls, op: ops.Project, parent, values): + df, all_scalars = asframe(values) + if all_scalars and len(parent) != len(df): + df = pd.concat([df] * len(parent)) + return df + + @classmethod + def visit(cls, op: ops.Filter, parent, predicates): + if predicates: + pred = reduce(operator.and_, predicates) + if len(pred) != len(parent): + raise RuntimeError( + "Selection predicate length does not match underlying table" + ) + parent = parent.loc[pred].reset_index(drop=True) + return parent + + @classmethod + def visit(cls, op: ops.Sort, parent, keys): + # 1. add sort key columns to the dataframe if they are not already present + # 2. sort the dataframe using those columns + # 3. drop the sort key columns + ascending = [key.ascending for key in op.keys] + newcols = {gen_name("sort_key"): col for col in keys} + names = list(newcols.keys()) + df = parent.assign(**newcols) + df = df.sort_values(by=names, ascending=ascending, ignore_index=True) + return df.drop(names, axis=1) + + @classmethod + def visit(cls, op: PandasAggregate, parent, groups, metrics): + if groups: + parent = parent.groupby([col.name for col in groups.values()]) + metrics = {k: parent.apply(v) for k, v in metrics.items()} + result = pd.concat(metrics, axis=1).reset_index() + renames = {v.name: k for k, v in op.groups.items()} + return result.rename(columns=renames) + else: + results = {k: v(parent) for k, v in metrics.items()} + combined, _ = asframe(results) + return combined + + @classmethod + def visit(cls, op: PandasJoin, how, left, right, left_on, right_on): + # broadcast predicates if they are scalar values + left_size = len(left) + left_on = [asseries(v, left_size) for v in left_on] + right_size = len(right) + right_on = [asseries(v, right_size) for v in right_on] + + if how == "cross": + assert not left_on and not right_on + return pd.merge(left, right, how="cross") + elif how == "anti": + df = pd.merge( + left, + right, + how="outer", + left_on=left_on, + right_on=right_on, + indicator=True, + ) + df = df[df["_merge"] == "left_only"] + return df.drop(columns=["_merge"]) + elif how == "semi": + mask = asseries(True, left_size) + for left_pred, right_pred in zip(left_on, right_on): + mask = mask & left_pred.isin(right_pred) + return left[mask] + else: + df = left.merge(right, how=how, left_on=left_on, right_on=right_on) + return df.drop(columns=[f"key_{i}" for i in range(len(left_on))]) + + @classmethod + def visit( + cls, + op: PandasAsofJoin, + how, + left, + right, + left_on, + right_on, + left_by, + right_by, + operator, + ): + # broadcast predicates if they are scalar values + left_size = len(left) + right_size = len(right) + left_on = [asseries(v, left_size) for v in left_on] + left_by = [asseries(v, left_size) for v in left_by] + right_on = [asseries(v, right_size) for v in right_on] + right_by = [asseries(v, right_size) for v in right_by] + + # merge_asof only works with column names not with series + left_on = {gen_name("left"): s for s in left_on} + left_by = {gen_name("left"): s for s in left_by} + right_on = {gen_name("right"): s for s in right_on} + right_by = {gen_name("right"): s for s in right_by} + + left = left.assign(**left_on, **left_by) + right = right.assign(**right_on, **right_by) + + # construct the appropriate flags for merge_asof + if operator == ops.LessEqual: + direction = "forward" + allow_exact_matches = True + elif operator == ops.GreaterEqual: + direction = "backward" + allow_exact_matches = True + elif operator == ops.Less: + direction = "forward" + allow_exact_matches = False + elif operator == ops.Greater: + direction = "backward" + allow_exact_matches = False + elif operator == ops.Equals: + direction = "nearest" + allow_exact_matches = True + else: + raise NotImplementedError( + f"Operator {operator} not supported for asof join" + ) + + # merge_asof requires the left side to be sorted by the join keys + left = left.sort_values(by=list(left_on.keys())) + df = pd.merge_asof( + left, + right, + left_on=list(left_on.keys()), + right_on=list(right_on.keys()), + left_by=list(left_by.keys()) or None, + right_by=list(right_by.keys()) or None, + direction=direction, + allow_exact_matches=allow_exact_matches, + ) + return df + + @classmethod + def visit(cls, op: ops.Union, left, right, distinct): + result = pd.concat([left, right], axis=0) + return result.drop_duplicates() if distinct else result + + @classmethod + def visit(cls, op: ops.Intersection, left, right, distinct): + if not distinct: + raise NotImplementedError( + "`distinct=False` is not supported by the pandas backend" + ) + return left.merge(right, on=list(left.columns), how="inner") + + @classmethod + def visit(cls, op: ops.Difference, left, right, distinct): + if not distinct: + raise NotImplementedError( + "`distinct=False` is not supported by the pandas backend" + ) + merged = left.merge(right, on=list(left.columns), how="outer", indicator=True) + result = merged[merged["_merge"] == "left_only"].drop("_merge", axis=1) + return result + + @classmethod + def visit(cls, op: ops.Distinct, parent): + return parent.drop_duplicates() + + @classmethod + def visit(cls, op: ops.DropNa, parent, how, subset): + if op.subset is not None: + subset = [col.name for col in op.subset] + else: + subset = None + return parent.dropna(how=how, subset=subset) + + @classmethod + def visit(cls, op: ops.FillNa, parent, replacements): + return parent.fillna(replacements) + + @classmethod + def visit(cls, op: ops.InValues, value, options): + if isinstance(value, pd.Series): + return value.isin(options) + else: + return value in options + + @classmethod + def visit(cls, op: ops.InSubquery, rel, needle): + first_column = rel.iloc[:, 0] + if isinstance(needle, pd.Series): + return needle.isin(first_column) + else: + return needle in first_column + + @classmethod + def visit(cls, op: PandasScalarSubquery, rel): + return rel.iat[0, 0] + + @classmethod + def execute(cls, node, backend, params): + def fn(node, _, **kwargs): + return cls.visit(node, **kwargs) + + original = node + node = node.to_expr().as_table().op() + node = plan(node, backend=backend, params=params) + df = node.map_clear(fn) + + # TODO(kszucs): add a flag to disable this conversion because it can be + # expensive for columns with object dtype + df = PandasData.convert_table(df, node.schema) + if isinstance(original, ops.Value): + if original.shape.is_scalar(): + return df.iloc[0, 0] + elif original.shape.is_columnar(): + return df.iloc[:, 0] + else: + raise TypeError(f"Unexpected shape: {original.shape}") + else: + return df diff --git a/ibis/backends/pandas/helpers.py b/ibis/backends/pandas/helpers.py new file mode 100644 index 0000000000000..d8bc9efd54eb5 --- /dev/null +++ b/ibis/backends/pandas/helpers.py @@ -0,0 +1,211 @@ +from __future__ import annotations + +import itertools +from typing import Callable + +import numpy as np +import pandas as pd + +from ibis.util import gen_name + + +def asseries(value, size=1): + """Ensure that value is a pandas Series object, broadcast if necessary.""" + if isinstance(value, pd.Series): + return value + elif isinstance(value, (list, np.ndarray)): + return pd.Series(itertools.repeat(np.array(value), size)) + else: + return pd.Series(np.repeat(value, size)) + + +def asframe(values: dict | tuple, concat=True): + """Construct a DataFrame from a dict or tuple of Series objects.""" + if isinstance(values, dict): + names, values = zip(*values.items()) + elif isinstance(values, tuple): + names = [f"_{i}" for i in range(len(values))] + else: + raise TypeError(f"values must be a dict, or tuple; got {type(values)}") + + size = 1 + all_scalars = True + for v in values: + if isinstance(v, pd.Series): + size = len(v) + all_scalars = False + break + + columns = [asseries(v, size) for v in values] + if concat: + df = pd.concat(columns, axis=1, keys=names).reset_index(drop=True) + return df, all_scalars + else: + return columns, all_scalars + + +def generic(func: Callable, operands): + return func(*operands.values()) + + +def rowwise(func: Callable, operands): + # dealing with a collection of series objects + df, all_scalars = asframe(operands) + result = df.apply(func, axis=1) # , **kwargs) + return result.iat[0] if all_scalars else result + + +def columnwise(func: Callable, operands): + df, all_scalars = asframe(operands) + result = func(df) + return result.iat[0] if all_scalars else result + + +def serieswise(func, operands): + (key, value), *rest = operands.items() + if isinstance(value, pd.Series): + # dealing with a single series object + return func(**operands) + else: + # dealing with a single scalar object + value = pd.Series([value]) + operands = {key: value, **dict(rest)} + return func(**operands).iat[0] + + +def elementwise(func, operands): + value = operands.pop(next(iter(operands))) + if isinstance(value, pd.Series): + # dealing with a single series object + if operands: + return value.apply(func, **operands) + else: + return value.map(func, na_action="ignore") + else: + # dealing with a single scalar object + return func(value, **operands) + + +def agg(func, arg_column, where_column): + if where_column is None: + + def applier(df): + return func(df[arg_column.name]) + else: + + def applier(df): + mask = df[where_column.name] + col = df[arg_column.name][mask] + return func(col) + + return applier + + +class UngroupedFrame: + def __init__(self, df): + self.df = df + + def groups(self): + yield self.df + + def apply_reduction(self, func, **kwargs): + result = func(self.df, **kwargs) + data = [result] * len(self.df) + return pd.Series(data, index=self.df.index) + + def apply_analytic(self, func, **kwargs): + return func(self.df, **kwargs) + + +class GroupedFrame: + def __init__(self, df, group_keys): + self.df = df + self.group_keys = group_keys + self.groupby = df.groupby(group_keys, as_index=True) + + def groups(self): + for _, df in self.groupby: + yield df + + def apply_analytic(self, func, **kwargs): + results = [func(df, **kwargs) for df in self.groups()] + return pd.concat(results) + + def apply_reduction(self, func, **kwargs): + name = gen_name("result") + result = self.groupby.apply(func, **kwargs).rename(name) + df = self.df.merge(result, left_on=self.group_keys, right_index=True) + return df[name] + + +class RowsFrame: + def __init__(self, parent): + self.parent = parent + + @staticmethod + def adjust(length, index, start_offset, end_offset): + if start_offset is None: + start_index = 0 + else: + start_index = index + start_offset + if start_index < 0: + start_index = 0 + elif start_index > length: + start_index = length + + if end_offset is None: + end_index = length + else: + end_index = index + end_offset + 1 + if end_index < 0: + end_index = 0 + elif end_index > length: + end_index = length + + return (start_index, end_index) + + def apply_analytic(self, func, **kwargs): + return self.parent.apply_analytic(func, **kwargs) + + def apply_reduction(self, func, **kwargs): + results = {} + for df in self.parent.groups(): + for i, (ix, row) in enumerate(df.iterrows()): + # TODO(kszucs): use unique column names for _start, _end + start, end = row["__start__"], row["__end__"] + start_index, end_index = self.adjust(len(df), i, start, end) + subdf = df.iloc[start_index:end_index] + results[ix] = func(subdf, **kwargs) + + return pd.Series(results) + + +class RangeFrame: + def __init__(self, parent, order_key): + self.parent = parent + self.order_key = order_key + + @staticmethod + def predicate(col, i, start, end): + value = col.iat[i] + if start is None: + return col <= value + end + elif end is None: + return col >= value + start + else: + return (col >= value + start) & (col <= value + end) + + def apply_analytic(self, func, **kwargs): + return self.parent.apply_analytic(func, **kwargs) + + def apply_reduction(self, func, **kwargs): + results = {} + for df in self.parent.groups(): + for i, (ix, row) in enumerate(df.iterrows()): + start, end = row["__start__"], row["__end__"] + column = df[self.order_key] + predicate = self.predicate(column, i, start, end) + subdf = df[predicate] + results[ix] = func(subdf, **kwargs) + + return pd.Series(results) diff --git a/ibis/backends/pandas/kernels.py b/ibis/backends/pandas/kernels.py new file mode 100644 index 0000000000000..1e28095c1ee2c --- /dev/null +++ b/ibis/backends/pandas/kernels.py @@ -0,0 +1,513 @@ +from __future__ import annotations + +import decimal +import json +import math +import operator + +try: + import regex as re +except ImportError: + import re +from functools import reduce +from urllib.parse import parse_qs, urlsplit + +import numpy as np +import pandas as pd +import toolz + +import ibis.expr.operations as ops +from ibis.backends.pandas.helpers import ( + columnwise, + elementwise, + generic, + rowwise, + serieswise, +) +from ibis.common.exceptions import OperationNotDefinedError +from ibis.util import any_of + + +def substring_rowwise(row): + arg, start, length = row["arg"], row["start"], row["length"] + if length is None: + return arg[start:] + else: + return arg[start : start + length] + + +def substring_serieswise(arg, start, length): + if length is None: + return arg.str[start:] + else: + return arg.str[start : start + length] + + +def _sql_like_to_regex(pattern, escape): + """Convert a SQL `LIKE` pattern to an equivalent Python regular expression. + + Parameters + ---------- + pattern + A LIKE pattern with the following semantics: + * `%` matches zero or more characters + * `_` matches exactly one character + * To escape `%` and `_` (or to match the `escape` parameter + itself), prefix the desired character with `escape`. + escape + Escape character + + Returns + ------- + str + A regular expression pattern equivalent to the input SQL `LIKE` pattern. + + Examples + -------- + >>> sql_like_to_regex("6%") # default is to not escape anything + '^6.*$' + >>> sql_like_to_regex("6^%", escape="^") + '^6%$' + >>> sql_like_to_regex("6_") + '^6.$' + >>> sql_like_to_regex("6/_", escape="/") + '^6_$' + >>> sql_like_to_regex("%abc") # any string ending with "abc" + '^.*abc$' + >>> sql_like_to_regex("abc%") # any string starting with "abc" + '^abc.*$' + """ + cur_i = 0 + pattern_length = len(pattern) + + while cur_i < pattern_length: + nxt_i = cur_i + 1 + + cur = pattern[cur_i] + nxt = pattern[nxt_i] if nxt_i < pattern_length else None + + skip = 1 + + if nxt is not None and escape is not None and cur == escape: + yield nxt + skip = 2 + elif cur == "%": + yield ".*" + elif cur == "_": + yield "." + else: + yield cur + + cur_i += skip + + +def sql_like_to_regex(pattern, escape=None): + return f"^{''.join(_sql_like_to_regex(pattern, escape))}$" + + +def string_sqllike_serieswise(arg, pattern, escape): + pat = sql_like_to_regex(pattern, escape) + return arg.str.contains(pat, regex=True) + + +def string_sqlilike_serieswise(arg, pattern, escape): + pat = sql_like_to_regex(pattern, escape) + return arg.str.contains(pat, regex=True, flags=re.IGNORECASE) + + +def extract_userinfo_elementwise(x): + url_parts = urlsplit(x) + username = url_parts.username or "" + password = url_parts.password or "" + return f"{username}:{password}" + + +def extract_queryparam_rowwise(row): + query = urlsplit(row["arg"]).query + param_name = row["key"] + if param_name is not None: + value = parse_qs(query)[param_name] + return value if len(value) > 1 else value[0] + else: + return query + + +def array_index_rowwise(row): + try: + return row["arg"][row["index"]] + except IndexError: + return None + + +def array_position_rowwise(row): + try: + return row["arg"].index(row["other"]) + except ValueError: + return -1 + + +def integer_range_rowwise(row): + if not row["step"]: + return [] + return list(np.arange(row["start"], row["stop"], row["step"])) + + +def timestamp_range_rowwise(row): + if not row["step"]: + return [] + return list( + pd.date_range(row["start"], row["stop"], freq=row["step"], inclusive="left") + ) + + +def _safe_method(mapping, method, *args, **kwargs): + if mapping is None or mapping is pd.NA: + return None + try: + method = getattr(mapping, method) + except AttributeError: + return None + else: + result = method(*args, **kwargs) + return None if result is pd.NA else result + + +def safe_len(mapping): + return _safe_method(mapping, "__len__") + + +def safe_get(mapping, key, default=None): + return _safe_method(mapping, "get", key, default) + + +def safe_contains(mapping, key): + return _safe_method(mapping, "__contains__", key) + + +def safe_keys(mapping): + result = _safe_method(mapping, "keys") + if result is None: + return None + # list(...) to unpack iterable + return np.array(list(result)) + + +def safe_values(mapping): + result = _safe_method(mapping, "values") + if result is None or result is pd.NA: + return None + # list(...) to unpack iterable + return np.array(list(result), dtype="object") + + +def safe_merge(left, right): + if left is None or left is pd.NA: + return None + elif right is None or right is pd.NA: + return None + else: + return {**left, **right} + + +def safe_json_getitem(value, key): + try: + # try to deserialize the value -> return None if it's None + if (js := json.loads(value)) is None: + return None + except (json.JSONDecodeError, TypeError): + # if there's an error related to decoding or a type error return None + return None + + try: + # try to extract the value as an array element or mapping key + return js[key] + except (KeyError, IndexError, TypeError): + # KeyError: missing mapping key + # IndexError: missing sequence key + # TypeError: `js` doesn't implement __getitem__, either at all or for + # the type of `key` + return None + + +def safe_decimal(func): + def wrapper(x, **kwargs): + try: + return func(x, **kwargs) + except decimal.InvalidOperation: + return decimal.Decimal("NaN") + + return wrapper + + +def round_serieswise(arg, digits): + if digits is None: + return np.round(arg).astype("int64") + else: + return np.round(arg, digits).astype("float64") + + +_generic_impls = { + ops.Abs: abs, + ops.Acos: np.arccos, + ops.Add: operator.add, + ops.And: operator.and_, + ops.Asin: np.arcsin, + ops.Atan: np.arctan, + ops.Atan2: np.arctan2, + ops.BitwiseAnd: lambda x, y: np.bitwise_and(x, y), + ops.BitwiseLeftShift: lambda x, y: np.left_shift(x, y).astype("int64"), + ops.BitwiseNot: np.invert, + ops.BitwiseOr: lambda x, y: np.bitwise_or(x, y), + ops.BitwiseRightShift: lambda x, y: np.right_shift(x, y).astype("int64"), + ops.BitwiseXor: lambda x, y: np.bitwise_xor(x, y), + ops.Ceil: lambda x: np.ceil(x).astype("int64"), + ops.Cos: np.cos, + ops.Cot: lambda x: 1 / np.tan(x), + ops.DateAdd: operator.add, + ops.DateDiff: operator.sub, + ops.DateSub: operator.sub, + ops.Degrees: np.degrees, + ops.Divide: operator.truediv, + ops.Equals: operator.eq, + ops.Exp: np.exp, + ops.Floor: lambda x: np.floor(x).astype("int64"), + ops.FloorDivide: operator.floordiv, + ops.Greater: operator.gt, + ops.GreaterEqual: operator.ge, + ops.IdenticalTo: lambda x, y: (x == y) | (pd.isnull(x) & pd.isnull(y)), + ops.IntervalAdd: operator.add, + ops.IntervalFloorDivide: operator.floordiv, + ops.IntervalMultiply: operator.mul, + ops.IntervalSubtract: operator.sub, + ops.IsInf: np.isinf, + ops.IsNull: pd.isnull, + ops.Less: operator.lt, + ops.LessEqual: operator.le, + ops.Ln: np.log, + ops.Log10: np.log10, + ops.Log2: np.log2, + ops.Modulus: operator.mod, + ops.Multiply: operator.mul, + ops.Negate: lambda x: not x if isinstance(x, (bool, np.bool_)) else -x, + ops.Not: lambda x: not x if isinstance(x, (bool, np.bool_)) else ~x, + ops.NotEquals: operator.ne, + ops.NotNull: pd.notnull, + ops.Or: operator.or_, + ops.Power: operator.pow, + ops.Radians: np.radians, + ops.Sign: np.sign, + ops.Sin: np.sin, + ops.Sqrt: np.sqrt, + ops.Subtract: operator.sub, + ops.Tan: np.tan, + ops.TimestampAdd: operator.add, + ops.TimestampDiff: operator.sub, + ops.TimestampSub: operator.sub, + ops.Xor: operator.xor, + ops.E: lambda: np.e, + ops.Pi: lambda: np.pi, + ops.TimestampNow: lambda: pd.Timestamp("now", tz="UTC").tz_localize(None), + ops.StringConcat: lambda xs: reduce(operator.add, xs), + ops.StringJoin: lambda sep, xs: reduce(lambda x, y: x + sep + y, xs), + ops.Log: lambda x, base: np.log(x) if base is None else np.log(x) / np.log(base), +} + +_columnwise_impls = { + ops.Clip: lambda df: df["arg"].clip(lower=df["lower"], upper=df["upper"]), + ops.IfElse: lambda df: df["true_expr"].where( + df["bool_expr"], other=df["false_null_expr"] + ), + ops.NullIf: lambda df: df["arg"].where(df["arg"] != df["null_if_expr"]), + ops.Repeat: lambda df: df["arg"] * df["times"], +} + +_rowwise_impls = { + ops.ArrayContains: lambda row: row["other"] in row["arg"], + ops.ArrayIndex: array_index_rowwise, + ops.ArrayPosition: array_position_rowwise, + ops.ArrayRemove: lambda row: [x for x in row["arg"] if x != row["other"]], + ops.ArrayRepeat: lambda row: np.tile(row["arg"], max(0, row["times"])), + ops.ArraySlice: lambda row: row["arg"][row["start"] : row["stop"]], + ops.ArrayUnion: lambda row: toolz.unique(row["left"] + row["right"]), + ops.EndsWith: lambda row: row["arg"].endswith(row["end"]), + ops.IntegerRange: integer_range_rowwise, + ops.JSONGetItem: lambda row: safe_json_getitem(row["arg"], row["index"]), + ops.Map: lambda row: dict(zip(row["keys"], row["values"])), + ops.MapGet: lambda row: safe_get(row["arg"], row["key"], row["default"]), + ops.MapContains: lambda row: safe_contains(row["arg"], row["key"]), + ops.MapMerge: lambda row: safe_merge(row["left"], row["right"]), + ops.TimestampRange: timestamp_range_rowwise, + ops.LPad: lambda row: row["arg"].rjust(row["length"], row["pad"]), + ops.RegexExtract: lambda row: re.search(row["pattern"], row["arg"]).group( + row["index"] + ), + ops.RegexReplace: lambda row: re.sub( + row["pattern"], row["replacement"], row["arg"] + ), + ops.RegexSearch: lambda row: re.search(row["pattern"], row["arg"]) is not None, + ops.RPad: lambda row: row["arg"].ljust(row["length"], row["pad"]), + ops.StartsWith: lambda row: row["arg"].startswith(row["start"]), + ops.StringContains: lambda row: row["haystack"].contains(row["needle"]), + ops.StringFind: lambda row: row["arg"].find( + row["substr"], row["start"], row["end"] + ), + ops.StringReplace: lambda row: row["arg"].replace( + row["pattern"], row["replacement"] + ), + ops.StringSplit: lambda row: row["arg"].split(row["delimiter"]), + ops.StrRight: lambda row: row["arg"][-row["nchars"] :], + ops.Translate: lambda row: row["arg"].translate( + str.maketrans(row["from_str"], row["to_str"]) + ), + ops.Substring: substring_rowwise, + ops.ExtractQuery: extract_queryparam_rowwise, + ops.Strftime: lambda row: row["arg"].strftime(row["format_str"]), +} + +_serieswise_impls = { + ops.Between: lambda arg, lower_bound, upper_bound: arg.between( + lower_bound, upper_bound + ), + ops.Capitalize: lambda arg: arg.str.capitalize(), + ops.Date: lambda arg: arg.dt.floor("d"), + ops.DayOfWeekIndex: lambda arg: pd.to_datetime(arg).dt.dayofweek, + ops.DayOfWeekName: lambda arg: pd.to_datetime(arg).dt.day_name(), + ops.EndsWith: lambda arg, end: arg.str.endswith(end), + ops.ExtractDay: lambda arg: arg.dt.day, + ops.ExtractDayOfYear: lambda arg: arg.dt.dayofyear, + ops.ExtractEpochSeconds: lambda arg: arg.astype("datetime64[s]") + .astype("int64") + .astype("int32"), + ops.ExtractHour: lambda arg: arg.dt.hour, + ops.ExtractMicrosecond: lambda arg: arg.dt.microsecond, + ops.ExtractMillisecond: lambda arg: arg.dt.microsecond // 1000, + ops.ExtractMinute: lambda arg: arg.dt.minute, + ops.ExtractMonth: lambda arg: arg.dt.month, + ops.ExtractQuarter: lambda arg: arg.dt.quarter, + ops.ExtractSecond: lambda arg: arg.dt.second, + ops.ExtractWeekOfYear: lambda arg: arg.dt.isocalendar().week.astype("int32"), + ops.ExtractYear: lambda arg: arg.dt.year, + ops.Lowercase: lambda arg: arg.str.lower(), + ops.LPad: lambda arg, length, pad: arg.str.rjust(length, fillchar=pad), + ops.LStrip: lambda arg: arg.str.lstrip(), + ops.Repeat: lambda arg, times: arg.str.repeat(times), + ops.Reverse: lambda arg: arg.str[::-1], + ops.Round: round_serieswise, + ops.RPad: lambda arg, length, pad: arg.str.ljust(length, fillchar=pad), + ops.RStrip: lambda arg: arg.str.rstrip(), + ops.StartsWith: lambda arg, start: arg.str.startswith(start), + ops.StringAscii: lambda arg: arg.map(ord, na_action="ignore").astype("int32"), + ops.StringContains: lambda haystack, needle: haystack.str.contains( + needle, regex=False + ), + ops.StringFind: lambda arg, substr, start, end: arg.str.find(substr, start, end), + ops.StringLength: lambda arg: arg.str.len().astype("int32"), + ops.StringReplace: lambda arg, pattern, replacement: arg.str.replace( + pattern, replacement + ), + ops.StringSplit: lambda arg, delimiter: arg.str.split(delimiter), + ops.StringSQLLike: string_sqllike_serieswise, + ops.StringSQLILike: string_sqlilike_serieswise, + ops.Strip: lambda arg: arg.str.strip(), + ops.Strftime: lambda arg, format_str: arg.dt.strftime(format_str), + ops.StrRight: lambda arg, nchars: arg.str[-nchars:], + ops.Substring: substring_serieswise, + ops.Time: lambda arg: arg.dt.time, + ops.TimestampFromUNIX: lambda arg, unit: pd.to_datetime(arg, unit=unit.short), + ops.Translate: lambda arg, from_str, to_str: arg.str.translate( + str.maketrans(from_str, to_str) + ), + ops.Uppercase: lambda arg: arg.str.upper(), +} + +_elementwise_impls = { + ops.ExtractProtocol: lambda x: getattr(urlsplit(x), "scheme", ""), + ops.ExtractAuthority: lambda x: getattr(urlsplit(x), "netloc", ""), + ops.ExtractPath: lambda x: getattr(urlsplit(x), "path", ""), + ops.ExtractFragment: lambda x: getattr(urlsplit(x), "fragment", ""), + ops.ExtractHost: lambda x: getattr(urlsplit(x), "hostname", ""), + ops.ExtractUserInfo: extract_userinfo_elementwise, + ops.StructField: lambda x, field: safe_get(x, field), + ops.ArrayLength: len, + ops.ArrayFlatten: toolz.concat, + ops.ArraySort: sorted, + ops.ArrayDistinct: toolz.unique, + ops.MapLength: safe_len, + ops.MapKeys: safe_keys, + ops.MapValues: safe_values, +} + + +_elementwise_decimal_impls = { + ops.Round: lambda x, digits=0: round(x, digits), + ops.Log10: safe_decimal(lambda x: x.log10()), + ops.Ln: safe_decimal(lambda x: x.ln()), + ops.Exp: safe_decimal(lambda x: x.exp()), + ops.Floor: safe_decimal(math.floor), + ops.Ceil: safe_decimal(math.ceil), + ops.Sqrt: safe_decimal(lambda x: x.sqrt()), + ops.Log2: safe_decimal(lambda x: x.ln() / decimal.Decimal(2).ln()), + ops.Sign: safe_decimal(lambda x: math.copysign(1, x)), + ops.Log: safe_decimal(lambda x, base: x.ln() / decimal.Decimal(base).ln()), +} + + +def pick_kernel(op, operands): + typ = type(op) + + # decimal operations have special implementations + if op.dtype.is_decimal(): + func = _elementwise_decimal_impls[typ] + return elementwise(func, operands) + + # prefer generic implementations if available + if func := _generic_impls.get(typ): + return generic(func, operands) + + first, *rest = operands.values() + is_multi_arg = bool(rest) + is_multi_column = any_of(rest, pd.Series) + + if is_multi_column: + if func := _columnwise_impls.get(typ): + return columnwise(func, operands) + elif func := _rowwise_impls.get(typ): + return rowwise(func, operands) + else: + raise OperationNotDefinedError( + "No columnwise or rowwise implementation found for " + f"multi-column operation {typ}" + ) + elif is_multi_arg: + if func := _columnwise_impls.get(typ): + return columnwise(func, operands) + elif func := _serieswise_impls.get(typ): + return serieswise(func, operands) + elif func := _rowwise_impls.get(typ): + return rowwise(func, operands) + elif func := _elementwise_impls.get(typ): + return elementwise(func, operands) + else: + raise OperationNotDefinedError( + "No columnwise, serieswise, rowwise or elementwise " + f"implementation found for multi-argument operation {typ}" + ) + else: # noqa: PLR5501 + if func := _serieswise_impls.get(typ): + return serieswise(func, operands) + elif func := _elementwise_impls.get(typ): + return elementwise(func, operands) + else: + raise OperationNotDefinedError( + "No serieswise or elementwise implementation found for " + f"single-argument operation {typ}" + ) + + +supported_operations = ( + _generic_impls.keys() + | _columnwise_impls.keys() + | _rowwise_impls.keys() + | _serieswise_impls.keys() + | _elementwise_impls.keys() +) diff --git a/ibis/backends/pandas/rewrites.py b/ibis/backends/pandas/rewrites.py new file mode 100644 index 0000000000000..7419f92d498d6 --- /dev/null +++ b/ibis/backends/pandas/rewrites.py @@ -0,0 +1,322 @@ +from __future__ import annotations + +from public import public + +import ibis +import ibis.expr.datashape as ds +import ibis.expr.datatypes as dt +import ibis.expr.operations as ops +from ibis.common.annotations import attribute +from ibis.common.collections import FrozenDict +from ibis.common.patterns import replace +from ibis.common.typing import VarTuple # noqa: TCH001 +from ibis.expr.schema import Schema +from ibis.util import gen_name + + +class PandasRelation(ops.Relation): + pass + + +class PandasValue(ops.Value): + pass + + +@public +class PandasRename(PandasRelation): + parent: ops.Relation + mapping: FrozenDict[str, str] + + @classmethod + def from_prefix(cls, parent, prefix): + mapping = {k: f"{prefix}_{k}" for k in parent.schema} + return cls(parent, mapping) + + @attribute + def values(self): + return FrozenDict( + {to: ops.Field(self.parent, from_) for from_, to in self.mapping.items()} + ) + + @attribute + def schema(self): + return Schema( + {self.mapping[name]: dtype for name, dtype in self.parent.schema.items()} + ) + + +@public +class PandasJoin(PandasRelation): + left: ops.Relation + right: ops.Relation + left_on: VarTuple[ops.Value] + right_on: VarTuple[ops.Value] + how: str + + @attribute + def values(self): + return FrozenDict({**self.left.values, **self.right.values}) + + @attribute + def schema(self): + return self.left.schema | self.right.schema + + +@public +class PandasAsofJoin(PandasJoin): + left_by: VarTuple[ops.Value] + right_by: VarTuple[ops.Value] + operator: type + + +@public +class PandasAggregate(PandasRelation): + parent: ops.Relation + groups: FrozenDict[str, ops.Field] + metrics: FrozenDict[str, ops.Reduction] + + @attribute + def values(self): + return FrozenDict({**self.groups, **self.metrics}) + + @attribute + def schema(self): + return Schema({k: v.dtype for k, v in self.values.items()}) + + +@public +class PandasLimit(PandasRelation): + parent: ops.Relation + n: ops.Relation + offset: ops.Relation + + @attribute + def values(self): + return self.parent.values + + @attribute + def schema(self): + return self.parent.schema + + +@public +class PandasScalarSubquery(PandasValue): + # variant with no integrity checks + rel: ops.Relation + + shape = ds.scalar + + @attribute + def dtype(self): + return self.rel.schema.types[0] + + +def is_columnar(node): + return isinstance(node, ops.Value) and node.shape.is_columnar() + + +@replace(ops.Project) +def rewrite_project(_, **kwargs): + winfuncs = [] + for v in _.values.values(): + winfuncs.extend(v.find(ops.WindowFunction, ops.Value)) + + if not winfuncs: + return _ + + selects = {ops.Field(_.parent, k): k for k in _.parent.schema} + for node in winfuncs: + # add computed values from the window function + values = list(node.func.__args__) + # add computed values from the window frame + values += node.frame.group_by + values += [key.expr for key in node.frame.order_by] + if node.frame.start is not None: + values.append(node.frame.start.value) + if node.frame.end is not None: + values.append(node.frame.end.value) + + for v in values: + if is_columnar(v) and v not in selects: + selects[v] = gen_name("value") + + # STEP 1: construct the pre-projection + proj = ops.Project(_.parent, {v: k for k, v in selects.items()}) + subs = {node: ops.Field(proj, name) for name, node in proj.values.items()} + + # STEP 2: construct new window function nodes + metrics = {} + for node in winfuncs: + frame = node.frame + start = None if frame.start is None else frame.start.replace(subs) + end = None if frame.end is None else frame.end.replace(subs) + order_by = [key.replace(subs) for key in frame.order_by] + group_by = [key.replace(subs) for key in frame.group_by] + frame = frame.__class__( + proj, start=start, end=end, group_by=group_by, order_by=order_by + ) + metrics[node] = ops.WindowFunction(node.func.replace(subs), frame) + + # STEP 3: reconstruct the current projection with the window functions + subs.update(metrics) + values = {k: v.replace(subs, filter=ops.Value) for k, v in _.values.items()} + return ops.Project(proj, values) + + +@replace(ops.Aggregate) +def rewrite_aggregate(_, **kwargs): + selects = {ops.Field(_.parent, k): k for k in _.parent.schema} + for v in _.groups.values(): + if v not in selects: + selects[v] = gen_name("group") + + reductions = {} + for v in _.metrics.values(): + for reduction in v.find_topmost(ops.Reduction): + for arg in reduction.__args__: + if is_columnar(arg) and arg not in selects: + selects[arg] = gen_name("value") + if reduction not in reductions: + reductions[reduction] = gen_name("reduction") + + # STEP 1: construct the pre-projection + proj = ops.Project(_.parent, {v: k for k, v in selects.items()}) + + # STEP 2: construct the pandas aggregation + subs = {node: ops.Field(proj, name) for name, node in proj.values.items()} + groups = {name: ops.Field(proj, selects[node]) for name, node in _.groups.items()} + metrics = {name: node.replace(subs) for node, name in reductions.items()} + agg = PandasAggregate(proj, groups, metrics) + + # STEP 3: construct the post-projection + subs = {node: ops.Field(agg, name) for node, name in reductions.items()} + values = {name: ops.Field(agg, name) for name, node in _.groups.items()} + values.update({name: node.replace(subs) for name, node in _.metrics.items()}) + return ops.Project(agg, values) + + +def split_join_predicates(left, right, predicates, only_equality=True): + left_on = [] + right_on = [] + for pred in predicates: + if left not in pred.relations or right not in pred.relations: + # not a usual join predicate, so apply a trick by placing the + # predicate to the left side and adding a literal True to the right + # which the left side must be equal to + left_on.append(pred) + right_on.append(ops.Literal(True, dtype=dt.boolean)) + elif isinstance(pred, ops.Binary): + if only_equality and not isinstance(pred, ops.Equals): + raise TypeError("Only equality join predicates supported with pandas") + if left in pred.left.relations and right in pred.right.relations: + left_on.append(pred.left) + right_on.append(pred.right) + elif left in pred.right.relations and right in pred.left.relations: + left_on.append(pred.right) + right_on.append(pred.left) + else: + raise ValueError("Join predicate does not reference both tables") + else: + raise TypeError(f"Unsupported join predicate {pred}") + + return left_on, right_on + + +@replace(ops.JoinChain) +def rewrite_join(_, **kwargs): + prefixes = {} + prefixes[_.first] = prefix = str(len(prefixes)) + left = PandasRename.from_prefix(_.first, prefix) + + for link in _.rest: + prefixes[link.table] = prefix = str(len(prefixes)) + right = PandasRename.from_prefix(link.table, prefix) + + subs = {v: ops.Field(left, k) for k, v in left.values.items()} + subs.update({v: ops.Field(right, k) for k, v in right.values.items()}) + preds = [pred.replace(subs, filter=ops.Value) for pred in link.predicates] + + # separate ASOF from the rest of the joins + if link.how == "asof": + on, *by = preds + left_on, right_on = split_join_predicates( + left, right, [on], only_equality=False + ) + left_by, right_by = split_join_predicates(left, right, by) + left = PandasAsofJoin( + how="asof", + left=left, + right=right, + left_on=left_on, + right_on=right_on, + left_by=left_by, + right_by=right_by, + operator=type(on), + ) + else: + # need to replace the fields in the predicates + left_on, right_on = split_join_predicates(left, right, preds) + left = PandasJoin( + how=link.how, + left=left, + right=right, + left_on=left_on, + right_on=right_on, + ) + + subs = {v: ops.Field(left, k) for k, v in left.values.items()} + fields = {k: v.replace(subs, filter=ops.Value) for k, v in _.values.items()} + return ops.Project(left, fields) + + +@replace(ops.Limit) +def rewrite_limit(_, **kwargs): + if isinstance(_.n, ops.Value): + n = _.n.to_expr() + else: + n = ibis.literal(_.n) + + if isinstance(_.offset, ops.Value): + offset = _.offset.to_expr() + else: + offset = ibis.literal(_.offset) + + n = n.as_table().op() + if isinstance(n, ops.Aggregate): + n = rewrite_aggregate.match(n, context={}) + + offset = offset.as_table().op() + if isinstance(offset, ops.Aggregate): + offset = rewrite_aggregate.match(offset, context={}) + + return PandasLimit(_.parent, n, offset) + + +@replace(ops.ScalarSubquery) +def rewrite_scalar_subquery(_, **kwargs): + return PandasScalarSubquery(_.rel) + + +@replace(ops.ScalarParameter) +def replace_parameter(_, params, **kwargs): + return ops.Literal(value=params[_], dtype=_.dtype) + + +@replace(ops.UnboundTable) +def bind_unbound_table(_, backend, **kwargs): + return ops.DatabaseTable(name=_.name, schema=_.schema, source=backend) + + +def plan(node, backend, params): + ctx = {"params": params, "backend": backend} + node = node.replace(rewrite_scalar_subquery) + node = node.replace( + rewrite_project + | rewrite_aggregate + | rewrite_join + | rewrite_limit + | replace_parameter + | bind_unbound_table, + context=ctx, + ) + return node diff --git a/ibis/backends/pandas/tests/conftest.py b/ibis/backends/pandas/tests/conftest.py index 8aa998871d2a8..41fcc924ed2cb 100644 --- a/ibis/backends/pandas/tests/conftest.py +++ b/ibis/backends/pandas/tests/conftest.py @@ -1,9 +1,16 @@ from __future__ import annotations +import decimal from typing import Any +import numpy as np +import pandas as pd +import pytest + import ibis +import ibis.expr.datatypes as dt from ibis.backends.conftest import TEST_TABLES +from ibis.backends.pandas import Backend from ibis.backends.tests.base import BackendTest from ibis.backends.tests.data import array_types, json_types, struct_types, win @@ -32,3 +39,282 @@ def _load_data(self, **_: Any) -> None: @staticmethod def connect(*, tmpdir, worker_id, **kw): return ibis.pandas.connect(**kw) + + +@pytest.fixture(scope="module") +def df(): + return pd.DataFrame( + { + "plain_int64": list(range(1, 4)), + "plain_strings": list("abc"), + "plain_float64": [4.0, 5.0, 6.0], + "plain_datetimes_naive": pd.Series( + pd.date_range(start="2017-01-02 01:02:03.234", periods=3).values + ), + "plain_datetimes_ny": pd.Series( + pd.date_range(start="2017-01-02 01:02:03.234", periods=3).values + ).dt.tz_localize("America/New_York"), + "plain_datetimes_utc": pd.Series( + pd.date_range(start="2017-01-02 01:02:03.234", periods=3).values + ).dt.tz_localize("UTC"), + "plain_uint64": pd.Series(range(1, 4), dtype=np.dtype("uint64")), + "dup_strings": list("dad"), + "dup_ints": [1, 2, 1], + "float64_as_strings": ["100.01", "234.23", "-999.34"], + "int64_as_strings": list(map(str, range(1, 4))), + "strings_with_space": [" ", "abab", "ddeeffgg"], + "translate_from_strings": ["rmz", "abc", "ghj"], + "translate_to_strings": ["lns", "ovk", "jfr"], + "int64_with_zeros": [0, 1, 0], + "float64_with_zeros": [1.0, 0.0, 1.0], + "float64_positive": [1.0, 2.0, 1.0], + "strings_with_nulls": ["a", None, "b"], + "datetime_strings_naive": pd.Series( + pd.date_range(start="2017-01-02 01:02:03.234", periods=3).values + ).astype(str), + "datetime_strings_ny": pd.Series( + pd.date_range(start="2017-01-02 01:02:03.234", periods=3).values + ) + .dt.tz_localize("America/New_York") + .astype(str), + "datetime_strings_utc": pd.Series( + pd.date_range(start="2017-01-02 01:02:03.234", periods=3).values + ) + .dt.tz_localize("UTC") + .astype(str), + "decimal": list(map(decimal.Decimal, ["1.0", "2", "3.234"])), + "array_of_float64": [ + np.array([1.0, 2.0], dtype="float64"), + np.array([3.0], dtype="float64"), + np.array([], dtype="float64"), + ], + "array_of_int64": [ + np.array([1, 2], dtype="int64"), + np.array([], dtype="int64"), + np.array([3], dtype="int64"), + ], + "array_of_strings": [ + np.array(["a", "b"], dtype="object"), + np.array([], dtype="object"), + np.array(["c"], dtype="object"), + ], + "map_of_strings_integers": [{"a": 1, "b": 2}, None, {}], + "map_of_integers_strings": [{}, None, {1: "a", 2: "b"}], + "map_of_complex_values": [None, {"a": [1, 2, 3], "b": []}, {}], + } + ) + + +@pytest.fixture(scope="module") +def batting_df(data_dir): + num_rows = 1000 + start_index = 30 + df = pd.read_parquet(data_dir / "parquet" / "batting.parquet").iloc[ + start_index : start_index + num_rows + ] + return df.reset_index(drop=True) + + +@pytest.fixture(scope="module") +def awards_players_df(data_dir): + return pd.read_parquet(data_dir / "parquet" / "awards_players.parquet") + + +@pytest.fixture(scope="module") +def df1(): + return pd.DataFrame( + {"key": list("abcd"), "value": [3, 4, 5, 6], "key2": list("eeff")} + ) + + +@pytest.fixture(scope="module") +def df2(): + return pd.DataFrame( + {"key": list("ac"), "other_value": [4.0, 6.0], "key3": list("fe")} + ) + + +@pytest.fixture(scope="module") +def intersect_df2(): + return pd.DataFrame({"key": list("cd"), "value": [5, 6], "key2": list("ff")}) + + +@pytest.fixture(scope="module") +def time_df1(): + return pd.DataFrame( + {"time": pd.to_datetime([1, 2, 3, 4]), "value": [1.1, 2.2, 3.3, 4.4]} + ) + + +@pytest.fixture(scope="module") +def time_df2(): + return pd.DataFrame({"time": pd.to_datetime([2, 4]), "other_value": [1.2, 2.0]}) + + +@pytest.fixture(scope="module") +def time_df3(): + return pd.DataFrame( + { + "time": pd.Series( + pd.date_range(start="2017-01-02 01:02:03.234", periods=8).values + ), + "id": list(range(1, 5)) * 2, + "value": [1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8], + } + ) + + +@pytest.fixture(scope="module") +def time_keyed_df1(): + return pd.DataFrame( + { + "time": pd.Series( + pd.date_range(start="2017-01-02 01:02:03.234", periods=6).values + ), + "key": [1, 2, 3, 1, 2, 3], + "value": [1.2, 1.4, 2.0, 4.0, 8.0, 16.0], + } + ) + + +@pytest.fixture(scope="module") +def time_keyed_df2(): + return pd.DataFrame( + { + "time": pd.Series( + pd.date_range( + start="2017-01-02 01:02:03.234", freq="3D", periods=3 + ).values + ), + "key": [1, 2, 3], + "other_value": [1.1, 1.2, 2.2], + } + ) + + +@pytest.fixture(scope="module") +def client( + df, + df1, + df2, + df3, + time_df1, + time_df2, + time_df3, + time_keyed_df1, + time_keyed_df2, + intersect_df2, +): + return Backend().connect( + { + "df": df, + "df1": df1, + "df2": df2, + "df3": df3, + "left": df1, + "right": df2, + "time_df1": time_df1, + "time_df2": time_df2, + "time_df3": time_df3, + "time_keyed_df1": time_keyed_df1, + "time_keyed_df2": time_keyed_df2, + "intersect_df2": intersect_df2, + } + ) + + +@pytest.fixture(scope="module") +def df3(): + return pd.DataFrame( + { + "key": list("ac"), + "other_value": [4.0, 6.0], + "key2": list("ae"), + "key3": list("fe"), + } + ) + + +t_schema = { + "decimal": dt.Decimal(4, 3), + "array_of_float64": dt.Array(dt.double), + "array_of_int64": dt.Array(dt.int64), + "array_of_strings": dt.Array(dt.string), + "map_of_strings_integers": dt.Map(dt.string, dt.int64), + "map_of_integers_strings": dt.Map(dt.int64, dt.string), + "map_of_complex_values": dt.Map(dt.string, dt.Array(dt.int64)), +} + + +@pytest.fixture(scope="module") +def t(client): + return client.table("df", schema=t_schema) + + +@pytest.fixture(scope="module") +def lahman(batting_df, awards_players_df): + return Backend().connect( + {"batting": batting_df, "awards_players": awards_players_df} + ) + + +@pytest.fixture(scope="module") +def left(client): + return client.table("left") + + +@pytest.fixture(scope="module") +def right(client): + return client.table("right") + + +@pytest.fixture(scope="module") +def time_left(client): + return client.table("time_df1") + + +@pytest.fixture(scope="module") +def time_right(client): + return client.table("time_df2") + + +@pytest.fixture(scope="module") +def time_table(client): + return client.table("time_df3") + + +@pytest.fixture(scope="module") +def time_keyed_left(client): + return client.table("time_keyed_df1") + + +@pytest.fixture(scope="module") +def time_keyed_right(client): + return client.table("time_keyed_df2") + + +@pytest.fixture(scope="module") +def batting(lahman): + return lahman.table("batting") + + +@pytest.fixture(scope="module") +def sel_cols(batting): + cols = batting.columns + start, end = cols.index("AB"), cols.index("H") + 1 + return ["playerID", "yearID", "teamID", "G"] + cols[start:end] + + +@pytest.fixture(scope="module") +def players_base(batting, sel_cols): + return batting[sel_cols].order_by(sel_cols[:3]) + + +@pytest.fixture(scope="module") +def players(players_base): + return players_base.group_by("playerID") + + +@pytest.fixture(scope="module") +def players_df(players_base): + return players_base.execute().reset_index(drop=True) diff --git a/ibis/backends/pandas/tests/execution/__init__.py b/ibis/backends/pandas/tests/execution/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/ibis/backends/pandas/tests/execution/conftest.py b/ibis/backends/pandas/tests/execution/conftest.py deleted file mode 100644 index 32d5efad67d2c..0000000000000 --- a/ibis/backends/pandas/tests/execution/conftest.py +++ /dev/null @@ -1,289 +0,0 @@ -from __future__ import annotations - -import decimal - -import numpy as np -import pandas as pd -import pytest - -import ibis.expr.datatypes as dt -from ibis.backends.pandas import Backend - - -@pytest.fixture(scope="module") -def df(): - return pd.DataFrame( - { - "plain_int64": list(range(1, 4)), - "plain_strings": list("abc"), - "plain_float64": [4.0, 5.0, 6.0], - "plain_datetimes_naive": pd.Series( - pd.date_range(start="2017-01-02 01:02:03.234", periods=3).values - ), - "plain_datetimes_ny": pd.Series( - pd.date_range(start="2017-01-02 01:02:03.234", periods=3).values - ).dt.tz_localize("America/New_York"), - "plain_datetimes_utc": pd.Series( - pd.date_range(start="2017-01-02 01:02:03.234", periods=3).values - ).dt.tz_localize("UTC"), - "plain_uint64": pd.Series(range(1, 4), dtype=np.dtype("uint64")), - "dup_strings": list("dad"), - "dup_ints": [1, 2, 1], - "float64_as_strings": ["100.01", "234.23", "-999.34"], - "int64_as_strings": list(map(str, range(1, 4))), - "strings_with_space": [" ", "abab", "ddeeffgg"], - "translate_from_strings": ["rmz", "abc", "ghj"], - "translate_to_strings": ["lns", "ovk", "jfr"], - "int64_with_zeros": [0, 1, 0], - "float64_with_zeros": [1.0, 0.0, 1.0], - "float64_positive": [1.0, 2.0, 1.0], - "strings_with_nulls": ["a", None, "b"], - "datetime_strings_naive": pd.Series( - pd.date_range(start="2017-01-02 01:02:03.234", periods=3).values - ).astype(str), - "datetime_strings_ny": pd.Series( - pd.date_range(start="2017-01-02 01:02:03.234", periods=3).values - ) - .dt.tz_localize("America/New_York") - .astype(str), - "datetime_strings_utc": pd.Series( - pd.date_range(start="2017-01-02 01:02:03.234", periods=3).values - ) - .dt.tz_localize("UTC") - .astype(str), - "decimal": list(map(decimal.Decimal, ["1.0", "2", "3.234"])), - "array_of_float64": [ - np.array([1.0, 2.0], dtype="float64"), - np.array([3.0], dtype="float64"), - np.array([], dtype="float64"), - ], - "array_of_int64": [ - np.array([1, 2], dtype="int64"), - np.array([], dtype="int64"), - np.array([3], dtype="int64"), - ], - "array_of_strings": [ - np.array(["a", "b"], dtype="object"), - np.array([], dtype="object"), - np.array(["c"], dtype="object"), - ], - "map_of_strings_integers": [{"a": 1, "b": 2}, None, {}], - "map_of_integers_strings": [{}, None, {1: "a", 2: "b"}], - "map_of_complex_values": [None, {"a": [1, 2, 3], "b": []}, {}], - } - ) - - -@pytest.fixture(scope="module") -def batting_df(data_dir): - num_rows = 1000 - start_index = 30 - df = pd.read_parquet(data_dir / "parquet" / "batting.parquet").iloc[ - start_index : start_index + num_rows - ] - return df.reset_index(drop=True) - - -@pytest.fixture(scope="module") -def awards_players_df(data_dir): - return pd.read_parquet(data_dir / "parquet" / "awards_players.parquet") - - -@pytest.fixture(scope="module") -def df1(): - return pd.DataFrame( - {"key": list("abcd"), "value": [3, 4, 5, 6], "key2": list("eeff")} - ) - - -@pytest.fixture(scope="module") -def df2(): - return pd.DataFrame( - {"key": list("ac"), "other_value": [4.0, 6.0], "key3": list("fe")} - ) - - -@pytest.fixture(scope="module") -def intersect_df2(): - return pd.DataFrame({"key": list("cd"), "value": [5, 6], "key2": list("ff")}) - - -@pytest.fixture(scope="module") -def time_df1(): - return pd.DataFrame( - {"time": pd.to_datetime([1, 2, 3, 4]), "value": [1.1, 2.2, 3.3, 4.4]} - ) - - -@pytest.fixture(scope="module") -def time_df2(): - return pd.DataFrame({"time": pd.to_datetime([2, 4]), "other_value": [1.2, 2.0]}) - - -@pytest.fixture(scope="module") -def time_df3(): - return pd.DataFrame( - { - "time": pd.Series( - pd.date_range(start="2017-01-02 01:02:03.234", periods=8).values - ), - "id": list(range(1, 5)) * 2, - "value": [1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8], - } - ) - - -@pytest.fixture(scope="module") -def time_keyed_df1(): - return pd.DataFrame( - { - "time": pd.Series( - pd.date_range(start="2017-01-02 01:02:03.234", periods=6).values - ), - "key": [1, 2, 3, 1, 2, 3], - "value": [1.2, 1.4, 2.0, 4.0, 8.0, 16.0], - } - ) - - -@pytest.fixture(scope="module") -def time_keyed_df2(): - return pd.DataFrame( - { - "time": pd.Series( - pd.date_range( - start="2017-01-02 01:02:03.234", freq="3D", periods=3 - ).values - ), - "key": [1, 2, 3], - "other_value": [1.1, 1.2, 2.2], - } - ) - - -@pytest.fixture(scope="module") -def client( - df, - df1, - df2, - df3, - time_df1, - time_df2, - time_df3, - time_keyed_df1, - time_keyed_df2, - intersect_df2, -): - return Backend().connect( - { - "df": df, - "df1": df1, - "df2": df2, - "df3": df3, - "left": df1, - "right": df2, - "time_df1": time_df1, - "time_df2": time_df2, - "time_df3": time_df3, - "time_keyed_df1": time_keyed_df1, - "time_keyed_df2": time_keyed_df2, - "intersect_df2": intersect_df2, - } - ) - - -@pytest.fixture(scope="module") -def df3(): - return pd.DataFrame( - { - "key": list("ac"), - "other_value": [4.0, 6.0], - "key2": list("ae"), - "key3": list("fe"), - } - ) - - -t_schema = { - "decimal": dt.Decimal(4, 3), - "array_of_float64": dt.Array(dt.double), - "array_of_int64": dt.Array(dt.int64), - "array_of_strings": dt.Array(dt.string), - "map_of_strings_integers": dt.Map(dt.string, dt.int64), - "map_of_integers_strings": dt.Map(dt.int64, dt.string), - "map_of_complex_values": dt.Map(dt.string, dt.Array(dt.int64)), -} - - -@pytest.fixture(scope="module") -def t(client): - return client.table("df", schema=t_schema) - - -@pytest.fixture(scope="module") -def lahman(batting_df, awards_players_df): - return Backend().connect( - {"batting": batting_df, "awards_players": awards_players_df} - ) - - -@pytest.fixture(scope="module") -def left(client): - return client.table("left") - - -@pytest.fixture(scope="module") -def right(client): - return client.table("right") - - -@pytest.fixture(scope="module") -def time_left(client): - return client.table("time_df1") - - -@pytest.fixture(scope="module") -def time_right(client): - return client.table("time_df2") - - -@pytest.fixture(scope="module") -def time_table(client): - return client.table("time_df3") - - -@pytest.fixture(scope="module") -def time_keyed_left(client): - return client.table("time_keyed_df1") - - -@pytest.fixture(scope="module") -def time_keyed_right(client): - return client.table("time_keyed_df2") - - -@pytest.fixture(scope="module") -def batting(lahman): - return lahman.table("batting") - - -@pytest.fixture(scope="module") -def sel_cols(batting): - cols = batting.columns - start, end = cols.index("AB"), cols.index("H") + 1 - return ["playerID", "yearID", "teamID", "G"] + cols[start:end] - - -@pytest.fixture(scope="module") -def players_base(batting, sel_cols): - return batting[sel_cols].order_by(sel_cols[:3]) - - -@pytest.fixture(scope="module") -def players(players_base): - return players_base.group_by("playerID") - - -@pytest.fixture(scope="module") -def players_df(players_base): - return players_base.execute().reset_index(drop=True) diff --git a/ibis/backends/pandas/tests/execution/test_timecontext.py b/ibis/backends/pandas/tests/execution/test_timecontext.py deleted file mode 100644 index 5a96cf33888fe..0000000000000 --- a/ibis/backends/pandas/tests/execution/test_timecontext.py +++ /dev/null @@ -1,399 +0,0 @@ -from __future__ import annotations - -import pandas as pd -import pytest -from packaging.version import parse as vparse - -import ibis -import ibis.common.exceptions as com -import ibis.expr.operations as ops -from ibis.backends.base.df.scope import Scope -from ibis.backends.base.df.timecontext import ( - TimeContext, - TimeContextRelation, - adjust_context, - compare_timecontext, - construct_time_context_aware_series, -) -from ibis.backends.pandas.execution import execute -from ibis.backends.pandas.execution.window import trim_window_result -from ibis.backends.pandas.tests.conftest import TestConf as tm - - -class CustomAsOfJoin(ops.AsOfJoin): - pass - - -def test_execute_with_timecontext(time_table): - expr = time_table - # define a time context for time-series data - context = (pd.Timestamp("20170101"), pd.Timestamp("20170103")) - - # without time context, execute produces every row - df_all = expr.execute() - assert len(df_all["time"]) == 8 - - # with context set, execute produces only rows within context - df_within_context = expr.execute(timecontext=context) - assert len(df_within_context["time"]) == 1 - - -def test_bad_timecontext(time_table, t): - expr = time_table - - # define context with illegal string - with pytest.raises(com.IbisError, match=r".*type pd.Timestamp.*"): - context = ("bad", "context") - expr.execute(timecontext=context) - - # define context with unsupported type int - with pytest.raises(com.IbisError, match=r".*type pd.Timestamp.*"): - context = (20091010, 20100101) - expr.execute(timecontext=context) - - # define context with too few values - with pytest.raises(com.IbisError, match=r".*should specify.*"): - context = pd.Timestamp("20101010") - expr.execute(timecontext=context) - - # define context with begin value later than end - with pytest.raises(com.IbisError, match=r".*before or equal.*"): - context = (pd.Timestamp("20101010"), pd.Timestamp("20090101")) - expr.execute(timecontext=context) - - # execute context with a table without TIME_COL - with pytest.raises(com.IbisError, match=r".*must have a time column.*"): - context = (pd.Timestamp("20090101"), pd.Timestamp("20100101")) - t.execute(timecontext=context) - - -def test_bad_call_to_adjust_context(): - op = "not_a_node" - context = (pd.Timestamp("20170101"), pd.Timestamp("20170103")) - scope = Scope() - with pytest.raises( - com.IbisError, match=r".*Unsupported input type for adjust context.*" - ): - adjust_context(op, scope, context) - - -def test_compare_timecontext(): - c1 = (pd.Timestamp("20170101"), pd.Timestamp("20170103")) - c2 = (pd.Timestamp("20170101"), pd.Timestamp("20170111")) - c3 = (pd.Timestamp("20160101"), pd.Timestamp("20160103")) - c4 = (pd.Timestamp("20161215"), pd.Timestamp("20170102")) - assert compare_timecontext(c1, c2) == TimeContextRelation.SUBSET - assert compare_timecontext(c2, c1) == TimeContextRelation.SUPERSET - assert compare_timecontext(c1, c4) == TimeContextRelation.OVERLAP - assert compare_timecontext(c1, c3) == TimeContextRelation.NONOVERLAP - - -def test_context_adjustment_asof_join( - time_keyed_left, time_keyed_right, time_keyed_df1, time_keyed_df2 -): - expr = time_keyed_left.asof_join( - time_keyed_right, "time", by="key", tolerance=4 * ibis.interval(days=1) - )[time_keyed_left, time_keyed_right.other_value] - context = (pd.Timestamp("20170105"), pd.Timestamp("20170111")) - result = expr.execute(timecontext=context) - - # compare with asof_join of manually trimmed tables - trimmed_df1 = time_keyed_df1[time_keyed_df1["time"] >= context[0]][ - time_keyed_df1["time"] < context[1] - ] - trimmed_df2 = time_keyed_df2[ - time_keyed_df2["time"] >= context[0] - pd.Timedelta(days=4) - ][time_keyed_df2["time"] < context[1]] - expected = pd.merge_asof( - trimmed_df1, - trimmed_df2, - on="time", - by="key", - tolerance=pd.Timedelta("4D"), - ) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - ["interval_ibis", "interval_pd"], - [ - (ibis.interval(days=1), "1d"), - (3 * ibis.interval(days=1), "3d"), - (5 * ibis.interval(days=1), "5d"), - ], -) -def test_context_adjustment_window(time_table, time_df3, interval_ibis, interval_pd): - # trim data manually - expected = ( - time_df3.set_index("time").value.rolling(interval_pd, closed="both").mean() - ) - expected = expected[expected.index >= pd.Timestamp("20170105")].reset_index( - drop=True - ) - - context = pd.Timestamp("20170105"), pd.Timestamp("20170111") - - window = ibis.trailing_window(interval_ibis, order_by=time_table.time) - expr = time_table["value"].mean().over(window) - # result should adjust time context accordingly - result = expr.execute(timecontext=context) - tm.assert_series_equal(result, expected) - - -def test_trim_window_result(time_df3): - """Unit test `trim_window_result` in Window execution.""" - df = time_df3.copy() - context = pd.Timestamp("20170105"), pd.Timestamp("20170111") - - # trim_window_result takes a MultiIndex Series as input - series = df["value"] - time_index = df.set_index("time").index - series.index = pd.MultiIndex.from_arrays( - [series.index, time_index], - names=series.index.names + ["time"], - ) - result = trim_window_result(series, context) - expected = df["time"][df["time"] >= pd.Timestamp("20170105")].reset_index(drop=True) - - # result should adjust time context accordingly - tm.assert_series_equal(result.reset_index()["time"], expected) - - # trim with a non-datetime type of 'time' throws Exception - wrong_series = df["id"] - df["time"] = df["time"].astype(str) - time_index = df.set_index("time").index - wrong_series.index = pd.MultiIndex.from_arrays( - [wrong_series.index, time_index], - names=wrong_series.index.names + ["time"], - ) - with pytest.raises(TypeError, match=r".*not supported between instances.*"): - trim_window_result(wrong_series, context) - - # column is ignored and series is not trimmed - no_context_result = trim_window_result(series, None) - tm.assert_series_equal(no_context_result, series) - - -def test_setting_timecontext_in_scope(time_table, time_df3): - expected_win_1 = ( - time_df3.set_index("time").value.rolling("3d", closed="both").mean() - ) - expected_win_1 = expected_win_1[ - expected_win_1.index >= pd.Timestamp("20170105") - ].reset_index(drop=True) - - context = pd.Timestamp("20170105"), pd.Timestamp("20170111") - window1 = ibis.trailing_window(3 * ibis.interval(days=1), order_by=time_table.time) - """In the following expression, Selection node will be executed first and - get table in context ('20170105', '20170101'). - - Then in window execution table will be executed again with a larger - context adjusted by window preceding days ('20170102', '20170111'). - To get the correct result, the cached table result with a smaller - context must be discard and updated to a larger time range. - """ - expr = time_table.mutate(value=time_table["value"].mean().over(window1)) - result = expr.execute(timecontext=context) - tm.assert_series_equal(result["value"], expected_win_1) - - -def test_context_adjustment_multi_window(time_table, time_df3): - expected_win_1 = ( - time_df3.set_index("time") - .rename(columns={"value": "v1"})["v1"] - .rolling("3d", closed="both") - .mean() - ) - expected_win_1 = expected_win_1[ - expected_win_1.index >= pd.Timestamp("20170105") - ].reset_index(drop=True) - - expected_win_2 = ( - time_df3.set_index("time") - .rename(columns={"value": "v2"})["v2"] - .rolling("2d", closed="both") - .mean() - ) - expected_win_2 = expected_win_2[ - expected_win_2.index >= pd.Timestamp("20170105") - ].reset_index(drop=True) - - context = pd.Timestamp("20170105"), pd.Timestamp("20170111") - window1 = ibis.trailing_window(3 * ibis.interval(days=1), order_by=time_table.time) - window2 = ibis.trailing_window(2 * ibis.interval(days=1), order_by=time_table.time) - expr = time_table.mutate( - v1=time_table["value"].mean().over(window1), - v2=time_table["value"].mean().over(window2), - ) - result = expr.execute(timecontext=context) - - tm.assert_series_equal(result["v1"], expected_win_1) - tm.assert_series_equal(result["v2"], expected_win_2) - - -@pytest.mark.xfail( - condition=vparse("1.4") <= vparse(pd.__version__) < vparse("1.4.2"), - raises=ValueError, - reason="https://github.com/pandas-dev/pandas/pull/44068", -) -def test_context_adjustment_window_groupby_id(time_table, time_df3): - """This test case is meant to test trim_window_result method in - pandas/execution/window.py to see if it could trim Series correctly with - groupby params.""" - expected = ( - time_df3.set_index("time") - .groupby("id") - .value.rolling("3d", closed="both") - .mean() - ) - # This is a MultiIndexed Series - expected = expected.reset_index() - expected = expected[expected.time >= pd.Timestamp("20170105")].reset_index( - drop=True - )["value"] - - context = pd.Timestamp("20170105"), pd.Timestamp("20170111") - - # expected.index.name = None - window = ibis.trailing_window( - 3 * ibis.interval(days=1), group_by="id", order_by=time_table.time - ) - expr = time_table["value"].mean().over(window) - # result should adjust time context accordingly - result = expr.execute(timecontext=context) - tm.assert_series_equal(result, expected) - - -def test_adjust_context_scope(time_keyed_left, time_keyed_right): - """Test that `adjust_context` has access to `scope` by default.""" - - @adjust_context.register(CustomAsOfJoin) - def adjust_context_custom_asof_join( - op: ops.AsOfJoin, - scope: Scope, - timecontext: TimeContext, - ) -> TimeContext: - """Confirms that `scope` is passed in.""" - assert scope is not None - return timecontext - - expr = CustomAsOfJoin( - left=time_keyed_left, - right=time_keyed_right, - predicates="time", - by="key", - tolerance=ibis.interval(days=4), - ).to_expr() - expr = expr[time_keyed_left, time_keyed_right.other_value] - context = (pd.Timestamp("20170105"), pd.Timestamp("20170111")) - expr.execute(timecontext=context) - - -def test_adjust_context_complete_shift( - time_keyed_left, - time_keyed_right, - time_keyed_df1, - time_keyed_df2, -): - """Test `adjust_context` function that completely shifts the context. - - This results in an adjusted context that is NOT a subset of the - original context. This is unlike an `adjust_context` function - that only expands the context. - - See #3104 - """ - - # Create a contrived `adjust_context` function for - # CustomAsOfJoin to mock this. - - @adjust_context.register(CustomAsOfJoin) - def adjust_context_custom_asof_join( - op: ops.AsOfJoin, - scope: Scope, - timecontext: TimeContext, - ) -> TimeContext: - """Shifts both the begin and end in the same direction.""" - - begin, end = timecontext - timedelta = execute(op.tolerance) - return (begin - timedelta, end - timedelta) - - expr = CustomAsOfJoin( - left=time_keyed_left, - right=time_keyed_right, - predicates="time", - by="key", - tolerance=ibis.interval(days=4), - ).to_expr() - expr = expr[time_keyed_left, time_keyed_right.other_value] - context = (pd.Timestamp("20170101"), pd.Timestamp("20170111")) - result = expr.execute(timecontext=context) - - # Compare with asof_join of manually trimmed tables - # Left table: No shift for context - # Right table: Shift both begin and end of context by 4 days - trimmed_df1 = time_keyed_df1[time_keyed_df1["time"] >= context[0]][ - time_keyed_df1["time"] < context[1] - ] - trimmed_df2 = time_keyed_df2[ - time_keyed_df2["time"] >= context[0] - pd.Timedelta(days=4) - ][time_keyed_df2["time"] < context[1] - pd.Timedelta(days=4)] - expected = pd.merge_asof( - trimmed_df1, - trimmed_df2, - on="time", - by="key", - tolerance=pd.Timedelta("4D"), - ) - - tm.assert_frame_equal(result, expected) - - -def test_construct_time_context_aware_series(time_df3): - """Unit test for `construct_time_context_aware_series`""" - # Series without 'time' index will result in a MultiIndex with 'time' - df = time_df3 - expected = df["value"] - time_index = pd.Index(df["time"]) - expected.index = pd.MultiIndex.from_arrays( - [expected.index, time_index], - names=expected.index.names + ["time"], - ) - result = construct_time_context_aware_series(df["value"], df) - tm.assert_series_equal(result, expected) - - # Series with 'time' as index will not change - time_indexed_df = time_df3.set_index("time") - expected_time_aware = time_indexed_df["value"] - result_time_aware = construct_time_context_aware_series( - time_indexed_df["value"], time_indexed_df - ) - tm.assert_series_equal(result_time_aware, expected_time_aware) - - # Series with a MultiIndex, where 'time' is in the MultiIndex, - # will not change - multi_index_time_aware_series = result_time_aware - expected_multi_index_time_aware = result_time_aware - result_multi_index_time_aware = construct_time_context_aware_series( - multi_index_time_aware_series, time_indexed_df - ) - tm.assert_series_equal( - result_multi_index_time_aware, expected_multi_index_time_aware - ) - - # Series with a MultiIndex, where 'time' is NOT in the MultiIndex, - # 'time' will be added into the MultiIndex - multi_index_series = df["id"] - expected_multi_index = df["id"].copy() - other_index = pd.Index(df["value"]) - expected_multi_index.index = pd.MultiIndex.from_arrays( - [expected_multi_index.index, other_index, time_index], - names=expected_multi_index.index.names + ["value", "time"], - ) - multi_index_series.index = pd.MultiIndex.from_arrays( - [multi_index_series.index, other_index], - names=multi_index_series.index.names + ["value"], - ) - result_multi_index = construct_time_context_aware_series(multi_index_series, df) - tm.assert_series_equal(result_multi_index, expected_multi_index) diff --git a/ibis/backends/pandas/tests/test_aggcontext.py b/ibis/backends/pandas/tests/test_aggcontext.py deleted file mode 100644 index 8fc7281a6fa7e..0000000000000 --- a/ibis/backends/pandas/tests/test_aggcontext.py +++ /dev/null @@ -1,167 +0,0 @@ -from __future__ import annotations - -import numpy as np -import pandas as pd -import pytest -from pandas import testing as tm -from pytest import param - -from ibis.backends.pandas.aggcontext import Summarize, window_agg_udf - -df = pd.DataFrame( - { - "id": [1, 2, 1, 2], - "v1": [1.0, 2.0, 3.0, 4.0], - "v2": [10.0, 20.0, 30.0, 40.0], - } -) - - -@pytest.mark.parametrize( - ("agg_fn", "expected_fn"), - [ - param( - lambda v1: v1.mean(), - lambda df: df["v1"].mean(), - id="udf", - ), - param( - "mean", - lambda df: df["v1"].mean(), - id="string", - ), - ], -) -def test_summarize_single_series(agg_fn, expected_fn): - """Test Summarize.agg operating on a single Series.""" - - aggcontext = Summarize() - - result = aggcontext.agg(df["v1"], agg_fn) - expected = expected_fn(df) - - assert result == expected - - -@pytest.mark.parametrize( - ("agg_fn", "expected_fn"), - [ - param( - lambda v1: v1.mean(), - lambda df: df["v1"].mean(), - id="udf", - ), - param( - "mean", - lambda df: df["v1"].mean(), - id="string", - ), - ], -) -def test_summarize_single_seriesgroupby(agg_fn, expected_fn): - """Test Summarize.agg operating on a single SeriesGroupBy.""" - - aggcontext = Summarize() - - df_grouped = df.sort_values("id").groupby("id") - result = aggcontext.agg(df_grouped["v1"], agg_fn) - - expected = expected_fn(df_grouped) - - tm.assert_series_equal(result, expected) - - -@pytest.mark.parametrize( - ("agg_fn", "expected_fn"), - [ - param( - lambda v1, v2: v1.mean() - v2.mean(), - lambda df: df["v1"].mean() - df["v2"].mean(), - id="two-column", - ), - # Two columns, but only the second one is actually used in UDF - param( - lambda v1, v2: v2.mean(), - lambda df: df["v2"].mean(), - id="redundant-column", - ), - ], -) -def test_summarize_multiple_series(agg_fn, expected_fn): - """Test Summarize.agg operating on many Series.""" - - aggcontext = Summarize() - - args = [df["v1"], df["v2"]] - result = aggcontext.agg(args[0], agg_fn, *args[1:]) - - expected = expected_fn(df) - - assert result == expected - - -@pytest.mark.parametrize( - "param", - [ - ( - pd.Series([True, True, True, True]), - pd.Series([1.0, 2.0, 2.0, 3.0]), - ), - ( - pd.Series([False, True, True, False]), - pd.Series([np.NaN, 2.0, 2.0, np.NaN]), - ), - ], -) -def test_window_agg_udf(param): - """Test passing custom window indices for window aggregation.""" - - mask, expected = param - - grouped_data = df.sort_values("id").groupby("id")["v1"] - result_index = grouped_data.obj.index - - window_lower_indices = pd.Series([0, 0, 2, 2]) - window_upper_indices = pd.Series([1, 2, 3, 4]) - - result = window_agg_udf( - grouped_data, - lambda s: s.mean(), - window_lower_indices, - window_upper_indices, - mask, - result_index, - dtype="float", - max_lookback=None, - ) - - expected.index = grouped_data.obj.index - - tm.assert_series_equal(result, expected) - - -def test_window_agg_udf_different_freq(): - """Test that window_agg_udf works when the window series and data series - have different frequencies.""" - - time = pd.Series([pd.Timestamp("20200101"), pd.Timestamp("20200201")]) - data = pd.Series([1, 2, 3, 4, 5, 6]) - window_lower_indices = pd.Series([0, 4]) - window_upper_indices = pd.Series([5, 7]) - mask = pd.Series([True, True]) - result_index = time.index - - result = window_agg_udf( - data, - lambda s: s.mean(), - window_lower_indices, - window_upper_indices, - mask, - result_index, - "float", - None, - ) - - expected = pd.Series([data.iloc[0:5].mean(), data.iloc[4:7].mean()]) - - tm.assert_series_equal(result, expected) diff --git a/ibis/backends/pandas/tests/execution/test_arrays.py b/ibis/backends/pandas/tests/test_arrays.py similarity index 96% rename from ibis/backends/pandas/tests/execution/test_arrays.py rename to ibis/backends/pandas/tests/test_arrays.py index 00e873715224a..98d1bb6fcd8df 100644 --- a/ibis/backends/pandas/tests/execution/test_arrays.py +++ b/ibis/backends/pandas/tests/test_arrays.py @@ -36,6 +36,13 @@ def test_array_length(t): tm.assert_frame_equal(result, expected) +def test_array_slice_using_column(t): + expr = t.array_of_int64[t.plain_int64 :] + result = expr.execute() + expected = pd.Series([[2], [], []]) + tm.assert_series_equal(result, expected) + + def test_array_length_scalar(client): raw_value = np.array([1, 2, 4]) value = ibis.array(raw_value) diff --git a/ibis/backends/pandas/tests/execution/test_cast.py b/ibis/backends/pandas/tests/test_cast.py similarity index 80% rename from ibis/backends/pandas/tests/execution/test_cast.py rename to ibis/backends/pandas/tests/test_cast.py index bc2d8a60f9745..7ca38a6752614 100644 --- a/ibis/backends/pandas/tests/execution/test_cast.py +++ b/ibis/backends/pandas/tests/test_cast.py @@ -5,14 +5,10 @@ import numpy as np import pandas as pd import pytest -import pytz -from pytest import param import ibis import ibis.expr.datatypes as dt -from ibis.backends.pandas.execution import execute from ibis.backends.pandas.tests.conftest import TestConf as tm -from ibis.common.exceptions import OperationNotDefinedError TIMESTAMP = "2022-03-13 06:59:10.467417" @@ -63,7 +59,9 @@ def test_cast_array(t, from_, to, expected): # One of the arrays in the Series res = result[0] assert isinstance(res, list) - assert [ibis.literal(v).type() for v in res] == [expected] * len(res) + + for v in result: + assert v == [dt.normalize(expected, x) for x in v] @pytest.mark.parametrize( @@ -71,7 +69,7 @@ def test_cast_array(t, from_, to, expected): [ ("string", "object"), ("int64", "int64"), - param("double", "float64", marks=pytest.mark.xfail(raises=TypeError)), + ("double", "float64"), ( dt.Timestamp("America/Los_Angeles"), "datetime64[ns, America/Los_Angeles]", @@ -97,22 +95,18 @@ def test_cast_timestamp_column(t, df, column, to, expected): [ ("string", str), ("int64", lambda x: pd.Timestamp(x).value // int(1e9)), - param( - "double", - float, - marks=pytest.mark.xfail(raises=OperationNotDefinedError), - ), + ("double", lambda x: float(pd.Timestamp(x).value // int(1e9))), ( dt.Timestamp("America/Los_Angeles"), - lambda x: x.astimezone(tz=pytz.timezone("America/Los_Angeles")), + lambda x: x.tz_localize(tz="America/Los_Angeles"), ), ], ) -def test_cast_timestamp_scalar_naive(to, expected): +def test_cast_timestamp_scalar_naive(client, to, expected): literal_expr = ibis.literal(pd.Timestamp(TIMESTAMP)) value = literal_expr.cast(to) - result = execute(value.op()) - raw = execute(literal_expr.op()) + result = client.execute(value) + raw = client.execute(literal_expr) assert result == expected(raw) @@ -121,23 +115,19 @@ def test_cast_timestamp_scalar_naive(to, expected): [ ("string", str), ("int64", lambda x: pd.Timestamp(x).value // int(1e9)), - param( - "double", - float, - marks=pytest.mark.xfail(raises=OperationNotDefinedError), - ), + ("double", lambda x: float(pd.Timestamp(x).value // int(1e9))), ( dt.Timestamp("America/Los_Angeles"), - lambda x: x.astimezone(tz=pytz.timezone("America/Los_Angeles")), + lambda x: x.astimezone(tz="America/Los_Angeles"), ), ], ) @pytest.mark.parametrize("tz", ["UTC", "America/New_York"]) -def test_cast_timestamp_scalar(to, expected, tz): +def test_cast_timestamp_scalar(client, to, expected, tz): literal_expr = ibis.literal(pd.Timestamp(TIMESTAMP).tz_localize(tz)) value = literal_expr.cast(to) - result = execute(value.op()) - raw = execute(literal_expr.op()) + result = client.execute(value) + raw = client.execute(literal_expr) assert result == expected(raw) @@ -158,7 +148,7 @@ def test_cast_date(t, df, column): tm.assert_series_equal(result, expected) -@pytest.mark.parametrize("type", [dt.Decimal(9, 0), dt.Decimal(12, 3)]) +@pytest.mark.parametrize("type", [dt.Decimal(9, 2), dt.Decimal(12, 3)]) def test_cast_to_decimal(t, df, type): expr = t.float64_as_strings.cast(type) result = expr.execute() diff --git a/ibis/backends/pandas/tests/test_core.py b/ibis/backends/pandas/tests/test_core.py index eb980c6cf7e93..45e3a3a02b943 100644 --- a/ibis/backends/pandas/tests/test_core.py +++ b/ibis/backends/pandas/tests/test_core.py @@ -6,11 +6,7 @@ import ibis import ibis.common.exceptions as com -import ibis.expr.operations as ops -from ibis.backends.base.df.scope import Scope from ibis.backends.pandas import Backend -from ibis.backends.pandas.dispatch import post_execute, pre_execute -from ibis.backends.pandas.execution import execute @pytest.fixture @@ -50,59 +46,24 @@ def test_from_dataframe(dataframe, ibis_table, core_client): tm.assert_frame_equal(result, expected) -def test_pre_execute_basic(): - """Test that pre_execute has intercepted execution and provided its own - scope dict.""" - - @pre_execute.register(ops.Add) - def pre_execute_test(op, *clients, scope=None, **kwargs): - return Scope({op: 4}, None) - - one = ibis.literal(1) - expr = one + one - result = execute(expr.op()) - assert result == 4 - - del pre_execute.funcs[(ops.Add,)] - pre_execute.reorder() - pre_execute._cache.clear() - - def test_execute_parameter_only(): param = ibis.param("int64") - result = execute(param.op(), params={param.op(): 42}) + con = ibis.pandas.connect() + result = con.execute(param, params={param.op(): 42}) assert result == 42 def test_missing_data_sources(): - t = ibis.table([("a", "string")]) + t = ibis.table([("a", "string")], name="t") expr = t.a.length() + con = ibis.pandas.connect() with pytest.raises(com.UnboundExpressionError): - execute(expr.op()) - - -def test_post_execute_called_on_joins(dataframe, core_client, ibis_table): - count = [0] - - @post_execute.register(ops.InnerJoin, pd.DataFrame) - def tmp_left_join_exe(op, lhs, **kwargs): - count[0] += 1 - return lhs - - left = ibis_table - right = left.view() - join = left.join(right, "plain_strings")[left.plain_int64] - result = join.execute() - assert result is not None - assert not result.empty - assert count[0] == 1 - - -def test_scope_look_up(): - # test if scope could lookup items properly - scope = Scope() - one_day = ibis.interval(days=1).op() - one_hour = ibis.interval(hours=1).op() - scope = scope.merge_scope(Scope({one_day: 1}, None)) - assert scope.get_value(one_hour) is None - assert scope.get_value(one_day) is not None + con.execute(expr) + + +def test_unbound_table_execution(): + t = ibis.table([("a", "string")], name="t") + expr = t.a.length() + con = ibis.pandas.connect({"t": pd.DataFrame({"a": ["a", "ab", "abc"]})}) + result = con.execute(expr) + assert result.tolist() == [1, 2, 3] diff --git a/ibis/backends/pandas/tests/test_dispatcher.py b/ibis/backends/pandas/tests/test_dispatcher.py deleted file mode 100644 index 27916fd112e08..0000000000000 --- a/ibis/backends/pandas/tests/test_dispatcher.py +++ /dev/null @@ -1,143 +0,0 @@ -from __future__ import annotations - -import pytest -from multipledispatch import Dispatcher - -from ibis.backends.pandas.dispatcher import TwoLevelDispatcher - - -class A1: - pass - - -class A2(A1): - pass - - -class A3(A2): - pass - - -class B1: - pass - - -class B2(B1): - pass - - -class B3(B2): - pass - - -@pytest.fixture -def foo_dispatchers(): - foo = TwoLevelDispatcher("foo", doc="Test dispatcher foo") - foo_m = Dispatcher("foo_m", doc="Control dispatcher foo_m") - - @foo.register(A1, B1) - @foo_m.register(A1, B1) - def foo0(x, y): - return 0 - - @foo.register(A1, B2) - @foo_m.register(A1, B2) - def foo1(x, y): - return 1 - - @foo.register(A2, B1) - @foo_m.register(A2, B1) - def foo2(x, y): - return 2 - - @foo.register(A2, B2) - @foo_m.register(A2, B2) - def foo3(x, y): - return 3 - - @foo.register( - (A1, A2), - ) - @foo_m.register( - (A1, A2), - ) - def foo4(x): - return 4 - - return foo, foo_m - - -@pytest.fixture -def foo(foo_dispatchers): - return foo_dispatchers[0] - - -@pytest.fixture -def foo_m(foo_dispatchers): - return foo_dispatchers[1] - - -def test_cache(foo, mocker): - """Test that cache is properly set after calling with args.""" - - spy = mocker.spy(foo, "dispatch") - a1, b1 = A1(), B1() - - assert (A1, B1) not in foo._cache - foo(a1, b1) - assert (A1, B1) in foo._cache - foo(a1, b1) - spy.assert_called_once_with(A1, B1) - - -def test_dispatch(foo, mocker): - """Test that calling dispatcher with a signature that is registered does - not trigger a linear search through dispatch_iter.""" - - spy = mocker.spy(foo, "dispatch_iter") - - # This should not trigger a linear search - foo(A1(), B1()) - assert not spy.called, ( - "Calling dispatcher with registered signature should " - "not trigger linear search" - ) - - foo(A3(), B3()) - spy.assert_called_once_with(A3, B3) - - -@pytest.mark.parametrize( - "args", - [ - (A1(), B1()), - (A1(), B2()), - (A1(), B3()), - (A2(), B1()), - (A2(), B2()), - (A2(), B3()), - (A3(), B1()), - (A3(), B2()), - (A3(), B3()), - (A1(),), - (A2(),), - (A3(),), - ], -) -def test_registered(foo_dispatchers, args): - foo, foo_m = foo_dispatchers - assert foo(*args) == foo_m(*args) - - -def test_ordering(foo, foo_m): - assert foo.ordering == foo_m.ordering - - -def test_funcs(foo, foo_m): - assert foo.funcs == foo_m.funcs - - -@pytest.mark.parametrize("args", [(B1(),), (B2(),), (A1(), A1()), (A1(), A2(), A3())]) -def test_unregistered(foo, args): - with pytest.raises(NotImplementedError, match="Could not find signature for foo.*"): - foo(*args) diff --git a/ibis/backends/pandas/tests/execution/test_functions.py b/ibis/backends/pandas/tests/test_functions.py similarity index 92% rename from ibis/backends/pandas/tests/execution/test_functions.py rename to ibis/backends/pandas/tests/test_functions.py index 2b3851675858e..9ef36b23ffb69 100644 --- a/ibis/backends/pandas/tests/execution/test_functions.py +++ b/ibis/backends/pandas/tests/test_functions.py @@ -13,7 +13,6 @@ import ibis import ibis.expr.datatypes as dt -from ibis.backends.pandas.execution import execute from ibis.backends.pandas.tests.conftest import TestConf as tm from ibis.backends.pandas.udf import udf @@ -74,7 +73,6 @@ def wrapper(*args, **kwargs): param( methodcaller("floor"), lambda x: decimal.Decimal(math.floor(x)), id="floor" ), - param(methodcaller("exp"), methodcaller("exp"), id="exp"), param( methodcaller("sign"), lambda x: x if not x else decimal.Decimal(1).copy_sign(x), @@ -97,19 +95,21 @@ def wrapper(*args, **kwargs): ) def test_math_functions_decimal(t, df, ibis_func, pandas_func): dtype = dt.Decimal(12, 3) - expr = ibis_func(t.float64_as_strings.cast(dtype)) - result = expr.execute() context = decimal.Context(prec=dtype.precision) - expected = df.float64_as_strings.apply( - lambda x: context.create_decimal(x).quantize( - decimal.Decimal( - f"{'0' * (dtype.precision - dtype.scale)}.{'0' * dtype.scale}" - ) + + def normalize(x): + x = context.create_decimal(x) + p = decimal.Decimal( + f"{'0' * (dtype.precision - dtype.scale)}.{'0' * dtype.scale}" ) - ).apply(pandas_func) + return x.quantize(p) + + expr = ibis_func(t.float64_as_strings.cast(dtype)) + result = expr.execute() - result[result.apply(math.isnan)] = -99999 - expected[expected.apply(math.isnan)] = -99999 + expected = ( + df.float64_as_strings.apply(normalize).apply(pandas_func).apply(normalize) + ) tm.assert_series_equal(result, expected.astype(expr.type().to_pandas())) @@ -221,10 +221,11 @@ def my_func(x, _): return x df = pd.DataFrame({"left": [left], "right": [right]}) - table = ibis.pandas.connect().from_dataframe(df) + con = ibis.pandas.connect() + table = con.from_dataframe(df) expr = my_func(table.left, table.right) - result = execute(expr.op()) + result = con.execute(expr) assert isinstance(result, pd.Series) result = result.tolist() @@ -238,8 +239,8 @@ def test_ifelse_returning_bool(): true = ibis.literal(True) false = ibis.literal(False) expr = ibis.ifelse(one + one == two, true, false) - result = execute(expr.op()) - assert result is True + result = ibis.pandas.connect().execute(expr) + assert result is True or result is np.True_ @pytest.mark.parametrize( @@ -261,7 +262,7 @@ def func(x): df = pd.DataFrame({"col": [value]}) table = ibis.pandas.connect().from_dataframe(df) - result = execute(table.col.op()) + result = table.col.execute() assert isinstance(result, pd.Series) result = result.tolist() diff --git a/ibis/backends/pandas/tests/test_helpers.py b/ibis/backends/pandas/tests/test_helpers.py new file mode 100644 index 0000000000000..4814a0d853763 --- /dev/null +++ b/ibis/backends/pandas/tests/test_helpers.py @@ -0,0 +1,72 @@ +from __future__ import annotations + +import pytest + +from ibis.backends.pandas.helpers import RowsFrame + +lst = list(range(10)) + + +@pytest.mark.parametrize( + ("ix", "start", "end", "expected"), + [ + (0, None, None, lst), + (0, 0, None, lst), + (0, None, 0, [0]), + (0, 0, 0, [0]), + (0, 0, 1, [0, 1]), + (0, 1, 1, [1]), + (0, 1, 2, [1, 2]), + (0, 1, None, lst[1:]), + (0, None, 1, [0, 1]), + (0, -1, None, lst), + (0, None, -1, []), + (0, -1, -1, []), + (0, -2, -1, []), + (0, -2, None, lst), + (0, None, -2, []), + (0, -1, 1, [0, 1]), + (0, 1, -1, []), + (0, -1, 2, [0, 1, 2]), + (1, None, None, lst), + (1, 0, None, lst[1:]), + (1, None, 0, [0, 1]), + (1, 0, 0, [1]), + (1, 0, 1, [1, 2]), + (1, 1, 1, [2]), + (1, 1, 2, [2, 3]), + (1, 1, None, lst[2:]), + (1, None, 1, [0, 1, 2]), + (1, -1, None, lst), + (1, None, -1, [0]), + (1, -1, -1, [0]), + (1, -2, -1, [0]), + (1, -2, None, lst), + (1, None, -2, []), + (1, -1, 1, [0, 1, 2]), + (1, 1, -1, []), + (1, -1, 2, [0, 1, 2, 3]), + (2, None, None, lst), + (2, 0, None, lst[2:]), + (2, None, 0, [0, 1, 2]), + (2, 0, 0, [2]), + (2, 0, 1, [2, 3]), + (2, 1, 1, [3]), + (2, 1, 2, [3, 4]), + (2, 1, None, lst[3:]), + (2, None, 1, [0, 1, 2, 3]), + (2, -1, None, lst[1:]), + (2, None, -1, [0, 1]), + (2, -1, -1, [1]), + (2, -2, -1, [0, 1]), + (2, -2, None, lst), + (2, None, -2, [0]), + (2, -1, 1, [1, 2, 3]), + (2, 1, -1, []), + (2, -1, 2, [1, 2, 3, 4]), + (3, None, None, lst), + ], +) +def test_rows_frame_adjustment(ix, start, end, expected): + start_index, end_index = RowsFrame.adjust(len(lst), ix, start, end) + assert lst[start_index:end_index] == expected diff --git a/ibis/backends/pandas/tests/execution/test_join.py b/ibis/backends/pandas/tests/test_join.py similarity index 89% rename from ibis/backends/pandas/tests/execution/test_join.py rename to ibis/backends/pandas/tests/test_join.py index 8fd990ea86e17..a9acaad3ed6ea 100644 --- a/ibis/backends/pandas/tests/execution/test_join.py +++ b/ibis/backends/pandas/tests/test_join.py @@ -1,5 +1,6 @@ from __future__ import annotations +import numpy as np import pandas as pd import pandas.testing as tm import pytest @@ -57,9 +58,26 @@ def test_join_with_multiple_predicates(how, left, right, df1, df2): ] result = expr.execute() expected = pd.merge( - df1, df2, how=how, left_on=["key", "key2"], right_on=["key", "key3"] + df1, + df2, + how=how, + left_on=["key", "key2"], + right_on=["key", "key3"], + suffixes=("_left", "_right"), ).reset_index(drop=True) - tm.assert_frame_equal(result[expected.columns], expected) + + expected_columns = ["key", "value", "key2", "key3", "other_value"] + expected = expected[expected_columns] + if how == "right": + # the ibis expression references the `key` column from the left table + # which is not present in the result of the right join, but pandas + # includes the column from the right table + expected["key"] = pd.Series([np.nan, np.nan, np.nan], dtype=object) + elif how == "outer": + expected["key"] = pd.Series(["a", "b", "c", "d", np.nan, np.nan], dtype=object) + + assert list(result.columns) == expected_columns + tm.assert_frame_equal(result, expected) @mutating_join_type @@ -70,6 +88,12 @@ def test_join_with_multiple_predicates_written_as_one(how, left, right, df1, df2 expected = pd.merge( df1, df2, how=how, left_on=["key", "key2"], right_on=["key", "key3"] ).reset_index(drop=True) + + if how == "right": + expected["key"] = pd.Series([np.nan, np.nan], dtype=object) + elif how == "outer": + expected["key"] = pd.Series(["a", "b", "c", "d", np.nan, np.nan], dtype=object) + tm.assert_frame_equal(result[expected.columns], expected) @@ -270,7 +294,9 @@ def test_asof_join(time_left, time_right, time_df1, time_df2): def test_asof_join_predicate(time_left, time_right, time_df1, time_df2): expr = time_left.asof_join(time_right, time_left.time == time_right.time) result = expr.execute() - expected = pd.merge_asof(time_df1, time_df2, on="time") + expected = pd.merge_asof( + time_df1, time_df2, on="time", direction="nearest", allow_exact_matches=True + ) tm.assert_frame_equal(result[expected.columns], expected) with pytest.raises(AssertionError): tm.assert_series_equal(result["time"], result["time_right"]) @@ -281,13 +307,10 @@ def test_keyed_asof_join( time_keyed_left, time_keyed_right, time_keyed_df1, time_keyed_df2 ): expr = time_keyed_left.asof_join(time_keyed_right, "time", by="key") + expr = expr.select(time_keyed_left, time_keyed_right.other_value) result = expr.execute() expected = pd.merge_asof(time_keyed_df1, time_keyed_df2, on="time", by="key") tm.assert_frame_equal(result[expected.columns], expected) - with pytest.raises(AssertionError): - tm.assert_series_equal(result["time"], result["time_right"]) - with pytest.raises(AssertionError): - tm.assert_series_equal(result["key"], result["key_right"]) @merge_asof_minversion @@ -327,7 +350,7 @@ def test_asof_join_overlapping_non_predicate( time_keyed_df2.assign(collide=time_keyed_df2["key"] + time_keyed_df2["other_value"]) expr = time_keyed_left.asof_join( - time_keyed_right, predicates=[("time", "time")], by=[("key", "key")] + time_keyed_right, on=("time", "time"), predicates=[("key", "key")] ) result = expr.execute() expected = pd.merge_asof( @@ -595,3 +618,33 @@ def test_multijoin(tracts_df, fields_df, harvest_df): ) tm.assert_frame_equal(result, expected) + + +def test_chain_join(): + test_df1 = pd.DataFrame({"id": ["1", "1"], "value": ["a", "a"]}) + test_df2 = pd.DataFrame({"id": ["1", "1"], "value": ["z", "z"]}) + test_df3 = pd.DataFrame({"id": ["1", "1"], "value": ["z1", "z1"]}) + + conn = ibis.pandas.connect({"df1": test_df1, "df2": test_df2, "df3": test_df3}) + + t1 = conn.table("df1") + t2 = conn.table("df2") + t3 = conn.table("df3") + + expr = ( + t1.join(t2, t1.id == t2.id) + .join(t3, t1.id == t3.id) + .select(t1.id, t1.value, t2.value.name("value2"), t3.value.name("value3")) + ) + result = expr.execute() + + n = len(test_df1) * len(test_df2) * len(test_df3) + expected = pd.DataFrame( + { + "id": ["1"] * n, + "value": ["a"] * n, + "value2": ["z"] * n, + "value3": ["z1"] * n, + } + ) + tm.assert_frame_equal(result, expected) diff --git a/ibis/backends/pandas/tests/execution/test_maps.py b/ibis/backends/pandas/tests/test_maps.py similarity index 100% rename from ibis/backends/pandas/tests/execution/test_maps.py rename to ibis/backends/pandas/tests/test_maps.py diff --git a/ibis/backends/pandas/tests/execution/test_operations.py b/ibis/backends/pandas/tests/test_operations.py similarity index 99% rename from ibis/backends/pandas/tests/execution/test_operations.py rename to ibis/backends/pandas/tests/test_operations.py index 54877d1ce4d0a..3d6e78d9d2c69 100644 --- a/ibis/backends/pandas/tests/execution/test_operations.py +++ b/ibis/backends/pandas/tests/test_operations.py @@ -13,7 +13,6 @@ import ibis.expr.datatypes as dt from ibis import _ from ibis.backends.pandas import Backend -from ibis.backends.pandas.execution import execute from ibis.backends.pandas.tests.conftest import TestConf as tm @@ -183,7 +182,6 @@ def test_group_by_rename_key(t, df): expr = t.group_by(t.dup_strings.name("foo")).aggregate( dup_string_count=t.dup_strings.count() ) - assert "foo" in expr.schema() result = expr.execute() assert "foo" in result.columns @@ -281,7 +279,7 @@ def test_nullif_zero(t, df, column): param( lambda t: ibis.literal("a"), lambda t: t.dup_strings, - lambda _: pd.Series(["d", np.nan, "d"], name="dup_strings"), + lambda _: pd.Series(["a", np.nan, "a"], name="dup_strings"), tm.assert_series_equal, id="literal_series", ), @@ -289,7 +287,7 @@ def test_nullif_zero(t, df, column): ) def test_nullif(t, df, left, right, expected, compare): expr = left(t).nullif(right(t)) - result = execute(expr.op()) + result = Backend().execute(expr) compare(result, expected(df)) diff --git a/ibis/backends/pandas/tests/execution/test_strings.py b/ibis/backends/pandas/tests/test_strings.py similarity index 89% rename from ibis/backends/pandas/tests/execution/test_strings.py rename to ibis/backends/pandas/tests/test_strings.py index 27f603903cd61..e583cb53437ec 100644 --- a/ibis/backends/pandas/tests/execution/test_strings.py +++ b/ibis/backends/pandas/tests/test_strings.py @@ -7,7 +7,9 @@ import pytest from pytest import param -from ibis.backends.pandas.execution.strings import sql_like_to_regex +import ibis +from ibis.backends.pandas import Backend +from ibis.backends.pandas.kernels import sql_like_to_regex @pytest.mark.parametrize( @@ -165,3 +167,23 @@ def test_translate( table = str.maketrans(from_str, to_str) series = df.strings_with_space.str.translate(table) tm.assert_series_equal(result, series, check_names=False) + + +def test_string_repeat(t): + int_col = t.plain_int64 + int_lit = ibis.literal(3) + string_col = t.strings_with_space + string_lit = ibis.literal("abc") + + expr1 = string_col.repeat(int_col) + expr2 = string_col.repeat(int_lit) + expr3 = string_lit.repeat(int_col) + expr4 = string_lit.repeat(int_lit) + + con = Backend() + con.execute(expr1) + con.execute(expr2) + con.execute(expr3) + con.execute(expr4) + + # TODO(kszucs): add assertions or rather parametrize the tests above diff --git a/ibis/backends/pandas/tests/execution/test_structs.py b/ibis/backends/pandas/tests/test_structs.py similarity index 95% rename from ibis/backends/pandas/tests/execution/test_structs.py rename to ibis/backends/pandas/tests/test_structs.py index 203d3e961b196..bf9647f73a62d 100644 --- a/ibis/backends/pandas/tests/execution/test_structs.py +++ b/ibis/backends/pandas/tests/test_structs.py @@ -8,7 +8,6 @@ import ibis import ibis.expr.datatypes as dt from ibis.backends.pandas import Backend -from ibis.backends.pandas.execution import execute from ibis.backends.pandas.tests.conftest import TestConf as tm @@ -48,13 +47,14 @@ def test_struct_field_literal(value): assert struct.type() == dt.Struct.from_tuples( [("fruit", dt.string), ("weight", dt.int8)] ) + con = ibis.pandas.connect() expr = struct["fruit"] - result = execute(expr.op()) + result = con.execute(expr) assert result == "pear" expr = struct["weight"] - result = execute(expr.op()) + result = con.execute(expr) assert result == 0 diff --git a/ibis/backends/pandas/tests/execution/test_temporal.py b/ibis/backends/pandas/tests/test_temporal.py similarity index 98% rename from ibis/backends/pandas/tests/execution/test_temporal.py rename to ibis/backends/pandas/tests/test_temporal.py index cd9a1e98384b1..f8cf670e99f14 100644 --- a/ibis/backends/pandas/tests/execution/test_temporal.py +++ b/ibis/backends/pandas/tests/test_temporal.py @@ -9,9 +9,9 @@ from packaging.version import parse as parse_version from pytest import param +import ibis from ibis import literal as L from ibis.backends.pandas import Backend -from ibis.backends.pandas.execution import execute from ibis.backends.pandas.tests.conftest import TestConf as tm from ibis.expr import datatypes as dt @@ -44,6 +44,7 @@ ], ) def test_timestamp_functions(case_func, expected_func): + con = ibis.pandas.connect() v = L("2015-09-01 14:48:05.359").cast("timestamp") vt = datetime.datetime( year=2015, @@ -56,7 +57,7 @@ def test_timestamp_functions(case_func, expected_func): ) result = case_func(v) expected = expected_func(vt) - assert execute(result.op()) == expected + assert con.execute(result) == expected @pytest.mark.parametrize( diff --git a/ibis/backends/pandas/tests/test_udf.py b/ibis/backends/pandas/tests/test_udf.py index f310db2174133..df6917aa2b25b 100644 --- a/ibis/backends/pandas/tests/test_udf.py +++ b/ibis/backends/pandas/tests/test_udf.py @@ -364,26 +364,28 @@ def my_wm(v, w): tm.assert_frame_equal(result, expected) -def test_udaf_window_nan(): - df = pd.DataFrame( - { - "a": np.arange(10, dtype=float), - "b": [3.0, np.NaN] * 5, - "key": list("ddeefffggh"), - } - ) - con = Backend().connect({"df": df}) - t = con.table("df") - window = ibis.trailing_window(2, order_by="a", group_by="key") - expr = t.mutate(rolled=my_mean(t.b).over(window)) - result = expr.execute().sort_values(["key", "a"]) - expected = df.sort_values(["key", "a"]).assign( - rolled=lambda d: d.groupby("key") - .b.rolling(3, min_periods=1) - .apply(lambda x: x.mean(), raw=True) - .reset_index(level=0, drop=True) - ) - tm.assert_frame_equal(result, expected) +# TODO(kszucs): revisit this, duckdb produces the same result as the pandas +# backend, but the expected result is different +# def test_udaf_window_nan(): +# df = pd.DataFrame( +# { +# "a": np.arange(10, dtype=float), +# "b": [3.0, np.NaN] * 5, +# "key": list("ddeefffggh"), +# } +# ) +# con = Backend().connect({"df": df}) +# t = con.table("df") +# window = ibis.trailing_window(2, order_by="a", group_by="key") +# expr = t.mutate(rolled=my_mean(t.b).over(window)) +# result = expr.execute().sort_values(["key", "a"]) +# expected = df.sort_values(["key", "a"]).assign( +# rolled=lambda d: d.groupby("key") +# .b.rolling(3, min_periods=1) +# .apply(lambda x: x.mean(), raw=True) +# .reset_index(level=0, drop=True) +# ) +# tm.assert_frame_equal(result, expected) @pytest.fixture(params=[[0.25, 0.75], [0.01, 0.99]]) diff --git a/ibis/backends/pandas/tests/execution/test_window.py b/ibis/backends/pandas/tests/test_window.py similarity index 93% rename from ibis/backends/pandas/tests/execution/test_window.py rename to ibis/backends/pandas/tests/test_window.py index 905dd833c7755..0f46a4a987b43 100644 --- a/ibis/backends/pandas/tests/execution/test_window.py +++ b/ibis/backends/pandas/tests/test_window.py @@ -11,11 +11,7 @@ import ibis import ibis.expr.datatypes as dt -import ibis.expr.operations as ops -from ibis.backends.base.df.scope import Scope from ibis.backends.pandas import Backend -from ibis.backends.pandas.dispatch import pre_execute -from ibis.backends.pandas.execution import execute from ibis.backends.pandas.tests.conftest import TestConf as tm from ibis.common.annotations import ValidationError from ibis.legacy.udf.vectorized import reduction @@ -51,58 +47,63 @@ def range_window(): @default @row_offset def test_lead(t, df, row_offset, default, row_window): + con = ibis.pandas.connect() expr = t.dup_strings.lead(row_offset, default=default).over(row_window) result = expr.execute() - expected = df.dup_strings.shift(execute((-row_offset).op())) + expected = df.dup_strings.shift(con.execute(-row_offset)) if default is not ibis.NA: - expected = expected.fillna(execute(default.op())) + expected = expected.fillna(con.execute(default)) tm.assert_series_equal(result, expected.rename("tmp")) @default @row_offset def test_lag(t, df, row_offset, default, row_window): + con = ibis.pandas.connect() expr = t.dup_strings.lag(row_offset, default=default).over(row_window) result = expr.execute() - expected = df.dup_strings.shift(execute(row_offset.op())) + expected = df.dup_strings.shift(con.execute(row_offset)) if default is not ibis.NA: - expected = expected.fillna(execute(default.op())) + expected = expected.fillna(con.execute(default)) tm.assert_series_equal(result, expected.rename("tmp")) @default @range_offset def test_lead_delta(t, df, range_offset, default, range_window): + con = ibis.pandas.connect() expr = t.dup_strings.lead(range_offset, default=default).over(range_window) result = expr.execute() expected = ( df[["plain_datetimes_naive", "dup_strings"]] .set_index("plain_datetimes_naive") .squeeze() - .shift(freq=execute((-range_offset).op())) + .shift(freq=con.execute(-range_offset)) .reindex(df.plain_datetimes_naive) .reset_index(drop=True) ) if default is not ibis.NA: - expected = expected.fillna(execute(default.op())) + expected = expected.fillna(con.execute(default)) tm.assert_series_equal(result, expected.rename("tmp")) @default @range_offset def test_lag_delta(t, df, range_offset, default, range_window): + con = ibis.pandas.connect() expr = t.dup_strings.lag(range_offset, default=default).over(range_window) result = expr.execute() + expected = ( df[["plain_datetimes_naive", "dup_strings"]] .set_index("plain_datetimes_naive") .squeeze() - .shift(freq=execute(range_offset.op())) + .shift(freq=con.execute(range_offset)) .reindex(df.plain_datetimes_naive) .reset_index(drop=True) ) if default is not ibis.NA: - expected = expected.fillna(execute(default.op())) + expected = expected.fillna(con.execute(default)) tm.assert_series_equal(result, expected.rename("tmp")) @@ -510,29 +511,6 @@ def test_window_with_mlb(): ) -def test_window_has_pre_execute_scope(): - called = [0] - - @pre_execute.register(ops.Lag, Backend) - def test_pre_execute(op, client, **kwargs): - called[0] += 1 - return Scope() - - data = {"key": list("abc"), "value": [1, 2, 3], "dup": list("ggh")} - df = pd.DataFrame(data, columns=["key", "value", "dup"]) - client = ibis.pandas.connect({"df": df}) - t = client.table("df") - window = ibis.window(order_by="value") - expr = t.key.lag(1).over(window).name("foo") - result = expr.execute() - assert result is not None - - # once in window op at the top to pickup any scope changes before computing - # twice in window op when calling execute on the ops.Lag node at the - # beginning of execute and once before the actual computation - assert called[0] == 3 - - def test_window_grouping_key_has_scope(t, df): param = ibis.param(dt.string) window = ibis.window(group_by=t.dup_strings + param) diff --git a/ibis/backends/pandas/trace.py b/ibis/backends/pandas/trace.py deleted file mode 100644 index 2350e89579302..0000000000000 --- a/ibis/backends/pandas/trace.py +++ /dev/null @@ -1,170 +0,0 @@ -"""Module that adds tracing to pandas execution. - -With tracing enabled, this module will log time and call stack information of -the executed expression. Call stack information is presented with indentation -level. - -For example: - -import pandas as pd -import logging - -import ibis.expr.datatypes as dt -import ibis.backends.pandas -from ibis.legacy.udf.vectorized import elementwise -from ibis.backends.pandas import trace - -logging.basicConfig() -trace.enable() - -df = pd.DataFrame( - { - 'a': [1, 2, 3] - } -) - -con = ibis.pandas.connect({"table1": df}) - -@elementwise( - input_type=[dt.double], - output_type=dt.double -) -def add_one(v): - import time - time.sleep(5) - return v + 1 - -table = con.table("table1") -table = table.mutate(b=add_one(table['a'])) -table.execute() - -Output: - -DEBUG:ibis.backends.pandas.trace: main_execute Selection -DEBUG:ibis.backends.pandas.trace: execute_until_in_scope Selection -DEBUG:ibis.backends.pandas.trace: execute_until_in_scope PandasTable -DEBUG:ibis.backends.pandas.trace: execute_database_table_client PandasTable -DEBUG:ibis.backends.pandas.trace: execute_database_table_client PandasTable 0:00:00.000085 -DEBUG:ibis.backends.pandas.trace: execute_until_in_scope PandasTable 0:00:00.000362 -DEBUG:ibis.backends.pandas.trace: execute_selection_dataframe Selection -DEBUG:ibis.backends.pandas.trace: main_execute ElementWiseVectorizedUDF -DEBUG:ibis.backends.pandas.trace: execute_until_in_scope ElementWiseVectorizedUDF -DEBUG:ibis.backends.pandas.trace: execute_until_in_scope TableColumn -DEBUG:ibis.backends.pandas.trace: execute_until_in_scope PandasTable -DEBUG:ibis.backends.pandas.trace: execute_until_in_scope PandasTable 0:00:00.000061 -DEBUG:ibis.backends.pandas.trace: execute_table_column_df_or_df_groupby TableColumn -DEBUG:ibis.backends.pandas.trace: execute_table_column_df_or_df_groupby TableColumn 0:00:00.000304 # noqa: E501 -DEBUG:ibis.backends.pandas.trace: execute_until_in_scope TableColumn 0:00:00.000584 -DEBUG:ibis.backends.pandas.trace: execute_udf_node ElementWiseVectorizedUDF -DEBUG:ibis.backends.pandas.trace: execute_udf_node ElementWiseVectorizedUDF 0:00:05.019173 -DEBUG:ibis.backends.pandas.trace: execute_until_in_scope ElementWiseVectorizedUDF 0:00:05.052604 # noqa: E501 -DEBUG:ibis.backends.pandas.trace: main_execute ElementWiseVectorizedUDF 0:00:05.052819 -DEBUG:ibis.backends.pandas.trace: execute_selection_dataframe Selection 0:00:05.054894 -DEBUG:ibis.backends.pandas.trace: execute_until_in_scope Selection 0:00:05.055662 -DEBUG:ibis.backends.pandas.trace: main_execute Selection 0:00:05.056556 -""" - -from __future__ import annotations - -import functools -import logging -import traceback -from datetime import datetime - -import ibis -from ibis.backends.pandas.dispatcher import TwoLevelDispatcher -from ibis.config import options -from ibis.expr import types as ir - -_logger = logging.getLogger("ibis.backends.pandas.trace") - -# A list of funcs that is traced -_trace_funcs = set() - - -def enable(): - """Enable tracing.""" - if options.pandas is None: - # pandas options haven't been registered yet - force module __getattr__ - ibis.pandas # noqa: B018 - options.pandas.enable_trace = True - logging.getLogger("ibis.backends.pandas.trace").setLevel(logging.DEBUG) - - -def _log_trace(func, start=None): - level = 0 - current_frame = None - - # Increase the current level for each traced function in the stackframe - # This way we can visualize the call stack. - for frame, _ in traceback.walk_stack(None): - current_frame = current_frame if current_frame is not None else frame - func_name = frame.f_code.co_name - if func_name in _trace_funcs: - level += 1 - - # We can assume we have 'args' because we only call _log_trace inside - # trace or TraceDispatcher.register - current_op = current_frame.f_locals["args"][0] - - # If the first argument is a Expr, we print its op because it's more - # informative. - if isinstance(current_op, ir.Expr): - current_op = current_op.op() - - _logger.debug( - "%s %s %s %s", - " " * level, - func.__name__, - type(current_op).__qualname__, - f"{datetime.now() - start}" if start else "", - ) - - -def trace(func): - """Return a function decorator that wraps `func` with tracing.""" - _trace_funcs.add(func.__name__) - - @functools.wraps(func) - def traced_func(*args, **kwargs): - # Unfortunately, this function can be called before the `ibis.pandas` - # attribute has ever been accessed, which means the trace configuration - # option might never get registered and will raise an error. Accessing - # the pandas attribute here forces the option initialization - import ibis - - ibis.pandas # noqa: B018 - - if not options.pandas.enable_trace: - return func(*args, **kwargs) - else: - start = datetime.now() - _log_trace(func) - res = func(*args, **kwargs) - _log_trace(func, start) - return res - - return traced_func - - -class TraceTwoLevelDispatcher(TwoLevelDispatcher): - """A Dispatcher that also wraps the registered function with tracing.""" - - def __init__(self, name, doc=None): - super().__init__(name, doc) - - def register(self, *types, **kwargs): - """Register a function with this Dispatcher. - - The function will also be wrapped with tracing information. - """ - - def _(func): - trace_func = trace(func) - TwoLevelDispatcher.register(self, *types, **kwargs)(trace_func) - # return func instead trace_func here so that - # chained register didn't get wrapped multiple - # times - return func - - return _ diff --git a/ibis/backends/pandas/udf.py b/ibis/backends/pandas/udf.py index 561aca6987d62..3168d348f67d5 100644 --- a/ibis/backends/pandas/udf.py +++ b/ibis/backends/pandas/udf.py @@ -2,35 +2,7 @@ from __future__ import annotations -import itertools - -import pandas as pd -from pandas.core.groupby import SeriesGroupBy - -import ibis.expr.operations as ops import ibis.legacy.udf.vectorized -from ibis.backends.base import BaseBackend -from ibis.backends.pandas.aggcontext import Transform -from ibis.backends.pandas.dispatch import execute_node, pre_execute -from ibis.backends.pandas.execution.util import get_grouping - - -def create_gens_from_args_groupby(*args: tuple[SeriesGroupBy, ...]): - """Create generators for each of `args` for groupby UDAF. - - Returns a generator that outputs each group. - - Parameters - ---------- - *args - A tuple of group by objects - - Returns - ------- - Tuple[Generator] - Generators of group by data - """ - return ((data for _, data in arg) for arg in args) class udf: @@ -49,120 +21,3 @@ def reduction(input_type, output_type): def analytic(input_type, output_type): """Alias for ibis.legacy.udf.vectorized.analytic.""" return ibis.legacy.udf.vectorized.analytic(input_type, output_type) - - -@pre_execute.register(ops.ElementWiseVectorizedUDF) -@pre_execute.register(ops.ElementWiseVectorizedUDF, BaseBackend) -def pre_execute_elementwise_udf(op, *clients, scope=None, **kwargs): - """Register execution rules for elementwise UDFs.""" - input_type = op.input_type - - # definitions - - # Define an execution rule for elementwise operations on a - # grouped Series - nargs = len(input_type) - - @execute_node.register( - ops.ElementWiseVectorizedUDF, *(itertools.repeat(SeriesGroupBy, nargs)) - ) - def execute_udf_node_groupby(op, *args, **kwargs): - func = op.func - - groupers = [ - grouper - for grouper in (getattr(arg, "grouper", None) for arg in args) - if grouper is not None - ] - - # all grouping keys must be identical - assert all(groupers[0] == grouper for grouper in groupers[1:]) - - # we're performing a scalar operation on grouped column, so - # perform the operation directly on the underlying Series - # and regroup after it's finished - args = [getattr(arg, "obj", arg) for arg in args] - groupings = get_grouping(groupers[0].groupings) - return func(*args).groupby(groupings, group_keys=False) - - # Define an execution rule for a simple elementwise Series - # function - @execute_node.register( - ops.ElementWiseVectorizedUDF, *(itertools.repeat(pd.Series, nargs)) - ) - @execute_node.register( - ops.ElementWiseVectorizedUDF, *(itertools.repeat(object, nargs)) - ) - def execute_udf_node(op, *args, cache=None, timecontext=None, **kwargs): - # We have rewritten op.func to be a closure enclosing - # the kwargs, and therefore, we do not need to pass - # kwargs here. This is true for all udf execution in this - # file. - # See ibis.legacy.udf.vectorized.UserDefinedFunction - - # prevent executing UDFs multiple times on different execution branches - try: - result = cache[(op, timecontext)] - except KeyError: - result = cache[(op, timecontext)] = op.func(*args) - - return result - - return scope - - -@pre_execute.register(ops.AnalyticVectorizedUDF) -@pre_execute.register(ops.AnalyticVectorizedUDF, BaseBackend) -@pre_execute.register(ops.ReductionVectorizedUDF) -@pre_execute.register(ops.ReductionVectorizedUDF, BaseBackend) -def pre_execute_analytic_and_reduction_udf(op, *clients, scope=None, **kwargs): - input_type = op.input_type - nargs = len(input_type) - - # An execution rule to handle analytic and reduction UDFs over - # 1) an ungrouped window, - # 2) an ungrouped Aggregate node, or - # 3) an ungrouped custom aggregation context - @execute_node.register(type(op), *(itertools.repeat(pd.Series, nargs))) - def execute_udaf_node_no_groupby(op, *args, aggcontext, **kwargs): - func = op.func - return aggcontext.agg(args[0], func, *args[1:]) - - # An execution rule to handle analytic and reduction UDFs over - # 1) a grouped window, - # 2) a grouped Aggregate node, or - # 3) a grouped custom aggregation context - @execute_node.register(type(op), *(itertools.repeat(SeriesGroupBy, nargs))) - def execute_udaf_node_groupby(op, *args, aggcontext, **kwargs): - func = op.func - if isinstance(aggcontext, Transform): - # We are aggregating over an unbounded (and GROUPED) window, - # which uses a Transform aggregation context. - # We need to do some pre-processing to func and args so that - # Transform can pull data out of the SeriesGroupBys in args. - - # Construct a generator that yields the next group of data - # for every argument excluding the first (pandas performs - # the iteration for the first argument) for each argument - # that is a SeriesGroupBy. - iters = create_gens_from_args_groupby(*args[1:]) - - # TODO: Unify calling convention here to be more like - # window - def aggregator(first, *rest): - # map(next, *rest) gets the inputs for the next group - # TODO: might be inefficient to do this on every call - return func(first, *map(next, rest)) - - return aggcontext.agg(args[0], aggregator, *iters) - else: - # We are either: - # 1) Aggregating over a bounded window, which uses a Window - # aggregation context - # 2) Aggregating over a custom aggregation context - # 3) Aggregating using an Aggregate node (with GROUPING), which - # uses a Summarize aggregation context - # No pre-processing to be done for any case. - return aggcontext.agg(args[0], func, *args[1:]) - - return scope diff --git a/ibis/backends/tests/test_aggregation.py b/ibis/backends/tests/test_aggregation.py index 4b0a4f7cc056b..be97ad419d924 100644 --- a/ibis/backends/tests/test_aggregation.py +++ b/ibis/backends/tests/test_aggregation.py @@ -1022,7 +1022,7 @@ def test_quantile( id="covar_pop", marks=[ pytest.mark.notimpl( - ["dask", "pandas", "polars", "druid"], + ["dask", "polars", "druid"], raises=com.OperationNotDefinedError, ), pytest.mark.notyet( @@ -1042,7 +1042,7 @@ def test_quantile( id="covar_samp", marks=[ pytest.mark.notimpl( - ["dask", "pandas", "polars", "druid"], + ["dask", "polars", "druid"], raises=com.OperationNotDefinedError, ), pytest.mark.notyet( @@ -1062,7 +1062,7 @@ def test_quantile( id="corr_pop", marks=[ pytest.mark.notimpl( - ["dask", "pandas", "druid"], + ["dask", "druid"], raises=com.OperationNotDefinedError, ), pytest.mark.notyet( @@ -1092,7 +1092,7 @@ def test_quantile( id="corr_samp", marks=[ pytest.mark.notimpl( - ["dask", "pandas", "druid"], + ["dask", "druid"], raises=com.OperationNotDefinedError, ), pytest.mark.notyet( @@ -1132,7 +1132,7 @@ def test_quantile( id="covar_pop_bool", marks=[ pytest.mark.notimpl( - ["dask", "pandas", "polars", "druid"], + ["dask", "polars", "druid"], raises=com.OperationNotDefinedError, ), pytest.mark.notyet( @@ -1156,7 +1156,7 @@ def test_quantile( id="corr_pop_bool", marks=[ pytest.mark.notimpl( - ["dask", "pandas", "druid"], + ["dask", "druid"], raises=com.OperationNotDefinedError, ), pytest.mark.notyet( @@ -1325,9 +1325,6 @@ def test_string_quantile(alltypes, func): @pytest.mark.notimpl(["dask"], raises=(AssertionError, NotImplementedError, TypeError)) @pytest.mark.notyet(["polars"], raises=PolarsInvalidOperationError) @pytest.mark.notyet(["datafusion"], raises=Exception, reason="not supported upstream") -@pytest.mark.broken( - ["pandas"], raises=AssertionError, reason="possibly incorrect results" -) @pytest.mark.parametrize( "func", [ @@ -1686,8 +1683,8 @@ def test_grouped_case(backend, con): ["datafusion", "mssql", "polars", "exasol"], raises=com.OperationNotDefinedError ) @pytest.mark.broken( - ["dask", "pandas"], - reason="Dask and Pandas do not windowize this operation correctly", + ["dask"], + reason="Dask does not windowize this operation correctly", raises=AssertionError, ) @pytest.mark.notyet(["impala", "flink"], raises=com.UnsupportedOperationError) diff --git a/ibis/backends/tests/test_array.py b/ibis/backends/tests/test_array.py index ea41cbb89956d..e2063b94354d4 100644 --- a/ibis/backends/tests/test_array.py +++ b/ibis/backends/tests/test_array.py @@ -342,6 +342,11 @@ def test_unnest_no_nulls(backend): raises=ValueError, reason="ValueError: Do not nest ARRAY types; ARRAY(basetype) handles multi-dimensional arrays of basetype", ) +@pytest.mark.broken( + ["pandas"], + raises=ValueError, + reason="all the input arrays must have same number of dimensions", +) def test_unnest_default_name(backend): array_types = backend.array_types df = array_types.execute() @@ -531,7 +536,7 @@ def test_array_filter(con, input, output): @builtin_array @pytest.mark.notimpl( - ["mssql", "pandas", "polars", "postgres"], + ["mssql", "polars", "postgres"], raises=com.OperationNotDefinedError, ) @pytest.mark.notimpl(["dask"], raises=com.OperationNotDefinedError) @@ -588,7 +593,7 @@ def test_array_contains(backend, con): ) @builtin_array @pytest.mark.notimpl( - ["dask", "impala", "mssql", "pandas", "polars"], + ["dask", "impala", "mssql", "polars"], raises=com.OperationNotDefinedError, ) def test_array_position(backend, con, a, expected_array): @@ -602,7 +607,7 @@ def test_array_position(backend, con, a, expected_array): @builtin_array @pytest.mark.notimpl( - ["dask", "impala", "mssql", "pandas", "polars"], + ["dask", "impala", "mssql", "polars"], raises=com.OperationNotDefinedError, ) @pytest.mark.broken( @@ -639,7 +644,7 @@ def test_array_remove(con, a): @builtin_array @pytest.mark.notimpl( - ["dask", "datafusion", "impala", "mssql", "pandas", "polars", "mysql"], + ["dask", "datafusion", "impala", "mssql", "polars", "mysql"], raises=com.OperationNotDefinedError, ) @pytest.mark.notimpl( @@ -693,7 +698,7 @@ def test_array_unique(con, input, expected): @builtin_array @pytest.mark.notimpl( - ["dask", "datafusion", "flink", "impala", "mssql", "pandas", "polars"], + ["dask", "datafusion", "flink", "impala", "mssql", "polars"], raises=com.OperationNotDefinedError, ) @pytest.mark.broken( @@ -714,7 +719,7 @@ def test_array_sort(con): @builtin_array @pytest.mark.notimpl( - ["dask", "datafusion", "impala", "mssql", "pandas", "polars"], + ["dask", "datafusion", "impala", "mssql", "polars"], raises=com.OperationNotDefinedError, ) @pytest.mark.parametrize( @@ -978,7 +983,7 @@ def test_array_flatten(backend, flatten_data, column, expected): reason="range isn't implemented upstream", raises=com.OperationNotDefinedError, ) -@pytest.mark.notimpl(["flink", "pandas", "dask"], raises=com.OperationNotDefinedError) +@pytest.mark.notimpl(["flink", "dask"], raises=com.OperationNotDefinedError) @pytest.mark.parametrize("n", [-2, 0, 2]) def test_range_single_argument(con, n): expr = ibis.range(n) @@ -992,9 +997,7 @@ def test_range_single_argument(con, n): raises=com.OperationNotDefinedError, ) @pytest.mark.parametrize("n", [-2, 0, 2]) -@pytest.mark.notimpl( - ["polars", "flink", "pandas", "dask"], raises=com.OperationNotDefinedError -) +@pytest.mark.notimpl(["polars", "flink", "dask"], raises=com.OperationNotDefinedError) @pytest.mark.skip("risingwave") def test_range_single_argument_unnest(backend, con, n): expr = ibis.range(n).unnest() @@ -1026,7 +1029,7 @@ def test_range_single_argument_unnest(backend, con, n): reason="range and unnest aren't implemented upstream", raises=com.OperationNotDefinedError, ) -@pytest.mark.notimpl(["flink", "pandas", "dask"], raises=com.OperationNotDefinedError) +@pytest.mark.notimpl(["flink", "dask"], raises=com.OperationNotDefinedError) def test_range_start_stop_step(con, start, stop, step): expr = ibis.range(start, stop, step) result = con.execute(expr) @@ -1041,7 +1044,7 @@ def test_range_start_stop_step(con, start, stop, step): @pytest.mark.notyet( ["datafusion"], raises=com.OperationNotDefinedError, reason="not supported upstream" ) -@pytest.mark.notimpl(["flink", "pandas", "dask"], raises=com.OperationNotDefinedError) +@pytest.mark.notimpl(["flink", "dask"], raises=com.OperationNotDefinedError) @pytest.mark.never( ["risingwave"], raises=sa.exc.InternalError, @@ -1222,7 +1225,7 @@ def swap(token): ) @timestamp_range_tzinfos @pytest.mark.notimpl( - ["pandas", "dask", "flink", "datafusion"], raises=com.OperationNotDefinedError + ["dask", "flink", "datafusion"], raises=com.OperationNotDefinedError ) def test_timestamp_range(con, start, stop, step, freq, tzinfo): start = start.replace(tzinfo=tzinfo) @@ -1273,7 +1276,7 @@ def test_timestamp_range(con, start, stop, step, freq, tzinfo): ) @timestamp_range_tzinfos @pytest.mark.notimpl( - ["pandas", "dask", "flink", "datafusion"], raises=com.OperationNotDefinedError + ["dask", "flink", "datafusion"], raises=com.OperationNotDefinedError ) def test_timestamp_range_zero_step(con, start, stop, step, tzinfo): start = start.replace(tzinfo=tzinfo) @@ -1293,7 +1296,7 @@ def test_repr_timestamp_array(con, monkeypatch): @pytest.mark.notyet( - ["dask", "datafusion", "flink", "pandas", "polars"], + ["dask", "datafusion", "flink", "polars"], raises=com.OperationNotDefinedError, ) @pytest.mark.broken( diff --git a/ibis/backends/tests/test_generic.py b/ibis/backends/tests/test_generic.py index e84a5eb97f02a..0e2d41fabdf1b 100644 --- a/ibis/backends/tests/test_generic.py +++ b/ibis/backends/tests/test_generic.py @@ -313,12 +313,14 @@ def test_filter(backend, alltypes, sorted_df, predicate_fn, expected_fn): "druid", "oracle", "exasol", + "pandas", ] ) @pytest.mark.never( ["flink"], reason="Flink engine does not support generic window clause with no order by", ) +# TODO(kszucs): this is not supported at the expression level def test_filter_with_window_op(backend, alltypes, sorted_df): sorted_alltypes = alltypes.order_by("id") table = sorted_alltypes @@ -1154,7 +1156,7 @@ def test_pivot_wider(backend): reason="backend doesn't implement window functions", ) @pytest.mark.notimpl( - ["pandas", "polars"], + ["polars"], raises=com.OperationNotDefinedError, reason="backend doesn't implement ops.WindowFunction", ) @@ -1232,7 +1234,7 @@ def test_distinct_on_keep(backend, on, keep): reason="backend doesn't implement window functions", ) @pytest.mark.notimpl( - ["pandas", "polars"], + ["polars"], raises=com.OperationNotDefinedError, reason="backend doesn't implement ops.WindowFunction", ) diff --git a/ibis/backends/tests/test_interactive.py b/ibis/backends/tests/test_interactive.py index bfa3f6adffe12..704e17019c6eb 100644 --- a/ibis/backends/tests/test_interactive.py +++ b/ibis/backends/tests/test_interactive.py @@ -33,6 +33,7 @@ def table(backend): return backend.functional_alltypes +@pytest.mark.notimpl(["pandas"]) def test_interactive_execute_on_repr(table, queries, snapshot): repr(table.bigint_col.sum()) snapshot.assert_match(queries[0], "out.sql") @@ -52,18 +53,21 @@ def test_repr_png_is_not_none_in_not_interactive(table): assert table._repr_png_() is not None +@pytest.mark.notimpl(["pandas"]) def test_default_limit(table, snapshot, queries): repr(table.select("id", "bool_col")) snapshot.assert_match(queries[0], "out.sql") +@pytest.mark.notimpl(["pandas"]) def test_respect_set_limit(table, snapshot, queries): repr(table.select("id", "bool_col").limit(10)) snapshot.assert_match(queries[0], "out.sql") +@pytest.mark.notimpl(["pandas"]) def test_disable_query_limit(table, snapshot, queries): assert ibis.options.sql.default_limit is None diff --git a/ibis/backends/tests/test_param.py b/ibis/backends/tests/test_param.py index 8266186481b23..b7aa81c43dd1f 100644 --- a/ibis/backends/tests/test_param.py +++ b/ibis/backends/tests/test_param.py @@ -65,9 +65,7 @@ def test_timestamp_accepts_date_literals(alltypes): assert expr.compile(params=params) is not None -@pytest.mark.notimpl( - ["dask", "impala", "pandas", "pyspark", "druid", "oracle", "exasol"] -) +@pytest.mark.notimpl(["dask", "impala", "pyspark", "druid", "oracle", "exasol"]) @pytest.mark.never( ["mysql", "sqlite", "mssql"], reason="backend will never implement array types" ) diff --git a/ibis/backends/tests/test_string.py b/ibis/backends/tests/test_string.py index d441b39896f2b..cde2dc86d1bc9 100644 --- a/ibis/backends/tests/test_string.py +++ b/ibis/backends/tests/test_string.py @@ -1098,7 +1098,7 @@ def test_no_conditional_percent_escape(con, expr): @pytest.mark.notimpl( - ["dask", "pandas", "mssql", "oracle", "exasol"], raises=com.OperationNotDefinedError + ["dask", "mssql", "oracle", "exasol"], raises=com.OperationNotDefinedError ) def test_non_match_regex_search_is_false(con): expr = ibis.literal("foo").re_search("bar") diff --git a/ibis/backends/tests/test_temporal.py b/ibis/backends/tests/test_temporal.py index baff2a018e181..4878ad46a2874 100644 --- a/ibis/backends/tests/test_temporal.py +++ b/ibis/backends/tests/test_temporal.py @@ -1028,7 +1028,6 @@ def convert_to_offset(x): "dask", "impala", "mysql", - "pandas", "postgres", "risingwave", "snowflake", @@ -1644,13 +1643,6 @@ def test_interval_add_cast_column(backend, alltypes, df): ), "%Y%m%d", marks=[ - pytest.mark.notimpl(["pandas"], raises=com.OperationNotDefinedError), - pytest.mark.notimpl( - [ - "pandas", - ], - raises=com.OperationNotDefinedError, - ), pytest.mark.notimpl( [ "pyspark", @@ -2254,7 +2246,7 @@ def test_time_literal(con, backend): @pytest.mark.broken( ["sqlite"], raises=AssertionError, reason="SQLite returns Timedelta from execution" ) -@pytest.mark.notimpl(["dask", "pandas"], raises=com.OperationNotDefinedError) +@pytest.mark.notimpl(["dask"], raises=com.OperationNotDefinedError) @pytest.mark.notyet(["oracle"], raises=sa.exc.DatabaseError) @pytest.mark.parametrize( "microsecond", diff --git a/ibis/backends/tests/test_timecontext.py b/ibis/backends/tests/test_timecontext.py index 72e78065640ec..50b181728d7e0 100644 --- a/ibis/backends/tests/test_timecontext.py +++ b/ibis/backends/tests/test_timecontext.py @@ -54,7 +54,7 @@ def filter_by_time_context(df, context): ) -@pytest.mark.notimpl(["dask", "duckdb"]) +@pytest.mark.notimpl(["dask", "duckdb", "pandas"]) @pytest.mark.notimpl( ["flink"], raises=com.OperationNotDefinedError, @@ -91,7 +91,7 @@ def test_context_adjustment_window_udf(backend, alltypes, context, window, monke backend.assert_frame_equal(result, expected) -@pytest.mark.notimpl(["dask", "duckdb"]) +@pytest.mark.notimpl(["dask", "duckdb", "pandas"]) @pytest.mark.broken( # TODO (mehmet): Check with the team. ["flink"], diff --git a/ibis/backends/tests/test_vectorized_udf.py b/ibis/backends/tests/test_vectorized_udf.py index f130b5b601545..c1c85326f52e5 100644 --- a/ibis/backends/tests/test_vectorized_udf.py +++ b/ibis/backends/tests/test_vectorized_udf.py @@ -570,7 +570,8 @@ def test_elementwise_udf_named_destruct(udf_alltypes): add_one_struct_udf = create_add_one_struct_udf( result_formatter=lambda v1, v2: (v1, v2) ) - with pytest.raises(com.IbisTypeError, match=r"Unable to infer"): + msg = "Duplicate column name 'new_struct' in result set" + with pytest.raises(com.IntegrityError, match=msg): udf_alltypes.mutate( new_struct=add_one_struct_udf(udf_alltypes["double_col"]).destructure() ) diff --git a/ibis/backends/tests/test_window.py b/ibis/backends/tests/test_window.py index 28ae24cfd19c0..e7968831330f3 100644 --- a/ibis/backends/tests/test_window.py +++ b/ibis/backends/tests/test_window.py @@ -247,7 +247,6 @@ def calc_zscore(s): id="row_number", marks=[ pytest.mark.notimpl(["dask"], raises=NotImplementedError), - pytest.mark.notimpl(["pandas"], raises=com.OperationNotDefinedError), ], ), param( @@ -469,7 +468,6 @@ def test_ungrouped_bounded_expanding_window( ) @pytest.mark.notimpl(["polars"], raises=com.OperationNotDefinedError) @pytest.mark.notimpl(["dask"], raises=NotImplementedError) -@pytest.mark.notimpl(["pandas"], raises=AssertionError) @pytest.mark.notimpl( ["flink"], raises=com.UnsupportedOperationError, @@ -652,7 +650,7 @@ def test_grouped_unbounded_window( ], ) @pytest.mark.broken(["snowflake"], raises=AssertionError) -@pytest.mark.broken(["dask", "pandas", "mssql"], raises=AssertionError) +@pytest.mark.broken(["dask", "mssql"], raises=AssertionError) @pytest.mark.notimpl(["polars"], raises=com.OperationNotDefinedError) @pytest.mark.notimpl( ["flink"], @@ -683,7 +681,7 @@ def test_simple_ungrouped_unbound_following_window( reason="OVER RANGE FOLLOWING windows are not supported in Flink yet", ) @pytest.mark.notimpl( - ["pandas", "dask"], + ["dask"], raises=NotImplementedError, reason="support scalar sorting keys are not yet implemented", ) @@ -719,7 +717,6 @@ def test_simple_ungrouped_window_with_scalar_order_by(alltypes): True, id="ordered-mean", marks=[ - pytest.mark.broken(["pandas"], raises=AssertionError), pytest.mark.notimpl( ["dask"], raises=NotImplementedError, @@ -796,7 +793,6 @@ def test_simple_ungrouped_window_with_scalar_order_by(alltypes): ], raises=com.OperationNotDefinedError, ), - pytest.mark.broken(["pandas"], raises=AssertionError), pytest.mark.broken( ["dask"], raises=ValueError, @@ -963,11 +959,6 @@ def test_simple_ungrouped_window_with_scalar_order_by(alltypes): ], raises=com.OperationNotDefinedError, ), - pytest.mark.notimpl( - ["pandas"], - raises=RuntimeWarning, - reason="invalid value encountered in divide", - ), pytest.mark.broken( ["dask"], raises=ValueError, @@ -1042,11 +1033,6 @@ def test_ungrouped_unbounded_window( ["impala"], raises=ImpalaHiveServer2Error, reason="limited RANGE support" ) @pytest.mark.notimpl(["dask"], raises=NotImplementedError) -@pytest.mark.notimpl( - ["pandas"], - raises=NotImplementedError, - reason="The pandas backend only implements range windows with temporal ordering keys", -) @pytest.mark.notimpl( ["flink"], raises=com.UnsupportedOperationError, @@ -1295,9 +1281,6 @@ def test_range_expression_bounds(backend): reason="clickhouse doesn't implement percent_rank", raises=com.OperationNotDefinedError, ) -@pytest.mark.broken( - ["pandas"], reason="missing column during execution", raises=KeyError -) @pytest.mark.broken( ["mssql"], reason="lack of support for booleans", raises=sa.exc.ProgrammingError ) @@ -1328,7 +1311,7 @@ def test_rank_followed_by_over_call_merge_frames(backend, alltypes, df): @pytest.mark.notyet( - ["pandas", "dask"], + ["dask"], reason="multiple ordering keys in a window function not supported for ranking", raises=ValueError, ) @@ -1342,6 +1325,11 @@ def test_rank_followed_by_over_call_merge_frames(backend, alltypes, df): @pytest.mark.broken( ["pyspark"], reason="pyspark requires CURRENT ROW", raises=PySparkAnalysisException ) +@pytest.mark.broken( + ["pandas"], + raises=TypeError, + reason="'<' not supported between instances of 'bool' and 'NoneType'", +) @pytest.mark.notimpl( ["risingwave"], raises=sa.exc.InternalError, diff --git a/ibis/expr/operations/reductions.py b/ibis/expr/operations/reductions.py index 2a85dbfcbab50..597f42107f358 100644 --- a/ibis/expr/operations/reductions.py +++ b/ibis/expr/operations/reductions.py @@ -17,6 +17,7 @@ class Reduction(Value): shape = ds.scalar + # TODO(kszucs): remove this @property def __window_op__(self): return self diff --git a/ibis/formats/pandas.py b/ibis/formats/pandas.py index 4f83af05d3207..b18fc2bf106e3 100644 --- a/ibis/formats/pandas.py +++ b/ibis/formats/pandas.py @@ -2,6 +2,7 @@ import contextlib import datetime +import decimal import warnings from importlib.util import find_spec as _find_spec @@ -117,8 +118,10 @@ def convert_table(cls, df, schema): "schema column count does not match input data column count" ) - for (name, series), dtype in zip(df.items(), schema.types): - df[name] = cls.convert_column(series, dtype) + columns = [] + for (_, series), dtype in zip(df.items(), schema.types): + columns.append(cls.convert_column(series, dtype)) + df = pd.concat(columns, axis=1) # return data with the schema's columns which may be different than the # input columns @@ -250,6 +253,23 @@ def convert_Interval(cls, s, dtype, pandas_type): def convert_String(cls, s, dtype, pandas_type): return s.astype(pandas_type, errors="ignore") + @classmethod + def convert_Decimal(cls, s, dtype, pandas_type): + context = decimal.Context(prec=dtype.precision) + + if dtype.scale is None: + normalize = context.create_decimal + else: + exponent = decimal.Decimal(10) ** -dtype.scale + + def normalize(x, exponent=exponent): + try: + return context.create_decimal(x).quantize(exponent) + except decimal.InvalidOperation: + return x + + return s.map(normalize, na_action="ignore").astype(pandas_type) + @classmethod def convert_UUID(cls, s, dtype, pandas_type): return s.map(cls.get_element_converter(dtype), na_action="ignore") diff --git a/ibis/formats/tests/test_dask.py b/ibis/formats/tests/test_dask.py index 89ce6c59198a3..2dbe9b61ad7d2 100644 --- a/ibis/formats/tests/test_dask.py +++ b/ibis/formats/tests/test_dask.py @@ -199,12 +199,3 @@ def test_schema_infer_exhaustive_dataframe(): ] assert DaskData.infer_table(df) == ibis.schema(expected) - - -def test_convert_dataframe_with_timezone(): - data = {"time": pd.date_range("2018-01-01", "2018-01-02", freq="H")} - df = dd.from_pandas(pd.DataFrame(data), npartitions=2) - expected = df.assign(time=df.time.dt.tz_localize("EST")) - desired_schema = ibis.schema([("time", 'timestamp("EST")')]) - result = DaskData.convert_table(df.copy(), desired_schema) - tm.assert_frame_equal(result.compute(), expected.compute())