diff --git a/.github/workflows/ibis-backends.yml b/.github/workflows/ibis-backends.yml index ff7caf83b69d..baffb490b3d0 100644 --- a/.github/workflows/ibis-backends.yml +++ b/.github/workflows/ibis-backends.yml @@ -77,10 +77,10 @@ jobs: # title: Dask # extras: # - dask - # - name: pandas - # title: Pandas - # extras: - # - pandas + - name: pandas + title: Pandas + extras: + - pandas # - name: sqlite # title: SQLite # extras: diff --git a/ibis/backends/base/df/__init__.py b/ibis/backends/base/df/__init__.py deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/ibis/backends/base/df/scope.py b/ibis/backends/base/df/scope.py deleted file mode 100644 index 1d41da93464d..000000000000 --- a/ibis/backends/base/df/scope.py +++ /dev/null @@ -1,211 +0,0 @@ -"""Module for scope. - -The motivation of Scope is to cache data for calculated ops. - -`scope` in Scope class is the main cache. It is a dictionary mapping -ibis node instances to concrete data, and the time context associate -with it (if any). - -When there are no time contexts associate with the cached result, getting -and setting values in Scope would be as simple as get and set in a normal -dictionary. With time contexts, we need the following logic for getting -and setting items in scope: - -Before setting the value op in scope we need to perform the following -check first: - -Test if `op` is in `scope` yet -- No, then put `op` in `scope`, set 'timecontext' to be the current -`timecontext` (None if `timecontext` is not present), set 'value' to be -the actual data. -- Yes, then get the time context stored in `scope` for `op` as -`old_timecontext`, and compare it with current `timecontext`: -If current `timecontext` is a subset of `_timecontext`, that means we -already cached a larger range of data. Do nothing and we will trim data in -later execution process. -If current `timecontext` is a superset of `old_timecontext`, that means we -need to update cache. Set 'value' to be the current data and set -'timecontext' to be the current `timecontext` for `op`. -If current `timecontext` is neither a subset nor a superset of -`old_timcontext`, but they overlap, or not overlap at all (For example -when there is a window that looks forward, over a window that looks -back), in this case, we should not trust the data stored either because -the data stored in scope doesn't cover the current time context. -For simplicity, we update cache in this case, instead of merge data of -different time contexts. -""" -from __future__ import annotations - -from collections import namedtuple -from typing import TYPE_CHECKING, Any - -import pandas as pd - -from ibis.backends.base.df.timecontext import TimeContextRelation, compare_timecontext - -if TYPE_CHECKING: - from collections.abc import Iterable - - from ibis.expr.operations import Node - -TimeContext = tuple[pd.Timestamp, pd.Timestamp] - -ScopeItem = namedtuple("ScopeItem", ["timecontext", "value"]) - - -class Scope: - def __init__( - self, - param: dict[Node, Any] | None = None, - timecontext: TimeContext | None = None, - ): - """Create a new scope. - - Associate None as timecontext by default. This is mostly used to - init a scope with a set of given params. - """ - self._items = ( - {op: ScopeItem(timecontext, value) for op, value in param.items()} - if param - else {} - ) - - def __contains__(self, op): - """Given an `op`, return if `op` is present in Scope. - - Note that this `__contain__` method doesn't take `timecontext` - as a parameter. This could be used to iterate all keys in - current scope, or any case that doesn't care about value, just - simply test if `op` is in scope or not. - When trying to get value in scope, use `get_value(op, timecontext)` - instead. Because the cached data could be trusted only if: - 1. `op` is in `scope`, and, - 2. The `timecontext` associated with `op` is a time context equal - to, or larger than the current time context. - """ - return op in self._items - - def __iter__(self): - return iter(self._items.keys()) - - def set_value(self, op: Node, timecontext: TimeContext | None, value: Any) -> None: - """Set values in scope. - - Given an `op`, `timecontext` and `value`, set `op` and - `(value, timecontext)` in scope. - - This method doesn't simply override and set, but takes time context - into consideration. - - If there is a value associated with the key, but time context is - smaller than the current time context we are going to set, `get_value` - will return None and we will proceed to set the new value in scope. - - Parameters - ---------- - op - Key in scope - timecontext - Time context - value - the cached result to save in scope, an object whose type may - differ in different backends. - """ - if self.get_value(op, timecontext) is None: - self._items[op] = ScopeItem(timecontext, value) - - def get_value(self, op: Node, timecontext: TimeContext | None = None) -> Any: - """Given a op and timecontext, get the result from scope. - - Parameters - ---------- - op - Key in scope - timecontext - Time context - - Returns - ------- - Any - The cached result, an object whose type may differ in different - backends. - """ - if op not in self: - return None - - # for ops without timecontext - if timecontext is None: - return self._items[op].value - else: - # For op with timecontext, there are some ops cannot use cached - # result with a different (larger) timecontext to get the - # correct result. - # For example, a groupby followed by count, if we use a larger or - # smaller dataset from cache, we will get an error in result. - # Such ops with global aggregation, ops whose result is - # depending on other rows in result Dataframe, cannot use cached - # result with different time context to optimize calculation. - # These are time context sensitive operations. Since these cases - # are rare in actual use case, we just enable optimization for - # all nodes for now. - cached_timecontext = self._items[op].timecontext - if cached_timecontext: - relation = compare_timecontext(timecontext, cached_timecontext) - if relation == TimeContextRelation.SUBSET: - return self._items[op].value - else: - return self._items[op].value - return None - - def merge_scope(self, other_scope: Scope, overwrite=False) -> Scope: - """Merge items in `other_scope` into this scope. - - Parameters - ---------- - other_scope - Scope to be merged with - overwrite - if `True`, force overwrite `value` if node already exists. - - Returns - ------- - Scope - a new Scope instance with items in two scopes merged. - """ - result = Scope() - - for op in self: - result._items[op] = self._items[op] - - for op in other_scope: - # if get_scope returns a not None value, then data is already - # cached in scope and it is at least a greater range than - # the current timecontext, so we drop the item. Otherwise - # add it into scope. - v = other_scope._items[op] - if overwrite or result.get_value(op, v.timecontext) is None: - result._items[op] = v - return result - - def merge_scopes(self, other_scopes: Iterable[Scope], overwrite=False) -> Scope: - """Merge items in `other_scopes` into this scope. - - Parameters - ---------- - other_scopes - scopes to be merged with - overwrite - if `True`, force overwrite value if node already exists. - - Returns - ------- - Scope - a new Scope instance with items in input scopes merged. - """ - result = Scope() - for op in self: - result._items[op] = self._items[op] - - for s in other_scopes: - result = result.merge_scope(s, overwrite) - return result diff --git a/ibis/backends/base/df/timecontext.py b/ibis/backends/base/df/timecontext.py deleted file mode 100644 index f84dd473bc4c..000000000000 --- a/ibis/backends/base/df/timecontext.py +++ /dev/null @@ -1,304 +0,0 @@ -"""Time context module. - -This is an implementation of time context extension without affecting the -existing SQL-like execution model for backends. - -Most of the execution is built on the foundation that "Data is uniquely -defined by the op tree". This is true in SQL analysis where there is no -ambiguity what the result of executing a Table is. - -In time series analysis, however, this is not necessarily True. We have defined -an extension to ibis execution for time series analysis where the result of -executing a Table is defined by the Table plus the time context are -associated with the execution. - -Time context specifies the temporal range of a query, it carries the start and -end datetimes. For example, a Table can represent the query select count(a) -from table, but the result of that is different with time context -("20190101", "20200101") vs ("20200101", "20210101"), because what data is in -"table" depends also on the time context. - -While data in scope is public and global for all nodes, `timecontext` is -intended to store 'local' time context data for each node in execution. i.e., -each subtree of an expr tree can have different time context. Which makes it -so that when executing each node, we also need to know the "local time context" -for that node. - -And we propose to store these data as 'timecontext', calculate in execution -pass it along to children nodes, in the ibis tree. See each backends for -implementation details. - -Time context adjustment algorithm - In an Ibis tree, time context is local for each node, and they should be - adjusted accordingly for some specific nodes. Those operations may - require extra data outside of the global time context that user defines. - For example, in asof_join, we need to look back extra `tolerance` daays - for the right table to get the data for joining. Similarly for window - operation with preceding and following. - Algorithm to calculate context adjustment are defined in this module - and could be used by multiple backends. -""" - -from __future__ import annotations - -import enum -import functools -from typing import TYPE_CHECKING, Any - -import pandas as pd - -import ibis.common.exceptions as com -import ibis.expr.operations as ops -from ibis import config - -TimeContext = tuple[pd.Timestamp, pd.Timestamp] - - -if TYPE_CHECKING: - from ibis.backends.base.df.scope import Scope - - -# In order to use time context feature, there must be a column of Timestamp -# type, and named as 'time' in Table. This TIME_COL constant will be -# used in filtering data from a table or columns of a table. It can be changed -# by running: -# -# ibis.config.options.context_adjustment.time_col = "other_time_col" - - -def get_time_col(): - return config.options.context_adjustment.time_col - - -class TimeContextRelation(enum.Enum): - """Enum to classify the relationship between two time contexts. - - Assume that we have two timecontext `c1 (begin1, end1)`, `c2(begin2, end2)`: - - - `SUBSET` means `c1` is a subset of `c2`, `begin1` is greater than or - equal to `begin2`, and `end1` is less than or equal to `end2`. - - `SUPERSET` means that `begin1` is earlier than `begin2`, and `end1` - is later than `end2`. - - If neither of the two contexts is a superset of each other, and they - share some time range in common, we called them `OVERLAP`. - - `NONOVERLAP` means the two contexts doesn't overlap at all, which - means `end1` is earlier than `begin2` or `end2` is earlier than - `begin1`. - """ - - SUBSET = 0 - SUPERSET = 1 - OVERLAP = 2 - NONOVERLAP = 3 - - -def compare_timecontext( - left_context: TimeContext, right_context: TimeContext -) -> TimeContextRelation: - """Compare two time contexts and return the relationship between them.""" - left_begin, left_end = left_context - right_begin, right_end = right_context - if right_begin <= left_begin and right_end >= left_end: - return TimeContextRelation.SUBSET - elif right_begin >= left_begin and right_end <= left_end: - return TimeContextRelation.SUPERSET - elif right_end < left_begin or left_end < right_begin: - return TimeContextRelation.NONOVERLAP - else: - return TimeContextRelation.OVERLAP - - -def canonicalize_context( - timecontext: TimeContext | None, -) -> TimeContext | None: - """Canonicalize a timecontext with type pandas.Timestamp for its begin and end time.""" - - SUPPORTS_TIMESTAMP_TYPE = pd.Timestamp - if not isinstance(timecontext, tuple) or len(timecontext) != 2: - raise com.IbisError(f"Timecontext {timecontext} should specify (begin, end)") - - begin, end = timecontext - - if not isinstance(begin, SUPPORTS_TIMESTAMP_TYPE): - raise com.IbisError( - f"begin time value {begin} of type {type(begin)} is not" - " of type pd.Timestamp" - ) - if not isinstance(end, SUPPORTS_TIMESTAMP_TYPE): - raise com.IbisError( - f"end time value {end} of type {type(begin)} is not of type pd.Timestamp" - ) - if begin > end: - raise com.IbisError( - f"begin time {begin} must be before or equal to end time {end}" - ) - return begin, end - - -def localize_context(timecontext: TimeContext, timezone: str) -> TimeContext: - """Localize tz-naive context.""" - begin, end = timecontext - if begin.tz is None: - begin = begin.tz_localize(timezone) - - if end.tz is None: - end = end.tz_localize(timezone) - - return begin, end - - -def construct_time_context_aware_series( - series: pd.Series, frame: pd.DataFrame -) -> pd.Series: - """Construct a Series by adding 'time' in its MultiIndex. - - In window execution, the result Series of udf may need - to be trimmed by timecontext. In order to do so, 'time' - must be added as an index to the Series. We extract - time column from the parent Dataframe `frame`. - See `trim_window_result` in execution/window.py for - trimming implementation. - - Examples - -------- - >>> import pandas as pd - >>> from ibis.backends.base.df.timecontext import ( - ... construct_time_context_aware_series, - ... ) - >>> df = pd.DataFrame( - ... { - ... "time": pd.Series(pd.date_range(start="2017-01-02", periods=3).values), - ... "id": [1, 2, 3], - ... "value": [1.1, 2.2, 3.3], - ... } - ... ) - >>> df - time id value - 0 2017-01-02 1 1.1 - 1 2017-01-03 2 2.2 - 2 2017-01-04 3 3.3 - >>> series = df["value"] - >>> series - 0 1.1 - 1 2.2 - 2 3.3 - Name: value, dtype: float64 - >>> construct_time_context_aware_series(series, df) # quartodoc: +SKIP # doctest: +SKIP - time - 0 2017-01-02 1.1 - 1 2017-01-03 2.2 - 2 2017-01-04 3.3 - Name: value, dtype: float64 - - The index will be a MultiIndex of the original RangeIndex - and a DateTimeIndex. - - >>> timed_series = construct_time_context_aware_series(series, df) - >>> timed_series # quartodoc: +SKIP # doctest: +SKIP - time - 0 2017-01-02 1.1 - 1 2017-01-03 2.2 - 2 2017-01-04 3.3 - Name: value, dtype: float64 - - >>> construct_time_context_aware_series( - ... timed_series, df - ... ) # quartodoc: +SKIP # doctest: +SKIP - time - 0 2017-01-02 1.1 - 1 2017-01-03 2.2 - 2 2017-01-04 3.3 - Name: value, dtype: float64 - The result is unchanged for a series already has 'time' as its index. - """ - time_col = get_time_col() - if time_col == frame.index.name: - time_index = frame.index - elif time_col in frame: - time_index = pd.Index(frame[time_col]) - else: - raise com.IbisError(f'"time" column not present in DataFrame {frame}') - if time_col not in series.index.names: - series.index = pd.MultiIndex.from_arrays( - list(map(series.index.get_level_values, range(series.index.nlevels))) - + [time_index], - names=series.index.names + [time_col], - ) - return series - - -@functools.singledispatch -def adjust_context(op: Any, scope: Scope, timecontext: TimeContext) -> TimeContext: - """Adjust the `timecontext` for `op`. - - Parameters - ---------- - op - Ibis operation. - scope - Incoming scope. - timecontext - Time context associated with the node. - - Returns - ------- - TimeContext - For `op` that is not of type Node, raise an error to avoid failing - silently since the default behavior is to return `timecontext`. - """ - raise com.IbisError(f"Unsupported input type for adjust context for {op}") - - -@adjust_context.register(ops.Node) -def adjust_context_node( - op: ops.Node, scope: Scope, timecontext: TimeContext -) -> TimeContext: - # For any node, by default, do not adjust time context - return timecontext - - -@adjust_context.register(ops.Alias) -def adjust_context_alias( - op: ops.Node, scope: Scope, timecontext: TimeContext -) -> TimeContext: - # For any node, by default, do not adjust time context - return adjust_context(op.arg, scope, timecontext) - - -@adjust_context.register(ops.AsOfJoin) -def adjust_context_asof_join( - op: ops.AsOfJoin, scope: Scope, timecontext: TimeContext -) -> TimeContext: - begin, end = timecontext - - if op.tolerance is not None: - from ibis.backends.pandas.execution import execute - - timedelta = execute(op.tolerance) - return (begin - timedelta, end) - - return timecontext - - -@adjust_context.register(ops.WindowFunction) -def adjust_context_window( - op: ops.WindowFunction, scope: Scope, timecontext: TimeContext -) -> TimeContext: - # TODO(kszucs): this file should be really moved to the pandas - # backend instead of the current central placement - from ibis.backends.pandas.execution import execute - - # adjust time context by preceding and following - begin, end = timecontext - - if op.frame.start is not None: - value = execute(op.frame.start.value) - if value: - begin = begin - value - - if op.frame.end is not None: - value = execute(op.frame.end.value) - if value: - end = end + value - - return (begin, end) diff --git a/ibis/backends/dask/tests/execution/test_join.py b/ibis/backends/dask/tests/execution/test_join.py index e9805c74c142..e76097b65cdd 100644 --- a/ibis/backends/dask/tests/execution/test_join.py +++ b/ibis/backends/dask/tests/execution/test_join.py @@ -96,6 +96,7 @@ def test_join_with_multiple_predicates(how, left, right, df1, df2): left, right.key3, right.other_value ] result = expr.execute().sort_values(by=["key"]).reset_index(drop=True) + expected = ( dd.merge(df1, df2, how=how, left_on=["key", "key2"], right_on=["key", "key3"]) .compute(scheduler="single-threaded") diff --git a/ibis/backends/pandas/__init__.py b/ibis/backends/pandas/__init__.py index 4349400c50ab..881a460b7f5e 100644 --- a/ibis/backends/pandas/__init__.py +++ b/ibis/backends/pandas/__init__.py @@ -1,6 +1,5 @@ from __future__ import annotations -import importlib from functools import lru_cache from typing import TYPE_CHECKING, Any @@ -22,8 +21,6 @@ import pathlib from collections.abc import Mapping, MutableMapping -raise RuntimeError("Temporarily make the pandas backend dysfunctional") - class BasePandasBackend(BaseBackend): """Base class for backends based on pandas.""" @@ -51,9 +48,6 @@ def do_connect( >>> ibis.pandas.connect({"t": pd.DataFrame({"a": [1, 2, 3]})}) """ - # register dispatchers - from ibis.backends.pandas import execution, udf # noqa: F401 - self.dictionary = dictionary or {} self.schemas: MutableMapping[str, sch.Schema] = {} @@ -256,34 +250,13 @@ def _convert_object(cls, obj: Any) -> Any: @classmethod @lru_cache def _get_operations(cls): - backend = f"ibis.backends.{cls.name}" - - execution = importlib.import_module(f"{backend}.execution") - execute_node = execution.execute_node + from ibis.backends.pandas.kernels import supported_operations - # import UDF to pick up AnalyticVectorizedUDF and others - importlib.import_module(f"{backend}.udf") - - dispatch = importlib.import_module(f"{backend}.dispatch") - pre_execute = dispatch.pre_execute - - return frozenset( - op - for op, *_ in execute_node.funcs.keys() | pre_execute.funcs.keys() - if issubclass(op, ops.Value) - ) + return supported_operations @classmethod def has_operation(cls, operation: type[ops.Value]) -> bool: - # Pandas doesn't support geospatial ops, but the dispatcher implements - # a common base class that makes it appear that it does. Explicitly - # exclude these operations. - if issubclass(operation, (ops.GeoSpatialUnOp, ops.GeoSpatialBinOp)): - return False - op_classes = cls._get_operations() - return operation in op_classes or any( - issubclass(operation, op_impl) for op_impl in op_classes - ) + return operation in cls._get_operations() def _clean_up_cached_table(self, op): del self.dictionary[op.name] @@ -331,7 +304,7 @@ class Backend(BasePandasBackend): name = "pandas" def execute(self, query, params=None, limit="default", **kwargs): - from ibis.backends.pandas.core import execute_and_reset + from ibis.backends.pandas.executor import Executor if limit != "default" and limit is not None: raise ValueError( @@ -346,16 +319,10 @@ def execute(self, query, params=None, limit="default", **kwargs): ) ) - node = query.op() - - if params is None: - params = {} - else: - params = { - k.op() if isinstance(k, ir.Expr) else k: v for k, v in params.items() - } + params = params or {} + params = {k.op() if isinstance(k, ir.Expr) else k: v for k, v in params.items()} - return execute_and_reset(node, params=params, **kwargs) + return Executor.execute(query.op(), backend=self, params=params) def _load_into_cache(self, name, expr): self.create_table(name, expr.execute()) diff --git a/ibis/backends/pandas/aggcontext.py b/ibis/backends/pandas/aggcontext.py deleted file mode 100644 index 64a4f73bc686..000000000000 --- a/ibis/backends/pandas/aggcontext.py +++ /dev/null @@ -1,710 +0,0 @@ -"""Implements an object to describe the context of a window aggregation. - -For any particular aggregation such as ``sum``, ``mean``, etc we need to decide -based on the presence or absence of other expressions like ``group_by`` and -``order_by`` whether we should call a different method of aggregation. - -Here are the different aggregation contexts and the conditions under which they -are used. - -Note that in the pandas backend, only trailing and cumulative windows are -supported right now. - -No ``group_by`` or ``order_by``: ``context.Summarize()`` --------------------------------------------------------- -This is an aggregation on a column, repeated for every row in the table. - -SQL - -:: - - SELECT SUM(value) OVER () AS sum_value FROM t - -Pandas - -:: - >>> import pandas as pd - >>> import numpy as np - >>> df = pd.DataFrame( - ... { - ... "key": list("aabc"), - ... "value": np.random.randn(4), - ... "time": pd.date_range(periods=4, start="now"), - ... } - ... ) - >>> s = pd.Series(df.value.sum(), index=df.index, name="sum_value") - >>> s # quartodoc: +SKIP # doctest: +SKIP - -Ibis - -:: - - >>> import ibis - >>> schema = dict(time="timestamp", key="string", value="double") - >>> t = ibis.table(schema, name="t") - >>> t[ - ... t, t.value.sum().name("sum_value") - ... ].sum_value # quartodoc: +SKIP # doctest: +SKIP - - -``group_by``, no ``order_by``: ``context.Transform()`` ------------------------------------------------------- - -This performs an aggregation per group and repeats it across every row in the -group. - -SQL - -:: - - SELECT SUM(value) OVER (PARTITION BY key) AS sum_value - FROM t - -Pandas - -:: - - >>> import pandas as pd - >>> import numpy as np - >>> df = pd.DataFrame( - ... { - ... "key": list("aabc"), - ... "value": np.random.randn(4), - ... "time": pd.date_range(periods=4, start="now"), - ... } - ... ) - >>> df.groupby("key").value.transform("sum") # quartodoc: +SKIP # doctest: +SKIP - -Ibis - -:: - - >>> import ibis - >>> schema = dict(time="timestamp", key="string", value="double") - >>> t = ibis.table(schema, name="t") - >>> t.value.sum().over( - ... ibis.window(group_by=t.key) - ... ) # quartodoc: +SKIP # doctest: +SKIP - -``order_by``, no ``group_by``: ``context.Cumulative()``/``context.Rolling()`` ------------------------------------------------------------------------------ - -Cumulative and trailing window operations. - -Cumulative -~~~~~~~~~~ - -Also called expanding. - -SQL - -:: - - SELECT SUM(value) OVER ( - ORDER BY time ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW - ) AS sum_value - FROM t - - -Pandas - -:: - - >>> import pandas as pd - >>> import numpy as np - >>> df = pd.DataFrame( - ... { - ... "key": list("aabc"), - ... "value": np.random.randn(4), - ... "time": pd.date_range(periods=4, start="now"), - ... } - ... ) - >>> df.sort_values("time").value.cumsum() # quartodoc: +SKIP # doctest: +SKIP - -Ibis - -:: - - >>> import ibis - >>> schema = dict(time="timestamp", key="string", value="double") - >>> t = ibis.table(schema, name="t") - >>> window = ibis.cumulative_window(order_by=t.time) - >>> t.value.sum().over(window) # quartodoc: +SKIP # doctest: +SKIP - -Moving -~~~~~~ - -Also called referred to as "rolling" in other libraries such as pandas. - -SQL - -:: - - SELECT SUM(value) OVER ( - ORDER BY time ROWS BETWEEN 3 PRECEDING AND CURRENT ROW - ) AS sum_value - FROM t - - -Pandas - -:: - - >>> import pandas as pd - >>> import numpy as np - >>> df = pd.DataFrame( - ... { - ... "key": list("aabc"), - ... "value": np.random.randn(4), - ... "time": pd.date_range(periods=4, start="now"), - ... } - ... ) - >>> df.sort_values("time").value.rolling( - ... 3 - ... ).sum() # quartodoc: +SKIP # doctest: +SKIP - -Ibis - -:: - - >>> import ibis - >>> schema = dict(time="timestamp", key="string", value="double") - >>> t = ibis.table(schema, name="t") - >>> window = ibis.trailing_window(3, order_by=t.time) - >>> t.value.sum().over(window) # quartodoc: +SKIP # doctest: +SKIP - - -``group_by`` and ``order_by``: ``context.Cumulative()``/``context.Rolling()`` ------------------------------------------------------------------------------ - -This performs a cumulative or rolling operation within a group. - -SQL - -:: - - SELECT SUM(value) OVER ( - PARTITION BY key ORDER BY time ROWS BETWEEN 4 PRECEDING AND CURRENT ROW - ) AS sum_value - FROM t - - -Pandas - -:: - - >>> import pandas as pd - >>> import numpy as np - >>> df = pd.DataFrame( - ... { - ... "key": list("aabc"), - ... "value": np.random.randn(4), - ... "time": pd.date_range(periods=4, start="now"), - ... } - ... ) - >>> sorter = lambda df: df.sort_values("time") - >>> gb = ( - ... df.groupby("key", group_keys=False) - ... .apply(sorter) - ... .reset_index(drop=True) - ... .groupby("key") - ... ) - >>> rolling = gb.value.rolling(2) - >>> rolling.sum() # quartodoc: +SKIP # doctest: +SKIP - -Ibis - -:: - - >>> import ibis - >>> schema = dict(time="timestamp", key="string", value="double") - >>> t = ibis.table(schema, name="t") - >>> window = ibis.trailing_window(2, order_by=t.time, group_by=t.key) - >>> t.value.sum().over(window) # quartodoc: +SKIP # doctest: +SKIP -""" - -from __future__ import annotations - -import abc -import functools -import itertools -import operator -from typing import TYPE_CHECKING, Any, Callable - -import pandas as pd -from pandas.core.groupby import SeriesGroupBy - -import ibis -import ibis.common.exceptions as com -import ibis.expr.datatypes as dt -import ibis.expr.operations as ops -import ibis.util -from ibis.backends.base.df.timecontext import ( - construct_time_context_aware_series, - get_time_col, -) - -if TYPE_CHECKING: - from collections.abc import Iterator - - import numpy as np - - -class AggregationContext(abc.ABC): - __slots__ = ( - "parent", - "group_by", - "order_by", - "dtype", - "max_lookback", - "output_type", - ) - - def __init__( - self, - parent=None, - group_by=None, - order_by=None, - max_lookback=None, - output_type=None, - ): - self.parent = parent - self.group_by = group_by - self.order_by = order_by - self.dtype = None if output_type is None else output_type.to_pandas() - self.output_type = output_type - self.max_lookback = max_lookback - - @abc.abstractmethod - def agg(self, grouped_data, function, *args, **kwargs): - pass - - -def wrap_for_apply( - function: Callable, - args: tuple[Any, ...] | None = None, - kwargs: dict[str, Any] | None = None, -) -> Callable: - """Wrap a function for use with Pandas `apply`. - - Parameters - ---------- - function : Callable - A function to be used with Pandas `apply`. - args : Optional[Tuple[Any, ...]] - args to be passed to function when it is called by Pandas `apply` - kwargs : Optional[Dict[str, Any]] - kwargs to be passed to function when it is called by Pandas `apply` - """ - assert callable(function), f"function {function} is not callable" - - new_args: tuple[Any, ...] = () - if args is not None: - new_args = args - - new_kwargs: dict[str, Any] = {} - if kwargs is not None: - new_kwargs = kwargs - - @functools.wraps(function) - def wrapped_func( - data: Any, - function: Callable = function, - args: tuple[Any, ...] = new_args, - kwargs: dict[str, Any] = new_kwargs, - ) -> Callable: - return function(data, *args, **kwargs) - - return wrapped_func - - -def wrap_for_agg( - function: Callable, - args: tuple[Any, ...], - kwargs: dict[str, Any], -) -> Callable: - """Wrap a function for use with Pandas `agg`. - - This includes special logic that will force Pandas `agg` to always treat - the function as an aggregation function. Details: - - When passed a function, Pandas `agg` will either: - 1) Behave like Pandas `apply` and treat the function as a N->N mapping - function (i.e. calls the function once for every value in the Series - that `agg` is being called on), OR - 2) Treat the function as a N->1 aggregation function (i.e. calls the - function once on the entire Series) - Pandas `agg` will use behavior #1 unless an error is raised when doing so. - - We want to force Pandas `agg` to use behavior #2. To do this, we will wrap - the function with logic that checks that a Series is being passed in, and - raises a TypeError otherwise. When Pandas `agg` is attempting to use - behavior #1 but sees the TypeError, it will fall back to behavior #2. - - Parameters - ---------- - function : Callable - An aggregation function to be used with Pandas `agg`. - args : Tuple[Any, ...] - args to be passed to function when it is called by Pandas `agg` - kwargs : Dict[str, Any] - kwargs to be passed to function when it is called by Pandas `agg` - """ - assert callable(function), f"function {function} is not callable" - - @functools.wraps(function) - def wrapped_func( - data: Any, - function: Callable = function, - args: tuple[Any, ...] = args, - kwargs: dict[str, Any] = kwargs, - ) -> Callable: - # `data` will be a scalar here if Pandas `agg` is trying to behave like - # like Pandas `apply`. - if not isinstance(data, pd.Series): - # Force `agg` to NOT behave like `apply`. We want Pandas to use - # `function` as an aggregation function, not as a mapping function. - raise TypeError( - f"This function expects a Series, but saw an object of type " - f"{type(data)} instead." - ) - return function(data, *args, **kwargs) - - return wrapped_func - - -class Summarize(AggregationContext): - __slots__ = () - - def agg(self, grouped_data, function, *args, **kwargs): - if isinstance(function, str): - return getattr(grouped_data, function)(*args, **kwargs) - - if not callable(function): - raise TypeError(f"Object {function} is not callable or a string") - - if isinstance(grouped_data, pd.core.groupby.generic.SeriesGroupBy) and len( - grouped_data - ): - # `SeriesGroupBy.agg` does not allow np.arrays to be returned - # from UDFs. To avoid `SeriesGroupBy.agg`, we will call the - # aggregation function manually on each group. (#2768) - aggs = {} - for k, v in grouped_data: - func_args = [d.get_group(k) for d in args] - aggs[k] = function(v, *func_args, **kwargs) - grouped_col_name = v.name - return ( - pd.Series(aggs) - .rename(grouped_col_name) - .rename_axis(grouped_data.grouper.names) - ) - else: - return grouped_data.agg(wrap_for_agg(function, args, kwargs)) - - -class Transform(AggregationContext): - __slots__ = () - - def agg(self, grouped_data, function, *args, **kwargs): - # If this is a multi column UDF, then we cannot use - # "transform" here (Data must be 1-dimensional) - # Instead, we need to use "apply", which can return a non - # numeric type, e.g, tuple of two double. - if self.output_type.is_struct(): - res = grouped_data.apply(function, *args, **kwargs) - else: - res = grouped_data.transform(function, *args, **kwargs) - - # The result series uses the name of the input. We should - # unset it to avoid confusion, when result is not guaranteed - # to be the same series / have the same type after transform - res.name = None - return res - - -@functools.singledispatch -def compute_window_spec(dtype, obj): - raise com.IbisTypeError( - f"Unknown dtype type {dtype} and object {obj} for compute_window_spec" - ) - - -@compute_window_spec.register(dt.Integer) -def compute_window_spec_none(_, obj): - """Helper method only used for row-based windows. - - Window spec in ibis is an inclusive window bound. A bound of 0 - indicates the current row. Window spec in Pandas indicates window - size. Therefore, we must add 1 to the ibis window bound to get the - expected behavior. - """ - from ibis.backends.pandas.core import execute - - value = execute(obj) - return value + 1 - - -@compute_window_spec.register(dt.Interval) -def compute_window_spec_interval(_, obj): - from ibis.backends.pandas.core import execute - - value = execute(obj) - return pd.tseries.frequencies.to_offset(value) - - -def window_agg_built_in( - frame: pd.DataFrame, - windowed: pd.core.window.Window, - function: str, - max_lookback: ops.Literal, - *args: tuple[Any, ...], - **kwargs: dict[str, Any], -) -> pd.Series: - """Apply window aggregation with built-in aggregators.""" - assert isinstance(function, str) - method = operator.methodcaller(function, *args, **kwargs) - - if max_lookback is not None: - agg_method = method - - def sliced_agg(s): - return agg_method(s.iloc[-max_lookback.value :]) - - method = operator.methodcaller("apply", sliced_agg, raw=False) - - result = method(windowed) - index = result.index - result.index = pd.MultiIndex.from_arrays( - [frame.index] + list(map(index.get_level_values, range(index.nlevels))), - names=[frame.index.name] + index.names, - ) - return result - - -def create_window_input_iter( - grouped_data: SeriesGroupBy | pd.Series, - masked_window_lower_indices: pd.Series, - masked_window_upper_indices: pd.Series, -) -> Iterator[np.ndarray]: - # create a generator for each input series - # the generator will yield a slice of the - # input series for each valid window - data = getattr(grouped_data, "obj", grouped_data).values - lower_indices_array = masked_window_lower_indices.values - upper_indices_array = masked_window_upper_indices.values - for i in range(len(lower_indices_array)): - lower_index = lower_indices_array[i] - upper_index = upper_indices_array[i] - yield data[lower_index:upper_index] - - -def window_agg_udf( - grouped_data: SeriesGroupBy, - function: Callable, - window_lower_indices: pd.Series, - window_upper_indices: pd.Series, - mask: pd.Series, - result_index: pd.Index, - dtype: np.dtype, - max_lookback: int, - *args: tuple[Any, ...], - **kwargs: dict[str, Any], -) -> pd.Series: - """Apply window aggregation with UDFs. - - Notes - ----- - Use custom logic to computing rolling window UDF instead of - using pandas's rolling function. - This is because pandas's rolling function doesn't support - multi param UDFs. - """ - assert len(window_lower_indices) == len(window_upper_indices) - assert len(window_lower_indices) == len(mask) - - # Reset index here so we don't need to deal with mismatching - # indices - window_lower_indices = window_lower_indices.reset_index(drop=True) - window_upper_indices = window_upper_indices.reset_index(drop=True) - mask = mask.reset_index(drop=True) - - # Compute window indices and manually roll - # over the window. - - # If an window has only nan values, we output nan for - # the window result. This follows pandas rolling apply - # behavior. - - # The first input column is in grouped_data, but there may - # be additional input columns in args. - inputs = (grouped_data,) + args - - masked_window_lower_indices = window_lower_indices[mask].astype("i8") - masked_window_upper_indices = window_upper_indices[mask].astype("i8") - - input_iters = [ - create_window_input_iter( - arg, masked_window_lower_indices, masked_window_upper_indices - ) - if isinstance(arg, (pd.Series, SeriesGroupBy)) - else itertools.repeat(arg) - for arg in inputs - ] - - valid_result = pd.Series( - function(*(next(gen) for gen in input_iters)) - for i in range(len(masked_window_lower_indices)) - ) - - valid_result = pd.Series(valid_result) - valid_result.index = masked_window_lower_indices.index - result = pd.Series(index=mask.index, dtype=dtype) - result[mask] = valid_result - result.index = result_index - - return result - - -class Window(AggregationContext): - __slots__ = ("construct_window",) - - def __init__(self, kind, *args, **kwargs): - super().__init__( - parent=kwargs.pop("parent", None), - group_by=kwargs.pop("group_by", None), - order_by=kwargs.pop("order_by", None), - output_type=kwargs.pop("output_type"), - max_lookback=kwargs.pop("max_lookback", None), - ) - self.construct_window = operator.methodcaller(kind, *args, **kwargs) - - def agg( - self, - grouped_data: pd.Series | SeriesGroupBy, - function: str | Callable, - *args: Any, - **kwargs: Any, - ) -> pd.Series: - # avoid a pandas warning about numpy arrays being passed through - # directly - group_by = self.group_by - order_by = self.order_by - - assert group_by or order_by - - # Get the DataFrame from which the operand originated - # (passed in when constructing this context object in - # execute_node(ops.Window)) - parent = self.parent - frame = getattr(parent, "obj", parent) - obj = getattr(grouped_data, "obj", grouped_data) - name = obj.name - if frame[name] is not obj or name in group_by or name in order_by: - name = f"{name}_{ibis.util.guid()}" - frame = frame.assign(**{name: obj}) - - # set the index to our order_by keys and append it to the existing - # index - # TODO: see if we can do this in the caller, when the context - # is constructed rather than pulling out the data - columns = group_by + order_by + [name] - # Create a new frame to avoid mutating the original one - indexed_by_ordering = frame[columns].copy() - # placeholder column to compute window_sizes below - indexed_by_ordering["_placeholder"] = 0 - indexed_by_ordering = indexed_by_ordering.set_index(order_by) - - # regroup if needed - if group_by: - grouped_frame = indexed_by_ordering.groupby(group_by, group_keys=False) - else: - grouped_frame = indexed_by_ordering - grouped = grouped_frame[name] - - if callable(function): - # To compute the window_size, we need to construct a - # RollingGroupby and compute count using construct_window. - # However, if the RollingGroupby is not numeric, e.g., - # we are calling window UDF on a timestamp column, we - # cannot compute rolling count directly because: - # (1) windowed.count() will exclude NaN observations - # , which results in incorrect window sizes. - # (2) windowed.apply(len, raw=True) will include NaN - # observations, but doesn't work on non-numeric types. - # https://github.com/pandas-dev/pandas/issues/23002 - # To deal with this, we create a _placeholder column - - windowed_frame = self.construct_window(grouped_frame) - window_sizes = windowed_frame["_placeholder"].count().reset_index(drop=True) - mask = ~(window_sizes.isna()) - window_upper_indices = pd.Series(range(len(window_sizes))) + 1 - window_lower_indices = window_upper_indices - window_sizes - # The result Series of udf may need to be trimmed by - # timecontext. In order to do so, 'time' must be added - # as an index to the Series, if present. Here We extract - # time column from the parent Dataframe `frame`. - if get_time_col() in frame: - result_index = construct_time_context_aware_series(obj, frame).index - else: - result_index = obj.index - result = window_agg_udf( - grouped_data, - function, - window_lower_indices, - window_upper_indices, - mask, - result_index, - self.dtype, - self.max_lookback, - *args, - **kwargs, - ) - else: - # perform the per-group rolling operation - windowed = self.construct_window(grouped) - result = window_agg_built_in( - frame, - windowed, - function, - self.max_lookback, - *args, - **kwargs, - ) - try: - return result.astype(self.dtype, copy=False) - except (TypeError, ValueError): - return result - - -class Cumulative(Window): - __slots__ = () - - def __init__(self, *args, **kwargs): - super().__init__("expanding", *args, **kwargs) - - -class Moving(Window): - __slots__ = () - - def __init__(self, start, max_lookback, *args, **kwargs): - from ibis.backends.pandas.core import timedelta_types - - start = compute_window_spec(start.dtype, start.value) - if isinstance(start, timedelta_types + (pd.offsets.DateOffset,)): - closed = "both" - else: - closed = None - - super().__init__( - "rolling", - start, - *args, - max_lookback=max_lookback, - closed=closed, - min_periods=1, - **kwargs, - ) - - def short_circuit_method(self, grouped_data, function): - raise AttributeError("No short circuit method for rolling operations") diff --git a/ibis/backends/pandas/convert.py b/ibis/backends/pandas/convert.py new file mode 100644 index 000000000000..76528d3e9258 --- /dev/null +++ b/ibis/backends/pandas/convert.py @@ -0,0 +1,88 @@ +from __future__ import annotations + +import pandas as pd +import pandas.api.types as pdt + +import ibis.expr.datatypes as dt +from ibis.formats.pandas import DataMapper, PandasType + + +class PandasConverter(DataMapper): + @classmethod + def convert_scalar(cls, obj, dtype): + series = pd.Series([obj]) + casted = cls.convert_column(series, dtype) + return casted[0] + + @classmethod + def convert_column(cls, obj, dtype): + pandas_type = PandasType.from_ibis(dtype) + + method_name = f"convert_{dtype.__class__.__name__}" + convert_method = getattr(cls, method_name, cls.convert_default) + + return convert_method(obj, dtype, pandas_type) + + @classmethod + def convert_default(cls, s, dtype, pandas_type): + if pandas_type == object: + func = lambda x: x if x is pd.NA else dt.normalize(dtype, x) + return s.map(func, na_action="ignore").astype(pandas_type) + else: + return s.astype(pandas_type) + + @classmethod + def convert_Integer(cls, s, dtype, pandas_type): + if pdt.is_datetime64_any_dtype(s.dtype): + return s.astype("int64").floordiv(int(1e9)).astype(pandas_type) + else: + return s.astype(pandas_type, errors="ignore") + + convert_SignedInteger = convert_UnsignedInteger = convert_Integer + convert_Int64 = convert_Int32 = convert_Int16 = convert_Int8 = convert_SignedInteger + convert_UInt64 = ( + convert_UInt32 + ) = convert_UInt16 = convert_UInt8 = convert_UnsignedInteger + + @classmethod + def convert_Floating(cls, s, dtype, pandas_type): + if pdt.is_datetime64_any_dtype(s.dtype): + return s.astype("int64").floordiv(int(1e9)).astype(pandas_type) + else: + return s.astype(pandas_type, errors="ignore") + + convert_Float64 = convert_Float32 = convert_Float16 = convert_Floating + + @classmethod + def convert_Timestamp(cls, s, dtype, pandas_type): + if isinstance(dtype, pd.DatetimeTZDtype): + return s.dt.tz_convert(dtype.timezone) + elif pdt.is_datetime64_dtype(s.dtype): + return s.dt.tz_localize(dtype.timezone) + elif pdt.is_numeric_dtype(s.dtype): + return pd.to_datetime(s, unit="s").dt.tz_localize(dtype.timezone) + else: + try: + return s.astype(pandas_type) + except TypeError: + try: + return pd.to_datetime(s).dt.tz_convert(dtype.timezone) + except TypeError: + return pd.to_datetime(s).dt.tz_localize(dtype.timezone) + + @classmethod + def convert_Date(cls, s, dtype, pandas_type): + if isinstance(s.dtype, pd.DatetimeTZDtype): + s = s.dt.tz_convert("UTC").dt.tz_localize(None) + elif pdt.is_numeric_dtype(s.dtype): + s = pd.to_datetime(s, unit="D") + else: + s = pd.to_datetime(s).astype(pandas_type, errors="ignore") + + return s.dt.normalize() + + @classmethod + def convert_String(cls, s, dtype, pandas_type): + # TODO(kszucs): should switch to the new pandas string type and convert + # object columns using s.convert_dtypes() method + return s.map(str, na_action="ignore").astype(object) diff --git a/ibis/backends/pandas/core.py b/ibis/backends/pandas/core.py deleted file mode 100644 index ef29b2bb29cc..000000000000 --- a/ibis/backends/pandas/core.py +++ /dev/null @@ -1,605 +0,0 @@ -"""The pandas backend. - -The pandas backend is a departure from the typical ibis backend in that it -doesn't compile to anything, and the execution of the ibis expression is under -the purview of ibis itself rather than executing SQL on a server. - -Design ------- -The pandas backend uses a technique called `multiple dispatch -`_, implemented in a -third-party open source library called `multipledispatch -`_. - -Multiple dispatch is a generalization of standard single-dispatch runtime -polymorphism to multiple arguments. - -Compilation ------------ -This is a no-op because we execute ibis expressions directly. - -Execution ---------- -Execution is divided into different dispatched functions, each arising from -a different use case. - -A top level function `execute` exists to provide the API for executing an ibis -expression against in-memory data. - -The general flow of execution is: - -:: - If the current operation is in scope: - return it - Else: - execute the arguments of the current node - - execute the current node with its executed arguments - -Specifically, execute is comprised of a series of steps that happen at -different times during the loop. - -1. ``compute_time_context`` ---------------------------- -First, at the beginning of the main execution loop, ``compute_time_context`` is -called. This function computes time contexts, and pass them to all children of -the current node. These time contexts could be used in later steps to get data. -This is essential for time series Table, and related operations that adjust -time context, such as window, asof_join, etc. - -By default, this function simply pass the unchanged time context to all -children nodes. - - -2. ``pre_execute`` ------------------- -Second, ``pre_execute`` is called. -This function serves a similar purpose to ``data_preload``, the key difference -being that ``pre_execute`` is called *every time* there's a call to execute. - -By default this function does nothing. - -3. ``execute_node`` -------------------- - -Then, when an expression is ready to be evaluated we call -:func:`~ibis.backends.pandas.core.execute` on the expressions arguments and -then :func:`~ibis.backends.pandas.dispatch.execute_node` on the expression -with its now-materialized arguments. - -4. ``post_execute`` -------------------- -The final step--``post_execute``--is called immediately after the previous call -to ``execute_node`` and takes the instance of the -:class:`~ibis.expr.operations.Node` just computed and the result of the -computation. - -The purpose of this function is to allow additional computation to happen in -the context of the current level of the execution loop. You might be wondering -That may sound vague, so let's look at an example. - -Let's say you want to take a three day rolling average, and you want to include -3 days of data prior to the first date of the input. You don't want to see that -data in the result for a few reasons, one of which is that it would break the -contract of window functions: given N rows of input there are N rows of output. - -Defining a ``post_execute`` rule for :class:`~ibis.expr.operations.Window` -allows you to encode such logic. One might want to implement this using -:class:`~ibis.expr.operations.ScalarParameter`, in which case the ``scope`` -passed to ``post_execute`` would be the bound values passed in at the time the -``execute`` method was called. - - -Scope ------ -Scope is used across the execution phases, it iss a map that maps Ibis -operators to actual data. It is used to cache data for calculated ops. It is -an optimization to reused executed results. - -With time context included, the key is op associated with each expression; -And scope value is another key-value map: -- value: pd.DataFrame or pd.Series that is the result of executing key op -- timecontext: of type TimeContext, the time context associated with the data -stored in value - -See ibis.common.scope for details about the implementation. -""" - -from __future__ import annotations - -import datetime -import functools -import numbers -from typing import TYPE_CHECKING, Any, Callable - -import numpy as np -import pandas as pd -from multipledispatch import Dispatcher - -import ibis.common.exceptions as com -import ibis.expr.datatypes as dt -import ibis.expr.operations as ops -import ibis.util -from ibis.backends.base import BaseBackend -from ibis.backends.base.df.scope import Scope -from ibis.backends.base.df.timecontext import TimeContext, canonicalize_context -from ibis.backends.pandas import aggcontext as agg_ctx -from ibis.backends.pandas.dispatch import ( - execute_literal, - execute_node, - post_execute, - pre_execute, -) -from ibis.backends.pandas.trace import trace - -if TYPE_CHECKING: - from collections.abc import Iterable, Mapping - -integer_types = np.integer, int -floating_types = (numbers.Real,) -numeric_types = integer_types + floating_types -boolean_types = bool, np.bool_ -fixed_width_types = numeric_types + boolean_types -date_types = (datetime.date,) -time_types = (datetime.time,) -timestamp_types = pd.Timestamp, datetime.datetime, np.datetime64 -timedelta_types = pd.Timedelta, datetime.timedelta, np.timedelta64 -temporal_types = date_types + time_types + timestamp_types + timedelta_types -scalar_types = fixed_width_types + temporal_types -simple_types = scalar_types + (str, type(None)) - - -@functools.singledispatch -def is_computable_input(arg): - """All inputs are not computable without a specific override.""" - return False - - -@is_computable_input.register(BaseBackend) -@is_computable_input.register(ops.Node) -@is_computable_input.register(dt.DataType) -@is_computable_input.register(type(None)) -@is_computable_input.register(tuple) -def is_computable_input_arg(arg): - """Return whether `arg` is a valid computable argument.""" - return True - - -# Register is_computable_input for each scalar type (int, float, date, etc). -# We use consume here to avoid leaking the iteration variable into the module. -ibis.util.consume( - is_computable_input.register(t)(is_computable_input_arg) for t in scalar_types -) - - -def execute_with_scope( - node: ops.Node, - scope: Scope, - timecontext: TimeContext | None = None, - aggcontext: agg_ctx.AggregationContext | None = None, - clients=None, - **kwargs: Any, -): - """Execute an expression `expr`, with data provided in `scope`. - - Parameters - ---------- - node - The operation node to execute. - scope - A Scope class, with dictionary mapping `ibis.expr.operations.Node` - subclass instances to concrete data such as a pandas DataFrame. - timecontext - A tuple of (begin, end) that is passed from parent Node to children - see [timecontext.py](ibis/backends/pandas/execution/timecontext.py) for - detailed usage for this time context. - aggcontext - Aggregation context - clients - Iterable of clients - kwargs - Keyword arguments - """ - # Call pre_execute, to allow clients to intercept the expression before - # computing anything *and* before associating leaf nodes with data. This - # allows clients to provide their own data for each leaf. - if clients is None: - clients, _ = node.to_expr()._find_backends() - - if aggcontext is None: - aggcontext = agg_ctx.Summarize() - - pre_executed_scope = pre_execute( - node, - *clients, - scope=scope, - timecontext=timecontext, - aggcontext=aggcontext, - **kwargs, - ) - new_scope = scope.merge_scope(pre_executed_scope) - result = execute_until_in_scope( - node, - new_scope, - timecontext=timecontext, - aggcontext=aggcontext, - clients=clients, - # XXX: we *explicitly* pass in scope and not new_scope here so that - # post_execute sees the scope of execute_with_scope, not the scope of - # execute_until_in_scope - post_execute_=functools.partial( - post_execute, - scope=scope, - timecontext=timecontext, - aggcontext=aggcontext, - clients=clients, - **kwargs, - ), - **kwargs, - ).get_value(node, timecontext) - return result - - -@trace -def execute_until_in_scope( - node, - scope: Scope, - timecontext: TimeContext | None = None, - aggcontext: agg_ctx.AggregationContext | None = None, - clients: Iterable | None = None, - post_execute_: Callable | None = None, - **kwargs: Any, -) -> Scope: - """Execute until our op is in `scope`.""" - # these should never be None - assert aggcontext is not None, "aggcontext is None" - assert clients is not None, "clients is None" - assert post_execute_ is not None, "post_execute_ is None" - - # base case: our op has been computed (or is a leaf data node), so - # return the corresponding value - if scope.get_value(node, timecontext) is not None: - return scope - if isinstance(node, ops.Literal): - # special case literals to avoid the overhead of dispatching - # execute_node - return Scope( - { - node: execute_literal( - node, - node.value, - node.dtype, - aggcontext=aggcontext, - **kwargs, - ) - }, - timecontext, - ) - - # figure out what arguments we're able to compute on based on the - # expressions inputs. things like expressions, None, and scalar types are - # computable whereas ``list``s are not - computable_args = [ - arg for arg in get_node_arguments(node) if is_computable_input(arg) - ] - - # pre_executed_states is a list of states with same the length of - # computable_args, these states are passed to each arg - if timecontext: - arg_timecontexts = compute_time_context( - node, - num_args=len(computable_args), - timecontext=timecontext, - clients=clients, - scope=scope, - ) - else: - arg_timecontexts = [None] * len(computable_args) - - pre_executed_scope = pre_execute( - node, - *clients, - scope=scope, - timecontext=timecontext, - aggcontext=aggcontext, - **kwargs, - ) - - new_scope = scope.merge_scope(pre_executed_scope) - - # Short circuit: if pre_execute puts op in scope, then we don't need to - # execute its computable_args - if new_scope.get_value(node, timecontext) is not None: - return new_scope - - # recursively compute each node's arguments until we've changed type. - # compute_time_context should return with a list with the same length - # as computable_args, the two lists will be zipping together for - # further execution - if len(arg_timecontexts) != len(computable_args): - raise com.IbisError( - "arg_timecontexts differ with computable_arg in length " - f"for type:\n{type(node).__name__}." - ) - - scopes = [ - execute_until_in_scope( - arg, - new_scope, - timecontext=timecontext, - aggcontext=aggcontext, - post_execute_=post_execute_, - clients=clients, - **kwargs, - ) - if isinstance(arg, ops.Node) - else Scope({arg: arg}, timecontext) - for (arg, timecontext) in zip(computable_args, arg_timecontexts) - ] - - # if we're unable to find data then raise an exception - if not scopes and computable_args: - raise com.UnboundExpressionError(f"Unable to find data for node:\n{node!r}") - - # there should be exactly one dictionary per computable argument - assert len(computable_args) == len(scopes) - - new_scope = new_scope.merge_scopes(scopes) - # pass our computed arguments to this node's execute_node implementation - data = [ - new_scope.get_value(arg, timecontext) if isinstance(arg, ops.Node) else arg - for (arg, timecontext) in zip(computable_args, arg_timecontexts) - ] - result = execute_node( - node, - *data, - scope=scope, - timecontext=timecontext, - aggcontext=aggcontext, - clients=clients, - **kwargs, - ) - computed = post_execute_( - node, result, timecontext=timecontext, aggcontext=aggcontext, **kwargs - ) - return Scope({node: computed}, timecontext) - - -execute = Dispatcher("execute") - - -@execute.register(ops.Node) -@trace -def main_execute( - node: ops.Node, - params: Mapping[ops.Node, Any] | None = None, - scope: Scope | None = None, - timecontext: TimeContext | None = None, - aggcontext: agg_ctx.AggregationContext | None = None, - cache: Mapping[ops.Node, Any] | None = None, - **kwargs: Any, -): - """Execute an expression against data that are bound to it. - - If no data are bound, raise an Exception. - - Parameters - ---------- - node : ibis.expr.operations.Node - The operation node to execute - params : Mapping[ibis.expr.operations.Node, object] - The data that an unbound parameter in `node` maps to - scope : Mapping[ibis.expr.operations.Node, object] - Additional scope, mapping ibis operations to data - timecontext : Optional[TimeContext] - timecontext needed for execution - aggcontext : Optional[ibis.backends.pandas.aggcontext.AggregationContext] - An object indicating how to compute aggregations. For example, - a rolling mean needs to be computed differently than the mean of a - column. - cache - Mapping for storing computation results. - kwargs - Additional arguments that can potentially be used by individual node - execution - - Returns - ------- - result : Union[ - pandas.Series, pandas.DataFrame, ibis.backends.pandas.core.simple_types - ] - - Raises - ------ - ValueError - * If no data are bound to the input expression - """ - if scope is None: - scope = Scope() - - if timecontext is not None: - # convert timecontext to datetime type, if time strings are provided - timecontext = canonicalize_context(timecontext) - - if params is None: - params = {} - - if cache is None: - cache = {} - - scope = scope.merge_scope(Scope(params, timecontext)) - return execute_with_scope( - node, - scope, - timecontext=timecontext, - aggcontext=aggcontext, - cache=cache, - **kwargs, - ) - - -def execute_and_reset( - node, - params=None, - scope=None, - timecontext: TimeContext | None = None, - aggcontext=None, - **kwargs, -): - """Execute an expression against data that are bound to it. - - If no data are bound, raise an Exception. - - The difference between this function and - `ibis.backends.pandas.core.execute` is that this function resets the index - of the result, if the result has an index. - - Parameters - ---------- - node : ibis.expr.operations.Node - The operation node to execute - params : Mapping[ibis.expr.operation.Node, object] - The data that an unbound parameter in `node` maps to - scope : Mapping[ibis.expr.operations.Node, object] - Additional scope, mapping ibis operations to data - timecontext : Optional[TimeContext] - timecontext needed for execution - aggcontext : Optional[ibis.backends.pandas.aggcontext.AggregationContext] - An object indicating how to compute aggregations. For example, - a rolling mean needs to be computed differently than the mean of a - column. - kwargs : Dict[str, object] - Additional arguments that can potentially be used by individual node - execution - - Returns - ------- - pandas.Series | pandas.DataFrame | ibis.backends.pandas.core.simple_types - Result of execution - - Raises - ------ - ValueError - * If no data are bound to the input expression - """ - result = execute( - node, - params=params, - scope=scope, - timecontext=timecontext, - aggcontext=aggcontext, - **kwargs, - ) - return _apply_schema(node, result) - - -def _apply_schema(op: ops.Node, result: pd.DataFrame | pd.Series): - from ibis.formats.pandas import PandasData - - assert isinstance(op, ops.Node), type(op) - if isinstance(result, pd.DataFrame): - df = result.reset_index().loc[:, list(op.schema.names)] - return PandasData.convert_table(df, op.schema) - elif isinstance(result, pd.Series): - schema = op.to_expr().as_table().schema() - df = PandasData.convert_table(result.to_frame(), schema) - return df.iloc[:, 0].reset_index(drop=True) - else: - return result - - -compute_time_context = Dispatcher( - "compute_time_context", - doc="""Compute the time context for a node in execution. - -Notes ------ -For a given node, return with a list of timecontext that are going to be -passed to its children nodes. - -Time context is useful when data is not uniquely defined by op tree. For example, -a table `t` can represent the query `SELECT count(a) FROM table`, but the -result of that is different with time context `(pd.Timestamp("20190101"), -pd.Timestamp("20200101"))` vs `(pd.Timestamp("20200101"), -pd.Timestamp("20210101“))` because what data is in `table` also depends on -the time context. Such context may be different for different nodes, that is, -each node may have a different time context. - -This function computes attributes that are going to be used in execution and -passes these attributes to child nodes. - -Parameters ----------- -clients : List[ibis.backends.base.BaseBackend] - backends for execution -timecontext : Optional[TimeContext] - begin and end time context needed for execution - -Returns -------- -List[Optional[TimeContext]] - A list of timecontexts for children nodes of the current node. Note that - timecontext are calculated for children nodes of computable args only. - The length of the return list is same of the length of computable inputs. - See `computable_args` in `execute_until_in_scope` -""", -) - - -@compute_time_context.register(ops.Node) -def compute_time_context_default( - node: ops.Node, - scope: Scope, - timecontext: TimeContext | None = None, - **kwargs, -): - return [timecontext for arg in get_node_arguments(node) if is_computable_input(arg)] - - -get_node_arguments = Dispatcher("get_node_arguments") - - -@get_node_arguments.register(ops.Node) -def get_node_arguments_default(node): - return node.args - - -@get_node_arguments.register(ops.ScalarParameter) -def get_node_arguments_parameter(node): - return () - - -@get_node_arguments.register(ops.DatabaseTable) -def get_node_arguments_table(node): - return (node.source,) - - -@get_node_arguments.register(ops.DropNa) -def get_node_arguments_dropna(node): - return (node.table,) - - -@get_node_arguments.register(ops.Selection) -def get_node_arguments_selection(node): - return (node.table,) - - -@get_node_arguments.register(ops.Aggregation) -def get_node_arguments_aggregation(node): - return (node.table,) - - -@get_node_arguments.register(ops.WindowFunction) -def get_node_arguments_window(node): - return get_node_arguments(node.func)[:1] - - -@get_node_arguments.register( - ( - ops.ElementWiseVectorizedUDF, - ops.ReductionVectorizedUDF, - ops.AnalyticVectorizedUDF, - ) -) -def get_node_arguments_udf(node): - return node.func_args diff --git a/ibis/backends/pandas/dispatch.py b/ibis/backends/pandas/dispatch.py deleted file mode 100644 index b5e080ade3bb..000000000000 --- a/ibis/backends/pandas/dispatch.py +++ /dev/null @@ -1,110 +0,0 @@ -from __future__ import annotations - -from functools import partial - -from multipledispatch import Dispatcher - -import ibis.common.exceptions as com -import ibis.expr.operations as ops -from ibis.backends.base import BaseBackend -from ibis.backends.base.df.scope import Scope -from ibis.backends.pandas.trace import TraceTwoLevelDispatcher - -# Individual operation execution -execute_node = TraceTwoLevelDispatcher( - "execute_node", - doc=( - "Execute an individual operation given the operation and its computed " - "arguments" - ), -) - - -@execute_node.register(ops.Node, [object]) -def raise_unknown_op(node, *args, **kwargs): - signature = ", ".join(type(arg).__name__ for arg in args) - raise com.OperationNotDefinedError( - "Operation is not implemented for this backend with " - f"signature: execute_node({type(node).__name__}, {signature})" - ) - - -@execute_node.register(ops.TableNode) -def raise_unknown_table_node(node, **kwargs): - raise com.UnboundExpressionError( - f"Node of type {type(node).__name__!r} has no data bound to it. " - "You probably tried to execute an expression without a data " - "source." - ) - - -pre_execute = Dispatcher( - "pre_execute", - doc="""\ -Given a node, compute a (possibly partial) scope prior to standard execution. - -Notes ------ -This function is useful if parts of the tree structure need to be executed at -the same time or if there are other reasons to need to interrupt the regular -depth-first traversal of the tree. -""", -) - - -# Default returns an empty scope -@pre_execute.register(ops.Node) -@pre_execute.register(ops.Node, BaseBackend) -def pre_execute_default(node, *clients, **kwargs): - return Scope() - - -# Merge the results of all client pre-execution with scope -@pre_execute.register(ops.Node, [BaseBackend]) -def pre_execute_multiple_clients(node, *clients, scope=None, **kwargs): - scope = scope.merge_scopes( - list(map(partial(pre_execute, node, scope=scope, **kwargs), clients)) - ) - return scope - - -execute_literal = Dispatcher( - "execute_literal", - doc="""\ -Special case literal execution to avoid the dispatching overhead of -``execute_node``. - -Parameters ----------- -op : ibis.expr.operations.Node -value : object - The literal value of the object, e.g., int, float. -datatype : ibis.expr.datatypes.DataType - Used to specialize on expressions whose underlying value is of a different - type than its would-be type. For example, interval values are represented - by an integer. -""", -) - - -post_execute = Dispatcher( - "post_execute", - doc="""\ -Execute code on the result of a computation. - -Parameters ----------- -op : ibis.expr.operations.Node - The operation that was just executed -data : object - The result of the computation -""", -) - - -@post_execute.register(ops.Node, object) -def post_execute_default(op, data, **kwargs): - return data - - -execute = Dispatcher("execute") diff --git a/ibis/backends/pandas/dispatcher.py b/ibis/backends/pandas/dispatcher.py deleted file mode 100644 index 6240c0106c3f..000000000000 --- a/ibis/backends/pandas/dispatcher.py +++ /dev/null @@ -1,113 +0,0 @@ -from __future__ import annotations - -from multipledispatch import Dispatcher - - -class TwoLevelDispatcher(Dispatcher): - """A `multipledispatch.Dispatcher` with two levels of dispatching. - - The major change is that this class no longer trigger reorder in - dispatch_iter. Because the majority of the slowness is happening - in reorder, this implementation makes dispatch_iter faster. - Instead, this implementation will trigger reorder in the meta dispatcher - and second level dispatcher. Because the number of registered signatures - for each dispatcher is much smaller in this implementation (In pandas - backend, the number of signatures in one level implementation is - O(1000), and the max number of signatures for the meta dispatcher and - second level dispatcher is O(100)), the overall dispatch_iter is faster. - - This implementation consist of three Dispatcher instance: - - (1) This dispatcher, or the instance of this class itself. This class - inherits Dispatcher to avoid duplicating __call__, cache, ambiguities - detection, as well as properties like ordering and funcs. - - (2) First level dispatcher, aka, meta dispatcher. This is the dispatcher - is used to dispatch to the second level dispatcher using the type of the - first arg. - - (3) Second level dispatcher. This is the actual dispatcher used for linear - searching of matched function given type of args. - - Implementation notes: - - (1) register: - This method will now (a) create the second level dispatcher - if missing and register it with the meta dispatcher. (b) return a function - decorator that will register with all the second level dispatcher. Note - that multiple second level dispatcher could be registered with because this - is supported: - - @foo.register((C1, C2), ...) - - The decorator will also register with this dispatcher so that func and - ordering works properly. - - (2) dispatcher_iter - Instead of searching through self.ordering, this method now searches - through: - (a) dispatch_iter of the meta dispatcher (to find matching second level - dispatcher). - (b) for each second level dispatcher, searches through its dispatch_iter. - Because dispatch_iter of meta dispatcher and second level dispatcher - searches through registered functions in proper order (from subclasses to - base classes). - - (3) ambiguity detection, ordering, and funcs - Because this dispatcher has the same func and ordering property as - multipledispatch.Dispatcher. We can completely reuse the ambiguity - detection logic of Dispatcher. Note: - (a) we never actually linear search through ordering of this dispatcher - for dispatching. It's only used for ambiguity detection. - (b) deleting an entry from func of this dispatcher (i.e. del - dispatcher.func[A, B]) does not unregister it. Entries from the second - level dispatcher also needs to be deleted. This is OK because it is not - public API. - - Difference in behavior: - (1) ambiguity detection - Because this implementation doesn't not trigger total reorder of signatures - in dispatch_iter, ambiguity warning will trigger when user calls - "ordering", instead of "dispatch". - """ - - def __init__(self, name, doc=None): - super().__init__(name, doc) - self._meta_dispatcher = Dispatcher(f"{name}_meta") - - def register(self, *types, **kwargs): - type0 = types[0] - - if isinstance(type0, type): - type0 = [type0] - - dispatchers = [] - - for t in type0: - if (t,) in self._meta_dispatcher.funcs: - dispatcher = self._meta_dispatcher.funcs[(t,)] - else: - dispatcher = Dispatcher(f"{self.name}_{t.__name__}") - self._meta_dispatcher.register(t)(dispatcher) - - dispatchers.append((t, dispatcher)) - - def _(func): - self.add(types, func, **kwargs) - for t, dispatcher in dispatchers: - dispatcher.add((t, *types[1:]), func, **kwargs) - return func - - return _ - - def __delitem__(self, types): - del self.funcs[types] - del self._meta_dispatcher.funcs[types[:1]].funcs[types] - if not self._meta_dispatcher.funcs[types[:1]].funcs: - del self._meta_dispatcher.funcs[types[1:]] - - def dispatch_iter(self, *types): - for dispatcher in self._meta_dispatcher.dispatch_iter(types[0]): - func = dispatcher.dispatch(*types) - if func is not None: - yield func diff --git a/ibis/backends/pandas/execution/__init__.py b/ibis/backends/pandas/execution/__init__.py deleted file mode 100644 index 5a79d5166b93..000000000000 --- a/ibis/backends/pandas/execution/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -from __future__ import annotations - -from ibis.backends.pandas.execution.arrays import * # noqa: F403 -from ibis.backends.pandas.execution.decimal import * # noqa: F403 -from ibis.backends.pandas.execution.generic import * # noqa: F403 -from ibis.backends.pandas.execution.join import * # noqa: F403 -from ibis.backends.pandas.execution.maps import * # noqa: F403 -from ibis.backends.pandas.execution.selection import * # noqa: F403 -from ibis.backends.pandas.execution.strings import * # noqa: F403 -from ibis.backends.pandas.execution.structs import * # noqa: F403 -from ibis.backends.pandas.execution.temporal import * # noqa: F403 -from ibis.backends.pandas.execution.timecontext import * # noqa: F403 -from ibis.backends.pandas.execution.window import * # noqa: F403 diff --git a/ibis/backends/pandas/execution/arrays.py b/ibis/backends/pandas/execution/arrays.py deleted file mode 100644 index 20461f022241..000000000000 --- a/ibis/backends/pandas/execution/arrays.py +++ /dev/null @@ -1,172 +0,0 @@ -from __future__ import annotations - -import itertools -import operator -from functools import partial -from typing import TYPE_CHECKING, Any - -import numpy as np -import pandas as pd -from pandas.core.groupby import SeriesGroupBy - -import ibis.expr.operations as ops -from ibis.backends.pandas.core import execute -from ibis.backends.pandas.dispatch import execute_node - -if TYPE_CHECKING: - from collections.abc import Collection - - -@execute_node.register(ops.Array, tuple) -def execute_array(op, cols, **kwargs): - vals = [execute(arg, **kwargs) for arg in cols] - length = next((len(v) for v in vals if isinstance(v, pd.Series)), None) - - if length is None: - return vals - - def ensure_series(v): - if isinstance(v, pd.Series): - return v - else: - return pd.Series(v, index=range(length)) - - # pd.concat() can only handle array-likes. - # If we're given a scalar, we need to broadcast it as a Series. - df = pd.concat([ensure_series(v) for v in vals], axis=1) - return df.apply(lambda row: np.array(row, dtype=object), axis=1) - - -@execute_node.register(ops.ArrayLength, pd.Series) -def execute_array_length(op, data, **kwargs): - return data.apply(len) - - -@execute_node.register(ops.ArrayLength, (list, np.ndarray)) -def execute_array_length_scalar(op, data, **kwargs): - return len(data) - - -@execute_node.register(ops.ArraySlice, pd.Series, int, (int, type(None))) -def execute_array_slice(op, data, start, stop, **kwargs): - return data.apply(operator.itemgetter(slice(start, stop))) - - -@execute_node.register(ops.ArraySlice, (list, np.ndarray), int, (int, type(None))) -def execute_array_slice_scalar(op, data, start, stop, **kwargs): - return data[start:stop] - - -@execute_node.register(ops.ArrayIndex, pd.Series, int) -def execute_array_index(op, data, index, **kwargs): - return data.apply( - lambda array, index=index: ( - array[index] if -len(array) <= index < len(array) else None - ) - ) - - -@execute_node.register(ops.ArrayIndex, (list, np.ndarray), int) -def execute_array_index_scalar(op, data, index, **kwargs): - try: - return data[index] - except IndexError: - return None - - -@execute_node.register(ops.ArrayContains, (list, np.ndarray), object) -def execute_node_contains_value_array(op, haystack, needle, **kwargs): - return needle in haystack - - -def _concat_iterables_to_series(*iters: Collection[Any]) -> pd.Series: - """Concatenate two collections to create a Series. - - The two collections are assumed to have the same length. - - Used for ArrayConcat implementation. - """ - first, *rest = iters - assert all(len(series) == len(first) for series in rest) - # Doing the iteration using `map` is much faster than doing the iteration - # using `Series.apply` due to Pandas-related overhead. - return pd.Series(map(lambda *args: np.concatenate(args), first, *rest)) - - -@execute_node.register(ops.ArrayConcat, tuple) -def execute_array_concat(op, args, **kwargs): - return execute_node(op, *map(partial(execute, **kwargs), args), **kwargs) - - -@execute_node.register(ops.ArrayConcat, pd.Series, pd.Series, [pd.Series]) -def execute_array_concat_series(op, first, second, *args, **kwargs): - return _concat_iterables_to_series(first, second, *args) - - -@execute_node.register( - ops.ArrayConcat, (list, np.ndarray), pd.Series, [(pd.Series, list, np.ndarray)] -) -def execute_array_concat_mixed_left(op, left, right, *args, **kwargs): - # ArrayConcat given a column (pd.Series) and a scalar (np.ndarray). - # We will broadcast the scalar to the length of the column. - # Broadcast `left` to the length of `right` - left = np.tile(left, (len(right), 1)) - return _concat_iterables_to_series(left, right) - - -@execute_node.register( - ops.ArrayConcat, pd.Series, (list, np.ndarray), [(pd.Series, list, np.ndarray)] -) -def execute_array_concat_mixed_right(op, left, right, *args, **kwargs): - # Broadcast `right` to the length of `left` - right = np.tile(right, (len(left), 1)) - return _concat_iterables_to_series(left, right) - - -@execute_node.register( - ops.ArrayConcat, (list, np.ndarray), (list, np.ndarray), [(list, np.ndarray)] -) -def execute_array_concat_scalar(op, left, right, *args, **kwargs): - return np.concatenate([left, right, *args]) - - -@execute_node.register(ops.ArrayRepeat, pd.Series, int) -def execute_array_repeat(op, data, n, **kwargs): - # Negative n will be treated as 0 (repeat will produce empty array) - n = max(n, 0) - return pd.Series(np.tile(arr, n) for arr in data) - - -@execute_node.register(ops.ArrayRepeat, (list, np.ndarray), int) -def execute_array_repeat_scalar(op, data, n, **kwargs): - # Negative n will be treated as 0 (repeat will produce empty array) - return np.tile(data, max(n, 0)) - - -@execute_node.register(ops.ArrayCollect, pd.Series, (type(None), pd.Series)) -def execute_array_collect(op, data, where, aggcontext=None, **kwargs): - return aggcontext.agg(data.loc[where] if where is not None else data, np.array) - - -@execute_node.register(ops.ArrayCollect, SeriesGroupBy, (type(None), pd.Series)) -def execute_array_collect_groupby(op, data, where, aggcontext=None, **kwargs): - return aggcontext.agg( - ( - data.obj.loc[where].groupby(data.grouping.grouper) - if where is not None - else data - ), - np.array, - ) - - -@execute_node.register(ops.Unnest, pd.Series) -def execute_unnest(op, data, **kwargs): - return data[data.map(lambda v: bool(len(v)), na_action="ignore")].explode() - - -@execute_node.register(ops.ArrayFlatten, pd.Series) -def execute_array_flatten(op, data, **kwargs): - return data.map( - lambda v: list(itertools.chain.from_iterable(v)), na_action="ignore" - ) diff --git a/ibis/backends/pandas/execution/constants.py b/ibis/backends/pandas/execution/constants.py deleted file mode 100644 index 0e543561a869..000000000000 --- a/ibis/backends/pandas/execution/constants.py +++ /dev/null @@ -1,106 +0,0 @@ -"""Constants for the pandas backend.""" - -from __future__ import annotations - -import operator - -import numpy as np -import pandas as pd - -import ibis.expr.datatypes as dt -import ibis.expr.operations as ops -import ibis.util - -JOIN_TYPES = { - ops.LeftJoin: "left", - ops.RightJoin: "right", - ops.InnerJoin: "inner", - ops.OuterJoin: "outer", -} - - -LEFT_JOIN_SUFFIX = f"_ibis_left_{ibis.util.guid()}" -RIGHT_JOIN_SUFFIX = f"_ibis_right_{ibis.util.guid()}" -JOIN_SUFFIXES = LEFT_JOIN_SUFFIX, RIGHT_JOIN_SUFFIX -ALTERNATE_SUFFIXES = { - LEFT_JOIN_SUFFIX: RIGHT_JOIN_SUFFIX, - RIGHT_JOIN_SUFFIX: LEFT_JOIN_SUFFIX, -} - - -IBIS_TYPE_TO_PANDAS_TYPE: dict[dt.DataType, type | str] = { - dt.float16: np.float16, - dt.float32: np.float32, - dt.float64: np.float64, - dt.float32: np.float32, - dt.float64: np.float64, - dt.int8: np.int8, - dt.int16: np.int16, - dt.int32: np.int32, - dt.int64: np.int64, - dt.string: str, - dt.timestamp: "datetime64[ns]", - dt.boolean: np.bool_, - dt.json: str, - dt.float16.copy(nullable=False): np.float16, - dt.float32.copy(nullable=False): np.float32, - dt.float64.copy(nullable=False): np.float64, - dt.float32.copy(nullable=False): np.float32, - dt.float64.copy(nullable=False): np.float64, - dt.int8.copy(nullable=False): np.int8, - dt.int16.copy(nullable=False): np.int16, - dt.int32.copy(nullable=False): np.int32, - dt.int64.copy(nullable=False): np.int64, - dt.string.copy(nullable=False): str, - dt.timestamp.copy(nullable=False): "datetime64[ns]", - dt.boolean.copy(nullable=False): np.bool_, - dt.json.copy(nullable=False): str, -} - - -IBIS_TO_PYTHON_LITERAL_TYPES = { - dt.boolean: bool, - dt.float64: float, - dt.float32: float, - dt.int64: int, - dt.int32: int, - dt.int16: int, - dt.int8: int, - dt.string: str, - dt.date: lambda x: pd.Timestamp(x).to_pydatetime().date(), - dt.boolean.copy(nullable=False): bool, - dt.float64.copy(nullable=False): float, - dt.float32.copy(nullable=False): float, - dt.int64.copy(nullable=False): int, - dt.int32.copy(nullable=False): int, - dt.int16.copy(nullable=False): int, - dt.int8.copy(nullable=False): int, - dt.string.copy(nullable=False): str, - dt.date.copy(nullable=False): lambda x: pd.Timestamp(x).to_pydatetime().date(), -} - - -BINARY_OPERATIONS = { - ops.Greater: operator.gt, - ops.Less: operator.lt, - ops.LessEqual: operator.le, - ops.GreaterEqual: operator.ge, - ops.Equals: operator.eq, - ops.NotEquals: operator.ne, - ops.And: operator.and_, - ops.Or: operator.or_, - ops.Xor: operator.xor, - ops.Add: operator.add, - ops.Subtract: operator.sub, - ops.Multiply: operator.mul, - ops.Divide: operator.truediv, - ops.FloorDivide: operator.floordiv, - ops.Modulus: operator.mod, - ops.Power: operator.pow, - ops.IdenticalTo: lambda x, y: (x == y) | (pd.isnull(x) & pd.isnull(y)), - ops.BitwiseXor: lambda x, y: np.bitwise_xor(x, y), - ops.BitwiseOr: lambda x, y: np.bitwise_or(x, y), - ops.BitwiseAnd: lambda x, y: np.bitwise_and(x, y), - ops.BitwiseLeftShift: lambda x, y: np.left_shift(x, y), - ops.BitwiseRightShift: lambda x, y: np.right_shift(x, y), -} diff --git a/ibis/backends/pandas/execution/decimal.py b/ibis/backends/pandas/execution/decimal.py deleted file mode 100644 index ac34bea4e8a2..000000000000 --- a/ibis/backends/pandas/execution/decimal.py +++ /dev/null @@ -1,135 +0,0 @@ -from __future__ import annotations - -import decimal -import math -import numbers - -import numpy as np -import pandas as pd - -import ibis.expr.datatypes as dt -import ibis.expr.operations as ops -from ibis.backends.pandas.dispatch import execute_node -from ibis.common.exceptions import OperationNotDefinedError - - -@execute_node.register(ops.Ln, decimal.Decimal) -def execute_decimal_natural_log(op, data, **kwargs): - try: - return data.ln() - except decimal.InvalidOperation: - return decimal.Decimal("NaN") - - -@execute_node.register(ops.Log, decimal.Decimal, decimal.Decimal) -def execute_decimal_log_with_decimal_base(op, data, base, **kwargs): - try: - return data.ln() / base.ln() - except decimal.InvalidOperation: - return decimal.Decimal("NaN") - - -@execute_node.register(ops.Log, decimal.Decimal, type(None)) -def execute_decimal_log_with_no_base(op, data, _, **kwargs): - return execute_decimal_natural_log(op, data, **kwargs) - - -@execute_node.register(ops.Log, decimal.Decimal, numbers.Real) -def execute_decimal_log_with_real_base(op, data, base, **kwargs): - return execute_node(op, data, decimal.Decimal(base), **kwargs) - - -@execute_node.register(ops.Log, decimal.Decimal, np.integer) -def execute_decimal_log_with_np_integer_base(op, data, base, **kwargs): - return execute_node(op, data, int(base), **kwargs) - - -@execute_node.register(ops.Log2, decimal.Decimal) -def execute_decimal_log2(op, data, **kwargs): - try: - return data.ln() / decimal.Decimal(2).ln() - except decimal.InvalidOperation: - return decimal.Decimal("NaN") - - -# While ops.Negate is a subclass of ops.Unary, multipledispatch will be -# faster if we provide types that can potentially match the types of inputs -# exactly -@execute_node.register((ops.Unary, ops.Negate), decimal.Decimal) -def execute_decimal_unary(op, data, **kwargs): - op_type = type(op) - operation_name = op_type.__name__.lower() - function = getattr( - decimal.Decimal, - operation_name, - None, - ) - if function is None: - math_function = getattr(math, operation_name, None) - if math_function is None: - raise OperationNotDefinedError(f"{op_type.__name__} not supported") - function = lambda x: decimal.Decimal(math_function(x)) - try: - return function(data) - except decimal.InvalidOperation: - return decimal.Decimal("NaN") - - -@execute_node.register(ops.Sign, decimal.Decimal) -def execute_decimal_sign(op, data, **kwargs): - return data if not data else decimal.Decimal(1).copy_sign(data) - - -@execute_node.register(ops.Abs, decimal.Decimal) -def execute_decimal_abs(op, data, **kwargs): - return abs(data) - - -@execute_node.register(ops.Round, decimal.Decimal, (np.integer, int)) -def execute_round_decimal(op, data, places, **kwargs): - # If we only allowed Python 3, we wouldn't have to implement any of this; - # we could just call round(data, places) :( - tuple_value = data.as_tuple() - precision = len(tuple_value.digits) - integer_part_length = precision + min(tuple_value.exponent, 0) - - if places < 0: - decimal_format_string = "0.{}E+{:d}".format( - "0" * (integer_part_length - 1 + places), - max(integer_part_length + places, abs(places)), - ) - else: - decimal_format_string = "{}.{}".format("0" * integer_part_length, "0" * places) - - places = decimal.Decimal(decimal_format_string) - return data.quantize(places) - - -@execute_node.register(ops.Round, decimal.Decimal, type(None)) -def execute_round_decimal_no_places(op, data, _, **kwargs): - return np.int64(round(data)) - - -@execute_node.register(ops.Cast, pd.Series, dt.Decimal) -def execute_cast_series_to_decimal(op, data, type, **kwargs): - precision = type.precision - scale = type.scale - context = decimal.Context(prec=precision) - places = context.create_decimal( - "{}.{}".format("0" * (precision - scale), "0" * scale) - ) - return data.apply( - lambda x, context=context, places=places: ( - context.create_decimal(x).quantize(places) - ) - ) - - -@execute_node.register(ops.E) -def execute_e(op, **kwargs): - return np.e - - -@execute_node.register(ops.Pi) -def execute_pi(op, **kwargs): - return np.pi diff --git a/ibis/backends/pandas/execution/generic.py b/ibis/backends/pandas/execution/generic.py deleted file mode 100644 index 7c8b53cc2f79..000000000000 --- a/ibis/backends/pandas/execution/generic.py +++ /dev/null @@ -1,1479 +0,0 @@ -"""Execution rules for generic ibis operations.""" - -from __future__ import annotations - -import collections -import contextlib -import datetime -import decimal -import functools -import math -import numbers -import operator -from collections.abc import Mapping, Sized - -import numpy as np -import pandas as pd -import pytz -import toolz -from pandas.core.groupby import DataFrameGroupBy, SeriesGroupBy - -import ibis.common.exceptions as com -import ibis.expr.datatypes as dt -import ibis.expr.operations as ops -import ibis.expr.types as ir -from ibis.backends.base.df.scope import Scope -from ibis.backends.base.df.timecontext import TimeContext, get_time_col -from ibis.backends.pandas import Backend as PandasBackend -from ibis.backends.pandas import aggcontext as agg_ctx -from ibis.backends.pandas.core import ( - boolean_types, - date_types, - execute, - fixed_width_types, - floating_types, - integer_types, - numeric_types, - scalar_types, - simple_types, - timedelta_types, - timestamp_types, -) -from ibis.backends.pandas.dispatch import execute_literal, execute_node -from ibis.backends.pandas.execution import constants -from ibis.backends.pandas.execution.util import coerce_to_output, get_grouping - - -# By default return the literal value -@execute_literal.register(ops.Literal, object, dt.DataType) -def execute_node_literal_value_datatype(op, value, datatype, **kwargs): - return value - - -# Because True and 1 hash to the same value, if we have True or False in scope -# keys while executing anything that should evaluate to 1 or 0 evaluates to -# True or False respectively. This is a hack to work around that by casting the -# bool to an integer. -@execute_literal.register(ops.Literal, object, dt.Integer) -def execute_node_literal_any_integer_datatype(op, value, datatype, **kwargs): - if value is None: - return value - return int(value) - - -@execute_literal.register(ops.Literal, object, dt.Boolean) -def execute_node_literal_any_boolean_datatype(op, value, datatype, **kwargs): - if value is None: - return value - return bool(value) - - -@execute_literal.register(ops.Literal, object, dt.Floating) -def execute_node_literal_any_floating_datatype(op, value, datatype, **kwargs): - if value is None: - return value - return float(value) - - -@execute_literal.register(ops.Literal, object, dt.Array) -def execute_node_literal_any_array_datatype(op, value, datatype, **kwargs): - if value is None: - return value - return np.array(value) - - -@execute_literal.register(ops.Literal, dt.DataType) -def execute_node_literal_datatype(op, datatype, **kwargs): - return op.value - - -@execute_literal.register( - ops.Literal, (*timedelta_types, str, *integer_types, type(None)), dt.Interval -) -def execute_interval_literal(op, value, dtype, **kwargs): - if value is None: - return pd.NaT - return pd.Timedelta(value, dtype.unit.short) - - -@execute_node.register(ops.Limit, pd.DataFrame, integer_types, integer_types) -def execute_limit_frame(op, data, nrows: int, offset: int, **kwargs): - return data.iloc[offset : offset + nrows] - - -@execute_node.register(ops.Limit, pd.DataFrame, type(None), integer_types) -def execute_limit_frame_no_limit(op, data, nrows: None, offset: int, **kwargs): - return data.iloc[offset:] - - -@execute_node.register(ops.Cast, SeriesGroupBy, dt.DataType) -def execute_cast_series_group_by(op, data, type, **kwargs): - result = execute_cast_series_generic(op, data.obj, type, **kwargs) - return result.groupby(get_grouping(data.grouper.groupings), group_keys=False) - - -@execute_node.register(ops.Cast, pd.Series, dt.DataType) -def execute_cast_series_generic(op, data, type, **kwargs): - out = data.astype(constants.IBIS_TYPE_TO_PANDAS_TYPE[type]) - if type.is_integer(): - if op.arg.dtype.is_timestamp(): - return out.floordiv(int(1e9)) - elif op.arg.dtype.is_date(): - return out.floordiv(int(24 * 60 * 60 * 1e9)) - return out - - -@execute_node.register(ops.Cast, pd.Series, dt.Array) -def execute_cast_series_array(op, data, type, **kwargs): - value_type = type.value_type - numpy_type = constants.IBIS_TYPE_TO_PANDAS_TYPE.get(value_type, None) - if numpy_type is None: - raise ValueError( - "Array value type must be a primitive type " - "(e.g., number, string, or timestamp)" - ) - - def cast_to_array(array, numpy_type=numpy_type): - elems = [ - el if el is None else np.array(el, dtype=numpy_type).item() for el in array - ] - try: - return np.array(elems, dtype=numpy_type) - except TypeError: - return np.array(elems) - - return data.map(cast_to_array) - - -@execute_node.register(ops.Cast, list, dt.Array) -def execute_cast_list_array(op, data, type, **kwargs): - value_type = type.value_type - numpy_type = constants.IBIS_TYPE_TO_PANDAS_TYPE.get(value_type, None) - if numpy_type is None: - raise ValueError( - "Array value type must be a primitive type " - "(e.g., number, string, or timestamp)" - ) - - def cast_to_array(array, numpy_type=numpy_type): - elems = [ - el if el is None else np.array(el, dtype=numpy_type).item() for el in array - ] - try: - return np.array(elems, dtype=numpy_type) - except TypeError: - return np.array(elems) - - return cast_to_array(data) - - -@execute_node.register(ops.Cast, pd.Series, dt.Timestamp) -def execute_cast_series_timestamp(op, data, type, **kwargs): - arg = op.arg - from_type = arg.dtype - - if from_type.equals(type): # noop cast - return data - - tz = type.timezone - - if from_type.is_timestamp(): - from_tz = from_type.timezone - if tz is None and from_tz is None: - return data - elif tz is None or from_tz is None: - return data.dt.tz_localize(tz) - elif tz is not None and from_tz is not None: - return data.dt.tz_convert(tz) - elif from_type.is_date(): - return data if tz is None else data.dt.tz_localize(tz) - - if from_type.is_string() or from_type.is_integer(): - if from_type.is_integer(): - timestamps = pd.to_datetime(data.values, unit="s") - else: - timestamps = pd.to_datetime(data.values) - if getattr(timestamps.dtype, "tz", None) is not None: - method_name = "tz_convert" - else: - method_name = "tz_localize" - method = getattr(timestamps, method_name) - timestamps = method(tz) - return pd.Series(timestamps, index=data.index, name=data.name) - - raise TypeError(f"Don't know how to cast {from_type} to {type}") - - -def _normalize(values, original_index, name, timezone=None): - index = pd.DatetimeIndex(values, tz=timezone) - return pd.Series(index.normalize(), index=original_index, name=name) - - -@execute_node.register(ops.Cast, pd.Series, dt.Date) -def execute_cast_series_date(op, data, type, **kwargs): - arg = op.args[0] - from_type = arg.dtype - - if from_type.equals(type): - return data - - if from_type.is_timestamp(): - return _normalize( - data.values, data.index, data.name, timezone=from_type.timezone - ) - - if from_type.is_string(): - values = data.values - datetimes = pd.to_datetime(values) - with contextlib.suppress(TypeError): - datetimes = datetimes.tz_convert(None) - dates = _normalize(datetimes, data.index, data.name) - return pd.Series(dates, index=data.index, name=data.name) - - if from_type.is_integer(): - return pd.Series( - pd.to_datetime(data.values, unit="D").values, - index=data.index, - name=data.name, - ) - - raise TypeError(f"Don't know how to cast {from_type} to {type}") - - -@execute_node.register(ops.SortKey, pd.Series, bool) -def execute_sort_key_series(op, data, _, **kwargs): - return data - - -def call_numpy_ufunc(func, op, data, **kwargs): - if getattr(data, "dtype", None) == np.dtype(np.object_): - return data.apply(functools.partial(execute_node, op, **kwargs)) - if func is None: - raise com.OperationNotDefinedError(f"{type(op).__name__} not supported") - return func(data) - - -@execute_node.register(ops.Negate, fixed_width_types + timedelta_types) -def execute_obj_negate(op, data, **kwargs): - return -data - - -@execute_node.register(ops.Negate, pd.Series) -def execute_series_negate(op, data, **kwargs): - return call_numpy_ufunc(np.negative, op, data, **kwargs) - - -@execute_node.register(ops.Negate, SeriesGroupBy) -def execute_series_group_by_negate(op, data, **kwargs): - return execute_series_negate(op, data.obj, **kwargs).groupby( - get_grouping(data.grouper.groupings), group_keys=False - ) - - -@execute_node.register(ops.Unary, pd.Series) -def execute_series_unary_op(op, data, **kwargs): - op_type = type(op) - if op_type == ops.BitwiseNot: - function = np.bitwise_not - else: - function = getattr(np, op_type.__name__.lower()) - return call_numpy_ufunc(function, op, data, **kwargs) - - -@execute_node.register(ops.Acos, (pd.Series, *numeric_types)) -def execute_series_acos(_, data, **kwargs): - return np.arccos(data) - - -@execute_node.register(ops.Asin, (pd.Series, *numeric_types)) -def execute_series_asin(_, data, **kwargs): - return np.arcsin(data) - - -@execute_node.register(ops.Atan, (pd.Series, *numeric_types)) -def execute_series_atan(_, data, **kwargs): - return np.arctan(data) - - -@execute_node.register(ops.Cot, (pd.Series, *numeric_types)) -def execute_series_cot(_, data, **kwargs): - return 1.0 / np.tan(data) - - -@execute_node.register( - ops.Atan2, (pd.Series, *numeric_types), (pd.Series, *numeric_types) -) -def execute_series_atan2(_, y, x, **kwargs): - return np.arctan2(y, x) - - -@execute_node.register((ops.Cos, ops.Sin, ops.Tan), (pd.Series, *numeric_types)) -def execute_series_trig(op, data, **kwargs): - function = getattr(np, type(op).__name__.lower()) - return call_numpy_ufunc(function, op, data, **kwargs) - - -@execute_node.register(ops.Radians, (pd.Series, *numeric_types)) -def execute_series_radians(_, data, **kwargs): - return np.radians(data) - - -@execute_node.register(ops.Degrees, (pd.Series, *numeric_types)) -def execute_series_degrees(_, data, **kwargs): - return np.degrees(data) - - -@execute_node.register((ops.Ceil, ops.Floor), pd.Series) -def execute_series_ceil(op, data, **kwargs): - return_type = np.object_ if data.dtype == np.object_ else np.int64 - func = getattr(np, type(op).__name__.lower()) - return call_numpy_ufunc(func, op, data, **kwargs).astype(return_type) - - -@execute_node.register(ops.BitwiseNot, integer_types) -def execute_int_bitwise_not(op, data, **kwargs): - return np.invert(data) - - -def vectorize_object(op, arg, *args, **kwargs): - func = np.vectorize(functools.partial(execute_node, op, **kwargs)) - return pd.Series(func(arg, *args), index=arg.index, name=arg.name) - - -@execute_node.register( - ops.Log, pd.Series, (pd.Series, numbers.Real, decimal.Decimal, type(None)) -) -def execute_series_log_with_base(op, data, base, **kwargs): - if data.dtype == np.dtype(np.object_): - return vectorize_object(op, data, base, **kwargs) - - if base is None: - return np.log(data) - return np.log(data) / np.log(base) - - -@execute_node.register(ops.Ln, pd.Series) -def execute_series_natural_log(op, data, **kwargs): - if data.dtype == np.dtype(np.object_): - return data.apply(functools.partial(execute_node, op, **kwargs)) - return np.log(data) - - -@execute_node.register( - ops.Clip, - pd.Series, - (pd.Series, type(None)) + numeric_types, - (pd.Series, type(None)) + numeric_types, -) -def execute_series_clip(op, data, lower, upper, **kwargs): - return data.clip(lower=lower, upper=upper) - - -@execute_node.register( - ops.Quantile, - pd.Series, - (np.ndarray, *numeric_types), - (pd.Series, type(None)), -) -def execute_series_quantile(op, data, quantile, mask, aggcontext=None, **_): - return aggcontext.agg( - data if mask is None else data.loc[mask], - "quantile", - q=quantile, - ) - - -@execute_node.register(ops.Quantile, pd.Series, (np.ndarray, *numeric_types)) -def execute_series_quantile_default(op, data, quantile, aggcontext=None, **_): - return aggcontext.agg(data, "quantile", q=quantile) - - -@execute_node.register( - ops.Quantile, - SeriesGroupBy, - (np.ndarray, *numeric_types), - (SeriesGroupBy, type(None)), -) -def execute_series_group_by_quantile(op, data, quantile, mask, aggcontext=None, **_): - return aggcontext.agg( - data, - ( - "quantile" - if mask is None - else functools.partial(_filtered_reduction, mask.obj, pd.Series.quantile) - ), - q=quantile, - ) - - -@execute_node.register( - ops.MultiQuantile, - pd.Series, - (np.ndarray, *numeric_types), - (pd.Series, type(None)), -) -def execute_series_quantile_multi(op, data, quantile, mask, aggcontext=None, **_): - return np.array( - aggcontext.agg(data if mask is None else data.loc[mask], "quantile", q=quantile) - ) - - -@execute_node.register( - ops.MultiQuantile, - SeriesGroupBy, - np.ndarray, - (SeriesGroupBy, type(None)), -) -def execute_series_quantile_multi_groupby( - op, data, quantile, mask, aggcontext=None, **kwargs -): - def q(x, quantile): - result = x.quantile(quantile).tolist() - return [result for _ in range(len(x))] - - return aggcontext.agg( - data, - q if mask is None else functools.partial(_filtered_reduction, mask.obj, q), - quantile, - ) - - -@execute_node.register(ops.MultiQuantile, SeriesGroupBy, np.ndarray) -def execute_series_quantile_multi_groupby_default( - op, data, quantile, aggcontext=None, **_ -): - def q(x, quantile): - result = x.quantile(quantile).tolist() - return [result for _ in range(len(x))] - - return aggcontext.agg(data, q, quantile) - - -@execute_node.register(ops.Cast, type(None), dt.DataType) -def execute_cast_null_to_anything(op, data, type, **kwargs): - return None - - -@execute_node.register(ops.Cast, datetime.datetime, dt.String) -def execute_cast_datetime_or_timestamp_to_string(op, data, type, **kwargs): - """Cast timestamps to strings.""" - return str(data) - - -@execute_node.register(ops.Cast, datetime.datetime, dt.Int64) -def execute_cast_timestamp_to_integer(op, data, type, **kwargs): - """Cast timestamps to integers.""" - t = pd.Timestamp(data) - return pd.NA if pd.isna(t) else int(t.timestamp()) - - -@execute_node.register(ops.Cast, (np.bool_, bool), dt.Timestamp) -def execute_cast_bool_to_timestamp(op, data, type, **kwargs): - raise TypeError( - "Casting boolean values to timestamps does not make sense. If you " - "really want to cast boolean values to timestamps please cast to " - "int64 first then to timestamp: " - "value.cast('int64').cast('timestamp')" - ) - - -@execute_node.register(ops.Cast, (np.bool_, bool), dt.Interval) -def execute_cast_bool_to_interval(op, data, type, **kwargs): - raise TypeError( - "Casting boolean values to intervals does not make sense. If you " - "really want to cast boolean values to intervals please cast to " - "int64 first then to interval: " - "value.cast('int64').cast(ibis.expr.datatypes.Interval(...))" - ) - - -@execute_node.register(ops.Cast, integer_types, dt.Timestamp) -def execute_cast_integer_to_timestamp(op, data, type, **kwargs): - """Cast integer to timestamp.""" - return pd.Timestamp(data, unit="s", tz=type.timezone) - - -@execute_node.register(ops.Cast, str, dt.Timestamp) -def execute_cast_string_to_timestamp(op, data, type, **kwargs): - """Cast string to timestamp.""" - return pd.Timestamp(data, tz=type.timezone) - - -@execute_node.register(ops.Cast, datetime.datetime, dt.Timestamp) -def execute_cast_timestamp_to_timestamp(op, data, type, **kwargs): - """Cast timestamps to other timestamps including timezone if necessary.""" - input_timezone = data.tzinfo - target_timezone = type.timezone - - if input_timezone == target_timezone: - return data - - if input_timezone is None or target_timezone is None: - return data.astimezone( - tz=None if target_timezone is None else pytz.timezone(target_timezone) - ) - - return data.astimezone(tz=pytz.timezone(target_timezone)) - - -@execute_node.register(ops.Cast, fixed_width_types + (str,), dt.DataType) -def execute_cast_string_literal(op, data, type, **kwargs): - try: - cast_function = constants.IBIS_TO_PYTHON_LITERAL_TYPES[type] - except KeyError: - raise TypeError(f"Don't know how to cast {data!r} to type {type}") - else: - return cast_function(data) - - -@execute_node.register(ops.Cast, Mapping, dt.DataType) -def execute_cast_mapping_literal(op, data, type, **kwargs): - data = ( - (ops.Literal(k, type.key_type), ops.Literal(v, type.value_type)) - for k, v in data.items() - ) - return {execute(k, **kwargs): execute(v, **kwargs) for k, v in data} - - -@execute_node.register(ops.Round, scalar_types, (int, type(None))) -def execute_round_scalars(op, data, places, **kwargs): - return round(data, places) if places else round(data) - - -@execute_node.register(ops.Round, pd.Series, (pd.Series, np.integer, type(None), int)) -def execute_round_series(op, data, places, **kwargs): - if data.dtype == np.dtype(np.object_): - return vectorize_object(op, data, places, **kwargs) - result = data.round(places or 0) - return result if places else result.astype("int64") - - -@execute_node.register(ops.TableColumn, (pd.DataFrame, DataFrameGroupBy)) -def execute_table_column_df_or_df_groupby(op, data, **kwargs): - return data[op.name] - - -@execute_node.register(ops.Aggregation, pd.DataFrame) -def execute_aggregation_dataframe( - op, - data, - scope=None, - timecontext: TimeContext | None = None, - **kwargs, -): - assert op.metrics, "no metrics found during aggregation execution" - - if op.sort_keys: - raise NotImplementedError("sorting on aggregations not yet implemented") - - if op.predicates: - predicate = functools.reduce( - operator.and_, - ( - execute(p, scope=scope, timecontext=timecontext, **kwargs) - for p in op.predicates - ), - ) - data = data.loc[predicate] - - columns: dict[str, str] = {} - - if op.by: - grouping_keys = [ - key.name - if isinstance(key, ops.TableColumn) - else execute(key, scope=scope, timecontext=timecontext, **kwargs).rename( - key.name - ) - for key in op.by - ] - source = data.groupby( - grouping_keys[0] if len(grouping_keys) == 1 else grouping_keys, - group_keys=False, - ) - else: - source = data - - scope = scope.merge_scope(Scope({op.table: source}, timecontext)) - - pieces = [ - coerce_to_output( - execute(metric, scope=scope, timecontext=timecontext, **kwargs), - metric, - ) - for metric in op.metrics - ] - - result = pd.concat(pieces, axis=1) - - # If grouping, need a reset to get the grouping key back as a column - if op.by: - result = result.reset_index() - - result.columns = [columns.get(c, c) for c in result.columns] - - if op.having: - # .having(...) is only accessible on groupby, so this should never - # raise - if not op.by: - raise ValueError( - "Filtering out aggregation values is not allowed without at " - "least one grouping key" - ) - - # TODO(phillipc): Don't recompute identical subexpressions - predicate = functools.reduce( - operator.and_, - ( - execute(h, scope=scope, timecontext=timecontext, **kwargs) - for h in op.having - ), - ) - assert len(predicate) == len( - result - ), "length of predicate does not match length of DataFrame" - result = result.loc[predicate.values] - return result - - -@execute_node.register(ops.Reduction, SeriesGroupBy, type(None)) -def execute_reduction_series_groupby(op, data, mask, aggcontext=None, **kwargs): - return aggcontext.agg(data, type(op).__name__.lower()) - - -@execute_node.register(ops.First, SeriesGroupBy, type(None)) -def execute_first_series_groupby(op, data, mask, aggcontext=None, **kwargs): - return aggcontext.agg(data, lambda x: getattr(x, "iat", x)[0]) - - -@execute_node.register(ops.Last, SeriesGroupBy, type(None)) -def execute_last_series_groupby(op, data, mask, aggcontext=None, **kwargs): - return aggcontext.agg(data, lambda x: getattr(x, "iat", x)[-1]) - - -variance_ddof = {"pop": 0, "sample": 1} - - -@execute_node.register(ops.Variance, SeriesGroupBy, type(None)) -def execute_reduction_series_groupby_var(op, data, _, aggcontext=None, **kwargs): - return aggcontext.agg(data, "var", ddof=variance_ddof[op.how]) - - -@execute_node.register(ops.StandardDev, SeriesGroupBy, type(None)) -def execute_reduction_series_groupby_std(op, data, _, aggcontext=None, **kwargs): - return aggcontext.agg(data, "std", ddof=variance_ddof[op.how]) - - -@execute_node.register( - (ops.CountDistinct, ops.ApproxCountDistinct), - SeriesGroupBy, - type(None), -) -def execute_count_distinct_series_groupby(op, data, _, aggcontext=None, **kwargs): - return aggcontext.agg(data, "nunique") - - -@execute_node.register(ops.Arbitrary, SeriesGroupBy, type(None)) -def execute_arbitrary_series_groupby(op, data, _, aggcontext=None, **kwargs): - how = op.how - if how is None: - how = "first" - - if how not in {"first", "last"}: - raise com.OperationNotDefinedError(f"Arbitrary {how!r} is not supported") - return aggcontext.agg(data, how) - - -@execute_node.register( - (ops.ArgMin, ops.ArgMax), - SeriesGroupBy, - SeriesGroupBy, - type(None), -) -def execute_reduction_series_groupby_argidx( - op, data, key, _, aggcontext=None, **kwargs -): - method = operator.methodcaller(op.__class__.__name__.lower()) - - def reduce(data, key=key.obj, method=method): - return data.iloc[method(key.loc[data.index])] - - return aggcontext.agg(data, reduce) - - -def _filtered_reduction(mask, method, data): - return method(data[mask[data.index]]) - - -@execute_node.register(ops.Reduction, SeriesGroupBy, SeriesGroupBy) -def execute_reduction_series_gb_mask(op, data, mask, aggcontext=None, **kwargs): - method = operator.methodcaller(type(op).__name__.lower()) - return aggcontext.agg( - data, functools.partial(_filtered_reduction, mask.obj, method) - ) - - -@execute_node.register(ops.First, SeriesGroupBy, SeriesGroupBy) -def execute_first_series_gb_mask(op, data, mask, aggcontext=None, **kwargs): - return aggcontext.agg( - data, functools.partial(_filtered_reduction, mask.obj, lambda x: x.iloc[0]) - ) - - -@execute_node.register(ops.Last, SeriesGroupBy, SeriesGroupBy) -def execute_last_series_gb_mask(op, data, mask, aggcontext=None, **kwargs): - return aggcontext.agg( - data, functools.partial(_filtered_reduction, mask.obj, lambda x: x.iloc[-1]) - ) - - -@execute_node.register( - (ops.CountDistinct, ops.ApproxCountDistinct), - SeriesGroupBy, - SeriesGroupBy, -) -def execute_count_distinct_series_groupby_mask( - op, data, mask, aggcontext=None, **kwargs -): - return aggcontext.agg( - data, - functools.partial(_filtered_reduction, mask.obj, pd.Series.nunique), - ) - - -@execute_node.register(ops.Variance, SeriesGroupBy, SeriesGroupBy) -def execute_var_series_groupby_mask(op, data, mask, aggcontext=None, **kwargs): - return aggcontext.agg( - data, - lambda x, mask=mask.obj, ddof=variance_ddof[op.how]: ( - x[mask[x.index]].var(ddof=ddof) - ), - ) - - -@execute_node.register(ops.StandardDev, SeriesGroupBy, SeriesGroupBy) -def execute_std_series_groupby_mask(op, data, mask, aggcontext=None, **kwargs): - return aggcontext.agg( - data, - lambda x, mask=mask.obj, ddof=variance_ddof[op.how]: ( - x[mask[x.index]].std(ddof=ddof) - ), - ) - - -@execute_node.register(ops.CountStar, DataFrameGroupBy, type(None)) -def execute_count_star_frame_groupby(op, data, _, **kwargs): - return data.size() - - -@execute_node.register(ops.CountDistinctStar, DataFrameGroupBy, type(None)) -def execute_count_distinct_star_frame_groupby(op, data, _, **kwargs): - return data.nunique() - - -@execute_node.register(ops.Reduction, pd.Series, (pd.Series, type(None))) -def execute_reduction_series_mask(op, data, mask, aggcontext=None, **kwargs): - operand = data[mask] if mask is not None else data - return aggcontext.agg(operand, type(op).__name__.lower()) - - -@execute_node.register(ops.First, pd.Series, (pd.Series, type(None))) -def execute_first_series_mask(op, data, mask, aggcontext=None, **kwargs): - operand = data[mask] if mask is not None else data - - def _first(x): - return getattr(x, "iloc", x)[0] - - return aggcontext.agg(operand, _first) - - -@execute_node.register(ops.Last, pd.Series, (pd.Series, type(None))) -def execute_last_series_mask(op, data, mask, aggcontext=None, **kwargs): - operand = data[mask] if mask is not None else data - - def _last(x): - return getattr(x, "iloc", x)[-1] - - return aggcontext.agg(operand, _last) - - -@execute_node.register( - (ops.CountDistinct, ops.ApproxCountDistinct), - pd.Series, - (pd.Series, type(None)), -) -def execute_count_distinct_series_mask(op, data, mask, aggcontext=None, **kwargs): - return aggcontext.agg(data[mask] if mask is not None else data, "nunique") - - -@execute_node.register(ops.Arbitrary, pd.Series, (pd.Series, type(None))) -def execute_arbitrary_series_mask(op, data, mask, aggcontext=None, **kwargs): - if op.how == "first": - index = 0 - elif op.how == "last": - index = -1 - else: - raise com.OperationNotDefinedError(f"Arbitrary {op.how!r} is not supported") - - data = data[mask] if mask is not None else data - return data.iloc[index] - - -@execute_node.register(ops.StandardDev, pd.Series, (pd.Series, type(None))) -def execute_standard_dev_series(op, data, mask, aggcontext=None, **kwargs): - return aggcontext.agg( - data[mask] if mask is not None else data, - "std", - ddof=variance_ddof[op.how], - ) - - -@execute_node.register(ops.Variance, pd.Series, (pd.Series, type(None))) -def execute_variance_series(op, data, mask, aggcontext=None, **kwargs): - return aggcontext.agg( - data[mask] if mask is not None else data, - "var", - ddof=variance_ddof[op.how], - ) - - -@execute_node.register((ops.Any, ops.All), pd.Series, (pd.Series, type(None))) -def execute_any_all_series(op, data, mask, aggcontext=None, **kwargs): - if mask is not None: - data = data.loc[mask] - if isinstance(aggcontext, (agg_ctx.Summarize, agg_ctx.Transform)): - result = aggcontext.agg(data, type(op).__name__.lower()) - else: - result = aggcontext.agg( - data, lambda data: getattr(data, type(op).__name__.lower())() - ) - try: - return result.astype(bool) - except TypeError: - return result - - -@execute_node.register((ops.Any, ops.All), SeriesGroupBy, type(None)) -def execute_any_all_series_group_by(op, data, mask, aggcontext=None, **kwargs): - if mask is not None: - data = data.obj.loc[mask].groupby(get_grouping(data.grouper.groupings)) - if isinstance(aggcontext, (agg_ctx.Summarize, agg_ctx.Transform)): - result = aggcontext.agg(data, type(op).__name__.lower()) - else: - result = aggcontext.agg( - data, lambda data: getattr(data, type(op).__name__.lower())() - ) - try: - return result.astype(bool) - except TypeError: - return result - - -@execute_node.register(ops.CountStar, pd.DataFrame, type(None)) -def execute_count_star_frame(op, data, _, **kwargs): - return len(data) - - -@execute_node.register(ops.CountStar, pd.DataFrame, pd.Series) -def execute_count_star_frame_filter(op, data, where, **kwargs): - return len(data) - len(where) + where.sum() - - -@execute_node.register(ops.CountDistinctStar, pd.DataFrame, type(None)) -def execute_count_distinct_star_frame(op, data, _, **kwargs): - return len(data.drop_duplicates()) - - -@execute_node.register(ops.CountDistinctStar, pd.DataFrame, pd.Series) -def execute_count_distinct_star_frame_filter(op, data, filt, **kwargs): - return len(data.loc[filt].drop_duplicates()) - - -@execute_node.register(ops.BitAnd, pd.Series, (pd.Series, type(None))) -def execute_bit_and_series(_, data, mask, aggcontext=None, **kwargs): - return aggcontext.agg( - data[mask] if mask is not None else data, - np.bitwise_and.reduce, - ) - - -@execute_node.register(ops.BitOr, pd.Series, (pd.Series, type(None))) -def execute_bit_or_series(_, data, mask, aggcontext=None, **kwargs): - return aggcontext.agg( - data[mask] if mask is not None else data, - np.bitwise_or.reduce, - ) - - -@execute_node.register(ops.BitXor, pd.Series, (pd.Series, type(None))) -def execute_bit_xor_series(_, data, mask, aggcontext=None, **kwargs): - return aggcontext.agg( - data[mask] if mask is not None else data, - np.bitwise_xor.reduce, - ) - - -@execute_node.register( - (ops.ArgMin, ops.ArgMax), - pd.Series, - pd.Series, - (pd.Series, type(None)), -) -def execute_argmin_series_mask(op, data, key, mask, aggcontext=None, **kwargs): - method_name = op.__class__.__name__.lower() - masked_key = key[mask] if mask is not None else key - idx = aggcontext.agg(masked_key, method_name) - masked = data[mask] if mask is not None else data - return masked.iloc[idx] - - -@execute_node.register(ops.Mode, pd.Series, (pd.Series, type(None))) -def execute_mode_series(_, data, mask, aggcontext=None, **kwargs): - return aggcontext.agg( - data[mask] if mask is not None else data, lambda x: x.mode().iloc[0] - ) - - -@execute_node.register(ops.Mode, SeriesGroupBy, (SeriesGroupBy, type(None))) -def execute_mode_series_groupby(_, data, mask, aggcontext=None, **kwargs): - def mode(x): - return x.mode().iloc[0] - - if mask is not None: - mode = functools.partial(_filtered_reduction, mask.obj, mode) - - return aggcontext.agg(data, mode) - - -@execute_node.register(ops.ApproxMedian, pd.Series, (pd.Series, type(None))) -def execute_approx_median_series(_, data, mask, aggcontext=None, **kwargs): - return aggcontext.agg( - data[mask] if mask is not None else data, lambda x: x.median() - ) - - -@execute_node.register(ops.ApproxMedian, SeriesGroupBy, (SeriesGroupBy, type(None))) -def execute_approx_median_series_groupby(_, data, mask, aggcontext=None, **kwargs): - median = pd.Series.median - - if mask is not None: - median = functools.partial(_filtered_reduction, mask.obj, median) - - return aggcontext.agg(data, median) - - -@execute_node.register((ops.Not, ops.Negate), (bool, np.bool_)) -def execute_not_bool(_, data, **kwargs): - return not data - - -def _execute_binary_op_impl(op, left, right, **_): - op_type = type(op) - try: - operation = constants.BINARY_OPERATIONS[op_type] - except KeyError: - raise com.OperationNotDefinedError( - f"Binary operation {op_type.__name__} not implemented" - ) - else: - return operation(left, right) - - -@execute_node.register(ops.Binary, pd.Series, pd.Series) -@execute_node.register( - (ops.NumericBinary, ops.LogicalBinary, ops.Comparison), - numeric_types, - pd.Series, -) -@execute_node.register( - (ops.NumericBinary, ops.LogicalBinary, ops.Comparison), - pd.Series, - numeric_types, -) -@execute_node.register( - (ops.NumericBinary, ops.LogicalBinary, ops.Comparison), - numeric_types, - numeric_types, -) -@execute_node.register((ops.Comparison, ops.Add, ops.Multiply), pd.Series, str) -@execute_node.register((ops.Comparison, ops.Add, ops.Multiply), str, pd.Series) -@execute_node.register((ops.Comparison, ops.Add), str, str) -@execute_node.register(ops.Multiply, integer_types, str) -@execute_node.register(ops.Multiply, str, integer_types) -@execute_node.register(ops.Comparison, pd.Series, timestamp_types) -@execute_node.register(ops.Comparison, timedelta_types, pd.Series) -@execute_node.register(ops.BitwiseBinary, integer_types, integer_types) -@execute_node.register(ops.BitwiseBinary, pd.Series, integer_types) -@execute_node.register(ops.BitwiseBinary, integer_types, pd.Series) -def execute_binary_op(op, left, right, **kwargs): - return _execute_binary_op_impl(op, left, right, **kwargs) - - -@execute_node.register(ops.Comparison, pd.Series, date_types) -def execute_binary_op_date(op, left, right, **kwargs): - return _execute_binary_op_impl( - op, pd.to_datetime(left), pd.to_datetime(right), **kwargs - ) - - -@execute_node.register(ops.Binary, SeriesGroupBy, SeriesGroupBy) -def execute_binary_op_series_group_by(op, left, right, **kwargs): - left_groupings = get_grouping(left.grouper.groupings) - right_groupings = get_grouping(right.grouper.groupings) - if left_groupings != right_groupings: - raise ValueError( - f"Cannot perform {type(op).__name__} operation on two series with " - "different groupings" - ) - result = execute_binary_op(op, left.obj, right.obj, **kwargs) - return result.groupby(left_groupings, group_keys=False) - - -@execute_node.register(ops.Binary, SeriesGroupBy, simple_types) -def execute_binary_op_series_gb_simple(op, left, right, **kwargs): - result = execute_binary_op(op, left.obj, right, **kwargs) - return result.groupby(get_grouping(left.grouper.groupings), group_keys=False) - - -@execute_node.register(ops.Binary, simple_types, SeriesGroupBy) -def execute_binary_op_simple_series_gb(op, left, right, **kwargs): - result = execute_binary_op(op, left, right.obj, **kwargs) - return result.groupby(get_grouping(right.grouper.groupings), group_keys=False) - - -@execute_node.register(ops.Unary, SeriesGroupBy) -def execute_unary_op_series_gb(op, operand, **kwargs): - result = execute_node(op, operand.obj, **kwargs) - return result.groupby(get_grouping(operand.grouper.groupings), group_keys=False) - - -@execute_node.register( - (ops.Log, ops.Round), - SeriesGroupBy, - (numbers.Real, decimal.Decimal, type(None)), -) -def execute_log_series_gb_others(op, left, right, **kwargs): - result = execute_node(op, left.obj, right, **kwargs) - return result.groupby(get_grouping(left.grouper.groupings), group_keys=False) - - -@execute_node.register((ops.Log, ops.Round), SeriesGroupBy, SeriesGroupBy) -def execute_log_series_gb_series_gb(op, left, right, **kwargs): - result = execute_node(op, left.obj, right.obj, **kwargs) - return result.groupby(get_grouping(left.grouper.groupings), group_keys=False) - - -@execute_node.register(ops.Not, pd.Series) -def execute_not_series(op, data, **kwargs): - return ~data - - -@execute_node.register(ops.StringSplit, pd.Series, (pd.Series, str)) -def execute_string_split(op, data, delimiter, **kwargs): - # Doing the iteration using `map` is much faster than doing the iteration - # using `Series.apply` due to Pandas-related overhead. - return pd.Series(np.array(s.split(delimiter)) for s in data) - - -@execute_node.register( - ops.Between, - pd.Series, - (pd.Series, numbers.Real, str, datetime.datetime), - (pd.Series, numbers.Real, str, datetime.datetime), -) -def execute_between(op, data, lower, upper, **kwargs): - return data.between(lower, upper) - - -@execute_node.register(ops.Union, pd.DataFrame, pd.DataFrame, bool) -def execute_union_dataframe_dataframe( - op, left: pd.DataFrame, right: pd.DataFrame, distinct, **kwargs -): - result = pd.concat([left, right], axis=0) - return result.drop_duplicates() if distinct else result - - -@execute_node.register(ops.Intersection, pd.DataFrame, pd.DataFrame, bool) -def execute_intersection_dataframe_dataframe( - op, - left: pd.DataFrame, - right: pd.DataFrame, - distinct: bool, - **kwargs, -): - if not distinct: - raise NotImplementedError( - "`distinct=False` is not supported by the pandas backend" - ) - result = left.merge(right, on=list(left.columns), how="inner") - return result - - -@execute_node.register(ops.Difference, pd.DataFrame, pd.DataFrame, bool) -def execute_difference_dataframe_dataframe( - op, - left: pd.DataFrame, - right: pd.DataFrame, - distinct: bool, - **kwargs, -): - if not distinct: - raise NotImplementedError( - "`distinct=False` is not supported by the pandas backend" - ) - merged = left.merge(right, on=list(left.columns), how="outer", indicator=True) - result = merged[merged["_merge"] == "left_only"].drop("_merge", axis=1) - return result - - -@execute_node.register(ops.IsNull, pd.Series) -def execute_series_isnull(op, data, **kwargs): - return data.isnull() - - -@execute_node.register(ops.NotNull, pd.Series) -def execute_series_notnnull(op, data, **kwargs): - return data.notnull() - - -@execute_node.register(ops.IsNan, (pd.Series, floating_types)) -def execute_isnan(op, data, **kwargs): - try: - return np.isnan(data) - except (TypeError, ValueError): - # if `data` contains `None` np.isnan will complain - # so we take advantage of NaN not equaling itself - # to do the correct thing - return data != data - - -@execute_node.register(ops.IsInf, (pd.Series, floating_types)) -def execute_isinf(op, data, **kwargs): - return np.isinf(data) - - -@execute_node.register(ops.SelfReference, pd.DataFrame) -def execute_node_self_reference_dataframe(op, data, **kwargs): - return data - - -@execute_node.register(ops.Alias, object) -def execute_alias(op, data, **kwargs): - # just return the underlying argument because the naming is handled - # by the translator for the top level expression - return data - - -@execute_node.register(ops.StringConcat, tuple) -def execute_node_string_concat(op, values, **kwargs): - values = [execute(arg, **kwargs) for arg in values] - return functools.reduce(operator.add, values) - - -@execute_node.register(ops.StringJoin, collections.abc.Sequence) -def execute_node_string_join(op, args, **kwargs): - return op.sep.join(args) - - -@execute_node.register(ops.InValues, object, tuple) -def execute_node_scalar_in_values(op, data, elements, **kwargs): - elements = [execute(arg, **kwargs) for arg in elements] - return data in elements - - -@execute_node.register(ops.InColumn, object, np.ndarray) -def execute_node_scalar_in_column(op, data, elements, **kwargs): - return data in elements - - -@execute_node.register(ops.InValues, pd.Series, tuple) -def execute_node_column_in_values(op, data, elements, **kwargs): - elements = [execute(arg, **kwargs) for arg in elements] - return data.isin(elements) - - -@execute_node.register(ops.InColumn, pd.Series, pd.Series) -def execute_node_column_in_column(op, data, elements, **kwargs): - return data.isin(elements) - - -@execute_node.register(ops.InValues, SeriesGroupBy, tuple) -def execute_node_group_in_values(op, data, elements, **kwargs): - elements = [execute(arg, **kwargs) for arg in elements] - return data.obj.isin(elements).groupby( - get_grouping(data.grouper.groupings), group_keys=False - ) - - -@execute_node.register(ops.InColumn, SeriesGroupBy, pd.Series) -def execute_node_group_in_column(op, data, elements, **kwargs): - return data.obj.isin(elements).groupby( - get_grouping(data.grouper.groupings), group_keys=False - ) - - -def pd_where(cond, true, false): - """Execute `where` following ibis's intended semantics.""" - if isinstance(cond, pd.Series): - if not isinstance(true, pd.Series): - true = pd.Series( - np.repeat(true, len(cond)), name=cond.name, index=cond.index - ) - return true.where(cond, other=false) - if cond: - if isinstance(false, pd.Series) and not isinstance(true, pd.Series): - return pd.Series(np.repeat(true, len(false))) - return true - else: - if isinstance(true, pd.Series) and not isinstance(false, pd.Series): - return pd.Series(np.repeat(false, len(true)), index=true.index) - return false - - -@execute_node.register(ops.IfElse, (pd.Series, *boolean_types), pd.Series, pd.Series) -@execute_node.register(ops.IfElse, (pd.Series, *boolean_types), pd.Series, simple_types) -@execute_node.register(ops.IfElse, (pd.Series, *boolean_types), simple_types, pd.Series) -@execute_node.register(ops.IfElse, (pd.Series, *boolean_types), type(None), type(None)) -def execute_node_where(op, cond, true, false, **kwargs): - return pd_where(cond, true, false) - - -# For true/false as scalars, we only support identical type pairs + None to -# limit the size of the dispatch table and not have to worry about type -# promotion. -for typ in (str, *scalar_types): - for cond_typ in (pd.Series, *boolean_types): - execute_node.register(ops.IfElse, cond_typ, typ, typ)(execute_node_where) - execute_node.register(ops.IfElse, cond_typ, type(None), typ)(execute_node_where) - execute_node.register(ops.IfElse, cond_typ, typ, type(None))(execute_node_where) - - -@execute_node.register(ops.DatabaseTable, PandasBackend) -def execute_database_table_client( - op, client, timecontext: TimeContext | None, **kwargs -): - df = client.dictionary[op.name] - if timecontext: - begin, end = timecontext - time_col = get_time_col() - if time_col not in df: - raise com.IbisError( - f"Table {op.name} must have a time column named {time_col}" - " to execute with time context." - ) - # filter with time context - mask = df[time_col].between(begin, end) - return df.loc[mask].reset_index(drop=True) - return df - - -MATH_FUNCTIONS = { - ops.Floor: math.floor, - ops.Ln: math.log, - ops.Log2: lambda x: math.log(x, 2), - ops.Log10: math.log10, - ops.Exp: math.exp, - ops.Sqrt: math.sqrt, - ops.Abs: abs, - ops.Ceil: math.ceil, - ops.Sign: lambda x: 0 if not x else -1 if x < 0 else 1, -} - -MATH_FUNCTION_TYPES = tuple(MATH_FUNCTIONS.keys()) - - -@execute_node.register(MATH_FUNCTION_TYPES, numeric_types) -def execute_node_math_function_number(op, value, **kwargs): - return MATH_FUNCTIONS[type(op)](value) - - -@execute_node.register(ops.Log, numeric_types, numeric_types) -def execute_node_log_number_number(op, value, base, **kwargs): - return math.log(value, base) - - -@execute_node.register(ops.DropNa, pd.DataFrame) -def execute_node_dropna_dataframe(op, df, **kwargs): - if op.subset is not None: - subset = [col.name for col in op.subset] - else: - subset = None - return df.dropna(how=op.how, subset=subset) - - -@execute_node.register(ops.FillNa, pd.DataFrame, simple_types) -def execute_node_fillna_dataframe_scalar(op, df, replacements, **kwargs): - return df.fillna(replacements) - - -@execute_node.register(ops.FillNa, pd.DataFrame) -def execute_node_fillna_dataframe_dict(op, df, **kwargs): - return df.fillna(dict(op.replacements)) - - -@execute_node.register(ops.NullIf, simple_types, simple_types) -def execute_node_nullif_scalars(op, value1, value2, **kwargs): - return np.nan if value1 == value2 else value1 - - -@execute_node.register(ops.NullIf, pd.Series, (pd.Series, *simple_types)) -def execute_node_nullif_series(op, left, right, **kwargs): - return left.where(left != right) - - -@execute_node.register(ops.NullIf, simple_types, pd.Series) -def execute_node_nullif_scalar_series(op, value, series, **kwargs): - return series.where(series != value) - - -def coalesce(values): - return functools.reduce( - lambda a1, a2: np.where(pd.isnull(a1), a2, a1), - values, - ) - - -@toolz.curry -def promote_to_sequence(length, obj): - try: - return obj.values - except AttributeError: - return np.repeat(obj, length) - - -def compute_row_reduction(func, values, **kwargs): - final_sizes = {len(x) for x in values if isinstance(x, Sized)} - if not final_sizes: - return func(values) - (final_size,) = final_sizes - raw = func(list(map(promote_to_sequence(final_size), values)), **kwargs) - return pd.Series(raw).squeeze() - - -@execute_node.register(ops.Greatest, tuple) -def execute_node_greatest_list(op, values, **kwargs): - values = [execute(arg, **kwargs) for arg in values] - return compute_row_reduction(np.maximum.reduce, values, axis=0) - - -@execute_node.register(ops.Least, tuple) -def execute_node_least_list(op, values, **kwargs): - values = [execute(arg, **kwargs) for arg in values] - return compute_row_reduction(np.minimum.reduce, values, axis=0) - - -@execute_node.register(ops.Coalesce, tuple) -def execute_node_coalesce(op, values, **kwargs): - # TODO: this is slow - values = [execute(arg, **kwargs) for arg in values] - return compute_row_reduction(coalesce, values) - - -def wrap_case_result(raw, expr): - """Wrap a CASE statement result in a Series and handle returning scalars. - - Parameters - ---------- - raw : ndarray[T] - The raw results of executing the ``CASE`` expression - expr : Value - The expression from the which `raw` was computed - - Returns - ------- - Union[scalar, Series] - """ - raw_1d = np.atleast_1d(raw) - if np.any(pd.isnull(raw_1d)): - result = pd.Series(raw_1d) - else: - result = pd.Series( - raw_1d, dtype=constants.IBIS_TYPE_TO_PANDAS_TYPE[expr.type()] - ) - if result.size == 1 and isinstance(expr, ir.Scalar): - value = result.iloc[0] - try: - return value.item() - except AttributeError: - return value - return result - - -def _build_select(op, whens, thens, otherwise, func=None, **kwargs): - if func is None: - func = lambda x: x - - whens_ = [] - grouped = 0 - for when in whens: - res = execute(when, **kwargs) - obj = getattr(res, "obj", res) - grouped += obj is not res - whens_.append(obj) - - thens_ = [] - for then in thens: - res = execute(then, **kwargs) - obj = getattr(res, "obj", res) - grouped += obj is not res - thens_.append(obj) - - if otherwise is None: - otherwise = np.nan - - raw = np.select(func(whens_), thens_, otherwise) - - if grouped: - return pd.Series(raw).groupby(get_grouping(res.grouper.groupings)) - return wrap_case_result(raw, op.to_expr()) - - -@execute_node.register(ops.SearchedCase, tuple, tuple, object) -def execute_searched_case(op, whens, thens, otherwise, **kwargs): - return _build_select(op, whens, thens, otherwise, **kwargs) - - -@execute_node.register(ops.SimpleCase, object, tuple, tuple, object) -def execute_simple_case_scalar(op, value, whens, thens, otherwise, **kwargs): - value = getattr(value, "obj", value) - return _build_select( - op, - whens, - thens, - otherwise, - func=lambda whens: np.asarray(whens) == value, - **kwargs, - ) - - -@execute_node.register(ops.SimpleCase, (pd.Series, SeriesGroupBy), tuple, tuple, object) -def execute_simple_case_series(op, value, whens, thens, otherwise, **kwargs): - value = getattr(value, "obj", value) - return _build_select( - op, - whens, - thens, - otherwise, - func=lambda whens: [value == when for when in whens], - **kwargs, - ) - - -@execute_node.register(ops.Distinct, pd.DataFrame) -def execute_distinct_dataframe(op, df, **kwargs): - return df.drop_duplicates() - - -@execute_node.register(ops.TableArrayView, pd.DataFrame) -def execute_table_array_view(op, _, **kwargs): - return execute(op.table).squeeze() - - -@execute_node.register(ops.InMemoryTable) -def execute_in_memory_table(op, **kwargs): - return op.data.to_frame() - - -@execute_node.register(ops.Sample, pd.DataFrame, object, object) -def execute_sample(op, data, fraction, seed, **kwargs): - return data.sample(frac=fraction, random_state=seed) diff --git a/ibis/backends/pandas/execution/join.py b/ibis/backends/pandas/execution/join.py deleted file mode 100644 index adf39079f659..000000000000 --- a/ibis/backends/pandas/execution/join.py +++ /dev/null @@ -1,183 +0,0 @@ -from __future__ import annotations - -import itertools - -import pandas as pd - -import ibis.expr.analysis as an -import ibis.expr.operations as ops -from ibis.backends.pandas.core import execute -from ibis.backends.pandas.dispatch import execute_node -from ibis.backends.pandas.execution import constants -from ibis.common.exceptions import UnsupportedOperationError - - -def _compute_join_column(column, **kwargs): - if isinstance(column, ops.TableColumn): - new_column = column.name - else: - new_column = execute(column, **kwargs) - root_table, *_ = an.find_immediate_parent_tables(column) - return new_column, root_table - - -@execute_node.register(ops.CrossJoin, pd.DataFrame, pd.DataFrame, tuple) -def execute_cross_join(op, left, right, predicates, **kwargs): - """Execute a cross join in pandas. - - Notes - ----- - We create a dummy column of all :data:`True` instances and use that as the - join key. This results in the desired Cartesian product behavior guaranteed - by cross join. - """ - assert not predicates, "cross join predicates must be empty" - return pd.merge( - left, - right, - how="cross", - copy=False, - suffixes=constants.JOIN_SUFFIXES, - ) - - -def _get_semi_anti_join_filter(op, left, right, predicates, **kwargs): - left_on, right_on = _construct_join_predicate_columns( - op, - predicates, - **kwargs, - ) - inner = left.merge( - right[right_on].drop_duplicates(), - on=left_on, - how="left", - indicator=True, - ) - return (inner["_merge"] == "both").values - - -@execute_node.register(ops.LeftSemiJoin, pd.DataFrame, pd.DataFrame, tuple) -def execute_left_semi_join(op, left, right, predicates, **kwargs): - """Execute a left semi join in pandas.""" - inner_filt = _get_semi_anti_join_filter( - op, - left, - right, - predicates, - **kwargs, - ) - return left.loc[inner_filt, :] - - -@execute_node.register(ops.LeftAntiJoin, pd.DataFrame, pd.DataFrame, tuple) -def execute_left_anti_join(op, left, right, predicates, **kwargs): - """Execute a left anti join in pandas.""" - inner_filt = _get_semi_anti_join_filter( - op, - left, - right, - predicates, - **kwargs, - ) - return left.loc[~inner_filt, :] - - -def _construct_join_predicate_columns(op, predicates, **kwargs): - on = {op.left: [], op.right: []} - - for predicate in predicates: - if not isinstance(predicate, ops.Equals): - raise TypeError("Only equality join predicates supported with pandas") - new_left_column, left_pred_root = _compute_join_column(predicate.left, **kwargs) - on[left_pred_root].append(new_left_column) - - new_right_column, right_pred_root = _compute_join_column( - predicate.right, **kwargs - ) - on[right_pred_root].append(new_right_column) - return on[op.left], on[op.right] - - -@execute_node.register(ops.Join, pd.DataFrame, pd.DataFrame, tuple) -def execute_join(op, left, right, predicates, **kwargs): - op_type = type(op) - - try: - how = constants.JOIN_TYPES[op_type] - except KeyError: - raise UnsupportedOperationError(f"{op_type.__name__} not supported") - - left_on, right_on = _construct_join_predicate_columns(op, predicates, **kwargs) - - df = pd.merge( - left, - right, - how=how, - left_on=left_on, - right_on=right_on, - suffixes=constants.JOIN_SUFFIXES, - ) - return df - - -@execute_node.register( - ops.AsOfJoin, - pd.DataFrame, - pd.DataFrame, - tuple, - (pd.Timedelta, type(None)), - tuple, -) -def execute_asof_join(op, left, right, by, tolerance, predicates, **kwargs): - left_on, right_on = _extract_predicate_names(predicates) - left_by, right_by = _extract_predicate_names(by) - - # Add default join suffixes to predicates and groups and rename the - # corresponding columns before the `merge_asof`. If we don't do this and the - # predicates have the same column name, we lose the original RHS column - # values in the output. Instead, the RHS values are copies of the LHS values. - # xref https://github.com/ibis-project/ibis/issues/6080 - left_on_suffixed = [x + constants.JOIN_SUFFIXES[0] for x in left_on] - right_on_suffixed = [x + constants.JOIN_SUFFIXES[1] for x in right_on] - - left_by_suffixed = [x + constants.JOIN_SUFFIXES[0] for x in left_by] - right_by_suffixed = [x + constants.JOIN_SUFFIXES[1] for x in right_by] - - left = left.rename( - columns=dict( - itertools.chain( - zip(left_on, left_on_suffixed), zip(left_by, left_by_suffixed) - ) - ) - ) - right = right.rename( - columns=dict( - itertools.chain( - zip(right_on, right_on_suffixed), zip(right_by, right_by_suffixed) - ) - ) - ) - - return pd.merge_asof( - left=left, - right=right, - left_on=left_on_suffixed, - right_on=right_on_suffixed, - left_by=left_by_suffixed or None, - right_by=right_by_suffixed or None, - tolerance=tolerance, - suffixes=constants.JOIN_SUFFIXES, - ) - - -def _extract_predicate_names(predicates): - lefts = [] - rights = [] - for predicate in predicates: - if not isinstance(predicate, ops.Equals): - raise TypeError("Only equality join predicates supported with pandas") - left_name = predicate.left.name - right_name = predicate.right.name - lefts.append(left_name) - rights.append(right_name) - return lefts, rights diff --git a/ibis/backends/pandas/execution/maps.py b/ibis/backends/pandas/execution/maps.py deleted file mode 100644 index 2da84583362c..000000000000 --- a/ibis/backends/pandas/execution/maps.py +++ /dev/null @@ -1,208 +0,0 @@ -from __future__ import annotations - -import collections -import functools - -import numpy as np -import pandas as pd -import toolz - -import ibis.expr.operations as ops -from ibis.backends.pandas.dispatch import execute_node - - -@execute_node.register(ops.Map, np.ndarray, np.ndarray) -def map_ndarray_ndarray(op, keys, values, **kwargs): - return dict(zip(keys, values)) - - -@execute_node.register(ops.Map, pd.Series, pd.Series) -def map_series_series(op, keys, values, **kwargs): - return keys.combine(values, lambda a, b: dict(zip(a, b))) - - -@execute_node.register(ops.MapLength, pd.Series) -def map_length_series(op, data, **kwargs): - # TODO: investigate whether calling a lambda is faster - return data.dropna().map(len).reindex(data.index) - - -@execute_node.register(ops.MapLength, (collections.abc.Mapping, type(None))) -def map_length_dict(op, data, **kwargs): - return None if data is None else len(data) - - -@execute_node.register(ops.MapGet, pd.Series, object, object) -def map_get_series_scalar_scalar(op, data, key, default, **kwargs): - return data.map(functools.partial(safe_get, key=key, default=default)) - - -@execute_node.register(ops.MapGet, pd.Series, object, pd.Series) -def map_get_series_scalar_series(op, data, key, default, **kwargs): - defaultiter = iter(default.values) - return data.map( - lambda mapping, key=key, defaultiter=defaultiter: safe_get( - mapping, key, next(defaultiter) - ) - ) - - -@execute_node.register(ops.MapGet, pd.Series, pd.Series, object) -def map_get_series_series_scalar(op, data, key, default, **kwargs): - keyiter = iter(key.values) - return data.map( - lambda mapping, keyiter=keyiter, default=default: safe_get( - mapping, next(keyiter), default - ) - ) - - -@execute_node.register(ops.MapGet, pd.Series, pd.Series, pd.Series) -def map_get_series_series_series(op, data, key, default): - keyiter = iter(key.values) - defaultiter = iter(default.values) - - def get(mapping, keyiter=keyiter, defaultiter=defaultiter): - return safe_get(mapping, next(keyiter), next(defaultiter)) - - return data.map(get) - - -@execute_node.register(ops.MapGet, collections.abc.Mapping, object, object) -def map_get_dict_scalar_scalar(op, data, key, default, **kwargs): - return safe_get(data, key, default) - - -@execute_node.register(ops.MapGet, collections.abc.Mapping, object, pd.Series) -def map_get_dict_scalar_series(op, data, key, default, **kwargs): - return default.map(lambda d, data=data, key=key: safe_get(data, key, d)) - - -@execute_node.register(ops.MapGet, collections.abc.Mapping, pd.Series, object) -def map_get_dict_series_scalar(op, data, key, default, **kwargs): - return key.map(lambda k, data=data, default=default: safe_get(data, k, default)) - - -@execute_node.register(ops.MapGet, collections.abc.Mapping, pd.Series, pd.Series) -def map_get_dict_series_series(op, data, key, default, **kwargs): - defaultiter = iter(default.values) - return key.map( - lambda k, data=data, defaultiter=defaultiter: safe_get( - data, k, next(defaultiter) - ) - ) - - -@execute_node.register(ops.MapContains, collections.abc.Mapping, object) -def map_contains_dict_object(op, data, key, **kwargs): - return safe_contains(data, key) - - -@execute_node.register(ops.MapContains, collections.abc.Mapping, pd.Series) -def map_contains_dict_series(op, data, key, **kwargs): - return key.map(lambda k, data=data: safe_contains(data, k)) - - -@execute_node.register(ops.MapContains, pd.Series, object) -def map_contains_series_object(op, data, key, **kwargs): - return data.map(lambda d: safe_contains(d, key)) - - -@execute_node.register(ops.MapContains, pd.Series, pd.Series) -def map_contains_series_series(op, data, key, **kwargs): - return data.combine(key, lambda d, k: safe_contains(d, k)) - - -def safe_method(mapping, method, *args, **kwargs): - if mapping is None: - return None - try: - method = getattr(mapping, method) - except AttributeError: - return None - else: - return method(*args, **kwargs) - - -def safe_get(mapping, key, default=None): - return safe_method(mapping, "get", key, default) - - -def safe_contains(mapping, key): - return safe_method(mapping, "__contains__", key) - - -def safe_keys(mapping): - result = safe_method(mapping, "keys") - if result is None: - return None - # list(...) to unpack iterable - return np.array(list(result)) - - -def safe_values(mapping): - result = safe_method(mapping, "values") - if result is None: - return None - # list(...) to unpack iterable - return np.array(list(result), dtype="object") - - -@execute_node.register(ops.MapKeys, pd.Series) -def map_keys_series(op, data, **kwargs): - return data.map(safe_keys) - - -@execute_node.register(ops.MapKeys, (collections.abc.Mapping, type(None))) -def map_keys_dict(op, data, **kwargs): - if data is None: - return None - # list(...) to unpack iterable - return np.array(list(data.keys())) - - -@execute_node.register(ops.MapValues, pd.Series) -def map_values_series(op, data, **kwargs): - res = data.map(safe_values) - return res - - -@execute_node.register(ops.MapValues, (collections.abc.Mapping, type(None))) -def map_values_dict(op, data, **kwargs): - if data is None: - return None - # list(...) to unpack iterable - return np.array(list(data.values())) - - -def safe_merge(*maps): - return None if any(m is None for m in maps) else toolz.merge(*maps) - - -@execute_node.register( - ops.MapMerge, - (collections.abc.Mapping, type(None)), - (collections.abc.Mapping, type(None)), -) -def map_merge_dict_dict(op, lhs, rhs, **kwargs): - return safe_merge(lhs, rhs) - - -@execute_node.register(ops.MapMerge, (collections.abc.Mapping, type(None)), pd.Series) -def map_merge_dict_series(op, lhs, rhs, **kwargs): - if lhs is None: - return pd.Series([None] * len(rhs)) - return rhs.map(lambda m, lhs=lhs: safe_merge(lhs, m)) - - -@execute_node.register(ops.MapMerge, pd.Series, (collections.abc.Mapping, type(None))) -def map_merge_series_dict(op, lhs, rhs, **kwargs): - if rhs is None: - return pd.Series([None] * len(lhs)) - return lhs.map(lambda m, rhs=rhs: safe_merge(m, rhs)) - - -@execute_node.register(ops.MapMerge, pd.Series, pd.Series) -def map_merge_series_series(op, lhs, rhs, **kwargs): - rhsiter = iter(rhs.values) - return lhs.map(lambda m, rhsiter=rhsiter: safe_merge(m, next(rhsiter))) diff --git a/ibis/backends/pandas/execution/selection.py b/ibis/backends/pandas/execution/selection.py deleted file mode 100644 index b1f8a0ee6659..000000000000 --- a/ibis/backends/pandas/execution/selection.py +++ /dev/null @@ -1,337 +0,0 @@ -"""Dispatching code for Selection operations.""" - -from __future__ import annotations - -import functools -import operator -from collections import defaultdict -from typing import TYPE_CHECKING, Any - -import pandas as pd -from toolz import concatv, first - -import ibis.expr.analysis as an -import ibis.expr.operations as ops -import ibis.expr.types as ir -from ibis.backends.base.df.scope import Scope -from ibis.backends.pandas.core import execute -from ibis.backends.pandas.dispatch import execute_node -from ibis.backends.pandas.execution import constants, util -from ibis.backends.pandas.execution.util import coerce_to_output - -if TYPE_CHECKING: - from collections.abc import Iterable - - from ibis.backends.base.df.timecontext import TimeContext - - -def compute_projection( - node: ops.Node, - parent: ops.Selection, - data: pd.DataFrame, - scope: Scope | None = None, - timecontext: TimeContext | None = None, - **kwargs: Any, -): - """Compute a projection. - - `ibis.expr.types.Scalar` instances occur when a specific column projection - is a window operation. - """ - if isinstance(node, ops.TableNode): - if node == parent.table: - return data - - assert isinstance(parent.table, ops.Join) - assert node in (parent.table.left, parent.table.right) - - mapping = remap_overlapping_column_names( - parent.table, - root_table=node, - data_columns=frozenset(data.columns), - ) - return map_new_column_names_to_data(mapping, data) - elif isinstance(node, ops.Value): - name = node.name - assert name is not None, "Value selection name is None" - - if node.shape.is_scalar(): - data_columns = frozenset(data.columns) - - if scope is None: - scope = Scope() - - scope = scope.merge_scopes( - Scope( - { - t: map_new_column_names_to_data( - remap_overlapping_column_names( - parent.table, t, data_columns - ), - data, - ) - }, - timecontext, - ) - for t in an.find_immediate_parent_tables(node) - ) - scalar = execute(node, scope=scope, **kwargs) - result = pd.Series([scalar], name=name).repeat(len(data.index)) - result.index = data.index - return result - else: - if isinstance(node, ops.TableColumn): - if name in data: - return data[name].rename(name) - - if not isinstance(parent.table, ops.Join): - raise KeyError(name) - - suffix = util.get_join_suffix_for_op(node, parent.table) - return data.loc[:, name + suffix].rename(name) - - data_columns = frozenset(data.columns) - - scope = scope.merge_scopes( - Scope( - { - t: map_new_column_names_to_data( - remap_overlapping_column_names( - parent.table, t, data_columns - ), - data, - ) - }, - timecontext, - ) - for t in an.find_immediate_parent_tables(node) - ) - - result = execute(node, scope=scope, timecontext=timecontext, **kwargs) - return coerce_to_output(result, node, data.index) - else: - raise TypeError(node) - - -def remap_overlapping_column_names(table, root_table, data_columns): - """Return a mapping of suffixed column names to column names without suffixes. - - Parameters - ---------- - table : TableNode - The ``TableNode`` we're selecting from. - root_table : TableNode - The root table of the expression we're selecting from. - data_columns - The available columns to select from - - Returns - ------- - dict[str, str] - A mapping from possibly-suffixed column names to column names without - suffixes. - """ - if not isinstance(table, ops.Join): - return None - - left_root, right_root = an.find_immediate_parent_tables([table.left, table.right]) - suffixes = { - left_root: constants.LEFT_JOIN_SUFFIX, - right_root: constants.RIGHT_JOIN_SUFFIX, - } - - # if we're selecting from the root table and that's not the left or right - # child, don't add a suffix - # - # this can happen when selecting directly from a join as opposed to - # explicitly referencing the left or right tables - # - # we use setdefault here because the root_table can be the left/right table - # which we may have already put into `suffixes` - suffixes.setdefault(root_table, "") - - suffix = suffixes[root_table] - - column_names = [ - ({name, f"{name}{suffix}"} & data_columns, name) - for name in root_table.schema.names - ] - mapping = { - first(col_name): final_name for col_name, final_name in column_names if col_name - } - return mapping - - -def map_new_column_names_to_data(mapping, df): - if mapping: - return df.loc[:, mapping.keys()].rename(columns=mapping) - return df - - -def _compute_predicates( - table_op: ops.TableNode, - predicates: Iterable[ir.BooleanColumn], - data: pd.DataFrame, - scope: Scope, - timecontext: TimeContext | None, - **kwargs: Any, -) -> pd.Series: - """Compute the predicates for a table operation. - - This handles the cases where `predicates` are computed columns, in addition - to the simple case of named columns coming directly from the input table. - """ - for predicate in predicates: - # Map each root table of the predicate to the data so that we compute - # predicates on the result instead of any left or right tables if the - # Selection is on a Join. Project data to only include columns from - # the root table. - root_tables = an.find_immediate_parent_tables(predicate) - - # handle suffixes - data_columns = frozenset(data.columns) - - additional_scope = Scope() - for root_table in root_tables: - mapping = remap_overlapping_column_names(table_op, root_table, data_columns) - new_data = map_new_column_names_to_data(mapping, data) - additional_scope = additional_scope.merge_scope( - Scope({root_table: new_data}, timecontext) - ) - - scope = scope.merge_scope(additional_scope) - yield execute(predicate, scope=scope, **kwargs) - - -def build_df_from_selection( - selections: list[ops.Value], - data: pd.DataFrame, - table: ops.Node, -) -> pd.DataFrame: - """Build up a df by doing direct selections, renaming if necessary. - - Special logic for: - - Joins where suffixes have been added to column names - - Cases where new columns are created and selected. - """ - cols = defaultdict(list) - - for node in selections: - selection = node.name - if selection not in data: - if not isinstance(table, ops.Join): - raise KeyError(selection) - join_suffix = util.get_join_suffix_for_op(node, table) - if selection + join_suffix not in data: - raise KeyError(selection) - selection += join_suffix - cols[selection].append(node.name) - - result = data[list(cols.keys())] - - renamed_cols = {} - for from_col, to_cols in cols.items(): - if len(to_cols) == 1 and from_col != to_cols[0]: - renamed_cols[from_col] = to_cols[0] - else: - for new_col in to_cols: - if from_col != new_col: - result[new_col] = result[from_col] - - if renamed_cols: - result = result.rename(columns=renamed_cols) - - return result - - -def build_df_from_projection( - selection_exprs: list[ir.Expr], - op: ops.Selection, - data: pd.DataFrame, - **kwargs, -) -> pd.DataFrame: - data_pieces = [ - compute_projection(node, op, data, **kwargs) for node in selection_exprs - ] - - new_pieces = [ - piece.reset_index(level=list(range(1, piece.index.nlevels)), drop=True) - if piece.index.nlevels > 1 - else piece - for piece in data_pieces - ] - # Result series might be trimmed by time context, thus index may - # have changed. To concat rows properly, we first `sort_index` on - # each pieces then assign data index manually to series - # - # If cardinality changes (e.g. unnest/explode), trying to do this - # won't work so don't try? - for i, piece in enumerate(new_pieces): - new_pieces[i] = piece.sort_index() - if len(new_pieces[i].index) == len(data.index): - new_pieces[i].index = data.index - - return pd.concat(new_pieces, axis=1) - - -@execute_node.register(ops.Selection, pd.DataFrame) -def execute_selection_dataframe( - op, - data, - scope: Scope, - timecontext: TimeContext | None, - **kwargs, -): - result = data - - # Build up the individual pandas structures from column expressions - if op.selections: - if all(isinstance(s, ops.TableColumn) for s in op.selections): - result = build_df_from_selection(op.selections, data, op.table) - else: - result = build_df_from_projection( - op.selections, - op, - data, - scope=scope, - timecontext=timecontext, - **kwargs, - ) - - if op.predicates: - predicates = _compute_predicates( - op.table, op.predicates, data, scope, timecontext, **kwargs - ) - predicate = functools.reduce(operator.and_, predicates) - assert len(predicate) == len( - result - ), "Selection predicate length does not match underlying table" - result = result.loc[predicate] - - if op.sort_keys: - result, grouping_keys, ordering_keys = util.compute_sorted_frame( - result, - order_by=op.sort_keys, - scope=scope, - timecontext=timecontext, - **kwargs, - ) - else: - grouping_keys = ordering_keys = () - - # return early if we do not have any temporary grouping or ordering columns - assert not grouping_keys, "group by should never show up in Selection" - if not ordering_keys: - return result - - # create a sequence of columns that we need to drop - temporary_columns = pd.Index(concatv(grouping_keys, ordering_keys)).difference( - data.columns - ) - - # no reason to call drop if we don't need to - if temporary_columns.empty: - return result - - # drop every temporary column we created for ordering or grouping - return result.drop(temporary_columns, axis=1) diff --git a/ibis/backends/pandas/execution/strings.py b/ibis/backends/pandas/execution/strings.py deleted file mode 100644 index 66e325b6d367..000000000000 --- a/ibis/backends/pandas/execution/strings.py +++ /dev/null @@ -1,560 +0,0 @@ -from __future__ import annotations - -import itertools -import json -import operator -from functools import partial, reduce -from urllib.parse import parse_qs, urlsplit - -import numpy as np -import pandas as pd -import toolz -from pandas.core.groupby import SeriesGroupBy - -try: - import regex as re -except ImportError: - import re - -import ibis.expr.operations as ops -import ibis.util -from ibis.backends.pandas.core import execute, integer_types, scalar_types -from ibis.backends.pandas.dispatch import execute_node -from ibis.backends.pandas.execution.util import get_grouping - - -@execute_node.register(ops.StringLength, pd.Series) -def execute_string_length_series(op, data, **kwargs): - return data.str.len().astype("int32") - - -@execute_node.register( - ops.Substring, pd.Series, integer_types, (type(None), *integer_types) -) -def execute_substring_int_int(op, data, start, length, **kwargs): - if length is None: - return data.str[start:] - else: - return data.str[start : start + length] - - -@execute_node.register(ops.Substring, pd.Series, pd.Series, integer_types) -def execute_substring_series_int(op, data, start, length, **kwargs): - return execute_substring_series_series( - op, data, start, pd.Series(np.repeat(length, len(start))), **kwargs - ) - - -@execute_node.register(ops.Substring, pd.Series, integer_types, pd.Series) -def execute_string_substring_int_series(op, data, start, length, **kwargs): - return execute_substring_series_series( - op, data, pd.Series(np.repeat(start, len(length))), length, **kwargs - ) - - -@execute_node.register(ops.Substring, pd.Series, pd.Series, pd.Series) -def execute_substring_series_series(op, data, start, length, **kwargs): - end = start + length - - return pd.Series( - [ - None - if (begin is not None and pd.isnull(begin)) - or (stop is not None and pd.isnull(stop)) - else value[begin:stop] - for value, begin, stop in zip(data, start.values, end.values) - ], - dtype=data.dtype, - name=data.name, - ) - - -@execute_node.register(ops.Strip, pd.Series) -def execute_string_strip(op, data, **kwargs): - return data.str.strip() - - -@execute_node.register(ops.LStrip, pd.Series) -def execute_string_lstrip(op, data, **kwargs): - return data.str.lstrip() - - -@execute_node.register(ops.RStrip, pd.Series) -def execute_string_rstrip(op, data, **kwargs): - return data.str.rstrip() - - -@execute_node.register( - ops.LPad, pd.Series, (pd.Series,) + integer_types, (pd.Series, str) -) -def execute_string_lpad(op, data, length, pad, **kwargs): - return data.str.pad(length, side="left", fillchar=pad) - - -@execute_node.register( - ops.RPad, pd.Series, (pd.Series,) + integer_types, (pd.Series, str) -) -def execute_string_rpad(op, data, length, pad, **kwargs): - return data.str.pad(length, side="right", fillchar=pad) - - -@execute_node.register(ops.Reverse, pd.Series) -def execute_string_reverse(op, data, **kwargs): - return data.str[::-1] - - -@execute_node.register(ops.Lowercase, pd.Series) -def execute_string_lower(op, data, **kwargs): - return data.str.lower() - - -@execute_node.register(ops.Uppercase, pd.Series) -def execute_string_upper(op, data, **kwargs): - return data.str.upper() - - -@execute_node.register(ops.Capitalize, (pd.Series, str)) -def execute_string_capitalize(op, data, **kwargs): - return getattr(data, "str", data).capitalize() - - -@execute_node.register(ops.Repeat, pd.Series, (pd.Series,) + integer_types) -def execute_string_repeat(op, data, times, **kwargs): - return data.str.repeat(times) - - -@execute_node.register(ops.StringContains, pd.Series, (pd.Series, str)) -def execute_string_contains(_, data, needle, **kwargs): - return data.str.contains(needle) - - -@execute_node.register( - ops.StringFind, - pd.Series, - (pd.Series, str), - (pd.Series, type(None)) + integer_types, - (pd.Series, type(None)) + integer_types, -) -def execute_string_find(op, data, needle, start, end, **kwargs): - return data.str.find(needle, start, end) - - -def _sql_like_to_regex(pattern, escape): - cur_i = 0 - pattern_length = len(pattern) - - while cur_i < pattern_length: - nxt_i = cur_i + 1 - - cur = pattern[cur_i] - nxt = pattern[nxt_i] if nxt_i < pattern_length else None - - skip = 1 - - if nxt is not None and escape is not None and cur == escape: - yield nxt - skip = 2 - elif cur == "%": - yield ".*" - elif cur == "_": - yield "." - else: - yield cur - - cur_i += skip - - -def sql_like_to_regex(pattern: str, escape: str | None = None) -> str: - """Convert a SQL `LIKE` pattern to an equivalent Python regular expression. - - Parameters - ---------- - pattern - A LIKE pattern with the following semantics: - * `%` matches zero or more characters - * `_` matches exactly one character - * To escape `%` and `_` (or to match the `escape` parameter - itself), prefix the desired character with `escape`. - escape - Escape character - - Returns - ------- - str - A regular expression pattern equivalent to the input SQL `LIKE` pattern. - - Examples - -------- - >>> sql_like_to_regex("6%") # default is to not escape anything - '^6.*$' - >>> sql_like_to_regex("6^%", escape="^") - '^6%$' - >>> sql_like_to_regex("6_") - '^6.$' - >>> sql_like_to_regex("6/_", escape="/") - '^6_$' - >>> sql_like_to_regex("%abc") # any string ending with "abc" - '^.*abc$' - >>> sql_like_to_regex("abc%") # any string starting with "abc" - '^abc.*$' - """ - return f"^{''.join(_sql_like_to_regex(pattern, escape))}$" - - -@execute_node.register(ops.StringSQLLike, pd.Series, str, (str, type(None))) -def execute_string_like_series_string(op, data, pattern, escape, **kwargs): - new_pattern = sql_like_to_regex(pattern, escape=escape) - return data.str.contains(new_pattern, regex=True) - - -@execute_node.register(ops.StringSQLLike, SeriesGroupBy, str, str) -def execute_string_like_series_groupby_string(op, data, pattern, escape, **kwargs): - return execute_string_like_series_string( - op, data.obj, pattern, escape, **kwargs - ).groupby(get_grouping(data.grouper.groupings), group_keys=False) - - -@execute_node.register(ops.GroupConcat, pd.Series, str, (pd.Series, type(None))) -def execute_group_concat_series_mask(op, data, sep, mask, aggcontext=None, **kwargs): - return aggcontext.agg( - data[mask] if mask is not None else data, - lambda series, sep=sep: sep.join(series.values), - ) - - -@execute_node.register(ops.GroupConcat, SeriesGroupBy, str, type(None)) -def execute_group_concat_series_gb(op, data, sep, _, aggcontext=None, **kwargs): - return aggcontext.agg(data, lambda data, sep=sep: sep.join(data.values.astype(str))) - - -@execute_node.register(ops.GroupConcat, SeriesGroupBy, str, SeriesGroupBy) -def execute_group_concat_series_gb_mask(op, data, sep, mask, aggcontext=None, **kwargs): - def method(series, sep=sep): - if series.empty: - return pd.NA - return sep.join(series.values.astype(str)) - - return aggcontext.agg( - data, - lambda data, mask=mask.obj, method=method: method(data[mask[data.index]]), - ) - - -@execute_node.register(ops.StringAscii, pd.Series) -def execute_string_ascii(op, data, **kwargs): - return data.map(ord).astype("int32") - - -@execute_node.register(ops.StringAscii, SeriesGroupBy) -def execute_string_ascii_group_by(op, data, **kwargs): - return execute_string_ascii(op, data, **kwargs).groupby( - get_grouping(data.grouper.groupings), group_keys=False - ) - - -@execute_node.register(ops.RegexSearch, pd.Series, str) -def execute_series_regex_search(op, data, pattern, **kwargs): - pattern = re.compile(pattern) - return data.map(lambda x, pattern=pattern: pattern.search(x) is not None) - - -@execute_node.register(ops.RegexSearch, SeriesGroupBy, str) -def execute_series_regex_search_gb(op, data, pattern, **kwargs): - return execute_series_regex_search( - op, data, getattr(pattern, "obj", pattern), **kwargs - ).groupby(get_grouping(data.grouper.groupings), group_keys=False) - - -@execute_node.register(ops.StartsWith, pd.Series, str) -def execute_series_starts_with(op, data, pattern, **kwargs): - return data.str.startswith(pattern) - - -@execute_node.register(ops.EndsWith, pd.Series, str) -def execute_series_ends_with(op, data, pattern, **kwargs): - return data.str.endswith(pattern) - - -@execute_node.register(ops.RegexExtract, pd.Series, str, integer_types) -def execute_series_regex_extract(op, data, pattern, index, **kwargs): - pattern = re.compile(pattern) - return pd.Series( - [ - None if (match is None or index > match.lastindex) else match[index] - for match in map(pattern.search, data) - ], - dtype=data.dtype, - name=data.name, - ) - - -@execute_node.register(ops.RegexExtract, SeriesGroupBy, str, integer_types) -def execute_series_regex_extract_gb(op, data, pattern, index, **kwargs): - return execute_series_regex_extract(op, data.obj, pattern, index, **kwargs).groupby( - get_grouping(data.grouper.groupings), group_keys=False - ) - - -@execute_node.register(ops.RegexReplace, pd.Series, str, str) -def execute_series_regex_replace(op, data, pattern, replacement, **kwargs): - pattern = re.compile(pattern) - - def replacer(x, pattern=pattern): - return pattern.sub(replacement, x) - - return data.apply(replacer) - - -@execute_node.register(ops.RegexReplace, str, str, str) -def execute_str_regex_replace(_, arg, pattern, replacement, **kwargs): - return re.sub(pattern, replacement, arg) - - -@execute_node.register(ops.RegexReplace, SeriesGroupBy, str, str) -def execute_series_regex_replace_gb(op, data, pattern, replacement, **kwargs): - return execute_series_regex_replace( - data.obj, pattern, replacement, **kwargs - ).groupby(get_grouping(data.grouper.groupings), group_keys=False) - - -@execute_node.register(ops.Translate, pd.Series, pd.Series, pd.Series) -def execute_series_translate_series_series(op, data, from_string, to_string, **kwargs): - tables = [ - str.maketrans(source, target) for source, target in zip(from_string, to_string) - ] - return pd.Series( - [string.translate(table) for string, table in zip(data, tables)], - dtype=data.dtype, - name=data.name, - ) - - -@execute_node.register(ops.Translate, pd.Series, pd.Series, str) -def execute_series_translate_series_scalar(op, data, from_string, to_string, **kwargs): - tables = [str.maketrans(source, to_string) for source in from_string] - return pd.Series( - [string.translate(table) for string, table in zip(data, tables)], - dtype=data.dtype, - name=data.name, - ) - - -@execute_node.register(ops.Translate, pd.Series, str, pd.Series) -def execute_series_translate_scalar_series(op, data, from_string, to_string, **kwargs): - tables = [str.maketrans(from_string, target) for target in to_string] - return pd.Series( - [string.translate(table) for string, table in zip(data, tables)], - dtype=data.dtype, - name=data.name, - ) - - -@execute_node.register(ops.Translate, pd.Series, str, str) -def execute_series_translate_scalar_scalar(op, data, from_string, to_string, **kwargs): - return data.str.translate(str.maketrans(from_string, to_string)) - - -@execute_node.register(ops.StrRight, pd.Series, integer_types) -def execute_series_right(op, data, nchars, **kwargs): - return data.str[-nchars:] - - -@execute_node.register(ops.StrRight, SeriesGroupBy, integer_types) -def execute_series_right_gb(op, data, nchars, **kwargs): - return execute_series_right(op, data.obj, nchars).groupby( - get_grouping(data.grouper.groupings), group_keys=False - ) - - -@execute_node.register(ops.StringReplace, pd.Series, (pd.Series, str), (pd.Series, str)) -def execute_series_string_replace(_, data, needle, replacement, **kwargs): - return data.str.replace(needle, replacement) - - -@execute_node.register(ops.StringJoin, (pd.Series, str), tuple) -def execute_series_join_scalar_sep(op, sep, args, **kwargs): - data = [execute(arg, **kwargs) for arg in args] - return reduce(lambda x, y: x + sep + y, data) - - -def haystack_to_series_of_lists(haystack, index=None): - if index is None: - index = toolz.first( - piece.index for piece in haystack if hasattr(piece, "index") - ) - pieces = reduce( - operator.add, - ( - pd.Series(getattr(piece, "values", piece), index=index).map( - ibis.util.promote_list - ) - for piece in haystack - ), - ) - return pieces - - -@execute_node.register(ops.FindInSet, pd.Series, tuple) -def execute_series_find_in_set(op, needle, haystack, **kwargs): - haystack = [execute(arg, **kwargs) for arg in haystack] - pieces = haystack_to_series_of_lists(haystack, index=needle.index) - index = itertools.count() - return pieces.map( - lambda elements, needle=needle, index=index: ( - ibis.util.safe_index(elements, needle.iat[next(index)]) - ) - ) - - -@execute_node.register(ops.FindInSet, SeriesGroupBy, list) -def execute_series_group_by_find_in_set(op, needle, haystack, **kwargs): - pieces = [getattr(piece, "obj", piece) for piece in haystack] - return execute_series_find_in_set(op, needle.obj, pieces, **kwargs).groupby( - get_grouping(needle.grouper.groupings), group_keys=False - ) - - -@execute_node.register(ops.FindInSet, scalar_types, list) -def execute_string_group_by_find_in_set(op, needle, haystack, **kwargs): - # `list` could contain series, series groupbys, or scalars - # mixing series and series groupbys is not allowed - series_in_haystack = [ - type(piece) - for piece in haystack - if isinstance(piece, (pd.Series, SeriesGroupBy)) - ] - - if not series_in_haystack: - return ibis.util.safe_index(haystack, needle) - - try: - (collection_type,) = frozenset(map(type, series_in_haystack)) - except ValueError: - raise ValueError("Mixing Series and SeriesGroupBy is not allowed") - - pieces = haystack_to_series_of_lists( - [getattr(piece, "obj", piece) for piece in haystack] - ) - - result = pieces.map(toolz.flip(ibis.util.safe_index)(needle)) - if issubclass(collection_type, pd.Series): - return result - - assert issubclass(collection_type, SeriesGroupBy) - - return result.groupby( - get_grouping( - toolz.first( - piece.grouper.groupings - for piece in haystack - if hasattr(piece, "grouper") - ) - ), - group_keys=False, - ) - - -def try_getitem(value, key): - try: - # try to deserialize the value -> return None if it's None - if (js := json.loads(value)) is None: - return None - except (json.JSONDecodeError, TypeError): - # if there's an error related to decoding or a type error return None - return None - - try: - # try to extract the value as an array element or mapping key - return js[key] - except (KeyError, IndexError, TypeError): - # KeyError: missing mapping key - # IndexError: missing sequence key - # TypeError: `js` doesn't implement __getitem__, either at all or for - # the type of `key` - return None - - -@execute_node.register(ops.JSONGetItem, pd.Series, (str, int)) -def execute_json_getitem_series_str_int(_, data, key, **kwargs): - return pd.Series(map(partial(try_getitem, key=key), data), dtype="object") - - -@execute_node.register(ops.JSONGetItem, pd.Series, pd.Series) -def execute_json_getitem_series_series(_, data, key, **kwargs): - return pd.Series(map(try_getitem, data, key), dtype="object") - - -def _extract_url_field(data, field_name): - if isinstance(data, str): - return getattr(urlsplit(data), field_name, "") - - return pd.Series( - [getattr(urlsplit(string), field_name, "") for string in data], - dtype=data.dtype, - name=data.name, - ) - - -@execute_node.register(ops.ExtractProtocol, (pd.Series, str)) -def execute_extract_protocol(op, data, **kwargs): - return _extract_url_field(data, "scheme") - - -@execute_node.register(ops.ExtractAuthority, (pd.Series, str)) -def execute_extract_authority(op, data, **kwargs): - return _extract_url_field(data, "netloc") - - -@execute_node.register(ops.ExtractPath, (pd.Series, str)) -def execute_extract_path(op, data, **kwargs): - return _extract_url_field(data, "path") - - -@execute_node.register(ops.ExtractFragment, (pd.Series, str)) -def execute_extract_fragment(op, data, **kwargs): - return _extract_url_field(data, "fragment") - - -@execute_node.register(ops.ExtractHost, (pd.Series, str)) -def execute_extract_host(op, data, **kwargs): - return _extract_url_field(data, "hostname") - - -@execute_node.register(ops.ExtractQuery, (pd.Series, str), (str, type(None))) -def execute_extract_query(op, data, key, **kwargs): - def extract_query_param(url, param_name): - query = urlsplit(url).query - if param_name is not None: - value = parse_qs(query)[param_name] - return value if len(value) > 1 else value[0] - else: - return query - - if isinstance(data, str): - return extract_query_param(data, key) - - return pd.Series( - [extract_query_param(url, key) for url in data], - dtype=data.dtype, - name=data.name, - ) - - -@execute_node.register(ops.ExtractUserInfo, (pd.Series, str)) -def execute_extract_user_info(op, data, **kwargs): - def extract_user_info(url): - url_parts = urlsplit(url) - - username = url_parts.username or "" - password = url_parts.password or "" - - return f"{username}:{password}" - - if isinstance(data, str): - return extract_user_info(data) - - return pd.Series( - [extract_user_info(string) for string in data], - dtype=data.dtype, - name=data.name, - ) diff --git a/ibis/backends/pandas/execution/structs.py b/ibis/backends/pandas/execution/structs.py deleted file mode 100644 index a2bcf7a94e11..000000000000 --- a/ibis/backends/pandas/execution/structs.py +++ /dev/null @@ -1,44 +0,0 @@ -"""Pandas backend execution of struct fields and literals.""" - -from __future__ import annotations - -import collections -import functools - -import pandas as pd -from pandas.core.groupby import SeriesGroupBy - -import ibis.expr.operations as ops -from ibis.backends.pandas.dispatch import execute_node -from ibis.backends.pandas.execution.util import get_grouping - - -@execute_node.register(ops.StructField, (collections.abc.Mapping, pd.DataFrame)) -def execute_node_struct_field_dict(op, data, **kwargs): - return data[op.field] - - -@execute_node.register(ops.StructField, (type(None), type(pd.NA), float)) -def execute_node_struct_field_none(op, data, **_): - assert (isinstance(data, float) and pd.isna(data)) or not isinstance(data, float) - return pd.NA - - -def _safe_getter(value, field: str): - if pd.isna(value): - return pd.NA - else: - return value[field] - - -@execute_node.register(ops.StructField, pd.Series) -def execute_node_struct_field_series(op, data, **kwargs): - getter = functools.partial(_safe_getter, field=op.field) - return data.map(getter).rename(op.field) - - -@execute_node.register(ops.StructField, SeriesGroupBy) -def execute_node_struct_field_series_group_by(op, data, **kwargs): - getter = functools.partial(_safe_getter, field=op.field) - groupings = get_grouping(data.grouper.groupings) - return data.obj.map(getter).rename(op.field).groupby(groupings, group_keys=False) diff --git a/ibis/backends/pandas/execution/temporal.py b/ibis/backends/pandas/execution/temporal.py deleted file mode 100644 index a2f2b5d8b5ec..000000000000 --- a/ibis/backends/pandas/execution/temporal.py +++ /dev/null @@ -1,341 +0,0 @@ -from __future__ import annotations - -import datetime - -import numpy as np -import pandas as pd -from pandas.core.groupby import SeriesGroupBy - -import ibis.expr.datatypes as dt -import ibis.expr.operations as ops -from ibis.backends.base import BaseBackend -from ibis.backends.base.df.scope import Scope -from ibis.backends.pandas.core import ( - date_types, - integer_types, - numeric_types, - timedelta_types, - timestamp_types, -) -from ibis.backends.pandas.dispatch import execute_node, pre_execute -from ibis.backends.pandas.execution.util import get_grouping - - -@execute_node.register(ops.Strftime, pd.Timestamp, str) -def execute_strftime_timestamp_str(op, data, format_string, **kwargs): - return data.strftime(format_string) - - -@execute_node.register(ops.Strftime, pd.Series, str) -def execute_strftime_series_str(op, data, format_string, **kwargs): - return data.dt.strftime(format_string) - - -@execute_node.register(ops.ExtractTemporalField, datetime.datetime) -def execute_extract_timestamp_field_timestamp(op, data, **kwargs): - field_name = type(op).__name__.lower().replace("extract", "") - return getattr(data, field_name) - - -@execute_node.register(ops.ExtractTemporalField, pd.Series) -def execute_extract_timestamp_field_series(op, data, **kwargs): - field_name = type(op).__name__.lower().replace("extract", "") - if field_name == "weekofyear": - return data.dt.isocalendar().week.astype(np.int32) - return getattr(data.dt, field_name).astype(np.int32) - - -@execute_node.register(ops.ExtractMillisecond, datetime.datetime) -def execute_extract_millisecond_timestamp(op, data, **kwargs): - return int(data.microsecond // 1_000) - - -@execute_node.register(ops.ExtractMicrosecond, datetime.datetime) -def execute_extract_microsecond_timestamp(op, data, **kwargs): - return int(data.microsecond) - - -@execute_node.register(ops.ExtractMillisecond, pd.Series) -def execute_extract_millisecond_series(op, data, **kwargs): - return (data.dt.microsecond // 1_000).astype(np.int32) - - -@execute_node.register(ops.ExtractMicrosecond, pd.Series) -def execute_extract_microsecond_series(op, data, **kwargs): - return data.dt.microsecond.astype(np.int32) - - -@execute_node.register(ops.ExtractEpochSeconds, pd.Series) -def execute_epoch_seconds_series(op, data, **kwargs): - return ( - data.astype("datetime64[ns]") - .astype("int64") - .floordiv(1_000_000_000) - .astype("int32") - ) - - -@execute_node.register(ops.ExtractEpochSeconds, (pd.Timestamp, datetime.datetime)) -def execute_epoch_seconds_literal(op, data, **kwargs): - return pd.Timestamp(data).floor("s").value // 1_000_000_000 - - -@execute_node.register( - ops.BetweenTime, - pd.Series, - (pd.Series, str, datetime.time), - (pd.Series, str, datetime.time), -) -def execute_between_time(op, data, lower, upper, **kwargs): - idx = pd.DatetimeIndex(data) - if idx.tz is not None: - idx = idx.tz_convert(None) # make naive because times are naive - indexer = idx.indexer_between_time(lower, upper) - result = np.zeros(len(data), dtype=np.bool_) - result[indexer] = True - return pd.Series(result) - - -@execute_node.register(ops.Date, pd.Series) -def execute_timestamp_date(op, data, **kwargs): - return data.dt.floor("d") - - -PANDAS_UNITS = { - "m": "Min", - "ms": "L", -} - - -@execute_node.register((ops.TimestampTruncate, ops.DateTruncate), pd.Series) -def execute_timestamp_truncate(op, data, **kwargs): - dt = data.dt - unit = PANDAS_UNITS.get(op.unit.short, op.unit.short) - try: - return dt.floor(unit) - except ValueError: - return dt.to_period(unit).dt.to_timestamp() - - -OFFSET_CLASS = { - "Y": pd.offsets.DateOffset, - "Q": pd.offsets.DateOffset, - "M": pd.offsets.DateOffset, - "W": pd.offsets.DateOffset, - # all other units are timedelta64s -} - - -@execute_node.register(ops.IntervalFromInteger, pd.Series) -def execute_interval_from_integer_series(op, data, **kwargs): - unit = op.unit.short - resolution = op.unit.plural - cls = OFFSET_CLASS.get(unit, None) - - # fast path for timedelta conversion - if cls is None: - return data.astype(f"timedelta64[{unit}]") - return data.apply(lambda n, cls=cls, resolution=resolution: cls(**{resolution: n})) - - -@execute_node.register(ops.IntervalFromInteger, integer_types) -def execute_interval_from_integer_integer_types(op, data, **kwargs): - unit = op.unit.short - resolution = op.unit.plural - cls = OFFSET_CLASS.get(unit, None) - - if cls is None: - return pd.Timedelta(data, unit=unit) - return cls(**{resolution: data}) - - -@execute_node.register(ops.Cast, pd.Series, dt.Interval) -def execute_cast_integer_to_interval_series(op, data, type, **kwargs): - to = op.to - unit = to.unit.short - resolution = to.unit.plural - cls = OFFSET_CLASS.get(unit, None) - - if cls is None: - return data.astype(f"timedelta64[{unit}]") - return data.apply(lambda n, cls=cls, resolution=resolution: cls(**{resolution: n})) - - -@execute_node.register(ops.Cast, integer_types, dt.Interval) -def execute_cast_integer_to_interval_integer_types(op, data, type, **kwargs): - to = op.to - unit = to.unit.short - resolution = to.unit.plural - cls = OFFSET_CLASS.get(unit, None) - - if cls is None: - return pd.Timedelta(data, unit=unit) - return cls(**{resolution: data}) - - -@execute_node.register(ops.TimestampAdd, timestamp_types, timedelta_types) -def execute_timestamp_add_datetime_timedelta(op, left, right, **kwargs): - return pd.Timestamp(left) + pd.Timedelta(right) - - -@execute_node.register(ops.TimestampAdd, timestamp_types, pd.Series) -def execute_timestamp_add_datetime_series(op, left, right, **kwargs): - return pd.Timestamp(left) + right - - -@execute_node.register(ops.IntervalAdd, timedelta_types, timedelta_types) -def execute_interval_add_delta_delta(op, left, right, **kwargs): - return op.op(pd.Timedelta(left), pd.Timedelta(right)) - - -@execute_node.register(ops.IntervalAdd, timedelta_types, pd.Series) -@execute_node.register( - ops.IntervalMultiply, timedelta_types, numeric_types + (pd.Series,) -) -def execute_interval_add_multiply_delta_series(op, left, right, **kwargs): - return op.op(pd.Timedelta(left), right) - - -@execute_node.register((ops.TimestampAdd, ops.IntervalAdd), pd.Series, timedelta_types) -def execute_timestamp_interval_add_series_delta(op, left, right, **kwargs): - return left + pd.Timedelta(right) - - -@execute_node.register((ops.TimestampAdd, ops.IntervalAdd), pd.Series, pd.Series) -def execute_timestamp_interval_add_series_series(op, left, right, **kwargs): - return left + right - - -@execute_node.register(ops.TimestampSub, timestamp_types, timedelta_types) -def execute_timestamp_sub_datetime_timedelta(op, left, right, **kwargs): - return pd.Timestamp(left) - pd.Timedelta(right) - - -@execute_node.register( - (ops.TimestampDiff, ops.TimestampSub), timestamp_types, pd.Series -) -def execute_timestamp_diff_sub_datetime_series(op, left, right, **kwargs): - return pd.Timestamp(left) - right - - -@execute_node.register(ops.TimestampSub, pd.Series, timedelta_types) -def execute_timestamp_sub_series_timedelta(op, left, right, **kwargs): - return left - pd.Timedelta(right) - - -@execute_node.register( - (ops.TimestampDiff, ops.TimestampSub, ops.IntervalSubtract), - pd.Series, - pd.Series, -) -def execute_timestamp_diff_sub_series_series(op, left, right, **kwargs): - return left - right - - -@execute_node.register(ops.TimestampDiff, timestamp_types, timestamp_types) -def execute_timestamp_diff_datetime_datetime(op, left, right, **kwargs): - return pd.Timestamp(left) - pd.Timestamp(right) - - -@execute_node.register(ops.TimestampDiff, pd.Series, timestamp_types) -def execute_timestamp_diff_series_datetime(op, left, right, **kwargs): - return left - pd.Timestamp(right) - - -@execute_node.register(ops.IntervalMultiply, pd.Series, numeric_types + (pd.Series,)) -@execute_node.register( - ops.IntervalFloorDivide, - (pd.Timedelta, pd.Series), - numeric_types + (pd.Series,), -) -def execute_interval_multiply_fdiv_series_numeric(op, left, right, **kwargs): - return op.op(left, right) - - -@execute_node.register(ops.TimestampFromUNIX, (pd.Series,) + integer_types) -def execute_timestamp_from_unix(op, data, **kwargs): - return pd.to_datetime(data, unit=op.unit.short) - - -@pre_execute.register(ops.TimestampNow) -@pre_execute.register(ops.TimestampNow, BaseBackend) -def pre_execute_timestamp_now(op, *args, **kwargs): - timecontext = kwargs.get("timecontext", None) - now = pd.Timestamp("now", tz="UTC").tz_localize(None) - return Scope({op: now}, timecontext) - - -@execute_node.register(ops.DayOfWeekIndex, (str, datetime.date)) -def execute_day_of_week_index_any(op, value, **kwargs): - return pd.Timestamp(value).dayofweek - - -@execute_node.register(ops.DayOfWeekIndex, pd.Series) -def execute_day_of_week_index_series(op, data, **kwargs): - return data.dt.dayofweek.astype(np.int16) - - -@execute_node.register(ops.DayOfWeekIndex, SeriesGroupBy) -def execute_day_of_week_index_series_group_by(op, data, **kwargs): - groupings = get_grouping(data.grouper.groupings) - return data.obj.dt.dayofweek.astype(np.int16).groupby(groupings, group_keys=False) - - -def day_name(obj: pd.core.indexes.accessors.DatetimeProperties | pd.Timestamp) -> str: - """Backwards compatible name-of-day getting function. - - Returns - ------- - str - The name of the day corresponding to `obj` - """ - try: - return obj.day_name() - except AttributeError: - return obj.weekday_name - - -@execute_node.register(ops.DayOfWeekName, (str, datetime.date)) -def execute_day_of_week_name_any(op, value, **kwargs): - return day_name(pd.Timestamp(value)) - - -@execute_node.register(ops.DayOfWeekName, pd.Series) -def execute_day_of_week_name_series(op, data, **kwargs): - return day_name(data.dt) - - -@execute_node.register(ops.DayOfWeekName, SeriesGroupBy) -def execute_day_of_week_name_series_group_by(op, data, **kwargs): - return day_name(data.obj.dt).groupby( - get_grouping(data.grouper.groupings), group_keys=False - ) - - -@execute_node.register(ops.DateSub, date_types, timedelta_types) -@execute_node.register(ops.DateSub, pd.Series, timedelta_types) -@execute_node.register((ops.DateDiff, ops.DateSub), pd.Series, pd.Series) -@execute_node.register(ops.DateDiff, date_types, date_types) -def execute_date_sub_diff(op, left, right, **kwargs): - return left - right - - -@execute_node.register((ops.DateDiff, ops.DateSub), date_types, pd.Series) -def execute_date_sub_diff_date_series(op, left, right, **kwargs): - return pd.Timestamp(left, unit="D") - right - - -@execute_node.register(ops.DateDiff, pd.Series, date_types) -def execute_date_sub_diff_series_date(op, left, right, **kwargs): - return left - pd.Timestamp(right, unit="D") - - -@execute_node.register(ops.DateAdd, pd.Series, timedelta_types) -@execute_node.register(ops.DateAdd, timedelta_types, pd.Series) -@execute_node.register(ops.DateAdd, pd.Series, pd.Series) -@execute_node.register(ops.DateAdd, date_types, timedelta_types) -@execute_node.register(ops.DateAdd, timedelta_types, date_types) -@execute_node.register(ops.DateAdd, date_types, pd.Series) -@execute_node.register(ops.DateAdd, pd.Series, date_types) -def execute_date_add(op, left, right, **kwargs): - return left + right diff --git a/ibis/backends/pandas/execution/timecontext.py b/ibis/backends/pandas/execution/timecontext.py deleted file mode 100644 index c9be8f75757f..000000000000 --- a/ibis/backends/pandas/execution/timecontext.py +++ /dev/null @@ -1,93 +0,0 @@ -"""Implementation of compute_time_context for time context related operations. - -Time context of a node is computed at the beginning of execution phase. - -To use time context to load time series data: - -For operations like window, asof_join that adjust time context in execution, -implement ``compute_time_context`` to pass different time contexts to child -nodes. - -If ``pre_execute`` preloads any data, it should use timecontext to trim data -to be in the time range. - -``execute_node`` of a leaf node can use timecontext to trim data, or to pass -it as a filter in the database query. - -In some cases, data need to be trimmed in ``post_execute``. - -Note: In order to use the feature we implemented here, there must be a -column of Timestamp type, and named as 'time' in Table. And this 'time' -column should be preserved across the expression tree. If 'time' column is -dropped then execution will result in error. -See ``execute_database_table_client`` in ``generic.py``. -And we assume timecontext is passed in as a tuple (begin, end) where begin and -end are timestamp, or datetime string like "20100101". Time range is inclusive -(include both begin and end points). - -This is an optional feature. The result of executing an expression without time -context is conceptually the same as executing an expression with (-inf, inf) -time context. -""" -from __future__ import annotations - -from typing import TYPE_CHECKING - -import ibis.expr.operations as ops -from ibis.backends.base.df.timecontext import TimeContext, adjust_context -from ibis.backends.pandas.core import ( - compute_time_context, - get_node_arguments, - is_computable_input, -) - -if TYPE_CHECKING: - from ibis.backends.base import BaseBackend - from ibis.backends.base.df.scope import Scope - - -@compute_time_context.register(ops.AsOfJoin) -def compute_time_context_asof_join( - op: ops.AsOfJoin, - scope: Scope, - clients: list[BaseBackend], - timecontext: TimeContext | None = None, - **kwargs, -): - new_timecontexts = [ - timecontext for arg in get_node_arguments(op) if is_computable_input(arg) - ] - - if not timecontext: - return new_timecontexts - - # right table is the second node in children - new_timecontexts = [ - new_timecontexts[0], - adjust_context(op, scope, timecontext), - *new_timecontexts[2:], - ] - return new_timecontexts - - -@compute_time_context.register(ops.Window) -def compute_time_context_window( - op: ops.Window, - scope: Scope, - clients: list[BaseBackend], - timecontext: TimeContext | None = None, - **kwargs, -): - new_timecontexts = [ - timecontext for arg in get_node_arguments(op) if is_computable_input(arg) - ] - - if not timecontext: - return new_timecontexts - - result = adjust_context(op, scope, timecontext) - - new_timecontexts = [ - result for arg in get_node_arguments(op) if is_computable_input(arg) - ] - return new_timecontexts diff --git a/ibis/backends/pandas/execution/util.py b/ibis/backends/pandas/execution/util.py deleted file mode 100644 index 15b43c8832bd..000000000000 --- a/ibis/backends/pandas/execution/util.py +++ /dev/null @@ -1,144 +0,0 @@ -from __future__ import annotations - -from typing import Any - -import pandas as pd - -import ibis.expr.analysis as an -import ibis.expr.operations as ops -import ibis.util -from ibis.backends.base.df.scope import Scope -from ibis.backends.pandas.core import execute -from ibis.backends.pandas.execution import constants - - -def get_grouping(grouper): - # this is such an annoying hack - assert isinstance(grouper, list) - if len(grouper) == 1: - return grouper[0] - return grouper - - -def get_join_suffix_for_op(op: ops.TableColumn, join_op: ops.Join): - (root_table,) = an.find_immediate_parent_tables(op) - left_root, right_root = an.find_immediate_parent_tables( - [join_op.left, join_op.right] - ) - return { - left_root: constants.LEFT_JOIN_SUFFIX, - right_root: constants.RIGHT_JOIN_SUFFIX, - }[root_table] - - -def compute_sort_key(key, data, timecontext, scope=None, **kwargs): - if key.shape.is_columnar(): - if key.name in data: - return key.name, None - else: - if scope is None: - scope = Scope() - scope = scope.merge_scopes( - Scope({t: data}, timecontext) - for t in an.find_immediate_parent_tables(key) - ) - new_column = execute(key, scope=scope, **kwargs) - name = ibis.util.guid() - new_column.name = name - return name, new_column - else: - raise NotImplementedError( - "Scalar sort keys are not yet supported in the pandas backend" - ) - - -def compute_sorted_frame(df, order_by, group_by=(), timecontext=None, **kwargs): - sort_keys = [] - ascending = [] - - for value in group_by: - sort_keys.append(value) - ascending.append(True) - for key in order_by: - sort_keys.append(key) - ascending.append(key.ascending) - - new_columns = {} - computed_sort_keys = [] - for key in sort_keys: - computed_sort_key, temporary_column = compute_sort_key( - key, df, timecontext, **kwargs - ) - computed_sort_keys.append(computed_sort_key) - - if temporary_column is not None: - new_columns[computed_sort_key] = temporary_column - - result = df.assign(**new_columns) - try: - result = result.sort_values( - computed_sort_keys, ascending=ascending, kind="mergesort" - ) - except TypeError: - result = result.sort_values(computed_sort_keys, ascending=ascending) - # TODO: we'll eventually need to return this frame with the temporary - # columns and drop them in the caller (maybe using post_execute?) - ngrouping_keys = len(group_by) - return ( - result, - computed_sort_keys[:ngrouping_keys], - computed_sort_keys[ngrouping_keys:], - ) - - -def coerce_to_output( - result: Any, node: ops.Node, index: pd.Index | None = None -) -> pd.Series | pd.DataFrame: - """Cast the result to either a Series or DataFrame. - - This method casts result of an execution to a Series or DataFrame, - depending on the type of the expression and shape of the result. - - Parameters - ---------- - result: Any - The result to cast - node: ibis.expr.operations.Node - The operation node associated with the result - index: pd.Index - Optional. If passed, scalar results will be broadcasted according - to the index. - - Returns - ------- - result: A Series or DataFrame - - Examples - -------- - For dataframe outputs, see ``ibis.util.coerce_to_dataframe``. - - >>> coerce_to_output(pd.Series(1), node) # quartodoc: +SKIP # doctest: +SKIP - 0 1 - Name: result, dtype: int64 - >>> coerce_to_output(1, node) # quartodoc: +SKIP # doctest: +SKIP - 0 1 - Name: result, dtype: int64 - >>> coerce_to_output(1, node, [1, 2, 3]) # quartodoc: +SKIP # doctest: +SKIP - 1 1 - 2 1 - 3 1 - Name: result, dtype: int64 - >>> coerce_to_output([1, 2, 3], node) # quartodoc: +SKIP # doctest: +SKIP - 0 [1, 2, 3] - Name: result, dtype: object - """ - if isinstance(result, pd.DataFrame): - rows = result.to_dict(orient="records") - return pd.Series(rows, name=node.name) - - # columnar result - if isinstance(result, pd.Series): - return result.rename(node.name) - - # Wrap `result` into a single-element Series. - return pd.Series([result], name=node.name) diff --git a/ibis/backends/pandas/execution/window.py b/ibis/backends/pandas/execution/window.py deleted file mode 100644 index 39475ecc2bb6..000000000000 --- a/ibis/backends/pandas/execution/window.py +++ /dev/null @@ -1,526 +0,0 @@ -"""Code for computing window functions with ibis and pandas.""" - -from __future__ import annotations - -import operator -from typing import TYPE_CHECKING, Any, Callable, NoReturn - -import numpy as np -import pandas as pd -import toolz -from multipledispatch import Dispatcher -from pandas.core.groupby import SeriesGroupBy - -import ibis.expr.analysis as an -import ibis.expr.operations as ops -from ibis.backends.base.df.scope import Scope -from ibis.backends.base.df.timecontext import ( - TimeContext, - construct_time_context_aware_series, - get_time_col, -) -from ibis.backends.pandas import aggcontext as agg_ctx -from ibis.backends.pandas.core import ( - compute_time_context, - date_types, - execute, - integer_types, - simple_types, - timedelta_types, - timestamp_types, -) -from ibis.backends.pandas.dispatch import execute_node, pre_execute -from ibis.backends.pandas.execution import util - -if TYPE_CHECKING: - from ibis.backends.pandas.aggcontext import AggregationContext - - -def _post_process_empty( - result: Any, - parent: pd.DataFrame, - order_by: list[str], - group_by: list[str], - timecontext: TimeContext | None, -) -> pd.Series: - # This is the post process of the no groupby nor orderby window - # `result` could be a Series, DataFrame, or a scalar. generated - # by `agg` method of class `Window`. For window without grouby or - # orderby, `agg` calls pands method directly. So if timecontext is - # present, we need to insert 'time' column into index for trimming the - # result. For cases when grouby or orderby is present, `agg` calls - # Ibis method `window_agg_built_in` and `window_agg_udf`, time - # context is already inserted there. - assert not order_by and not group_by - if isinstance(result, (pd.Series, pd.DataFrame)): - if timecontext: - result = construct_time_context_aware_series(result, parent) - return result - else: - # `result` is a scalar when a reduction operation is being - # applied over the window, since reduction operations are N->1 - # in this case we do not need to trim result by timecontext, - # just expand reduction result to be a Series with `index`. - index = parent.index - result = pd.Series([result]).repeat(len(index)) - result.index = index - return result - - -def _post_process_group_by( - series: pd.Series, - parent: pd.DataFrame, - order_by: list[str], - group_by: list[str], - timecontext: TimeContext | None, -) -> pd.Series: - assert not order_by and group_by - return series - - -def _post_process_order_by( - series, - parent: pd.DataFrame, - order_by: list[str], - group_by: list[str], - timecontext: TimeContext | None, -) -> pd.Series: - assert order_by and not group_by - indexed_parent = parent.set_index(order_by) - index = indexed_parent.index - - # get the names of the levels that will be in the result - series_index_names = frozenset(series.index.names) - - # get the levels common to series.index, in the order that they occur in - # the parent's index - reordered_levels = [name for name in index.names if name in series_index_names] - - if len(reordered_levels) > 1: - series = series.reorder_levels(reordered_levels) - - series = series.iloc[index.argsort(kind="mergesort")] - return series - - -def _post_process_group_by_order_by( - series: pd.Series, - parent: pd.DataFrame, - order_by: list[str], - group_by: list[str], - timecontext: TimeContext | None, -) -> pd.Series: - indexed_parent = parent.set_index(group_by + order_by, append=True) - index = indexed_parent.index - - # get the names of the levels that will be in the result - series_index_names = frozenset(series.index.names) - - # get the levels common to series.index, in the order that they occur in - # the parent's index - reordered_levels = [name for name in index.names if name in series_index_names] - - if len(reordered_levels) > 1: - series = series.reorder_levels(reordered_levels) - return series - - -get_aggcontext = Dispatcher("get_aggcontext") - - -@get_aggcontext.register(object) -def get_aggcontext_default( - window, - *, - scope, - operand, - parent, - group_by, - order_by, - **kwargs, -) -> NoReturn: - raise NotImplementedError( - f"get_aggcontext is not implemented for {type(window).__name__}" - ) - - -@get_aggcontext.register(ops.WindowFrame) -def get_aggcontext_window( - frame, - *, - scope, - operand, - parent, - group_by, - order_by, - **kwargs, -) -> AggregationContext: - # no order by or group by: default summarization aggcontext - # - # if we're reducing and we have an order by expression then we need to - # expand or roll. - # - # otherwise we're transforming - output_type = operand.dtype - - if not group_by and not order_by: - aggcontext = agg_ctx.Summarize(parent=parent, output_type=output_type) - elif group_by and not order_by: - # groupby transform (window with a partition by clause in SQL parlance) - aggcontext = agg_ctx.Transform( - parent=parent, - group_by=group_by, - order_by=order_by, - output_type=output_type, - ) - elif frame.start is not None: - if isinstance(frame, ops.RowsWindowFrame): - max_lookback = frame.max_lookback - else: - max_lookback = None - - aggcontext = agg_ctx.Moving( - frame.start, - # FIXME(kszucs): I don't think that we have a proper max_lookback test - # case because passing None here is not braking anything - max_lookback=max_lookback, - parent=parent, - group_by=group_by, - order_by=order_by, - output_type=output_type, - ) - else: - # expanding window - aggcontext = agg_ctx.Cumulative( - parent=parent, - group_by=group_by, - order_by=order_by, - output_type=output_type, - ) - - return aggcontext - - -def trim_window_result(data: pd.Series | pd.DataFrame, timecontext: TimeContext | None): - """Trim data within time range defined by timecontext. - - This is a util function used in ``execute_window_op``, where time - context might be adjusted for calculation. Data must be trimmed - within the original time context before return. - `data` is a pd.Series with Multiindex for most cases, for multi - column udf result, `data` could be a pd.DataFrame - - Params - ------ - data: pd.Series or pd.DataFrame - timecontext: Optional[TimeContext] - - Returns - ------- - a trimmed pd.Series or or pd.DataFrame with the same Multiindex - as data's - """ - # noop if timecontext is None - if not timecontext: - return data - assert isinstance( - data, (pd.Series, pd.DataFrame) - ), "window computed columns is not a pd.Series nor a pd.DataFrame" - - # reset multiindex, convert Series into a DataFrame - df = data.reset_index() - - # Filter the data, here we preserve the time index so that when user is - # computing a single column, the computation and the relevant time - # indexes are returned. - time_col = get_time_col() - if time_col not in df: - return data - - subset = df.loc[df[time_col].between(*timecontext)] - - # Get columns to set for index - if isinstance(data, pd.Series): - # if Series doesn't contain a name, reset_index will assign - # '0' as the column name for the column of value - name = data.name if data.name else 0 - index_columns = list(subset.columns.difference([name])) - else: - name = data.columns - index_columns = list(subset.columns.difference(name)) - - # set the correct index for return Series / DataFrame - indexed_subset = subset.set_index(index_columns) - return indexed_subset[name] - - -@execute_node.register(ops.WindowFunction, [pd.Series]) -def execute_window_op( - op, - *data, - scope: Scope | None = None, - timecontext: TimeContext | None = None, - aggcontext=None, - clients=None, - **kwargs, -): - func, frame = op.func, op.frame - - if frame.how == "range" and any( - not col.dtype.is_temporal() for col in frame.order_by - ): - raise NotImplementedError( - "The pandas backend only implements range windows with temporal " - "ordering keys" - ) - - # pre execute "manually" here because otherwise we wouldn't pickup - # relevant scope changes from the child operand since we're managing - # execution of that by hand - - adjusted_timecontext = None - if timecontext: - arg_timecontexts = compute_time_context( - op, timecontext=timecontext, clients=clients, scope=scope - ) - # timecontext is the original time context required by parent node - # of this Window, while adjusted_timecontext is the adjusted context - # of this Window, since we are doing a manual execution here, use - # adjusted_timecontext in later execution phases - adjusted_timecontext = arg_timecontexts[0] - - pre_executed_scope = pre_execute( - func, - *clients, - scope=scope, - timecontext=adjusted_timecontext, - aggcontext=aggcontext, - **kwargs, - ) - if scope is None: - scope = pre_executed_scope - else: - scope = scope.merge_scope(pre_executed_scope) - - root_table = an.find_first_base_table(op) - data = execute( - root_table, - scope=scope, - timecontext=adjusted_timecontext, - clients=clients, - aggcontext=aggcontext, - **kwargs, - ) - - grouping_keys = [ - key.name - if isinstance(key, ops.TableColumn) - else execute( - key, - scope=scope, - clients=clients, - timecontext=adjusted_timecontext, - aggcontext=aggcontext, - **kwargs, - ) - for key in frame.group_by - ] - - if not frame.order_by: - ordering_keys = [] - - post_process: Callable[ - [Any, pd.DataFrame, list[str], list[str], TimeContext | None], - pd.Series, - ] - if frame.group_by: - if frame.order_by: - sorted_df, grouping_keys, ordering_keys = util.compute_sorted_frame( - data, - frame.order_by, - group_by=frame.group_by, - timecontext=adjusted_timecontext, - **kwargs, - ) - source = sorted_df.groupby(grouping_keys, sort=True, group_keys=False) - post_process = _post_process_group_by_order_by - else: - source = data.groupby(grouping_keys, sort=False, group_keys=False) - post_process = _post_process_group_by - elif frame.order_by: - source, grouping_keys, ordering_keys = util.compute_sorted_frame( - data, frame.order_by, timecontext=adjusted_timecontext, **kwargs - ) - post_process = _post_process_order_by - else: - source = data - post_process = _post_process_empty - - # Here groupby object should be add to the corresponding node in scope - # for execution, data will be overwrite to a groupby object, so we - # force an update regardless of time context - new_scope = scope.merge_scopes( - [ - Scope({t: source}, adjusted_timecontext) - for t in an.find_immediate_parent_tables(func) - ], - overwrite=True, - ) - - aggcontext = get_aggcontext( - frame, - scope=scope, - operand=func, - parent=source, - group_by=grouping_keys, - order_by=ordering_keys, - **kwargs, - ) - result = execute( - func, - scope=new_scope, - timecontext=adjusted_timecontext, - aggcontext=aggcontext, - clients=clients, - **kwargs, - ) - result = post_process( - result, - data, - ordering_keys, - grouping_keys, - adjusted_timecontext, - ) - assert len(data) == len( - result - ), "input data source and computed column do not have the same length" - - # trim data to original time context - result = trim_window_result(result, timecontext) - return result - - -def post_lead_lag(result, default): - if not pd.isnull(default): - return result.fillna(default) - return result - - -@execute_node.register( - (ops.Lead, ops.Lag), - (pd.Series, SeriesGroupBy), - integer_types + (type(None),), - simple_types + (type(None),), -) -def execute_series_lead_lag(op, data, offset, default, **kwargs): - func = toolz.identity if isinstance(op, ops.Lag) else operator.neg - result = data.shift(func(1 if offset is None else offset)) - return post_lead_lag(result, default) - - -@execute_node.register( - (ops.Lead, ops.Lag), - (pd.Series, SeriesGroupBy), - timedelta_types, - date_types + timestamp_types + (str, type(None)), -) -def execute_series_lead_lag_timedelta( - op, data, offset, default, aggcontext=None, **kwargs -): - """Shift a column relative to another one in units of time instead of rows.""" - # lagging adds time (delayed), leading subtracts time (moved up) - func = operator.add if isinstance(op, ops.Lag) else operator.sub - group_by = aggcontext.group_by - order_by = aggcontext.order_by - - # get the parent object from which `data` originated - parent = aggcontext.parent - - # get the DataFrame from the parent object, handling the DataFrameGroupBy - # case - parent_df = getattr(parent, "obj", parent) - - # index our parent df by grouping and ordering keys - indexed_original_df = parent_df.set_index(group_by + order_by) - - # perform the time shift - adjusted_parent_df = parent_df.assign( - **{k: func(parent_df[k], offset) for k in order_by} - ) - - # index the parent *after* adjustment - adjusted_indexed_parent = adjusted_parent_df.set_index(group_by + order_by) - - # get the column we care about - result = adjusted_indexed_parent[getattr(data, "obj", data).name] - - # reindex the shifted data by the original frame's index - result = result.reindex(indexed_original_df.index) - - # add a default if necessary - return post_lead_lag(result, default) - - -@execute_node.register(ops.FirstValue, pd.Series) -def execute_series_first_value(op, data, **kwargs): - return data.iloc[np.repeat(0, len(data))] - - -def _getter(x: pd.Series | np.ndarray, idx: int): - return getattr(x, "values", x)[idx] - - -@execute_node.register(ops.FirstValue, SeriesGroupBy) -def execute_series_group_by_first_value(op, data, aggcontext=None, **kwargs): - return aggcontext.agg(data, lambda x: _getter(x, 0)) - - -@execute_node.register(ops.LastValue, pd.Series) -def execute_series_last_value(op, data, **kwargs): - return data.iloc[np.repeat(-1, len(data))] - - -@execute_node.register(ops.LastValue, SeriesGroupBy) -def execute_series_group_by_last_value(op, data, aggcontext=None, **kwargs): - return aggcontext.agg(data, lambda x: _getter(x, -1)) - - -@execute_node.register(ops.MinRank) -def execute_series_min_rank(op, aggcontext=None, **kwargs): - (key,) = aggcontext.order_by - df = aggcontext.parent - data = df[key] - return data.rank(method="min", ascending=True).astype("int64") - 1 - - -@execute_node.register(ops.DenseRank) -def execute_series_dense_rank(op, aggcontext=None, **kwargs): - (key,) = aggcontext.order_by - df = aggcontext.parent - data = df[key] - return data.rank(method="dense", ascending=True).astype("int64") - 1 - - -@execute_node.register(ops.PercentRank) -def execute_series_group_by_percent_rank(op, aggcontext=None, **kwargs): - (key,) = aggcontext.order_by - df = aggcontext.parent - data = df[key] - - result = data.rank(method="min", ascending=True) - 1 - - if isinstance(data, SeriesGroupBy): - nrows = data.transform("count") - else: - nrows = len(data) - - result /= nrows - 1 - return result - - -@execute_node.register(ops.CumeDist) -def execute_series_group_by_cume_dist(op, aggcontext=None, **kwargs): - (key,) = aggcontext.order_by - df = aggcontext.parent - data = df[key] - return data.rank(method="min", ascending=True, pct=True) diff --git a/ibis/backends/pandas/executor.py b/ibis/backends/pandas/executor.py new file mode 100644 index 000000000000..f9dd69a3c027 --- /dev/null +++ b/ibis/backends/pandas/executor.py @@ -0,0 +1,761 @@ +from __future__ import annotations + +import operator +from functools import reduce + +import numpy as np +import pandas as pd + +import ibis.expr.operations as ops +from ibis.backends.pandas.convert import PandasConverter +from ibis.backends.pandas.helpers import ( + GroupedFrame, + RangeFrame, + RowsFrame, + UngroupedFrame, + agg, + asframe, + asseries, + columnwise, + elementwise, + rowwise, + serieswise, +) +from ibis.backends.pandas.kernels import pick_kernel +from ibis.backends.pandas.rewrites import ( + PandasAggregate, + PandasAsofJoin, + PandasJoin, + PandasLimit, + PandasRename, + PandasScalarSubquery, + plan, +) +from ibis.common.dispatch import Dispatched +from ibis.common.exceptions import OperationNotDefinedError, UnboundExpressionError +from ibis.formats.pandas import PandasData +from ibis.util import gen_name + +# ruff: noqa: F811 + + +_reduction_operations = { + ops.Min: lambda x: x.min(), + ops.Max: lambda x: x.max(), + ops.Sum: lambda x: x.sum(), + ops.Mean: lambda x: x.mean(), + ops.Count: lambda x: x.count(), + ops.Mode: lambda x: x.mode().iat[0], + ops.Any: lambda x: x.any(), + ops.All: lambda x: x.all(), + ops.Median: lambda x: x.median(), + ops.ApproxMedian: lambda x: x.median(), + ops.BitAnd: lambda x: np.bitwise_and.reduce(x.values), + ops.BitOr: lambda x: np.bitwise_or.reduce(x.values), + ops.BitXor: lambda x: np.bitwise_xor.reduce(x.values), + ops.Last: lambda x: x.iat[-1], + ops.First: lambda x: x.iat[0], + ops.CountDistinct: lambda x: x.nunique(), + ops.ApproxCountDistinct: lambda x: x.nunique(), + ops.ArrayCollect: lambda x: x.tolist(), +} + + +class Executor(Dispatched): + @classmethod + def visit(cls, op: ops.Node, **kwargs): + raise OperationNotDefinedError( + f"Operation {op!r} is not implemented for the pandas backend" + ) + + @classmethod + def visit(cls, op: ops.Literal, value, dtype): + if dtype.is_interval(): + value = pd.Timedelta(value, dtype.unit.short) + elif dtype.is_array(): + value = np.array(value) + elif dtype.is_date(): + value = pd.Timestamp(value, tz="UTC").tz_localize(None) + return value + + @classmethod + def visit(cls, op: ops.Field, rel, name): + return rel[name] + + @classmethod + def visit(cls, op: ops.Alias, arg, name): + try: + return arg.rename(name) + except AttributeError: + return arg + + @classmethod + def visit(cls, op: ops.SortKey, expr, ascending): + return expr + + @classmethod + def visit(cls, op: ops.Cast, arg, to): + if isinstance(arg, pd.Series): + return PandasConverter.convert_column(arg, to) + else: + return PandasConverter.convert_scalar(arg, to) + + @classmethod + def visit(cls, op: ops.TypeOf, arg): + raise OperationNotDefinedError("TypeOf is not implemented") + + @classmethod + def visit(cls, op: ops.RandomScalar): + raise OperationNotDefinedError("RandomScalar is not implemented") + + @classmethod + def visit(cls, op: ops.Greatest, arg): + return columnwise(lambda df: df.max(axis=1), arg) + + @classmethod + def visit(cls, op: ops.Least, arg): + return columnwise(lambda df: df.min(axis=1), arg) + + @classmethod + def visit(cls, op: ops.Coalesce, arg): + return columnwise(lambda df: df.bfill(axis=1).iloc[:, 0], arg) + + @classmethod + def visit(cls, op: ops.Value, **operands): + return pick_kernel(op, operands) + + @classmethod + def visit(cls, op: ops.IsNan, arg): + try: + return np.isnan(arg) + except (TypeError, ValueError): + # if `arg` contains `None` np.isnan will complain + # so we take advantage of NaN not equaling itself + # to do the correct thing + return arg != arg + + @classmethod + def visit(cls, op: ops.SearchedCase, cases, results, default): + cases, _ = asframe(cases, concat=False) + results, _ = asframe(results, concat=False) + out = np.select(cases, results, default) + return pd.Series(out) + + @classmethod + def visit(cls, op: ops.SimpleCase, base, cases, results, default): + if isinstance(default, pd.Series): + raise NotImplementedError( + "SimpleCase with a columnar shaped default value is not implemented" + ) + cases = tuple(base == case for case in cases) + cases, _ = asframe(cases, concat=False) + results, _ = asframe(results, concat=False) + out = np.select(cases, results, default) + return pd.Series(out) + + @classmethod + def visit(cls, op: ops.TimestampTruncate | ops.DateTruncate, arg, unit): + # TODO(kszucs): should use serieswise() + unit = {"m": "Min", "ms": "L"}.get(unit.short, unit.short) + try: + return arg.dt.floor(unit) + except ValueError: + return arg.dt.to_period(unit).dt.to_timestamp() + + @classmethod + def visit(cls, op: ops.IntervalFromInteger, unit, **kwargs): + if unit.short in {"Y", "Q", "M", "W"}: + return elementwise(lambda v: pd.DateOffset(**{unit.plural: v}), kwargs) + else: + return serieswise( + lambda arg: arg.astype(f"timedelta64[{unit.short}]"), kwargs + ) + + @classmethod + def visit(cls, op: ops.BetweenTime, arg, lower_bound, upper_bound): + idx = pd.DatetimeIndex(arg) + if idx.tz is not None: + idx = idx.tz_convert(None) # make naive because times are naive + indexer = idx.indexer_between_time(lower_bound, upper_bound) + result = np.zeros(len(arg), dtype=np.bool_) + result[indexer] = True + return pd.Series(result) + + @classmethod + def visit(cls, op: ops.FindInSet, needle, values): + (needle, *haystack), _ = asframe((needle, *values), concat=False) + condlist = [needle == col for col in haystack] + choicelist = [i for i, _ in enumerate(haystack)] + result = np.select(condlist, choicelist, default=-1) + return pd.Series(result, name=op.name) + + @classmethod + def visit(cls, op: ops.Array, exprs): + return rowwise(lambda row: np.array(row, dtype=object), exprs) + + @classmethod + def visit(cls, op: ops.ArrayConcat, arg): + return rowwise(lambda row: np.concatenate(row.values), arg) + + @classmethod + def visit(cls, op: ops.Unnest, arg): + arg = asseries(arg) + mask = arg.map(lambda v: bool(len(v)), na_action="ignore") + return arg[mask].explode() + + @classmethod + def visit( + cls, op: ops.ElementWiseVectorizedUDF, func, func_args, input_type, return_type + ): + """Execute an elementwise UDF.""" + + res = func(*func_args) + if isinstance(res, pd.DataFrame): + # it is important otherwise it is going to fill up the memory + res = res.apply(lambda row: row.to_dict(), axis=1) + + return res + + ############################# Reductions ################################## + + @classmethod + def visit(cls, op: ops.Reduction, arg, where): + func = _reduction_operations[type(op)] + return agg(func, arg, where) + + @classmethod + def visit(cls, op: ops.CountStar, arg, where): + def agg(df): + if where is None: + return len(df) + else: + return df[where.name].sum() + + return agg + + @classmethod + def visit(cls, op: ops.CountDistinctStar, arg, where): + def agg(df): + if where is None: + return df.nunique() + else: + return df[where.name].nunique() + + return agg + + @classmethod + def visit(cls, op: ops.Arbitrary, arg, where, how): + if how == "first": + return agg(lambda x: x.iat[0], arg, where) + elif how == "last": + return agg(lambda x: x.iat[-1], arg, where) + else: + raise OperationNotDefinedError(f"Arbitrary {how!r} is not supported") + + @classmethod + def visit(cls, op: ops.ArgMin | ops.ArgMax, arg, key, where): + func = operator.methodcaller(op.__class__.__name__.lower()) + + if where is None: + + def agg(df): + indices = func(df[key.name]) + return df[arg.name].iloc[indices] + else: + + def agg(df): + mask = df[where.name] + filtered = df[mask] + indices = func(filtered[key.name]) + return filtered[arg.name].iloc[indices] + + return agg + + @classmethod + def visit(cls, op: ops.Variance, arg, where, how): + ddof = {"pop": 0, "sample": 1}[how] + return agg(lambda x: x.var(ddof=ddof), arg, where) + + @classmethod + def visit(cls, op: ops.StandardDev, arg, where, how): + ddof = {"pop": 0, "sample": 1}[how] + return agg(lambda x: x.std(ddof=ddof), arg, where) + + @classmethod + def visit(cls, op: ops.Correlation, left, right, where, how): + if where is None: + + def agg(df): + return df[left.name].corr(df[right.name]) + else: + + def agg(df): + mask = df[where.name] + lhs = df[left.name][mask] + rhs = df[right.name][mask] + return lhs.corr(rhs) + + return agg + + @classmethod + def visit(cls, op: ops.Covariance, left, right, where, how): + ddof = {"pop": 0, "sample": 1}[how] + if where is None: + + def agg(df): + return df[left.name].cov(df[right.name], ddof=ddof) + else: + + def agg(df): + mask = df[where.name] + lhs = df[left.name][mask] + rhs = df[right.name][mask] + return lhs.cov(rhs, ddof=ddof) + + return agg + + @classmethod + def visit(cls, op: ops.GroupConcat, arg, sep, where): + if where is None: + + def agg(df): + return sep.join(df[arg.name].astype(str)) + else: + + def agg(df): + mask = df[where.name] + group = df[arg.name][mask] + if group.empty: + return pd.NA + return sep.join(group) + + return agg + + @classmethod + def visit(cls, op: ops.Quantile, arg, quantile, where): + return agg(lambda x: x.quantile(quantile), arg, where) + + @classmethod + def visit(cls, op: ops.MultiQuantile, arg, quantile, where): + return agg(lambda x: list(x.quantile(quantile)), arg, where) + + @classmethod + def visit( + cls, op: ops.ReductionVectorizedUDF, func, func_args, input_type, return_type + ): + def agg(df): + args = [df[col.name] for col in func_args] + return func(*args) + + return agg + + ############################# Analytic #################################### + + @classmethod + def visit(cls, op: ops.RowNumber): + def agg(df, order_keys): + return pd.Series(np.arange(len(df)), index=df.index) + + return agg + + @classmethod + def visit(cls, op: ops.Lag | ops.Lead, arg, offset, default): + if isinstance(op, ops.Lag): + sign = lambda x: x + else: + sign = lambda x: -x + + if op.offset is not None and op.offset.dtype.is_interval(): + + def agg(df, order_keys): + df = df.set_index(order_keys) + col = df[arg.name].shift(freq=sign(offset)) + return col.reindex(df.index, fill_value=default) + else: + offset = 1 if offset is None else offset + + def agg(df, order_keys): + return df[arg.name].shift(sign(offset), fill_value=default) + + return agg + + @classmethod + def visit(cls, op: ops.MinRank | ops.DenseRank): + method = "dense" if isinstance(op, ops.DenseRank) else "min" + + def agg(df, order_keys): + if len(order_keys) == 0: + raise ValueError("order_by argument is required for rank functions") + elif len(order_keys) == 1: + s = df[order_keys[0]] + else: + s = df[order_keys].apply(tuple, axis=1) + + return s.rank(method=method).astype("int64") - 1 + + return agg + + @classmethod + def visit(cls, op: ops.PercentRank): + def agg(df, order_keys): + if len(order_keys) == 0: + raise ValueError("order_by argument is required for rank functions") + elif len(order_keys) == 1: + s = df[order_keys[0]] + else: + s = df[order_keys].apply(tuple, axis=1) + + return s.rank(method="min").sub(1).div(len(df) - 1) + + return agg + + @classmethod + def visit(cls, op: ops.CumeDist): + def agg(df, order_keys): + if len(order_keys) == 0: + raise ValueError("order_by argument is required for rank functions") + elif len(order_keys) == 1: + s = df[order_keys[0]] + else: + s = df[order_keys].apply(tuple, axis=1) + + return s.rank(method="average", pct=True) + + return agg + + @classmethod + def visit(cls, op: ops.FirstValue | ops.LastValue, arg): + i = 0 if isinstance(op, ops.FirstValue) else -1 + + def agg(df, order_keys): + return df[arg.name].iat[i] + + return agg + + @classmethod + def visit( + cls, op: ops.AnalyticVectorizedUDF, func, func_args, input_type, return_type + ): + def agg(df, order_keys): + args = [df[col.name] for col in func_args] + return func(*args) + + return agg + + ############################ Window functions ############################# + + @classmethod + def visit(cls, op: ops.WindowBoundary, value, preceding): + return value + + @classmethod + def visit( + cls, op: ops.WindowFrame, table, start, end, group_by, order_by, **kwargs + ): + if start is not None: + start = asseries(start, len(table)) + if op.start.preceding: + start = -start + if end is not None: + end = asseries(end, len(table)) + if op.end.preceding: + end = -end + + table = table.assign(__start__=start, __end__=end) + + # TODO(kszucs): order by ibis.random() is not supported because it is + # excluded from the group by keys due to its scalar shape + group_keys = [group.name for group in op.group_by] + order_keys = [key.name for key in op.order_by if key.shape.is_columnar()] + ascending = [key.ascending for key in op.order_by if key.shape.is_columnar()] + + if order_by: + table = table.sort_values(order_keys, ascending=ascending, kind="mergesort") + + if group_by: + frame = GroupedFrame(df=table, group_keys=group_keys) + else: + frame = UngroupedFrame(df=table) + + if start is None and end is None: + return frame + elif op.how == "rows": + return RowsFrame(parent=frame) + elif op.how == "range": + if len(order_keys) != 1: + raise NotImplementedError( + "Only single column order by is supported for range window frames" + ) + return RangeFrame(parent=frame, order_key=order_keys[0]) + else: + raise NotImplementedError(f"Unsupported window frame type: {op.how}") + + @classmethod + def visit(cls, op: ops.WindowFunction, func, frame): + if isinstance(op.func, ops.Analytic): + order_keys = [key.name for key in op.frame.order_by] + return frame.apply_analytic(func, order_keys=order_keys) + else: + return frame.apply_reduction(func) + + ############################ Relational ################################### + + @classmethod + def visit(cls, op: ops.DatabaseTable, name, schema, source, namespace): + try: + return source.dictionary[name] + except KeyError: + raise UnboundExpressionError( + f"{name} is not a table in the {source.name!r} backend, you " + "probably tried to execute an expression without a data source" + ) + + @classmethod + def visit(cls, op: ops.InMemoryTable, name, schema, data): + return data.to_frame() + + @classmethod + def visit(cls, op: ops.DummyTable, values): + df, _ = asframe(values) + return df + + @classmethod + def visit(cls, op: ops.SelfReference | ops.JoinTable, parent, **kwargs): + return parent + + @classmethod + def visit(cls, op: PandasRename, parent, mapping): + return parent.rename(columns=mapping) + + @classmethod + def visit(cls, op: PandasLimit, parent, n, offset): + n = n.iat[0, 0] + offset = offset.iat[0, 0] + if n is None: + return parent.iloc[offset:] + else: + return parent.iloc[offset : offset + n] + + @classmethod + def visit(cls, op: ops.Sample, parent, fraction, method, seed): + return parent.sample(frac=fraction, random_state=seed) + + @classmethod + def visit(cls, op: ops.Project, parent, values): + df, all_scalars = asframe(values) + if all_scalars and len(parent) != len(df): + df = pd.concat([df] * len(parent)) + return df + + @classmethod + def visit(cls, op: ops.Filter, parent, predicates): + if predicates: + pred = reduce(operator.and_, predicates) + if len(pred) != len(parent): + raise RuntimeError( + "Selection predicate length does not match underlying table" + ) + parent = parent.loc[pred].reset_index(drop=True) + return parent + + @classmethod + def visit(cls, op: ops.Sort, parent, keys): + # 1. add sort key columns to the dataframe if they are not already present + # 2. sort the dataframe using those columns + # 3. drop the sort key columns + ascending = [key.ascending for key in op.keys] + newcols = {gen_name("sort_key"): col for col in keys} + names = list(newcols.keys()) + df = parent.assign(**newcols) + df = df.sort_values(by=names, ascending=ascending, ignore_index=True) + return df.drop(names, axis=1) + + @classmethod + def visit(cls, op: PandasAggregate, parent, groups, metrics): + if groups: + parent = parent.groupby([col.name for col in groups.values()]) + metrics = {k: parent.apply(v) for k, v in metrics.items()} + result = pd.concat(metrics, axis=1).reset_index() + renames = {v.name: k for k, v in op.groups.items()} + return result.rename(columns=renames) + else: + results = {k: v(parent) for k, v in metrics.items()} + combined, _ = asframe(results) + return combined + + @classmethod + def visit(cls, op: PandasJoin, how, left, right, left_on, right_on): + # broadcast predicates if they are scalar values + left_size = len(left) + left_on = [asseries(v, left_size) for v in left_on] + right_size = len(right) + right_on = [asseries(v, right_size) for v in right_on] + + if how == "cross": + assert not left_on and not right_on + return pd.merge(left, right, how="cross") + elif how == "anti": + df = pd.merge( + left, + right, + how="outer", + left_on=left_on, + right_on=right_on, + indicator=True, + ) + df = df[df["_merge"] == "left_only"] + return df.drop(columns=["_merge"]) + elif how == "semi": + mask = asseries(True, left_size) + for left_pred, right_pred in zip(left_on, right_on): + mask = mask & left_pred.isin(right_pred) + return left[mask] + else: + df = left.merge(right, how=how, left_on=left_on, right_on=right_on) + return df.drop(columns=[f"key_{i}" for i in range(len(left_on))]) + + @classmethod + def visit( + cls, + op: PandasAsofJoin, + how, + left, + right, + left_on, + right_on, + left_by, + right_by, + operator, + ): + # broadcast predicates if they are scalar values + left_size = len(left) + right_size = len(right) + left_on = [asseries(v, left_size) for v in left_on] + left_by = [asseries(v, left_size) for v in left_by] + right_on = [asseries(v, right_size) for v in right_on] + right_by = [asseries(v, right_size) for v in right_by] + + # merge_asof only works with column names not with series + left_on = {gen_name("left"): s for s in left_on} + left_by = {gen_name("left"): s for s in left_by} + right_on = {gen_name("right"): s for s in right_on} + right_by = {gen_name("right"): s for s in right_by} + + left = left.assign(**left_on, **left_by) + right = right.assign(**right_on, **right_by) + + # construct the appropriate flags for merge_asof + if operator == ops.LessEqual: + direction = "forward" + allow_exact_matches = True + elif operator == ops.GreaterEqual: + direction = "backward" + allow_exact_matches = True + elif operator == ops.Less: + direction = "forward" + allow_exact_matches = False + elif operator == ops.Greater: + direction = "backward" + allow_exact_matches = False + elif operator == ops.Equals: + direction = "nearest" + allow_exact_matches = True + else: + raise NotImplementedError( + f"Operator {operator} not supported for asof join" + ) + + # merge_asof requires the left side to be sorted by the join keys + left = left.sort_values(by=list(left_on.keys())) + df = pd.merge_asof( + left, + right, + left_on=list(left_on.keys()), + right_on=list(right_on.keys()), + left_by=list(left_by.keys()) or None, + right_by=list(right_by.keys()) or None, + direction=direction, + allow_exact_matches=allow_exact_matches, + ) + return df + + @classmethod + def visit(cls, op: ops.Union, left, right, distinct): + result = pd.concat([left, right], axis=0) + return result.drop_duplicates() if distinct else result + + @classmethod + def visit(cls, op: ops.Intersection, left, right, distinct): + if not distinct: + raise NotImplementedError( + "`distinct=False` is not supported by the pandas backend" + ) + return left.merge(right, on=list(left.columns), how="inner") + + @classmethod + def visit(cls, op: ops.Difference, left, right, distinct): + if not distinct: + raise NotImplementedError( + "`distinct=False` is not supported by the pandas backend" + ) + merged = left.merge(right, on=list(left.columns), how="outer", indicator=True) + result = merged[merged["_merge"] == "left_only"].drop("_merge", axis=1) + return result + + @classmethod + def visit(cls, op: ops.Distinct, parent): + return parent.drop_duplicates() + + @classmethod + def visit(cls, op: ops.DropNa, parent, how, subset): + if op.subset is not None: + subset = [col.name for col in op.subset] + else: + subset = None + return parent.dropna(how=how, subset=subset) + + @classmethod + def visit(cls, op: ops.FillNa, parent, replacements): + return parent.fillna(replacements) + + @classmethod + def visit(cls, op: ops.InValues, value, options): + if isinstance(value, pd.Series): + return value.isin(options) + else: + return value in options + + @classmethod + def visit(cls, op: ops.InSubquery, rel, needle): + first_column = rel.iloc[:, 0] + if isinstance(needle, pd.Series): + return needle.isin(first_column) + else: + return needle in first_column + + @classmethod + def visit(cls, op: PandasScalarSubquery, rel): + return rel.iat[0, 0] + + @classmethod + def execute(cls, node, backend, params): + def fn(node, _, **kwargs): + return cls.visit(node, **kwargs) + + original = node + node = node.to_expr().as_table().op() + node = plan(node, backend=backend, params=params) + df = node.map_clear(fn) + + # TODO(kszucs): add a flag to disable this conversion because it can be + # expensive for columns with object dtype + df = PandasData.convert_table(df, node.schema) + if isinstance(original, ops.Value): + if original.shape.is_scalar(): + return df.iloc[0, 0] + elif original.shape.is_columnar(): + return df.iloc[:, 0] + else: + raise TypeError(f"Unexpected shape: {original.shape}") + else: + return df diff --git a/ibis/backends/pandas/helpers.py b/ibis/backends/pandas/helpers.py new file mode 100644 index 000000000000..d8bc9efd54eb --- /dev/null +++ b/ibis/backends/pandas/helpers.py @@ -0,0 +1,211 @@ +from __future__ import annotations + +import itertools +from typing import Callable + +import numpy as np +import pandas as pd + +from ibis.util import gen_name + + +def asseries(value, size=1): + """Ensure that value is a pandas Series object, broadcast if necessary.""" + if isinstance(value, pd.Series): + return value + elif isinstance(value, (list, np.ndarray)): + return pd.Series(itertools.repeat(np.array(value), size)) + else: + return pd.Series(np.repeat(value, size)) + + +def asframe(values: dict | tuple, concat=True): + """Construct a DataFrame from a dict or tuple of Series objects.""" + if isinstance(values, dict): + names, values = zip(*values.items()) + elif isinstance(values, tuple): + names = [f"_{i}" for i in range(len(values))] + else: + raise TypeError(f"values must be a dict, or tuple; got {type(values)}") + + size = 1 + all_scalars = True + for v in values: + if isinstance(v, pd.Series): + size = len(v) + all_scalars = False + break + + columns = [asseries(v, size) for v in values] + if concat: + df = pd.concat(columns, axis=1, keys=names).reset_index(drop=True) + return df, all_scalars + else: + return columns, all_scalars + + +def generic(func: Callable, operands): + return func(*operands.values()) + + +def rowwise(func: Callable, operands): + # dealing with a collection of series objects + df, all_scalars = asframe(operands) + result = df.apply(func, axis=1) # , **kwargs) + return result.iat[0] if all_scalars else result + + +def columnwise(func: Callable, operands): + df, all_scalars = asframe(operands) + result = func(df) + return result.iat[0] if all_scalars else result + + +def serieswise(func, operands): + (key, value), *rest = operands.items() + if isinstance(value, pd.Series): + # dealing with a single series object + return func(**operands) + else: + # dealing with a single scalar object + value = pd.Series([value]) + operands = {key: value, **dict(rest)} + return func(**operands).iat[0] + + +def elementwise(func, operands): + value = operands.pop(next(iter(operands))) + if isinstance(value, pd.Series): + # dealing with a single series object + if operands: + return value.apply(func, **operands) + else: + return value.map(func, na_action="ignore") + else: + # dealing with a single scalar object + return func(value, **operands) + + +def agg(func, arg_column, where_column): + if where_column is None: + + def applier(df): + return func(df[arg_column.name]) + else: + + def applier(df): + mask = df[where_column.name] + col = df[arg_column.name][mask] + return func(col) + + return applier + + +class UngroupedFrame: + def __init__(self, df): + self.df = df + + def groups(self): + yield self.df + + def apply_reduction(self, func, **kwargs): + result = func(self.df, **kwargs) + data = [result] * len(self.df) + return pd.Series(data, index=self.df.index) + + def apply_analytic(self, func, **kwargs): + return func(self.df, **kwargs) + + +class GroupedFrame: + def __init__(self, df, group_keys): + self.df = df + self.group_keys = group_keys + self.groupby = df.groupby(group_keys, as_index=True) + + def groups(self): + for _, df in self.groupby: + yield df + + def apply_analytic(self, func, **kwargs): + results = [func(df, **kwargs) for df in self.groups()] + return pd.concat(results) + + def apply_reduction(self, func, **kwargs): + name = gen_name("result") + result = self.groupby.apply(func, **kwargs).rename(name) + df = self.df.merge(result, left_on=self.group_keys, right_index=True) + return df[name] + + +class RowsFrame: + def __init__(self, parent): + self.parent = parent + + @staticmethod + def adjust(length, index, start_offset, end_offset): + if start_offset is None: + start_index = 0 + else: + start_index = index + start_offset + if start_index < 0: + start_index = 0 + elif start_index > length: + start_index = length + + if end_offset is None: + end_index = length + else: + end_index = index + end_offset + 1 + if end_index < 0: + end_index = 0 + elif end_index > length: + end_index = length + + return (start_index, end_index) + + def apply_analytic(self, func, **kwargs): + return self.parent.apply_analytic(func, **kwargs) + + def apply_reduction(self, func, **kwargs): + results = {} + for df in self.parent.groups(): + for i, (ix, row) in enumerate(df.iterrows()): + # TODO(kszucs): use unique column names for _start, _end + start, end = row["__start__"], row["__end__"] + start_index, end_index = self.adjust(len(df), i, start, end) + subdf = df.iloc[start_index:end_index] + results[ix] = func(subdf, **kwargs) + + return pd.Series(results) + + +class RangeFrame: + def __init__(self, parent, order_key): + self.parent = parent + self.order_key = order_key + + @staticmethod + def predicate(col, i, start, end): + value = col.iat[i] + if start is None: + return col <= value + end + elif end is None: + return col >= value + start + else: + return (col >= value + start) & (col <= value + end) + + def apply_analytic(self, func, **kwargs): + return self.parent.apply_analytic(func, **kwargs) + + def apply_reduction(self, func, **kwargs): + results = {} + for df in self.parent.groups(): + for i, (ix, row) in enumerate(df.iterrows()): + start, end = row["__start__"], row["__end__"] + column = df[self.order_key] + predicate = self.predicate(column, i, start, end) + subdf = df[predicate] + results[ix] = func(subdf, **kwargs) + + return pd.Series(results) diff --git a/ibis/backends/pandas/kernels.py b/ibis/backends/pandas/kernels.py new file mode 100644 index 000000000000..1e28095c1ee2 --- /dev/null +++ b/ibis/backends/pandas/kernels.py @@ -0,0 +1,513 @@ +from __future__ import annotations + +import decimal +import json +import math +import operator + +try: + import regex as re +except ImportError: + import re +from functools import reduce +from urllib.parse import parse_qs, urlsplit + +import numpy as np +import pandas as pd +import toolz + +import ibis.expr.operations as ops +from ibis.backends.pandas.helpers import ( + columnwise, + elementwise, + generic, + rowwise, + serieswise, +) +from ibis.common.exceptions import OperationNotDefinedError +from ibis.util import any_of + + +def substring_rowwise(row): + arg, start, length = row["arg"], row["start"], row["length"] + if length is None: + return arg[start:] + else: + return arg[start : start + length] + + +def substring_serieswise(arg, start, length): + if length is None: + return arg.str[start:] + else: + return arg.str[start : start + length] + + +def _sql_like_to_regex(pattern, escape): + """Convert a SQL `LIKE` pattern to an equivalent Python regular expression. + + Parameters + ---------- + pattern + A LIKE pattern with the following semantics: + * `%` matches zero or more characters + * `_` matches exactly one character + * To escape `%` and `_` (or to match the `escape` parameter + itself), prefix the desired character with `escape`. + escape + Escape character + + Returns + ------- + str + A regular expression pattern equivalent to the input SQL `LIKE` pattern. + + Examples + -------- + >>> sql_like_to_regex("6%") # default is to not escape anything + '^6.*$' + >>> sql_like_to_regex("6^%", escape="^") + '^6%$' + >>> sql_like_to_regex("6_") + '^6.$' + >>> sql_like_to_regex("6/_", escape="/") + '^6_$' + >>> sql_like_to_regex("%abc") # any string ending with "abc" + '^.*abc$' + >>> sql_like_to_regex("abc%") # any string starting with "abc" + '^abc.*$' + """ + cur_i = 0 + pattern_length = len(pattern) + + while cur_i < pattern_length: + nxt_i = cur_i + 1 + + cur = pattern[cur_i] + nxt = pattern[nxt_i] if nxt_i < pattern_length else None + + skip = 1 + + if nxt is not None and escape is not None and cur == escape: + yield nxt + skip = 2 + elif cur == "%": + yield ".*" + elif cur == "_": + yield "." + else: + yield cur + + cur_i += skip + + +def sql_like_to_regex(pattern, escape=None): + return f"^{''.join(_sql_like_to_regex(pattern, escape))}$" + + +def string_sqllike_serieswise(arg, pattern, escape): + pat = sql_like_to_regex(pattern, escape) + return arg.str.contains(pat, regex=True) + + +def string_sqlilike_serieswise(arg, pattern, escape): + pat = sql_like_to_regex(pattern, escape) + return arg.str.contains(pat, regex=True, flags=re.IGNORECASE) + + +def extract_userinfo_elementwise(x): + url_parts = urlsplit(x) + username = url_parts.username or "" + password = url_parts.password or "" + return f"{username}:{password}" + + +def extract_queryparam_rowwise(row): + query = urlsplit(row["arg"]).query + param_name = row["key"] + if param_name is not None: + value = parse_qs(query)[param_name] + return value if len(value) > 1 else value[0] + else: + return query + + +def array_index_rowwise(row): + try: + return row["arg"][row["index"]] + except IndexError: + return None + + +def array_position_rowwise(row): + try: + return row["arg"].index(row["other"]) + except ValueError: + return -1 + + +def integer_range_rowwise(row): + if not row["step"]: + return [] + return list(np.arange(row["start"], row["stop"], row["step"])) + + +def timestamp_range_rowwise(row): + if not row["step"]: + return [] + return list( + pd.date_range(row["start"], row["stop"], freq=row["step"], inclusive="left") + ) + + +def _safe_method(mapping, method, *args, **kwargs): + if mapping is None or mapping is pd.NA: + return None + try: + method = getattr(mapping, method) + except AttributeError: + return None + else: + result = method(*args, **kwargs) + return None if result is pd.NA else result + + +def safe_len(mapping): + return _safe_method(mapping, "__len__") + + +def safe_get(mapping, key, default=None): + return _safe_method(mapping, "get", key, default) + + +def safe_contains(mapping, key): + return _safe_method(mapping, "__contains__", key) + + +def safe_keys(mapping): + result = _safe_method(mapping, "keys") + if result is None: + return None + # list(...) to unpack iterable + return np.array(list(result)) + + +def safe_values(mapping): + result = _safe_method(mapping, "values") + if result is None or result is pd.NA: + return None + # list(...) to unpack iterable + return np.array(list(result), dtype="object") + + +def safe_merge(left, right): + if left is None or left is pd.NA: + return None + elif right is None or right is pd.NA: + return None + else: + return {**left, **right} + + +def safe_json_getitem(value, key): + try: + # try to deserialize the value -> return None if it's None + if (js := json.loads(value)) is None: + return None + except (json.JSONDecodeError, TypeError): + # if there's an error related to decoding or a type error return None + return None + + try: + # try to extract the value as an array element or mapping key + return js[key] + except (KeyError, IndexError, TypeError): + # KeyError: missing mapping key + # IndexError: missing sequence key + # TypeError: `js` doesn't implement __getitem__, either at all or for + # the type of `key` + return None + + +def safe_decimal(func): + def wrapper(x, **kwargs): + try: + return func(x, **kwargs) + except decimal.InvalidOperation: + return decimal.Decimal("NaN") + + return wrapper + + +def round_serieswise(arg, digits): + if digits is None: + return np.round(arg).astype("int64") + else: + return np.round(arg, digits).astype("float64") + + +_generic_impls = { + ops.Abs: abs, + ops.Acos: np.arccos, + ops.Add: operator.add, + ops.And: operator.and_, + ops.Asin: np.arcsin, + ops.Atan: np.arctan, + ops.Atan2: np.arctan2, + ops.BitwiseAnd: lambda x, y: np.bitwise_and(x, y), + ops.BitwiseLeftShift: lambda x, y: np.left_shift(x, y).astype("int64"), + ops.BitwiseNot: np.invert, + ops.BitwiseOr: lambda x, y: np.bitwise_or(x, y), + ops.BitwiseRightShift: lambda x, y: np.right_shift(x, y).astype("int64"), + ops.BitwiseXor: lambda x, y: np.bitwise_xor(x, y), + ops.Ceil: lambda x: np.ceil(x).astype("int64"), + ops.Cos: np.cos, + ops.Cot: lambda x: 1 / np.tan(x), + ops.DateAdd: operator.add, + ops.DateDiff: operator.sub, + ops.DateSub: operator.sub, + ops.Degrees: np.degrees, + ops.Divide: operator.truediv, + ops.Equals: operator.eq, + ops.Exp: np.exp, + ops.Floor: lambda x: np.floor(x).astype("int64"), + ops.FloorDivide: operator.floordiv, + ops.Greater: operator.gt, + ops.GreaterEqual: operator.ge, + ops.IdenticalTo: lambda x, y: (x == y) | (pd.isnull(x) & pd.isnull(y)), + ops.IntervalAdd: operator.add, + ops.IntervalFloorDivide: operator.floordiv, + ops.IntervalMultiply: operator.mul, + ops.IntervalSubtract: operator.sub, + ops.IsInf: np.isinf, + ops.IsNull: pd.isnull, + ops.Less: operator.lt, + ops.LessEqual: operator.le, + ops.Ln: np.log, + ops.Log10: np.log10, + ops.Log2: np.log2, + ops.Modulus: operator.mod, + ops.Multiply: operator.mul, + ops.Negate: lambda x: not x if isinstance(x, (bool, np.bool_)) else -x, + ops.Not: lambda x: not x if isinstance(x, (bool, np.bool_)) else ~x, + ops.NotEquals: operator.ne, + ops.NotNull: pd.notnull, + ops.Or: operator.or_, + ops.Power: operator.pow, + ops.Radians: np.radians, + ops.Sign: np.sign, + ops.Sin: np.sin, + ops.Sqrt: np.sqrt, + ops.Subtract: operator.sub, + ops.Tan: np.tan, + ops.TimestampAdd: operator.add, + ops.TimestampDiff: operator.sub, + ops.TimestampSub: operator.sub, + ops.Xor: operator.xor, + ops.E: lambda: np.e, + ops.Pi: lambda: np.pi, + ops.TimestampNow: lambda: pd.Timestamp("now", tz="UTC").tz_localize(None), + ops.StringConcat: lambda xs: reduce(operator.add, xs), + ops.StringJoin: lambda sep, xs: reduce(lambda x, y: x + sep + y, xs), + ops.Log: lambda x, base: np.log(x) if base is None else np.log(x) / np.log(base), +} + +_columnwise_impls = { + ops.Clip: lambda df: df["arg"].clip(lower=df["lower"], upper=df["upper"]), + ops.IfElse: lambda df: df["true_expr"].where( + df["bool_expr"], other=df["false_null_expr"] + ), + ops.NullIf: lambda df: df["arg"].where(df["arg"] != df["null_if_expr"]), + ops.Repeat: lambda df: df["arg"] * df["times"], +} + +_rowwise_impls = { + ops.ArrayContains: lambda row: row["other"] in row["arg"], + ops.ArrayIndex: array_index_rowwise, + ops.ArrayPosition: array_position_rowwise, + ops.ArrayRemove: lambda row: [x for x in row["arg"] if x != row["other"]], + ops.ArrayRepeat: lambda row: np.tile(row["arg"], max(0, row["times"])), + ops.ArraySlice: lambda row: row["arg"][row["start"] : row["stop"]], + ops.ArrayUnion: lambda row: toolz.unique(row["left"] + row["right"]), + ops.EndsWith: lambda row: row["arg"].endswith(row["end"]), + ops.IntegerRange: integer_range_rowwise, + ops.JSONGetItem: lambda row: safe_json_getitem(row["arg"], row["index"]), + ops.Map: lambda row: dict(zip(row["keys"], row["values"])), + ops.MapGet: lambda row: safe_get(row["arg"], row["key"], row["default"]), + ops.MapContains: lambda row: safe_contains(row["arg"], row["key"]), + ops.MapMerge: lambda row: safe_merge(row["left"], row["right"]), + ops.TimestampRange: timestamp_range_rowwise, + ops.LPad: lambda row: row["arg"].rjust(row["length"], row["pad"]), + ops.RegexExtract: lambda row: re.search(row["pattern"], row["arg"]).group( + row["index"] + ), + ops.RegexReplace: lambda row: re.sub( + row["pattern"], row["replacement"], row["arg"] + ), + ops.RegexSearch: lambda row: re.search(row["pattern"], row["arg"]) is not None, + ops.RPad: lambda row: row["arg"].ljust(row["length"], row["pad"]), + ops.StartsWith: lambda row: row["arg"].startswith(row["start"]), + ops.StringContains: lambda row: row["haystack"].contains(row["needle"]), + ops.StringFind: lambda row: row["arg"].find( + row["substr"], row["start"], row["end"] + ), + ops.StringReplace: lambda row: row["arg"].replace( + row["pattern"], row["replacement"] + ), + ops.StringSplit: lambda row: row["arg"].split(row["delimiter"]), + ops.StrRight: lambda row: row["arg"][-row["nchars"] :], + ops.Translate: lambda row: row["arg"].translate( + str.maketrans(row["from_str"], row["to_str"]) + ), + ops.Substring: substring_rowwise, + ops.ExtractQuery: extract_queryparam_rowwise, + ops.Strftime: lambda row: row["arg"].strftime(row["format_str"]), +} + +_serieswise_impls = { + ops.Between: lambda arg, lower_bound, upper_bound: arg.between( + lower_bound, upper_bound + ), + ops.Capitalize: lambda arg: arg.str.capitalize(), + ops.Date: lambda arg: arg.dt.floor("d"), + ops.DayOfWeekIndex: lambda arg: pd.to_datetime(arg).dt.dayofweek, + ops.DayOfWeekName: lambda arg: pd.to_datetime(arg).dt.day_name(), + ops.EndsWith: lambda arg, end: arg.str.endswith(end), + ops.ExtractDay: lambda arg: arg.dt.day, + ops.ExtractDayOfYear: lambda arg: arg.dt.dayofyear, + ops.ExtractEpochSeconds: lambda arg: arg.astype("datetime64[s]") + .astype("int64") + .astype("int32"), + ops.ExtractHour: lambda arg: arg.dt.hour, + ops.ExtractMicrosecond: lambda arg: arg.dt.microsecond, + ops.ExtractMillisecond: lambda arg: arg.dt.microsecond // 1000, + ops.ExtractMinute: lambda arg: arg.dt.minute, + ops.ExtractMonth: lambda arg: arg.dt.month, + ops.ExtractQuarter: lambda arg: arg.dt.quarter, + ops.ExtractSecond: lambda arg: arg.dt.second, + ops.ExtractWeekOfYear: lambda arg: arg.dt.isocalendar().week.astype("int32"), + ops.ExtractYear: lambda arg: arg.dt.year, + ops.Lowercase: lambda arg: arg.str.lower(), + ops.LPad: lambda arg, length, pad: arg.str.rjust(length, fillchar=pad), + ops.LStrip: lambda arg: arg.str.lstrip(), + ops.Repeat: lambda arg, times: arg.str.repeat(times), + ops.Reverse: lambda arg: arg.str[::-1], + ops.Round: round_serieswise, + ops.RPad: lambda arg, length, pad: arg.str.ljust(length, fillchar=pad), + ops.RStrip: lambda arg: arg.str.rstrip(), + ops.StartsWith: lambda arg, start: arg.str.startswith(start), + ops.StringAscii: lambda arg: arg.map(ord, na_action="ignore").astype("int32"), + ops.StringContains: lambda haystack, needle: haystack.str.contains( + needle, regex=False + ), + ops.StringFind: lambda arg, substr, start, end: arg.str.find(substr, start, end), + ops.StringLength: lambda arg: arg.str.len().astype("int32"), + ops.StringReplace: lambda arg, pattern, replacement: arg.str.replace( + pattern, replacement + ), + ops.StringSplit: lambda arg, delimiter: arg.str.split(delimiter), + ops.StringSQLLike: string_sqllike_serieswise, + ops.StringSQLILike: string_sqlilike_serieswise, + ops.Strip: lambda arg: arg.str.strip(), + ops.Strftime: lambda arg, format_str: arg.dt.strftime(format_str), + ops.StrRight: lambda arg, nchars: arg.str[-nchars:], + ops.Substring: substring_serieswise, + ops.Time: lambda arg: arg.dt.time, + ops.TimestampFromUNIX: lambda arg, unit: pd.to_datetime(arg, unit=unit.short), + ops.Translate: lambda arg, from_str, to_str: arg.str.translate( + str.maketrans(from_str, to_str) + ), + ops.Uppercase: lambda arg: arg.str.upper(), +} + +_elementwise_impls = { + ops.ExtractProtocol: lambda x: getattr(urlsplit(x), "scheme", ""), + ops.ExtractAuthority: lambda x: getattr(urlsplit(x), "netloc", ""), + ops.ExtractPath: lambda x: getattr(urlsplit(x), "path", ""), + ops.ExtractFragment: lambda x: getattr(urlsplit(x), "fragment", ""), + ops.ExtractHost: lambda x: getattr(urlsplit(x), "hostname", ""), + ops.ExtractUserInfo: extract_userinfo_elementwise, + ops.StructField: lambda x, field: safe_get(x, field), + ops.ArrayLength: len, + ops.ArrayFlatten: toolz.concat, + ops.ArraySort: sorted, + ops.ArrayDistinct: toolz.unique, + ops.MapLength: safe_len, + ops.MapKeys: safe_keys, + ops.MapValues: safe_values, +} + + +_elementwise_decimal_impls = { + ops.Round: lambda x, digits=0: round(x, digits), + ops.Log10: safe_decimal(lambda x: x.log10()), + ops.Ln: safe_decimal(lambda x: x.ln()), + ops.Exp: safe_decimal(lambda x: x.exp()), + ops.Floor: safe_decimal(math.floor), + ops.Ceil: safe_decimal(math.ceil), + ops.Sqrt: safe_decimal(lambda x: x.sqrt()), + ops.Log2: safe_decimal(lambda x: x.ln() / decimal.Decimal(2).ln()), + ops.Sign: safe_decimal(lambda x: math.copysign(1, x)), + ops.Log: safe_decimal(lambda x, base: x.ln() / decimal.Decimal(base).ln()), +} + + +def pick_kernel(op, operands): + typ = type(op) + + # decimal operations have special implementations + if op.dtype.is_decimal(): + func = _elementwise_decimal_impls[typ] + return elementwise(func, operands) + + # prefer generic implementations if available + if func := _generic_impls.get(typ): + return generic(func, operands) + + first, *rest = operands.values() + is_multi_arg = bool(rest) + is_multi_column = any_of(rest, pd.Series) + + if is_multi_column: + if func := _columnwise_impls.get(typ): + return columnwise(func, operands) + elif func := _rowwise_impls.get(typ): + return rowwise(func, operands) + else: + raise OperationNotDefinedError( + "No columnwise or rowwise implementation found for " + f"multi-column operation {typ}" + ) + elif is_multi_arg: + if func := _columnwise_impls.get(typ): + return columnwise(func, operands) + elif func := _serieswise_impls.get(typ): + return serieswise(func, operands) + elif func := _rowwise_impls.get(typ): + return rowwise(func, operands) + elif func := _elementwise_impls.get(typ): + return elementwise(func, operands) + else: + raise OperationNotDefinedError( + "No columnwise, serieswise, rowwise or elementwise " + f"implementation found for multi-argument operation {typ}" + ) + else: # noqa: PLR5501 + if func := _serieswise_impls.get(typ): + return serieswise(func, operands) + elif func := _elementwise_impls.get(typ): + return elementwise(func, operands) + else: + raise OperationNotDefinedError( + "No serieswise or elementwise implementation found for " + f"single-argument operation {typ}" + ) + + +supported_operations = ( + _generic_impls.keys() + | _columnwise_impls.keys() + | _rowwise_impls.keys() + | _serieswise_impls.keys() + | _elementwise_impls.keys() +) diff --git a/ibis/backends/pandas/rewrites.py b/ibis/backends/pandas/rewrites.py new file mode 100644 index 000000000000..7419f92d498d --- /dev/null +++ b/ibis/backends/pandas/rewrites.py @@ -0,0 +1,322 @@ +from __future__ import annotations + +from public import public + +import ibis +import ibis.expr.datashape as ds +import ibis.expr.datatypes as dt +import ibis.expr.operations as ops +from ibis.common.annotations import attribute +from ibis.common.collections import FrozenDict +from ibis.common.patterns import replace +from ibis.common.typing import VarTuple # noqa: TCH001 +from ibis.expr.schema import Schema +from ibis.util import gen_name + + +class PandasRelation(ops.Relation): + pass + + +class PandasValue(ops.Value): + pass + + +@public +class PandasRename(PandasRelation): + parent: ops.Relation + mapping: FrozenDict[str, str] + + @classmethod + def from_prefix(cls, parent, prefix): + mapping = {k: f"{prefix}_{k}" for k in parent.schema} + return cls(parent, mapping) + + @attribute + def values(self): + return FrozenDict( + {to: ops.Field(self.parent, from_) for from_, to in self.mapping.items()} + ) + + @attribute + def schema(self): + return Schema( + {self.mapping[name]: dtype for name, dtype in self.parent.schema.items()} + ) + + +@public +class PandasJoin(PandasRelation): + left: ops.Relation + right: ops.Relation + left_on: VarTuple[ops.Value] + right_on: VarTuple[ops.Value] + how: str + + @attribute + def values(self): + return FrozenDict({**self.left.values, **self.right.values}) + + @attribute + def schema(self): + return self.left.schema | self.right.schema + + +@public +class PandasAsofJoin(PandasJoin): + left_by: VarTuple[ops.Value] + right_by: VarTuple[ops.Value] + operator: type + + +@public +class PandasAggregate(PandasRelation): + parent: ops.Relation + groups: FrozenDict[str, ops.Field] + metrics: FrozenDict[str, ops.Reduction] + + @attribute + def values(self): + return FrozenDict({**self.groups, **self.metrics}) + + @attribute + def schema(self): + return Schema({k: v.dtype for k, v in self.values.items()}) + + +@public +class PandasLimit(PandasRelation): + parent: ops.Relation + n: ops.Relation + offset: ops.Relation + + @attribute + def values(self): + return self.parent.values + + @attribute + def schema(self): + return self.parent.schema + + +@public +class PandasScalarSubquery(PandasValue): + # variant with no integrity checks + rel: ops.Relation + + shape = ds.scalar + + @attribute + def dtype(self): + return self.rel.schema.types[0] + + +def is_columnar(node): + return isinstance(node, ops.Value) and node.shape.is_columnar() + + +@replace(ops.Project) +def rewrite_project(_, **kwargs): + winfuncs = [] + for v in _.values.values(): + winfuncs.extend(v.find(ops.WindowFunction, ops.Value)) + + if not winfuncs: + return _ + + selects = {ops.Field(_.parent, k): k for k in _.parent.schema} + for node in winfuncs: + # add computed values from the window function + values = list(node.func.__args__) + # add computed values from the window frame + values += node.frame.group_by + values += [key.expr for key in node.frame.order_by] + if node.frame.start is not None: + values.append(node.frame.start.value) + if node.frame.end is not None: + values.append(node.frame.end.value) + + for v in values: + if is_columnar(v) and v not in selects: + selects[v] = gen_name("value") + + # STEP 1: construct the pre-projection + proj = ops.Project(_.parent, {v: k for k, v in selects.items()}) + subs = {node: ops.Field(proj, name) for name, node in proj.values.items()} + + # STEP 2: construct new window function nodes + metrics = {} + for node in winfuncs: + frame = node.frame + start = None if frame.start is None else frame.start.replace(subs) + end = None if frame.end is None else frame.end.replace(subs) + order_by = [key.replace(subs) for key in frame.order_by] + group_by = [key.replace(subs) for key in frame.group_by] + frame = frame.__class__( + proj, start=start, end=end, group_by=group_by, order_by=order_by + ) + metrics[node] = ops.WindowFunction(node.func.replace(subs), frame) + + # STEP 3: reconstruct the current projection with the window functions + subs.update(metrics) + values = {k: v.replace(subs, filter=ops.Value) for k, v in _.values.items()} + return ops.Project(proj, values) + + +@replace(ops.Aggregate) +def rewrite_aggregate(_, **kwargs): + selects = {ops.Field(_.parent, k): k for k in _.parent.schema} + for v in _.groups.values(): + if v not in selects: + selects[v] = gen_name("group") + + reductions = {} + for v in _.metrics.values(): + for reduction in v.find_topmost(ops.Reduction): + for arg in reduction.__args__: + if is_columnar(arg) and arg not in selects: + selects[arg] = gen_name("value") + if reduction not in reductions: + reductions[reduction] = gen_name("reduction") + + # STEP 1: construct the pre-projection + proj = ops.Project(_.parent, {v: k for k, v in selects.items()}) + + # STEP 2: construct the pandas aggregation + subs = {node: ops.Field(proj, name) for name, node in proj.values.items()} + groups = {name: ops.Field(proj, selects[node]) for name, node in _.groups.items()} + metrics = {name: node.replace(subs) for node, name in reductions.items()} + agg = PandasAggregate(proj, groups, metrics) + + # STEP 3: construct the post-projection + subs = {node: ops.Field(agg, name) for node, name in reductions.items()} + values = {name: ops.Field(agg, name) for name, node in _.groups.items()} + values.update({name: node.replace(subs) for name, node in _.metrics.items()}) + return ops.Project(agg, values) + + +def split_join_predicates(left, right, predicates, only_equality=True): + left_on = [] + right_on = [] + for pred in predicates: + if left not in pred.relations or right not in pred.relations: + # not a usual join predicate, so apply a trick by placing the + # predicate to the left side and adding a literal True to the right + # which the left side must be equal to + left_on.append(pred) + right_on.append(ops.Literal(True, dtype=dt.boolean)) + elif isinstance(pred, ops.Binary): + if only_equality and not isinstance(pred, ops.Equals): + raise TypeError("Only equality join predicates supported with pandas") + if left in pred.left.relations and right in pred.right.relations: + left_on.append(pred.left) + right_on.append(pred.right) + elif left in pred.right.relations and right in pred.left.relations: + left_on.append(pred.right) + right_on.append(pred.left) + else: + raise ValueError("Join predicate does not reference both tables") + else: + raise TypeError(f"Unsupported join predicate {pred}") + + return left_on, right_on + + +@replace(ops.JoinChain) +def rewrite_join(_, **kwargs): + prefixes = {} + prefixes[_.first] = prefix = str(len(prefixes)) + left = PandasRename.from_prefix(_.first, prefix) + + for link in _.rest: + prefixes[link.table] = prefix = str(len(prefixes)) + right = PandasRename.from_prefix(link.table, prefix) + + subs = {v: ops.Field(left, k) for k, v in left.values.items()} + subs.update({v: ops.Field(right, k) for k, v in right.values.items()}) + preds = [pred.replace(subs, filter=ops.Value) for pred in link.predicates] + + # separate ASOF from the rest of the joins + if link.how == "asof": + on, *by = preds + left_on, right_on = split_join_predicates( + left, right, [on], only_equality=False + ) + left_by, right_by = split_join_predicates(left, right, by) + left = PandasAsofJoin( + how="asof", + left=left, + right=right, + left_on=left_on, + right_on=right_on, + left_by=left_by, + right_by=right_by, + operator=type(on), + ) + else: + # need to replace the fields in the predicates + left_on, right_on = split_join_predicates(left, right, preds) + left = PandasJoin( + how=link.how, + left=left, + right=right, + left_on=left_on, + right_on=right_on, + ) + + subs = {v: ops.Field(left, k) for k, v in left.values.items()} + fields = {k: v.replace(subs, filter=ops.Value) for k, v in _.values.items()} + return ops.Project(left, fields) + + +@replace(ops.Limit) +def rewrite_limit(_, **kwargs): + if isinstance(_.n, ops.Value): + n = _.n.to_expr() + else: + n = ibis.literal(_.n) + + if isinstance(_.offset, ops.Value): + offset = _.offset.to_expr() + else: + offset = ibis.literal(_.offset) + + n = n.as_table().op() + if isinstance(n, ops.Aggregate): + n = rewrite_aggregate.match(n, context={}) + + offset = offset.as_table().op() + if isinstance(offset, ops.Aggregate): + offset = rewrite_aggregate.match(offset, context={}) + + return PandasLimit(_.parent, n, offset) + + +@replace(ops.ScalarSubquery) +def rewrite_scalar_subquery(_, **kwargs): + return PandasScalarSubquery(_.rel) + + +@replace(ops.ScalarParameter) +def replace_parameter(_, params, **kwargs): + return ops.Literal(value=params[_], dtype=_.dtype) + + +@replace(ops.UnboundTable) +def bind_unbound_table(_, backend, **kwargs): + return ops.DatabaseTable(name=_.name, schema=_.schema, source=backend) + + +def plan(node, backend, params): + ctx = {"params": params, "backend": backend} + node = node.replace(rewrite_scalar_subquery) + node = node.replace( + rewrite_project + | rewrite_aggregate + | rewrite_join + | rewrite_limit + | replace_parameter + | bind_unbound_table, + context=ctx, + ) + return node diff --git a/ibis/backends/pandas/tests/conftest.py b/ibis/backends/pandas/tests/conftest.py index 8aa998871d2a..41fcc924ed2c 100644 --- a/ibis/backends/pandas/tests/conftest.py +++ b/ibis/backends/pandas/tests/conftest.py @@ -1,9 +1,16 @@ from __future__ import annotations +import decimal from typing import Any +import numpy as np +import pandas as pd +import pytest + import ibis +import ibis.expr.datatypes as dt from ibis.backends.conftest import TEST_TABLES +from ibis.backends.pandas import Backend from ibis.backends.tests.base import BackendTest from ibis.backends.tests.data import array_types, json_types, struct_types, win @@ -32,3 +39,282 @@ def _load_data(self, **_: Any) -> None: @staticmethod def connect(*, tmpdir, worker_id, **kw): return ibis.pandas.connect(**kw) + + +@pytest.fixture(scope="module") +def df(): + return pd.DataFrame( + { + "plain_int64": list(range(1, 4)), + "plain_strings": list("abc"), + "plain_float64": [4.0, 5.0, 6.0], + "plain_datetimes_naive": pd.Series( + pd.date_range(start="2017-01-02 01:02:03.234", periods=3).values + ), + "plain_datetimes_ny": pd.Series( + pd.date_range(start="2017-01-02 01:02:03.234", periods=3).values + ).dt.tz_localize("America/New_York"), + "plain_datetimes_utc": pd.Series( + pd.date_range(start="2017-01-02 01:02:03.234", periods=3).values + ).dt.tz_localize("UTC"), + "plain_uint64": pd.Series(range(1, 4), dtype=np.dtype("uint64")), + "dup_strings": list("dad"), + "dup_ints": [1, 2, 1], + "float64_as_strings": ["100.01", "234.23", "-999.34"], + "int64_as_strings": list(map(str, range(1, 4))), + "strings_with_space": [" ", "abab", "ddeeffgg"], + "translate_from_strings": ["rmz", "abc", "ghj"], + "translate_to_strings": ["lns", "ovk", "jfr"], + "int64_with_zeros": [0, 1, 0], + "float64_with_zeros": [1.0, 0.0, 1.0], + "float64_positive": [1.0, 2.0, 1.0], + "strings_with_nulls": ["a", None, "b"], + "datetime_strings_naive": pd.Series( + pd.date_range(start="2017-01-02 01:02:03.234", periods=3).values + ).astype(str), + "datetime_strings_ny": pd.Series( + pd.date_range(start="2017-01-02 01:02:03.234", periods=3).values + ) + .dt.tz_localize("America/New_York") + .astype(str), + "datetime_strings_utc": pd.Series( + pd.date_range(start="2017-01-02 01:02:03.234", periods=3).values + ) + .dt.tz_localize("UTC") + .astype(str), + "decimal": list(map(decimal.Decimal, ["1.0", "2", "3.234"])), + "array_of_float64": [ + np.array([1.0, 2.0], dtype="float64"), + np.array([3.0], dtype="float64"), + np.array([], dtype="float64"), + ], + "array_of_int64": [ + np.array([1, 2], dtype="int64"), + np.array([], dtype="int64"), + np.array([3], dtype="int64"), + ], + "array_of_strings": [ + np.array(["a", "b"], dtype="object"), + np.array([], dtype="object"), + np.array(["c"], dtype="object"), + ], + "map_of_strings_integers": [{"a": 1, "b": 2}, None, {}], + "map_of_integers_strings": [{}, None, {1: "a", 2: "b"}], + "map_of_complex_values": [None, {"a": [1, 2, 3], "b": []}, {}], + } + ) + + +@pytest.fixture(scope="module") +def batting_df(data_dir): + num_rows = 1000 + start_index = 30 + df = pd.read_parquet(data_dir / "parquet" / "batting.parquet").iloc[ + start_index : start_index + num_rows + ] + return df.reset_index(drop=True) + + +@pytest.fixture(scope="module") +def awards_players_df(data_dir): + return pd.read_parquet(data_dir / "parquet" / "awards_players.parquet") + + +@pytest.fixture(scope="module") +def df1(): + return pd.DataFrame( + {"key": list("abcd"), "value": [3, 4, 5, 6], "key2": list("eeff")} + ) + + +@pytest.fixture(scope="module") +def df2(): + return pd.DataFrame( + {"key": list("ac"), "other_value": [4.0, 6.0], "key3": list("fe")} + ) + + +@pytest.fixture(scope="module") +def intersect_df2(): + return pd.DataFrame({"key": list("cd"), "value": [5, 6], "key2": list("ff")}) + + +@pytest.fixture(scope="module") +def time_df1(): + return pd.DataFrame( + {"time": pd.to_datetime([1, 2, 3, 4]), "value": [1.1, 2.2, 3.3, 4.4]} + ) + + +@pytest.fixture(scope="module") +def time_df2(): + return pd.DataFrame({"time": pd.to_datetime([2, 4]), "other_value": [1.2, 2.0]}) + + +@pytest.fixture(scope="module") +def time_df3(): + return pd.DataFrame( + { + "time": pd.Series( + pd.date_range(start="2017-01-02 01:02:03.234", periods=8).values + ), + "id": list(range(1, 5)) * 2, + "value": [1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8], + } + ) + + +@pytest.fixture(scope="module") +def time_keyed_df1(): + return pd.DataFrame( + { + "time": pd.Series( + pd.date_range(start="2017-01-02 01:02:03.234", periods=6).values + ), + "key": [1, 2, 3, 1, 2, 3], + "value": [1.2, 1.4, 2.0, 4.0, 8.0, 16.0], + } + ) + + +@pytest.fixture(scope="module") +def time_keyed_df2(): + return pd.DataFrame( + { + "time": pd.Series( + pd.date_range( + start="2017-01-02 01:02:03.234", freq="3D", periods=3 + ).values + ), + "key": [1, 2, 3], + "other_value": [1.1, 1.2, 2.2], + } + ) + + +@pytest.fixture(scope="module") +def client( + df, + df1, + df2, + df3, + time_df1, + time_df2, + time_df3, + time_keyed_df1, + time_keyed_df2, + intersect_df2, +): + return Backend().connect( + { + "df": df, + "df1": df1, + "df2": df2, + "df3": df3, + "left": df1, + "right": df2, + "time_df1": time_df1, + "time_df2": time_df2, + "time_df3": time_df3, + "time_keyed_df1": time_keyed_df1, + "time_keyed_df2": time_keyed_df2, + "intersect_df2": intersect_df2, + } + ) + + +@pytest.fixture(scope="module") +def df3(): + return pd.DataFrame( + { + "key": list("ac"), + "other_value": [4.0, 6.0], + "key2": list("ae"), + "key3": list("fe"), + } + ) + + +t_schema = { + "decimal": dt.Decimal(4, 3), + "array_of_float64": dt.Array(dt.double), + "array_of_int64": dt.Array(dt.int64), + "array_of_strings": dt.Array(dt.string), + "map_of_strings_integers": dt.Map(dt.string, dt.int64), + "map_of_integers_strings": dt.Map(dt.int64, dt.string), + "map_of_complex_values": dt.Map(dt.string, dt.Array(dt.int64)), +} + + +@pytest.fixture(scope="module") +def t(client): + return client.table("df", schema=t_schema) + + +@pytest.fixture(scope="module") +def lahman(batting_df, awards_players_df): + return Backend().connect( + {"batting": batting_df, "awards_players": awards_players_df} + ) + + +@pytest.fixture(scope="module") +def left(client): + return client.table("left") + + +@pytest.fixture(scope="module") +def right(client): + return client.table("right") + + +@pytest.fixture(scope="module") +def time_left(client): + return client.table("time_df1") + + +@pytest.fixture(scope="module") +def time_right(client): + return client.table("time_df2") + + +@pytest.fixture(scope="module") +def time_table(client): + return client.table("time_df3") + + +@pytest.fixture(scope="module") +def time_keyed_left(client): + return client.table("time_keyed_df1") + + +@pytest.fixture(scope="module") +def time_keyed_right(client): + return client.table("time_keyed_df2") + + +@pytest.fixture(scope="module") +def batting(lahman): + return lahman.table("batting") + + +@pytest.fixture(scope="module") +def sel_cols(batting): + cols = batting.columns + start, end = cols.index("AB"), cols.index("H") + 1 + return ["playerID", "yearID", "teamID", "G"] + cols[start:end] + + +@pytest.fixture(scope="module") +def players_base(batting, sel_cols): + return batting[sel_cols].order_by(sel_cols[:3]) + + +@pytest.fixture(scope="module") +def players(players_base): + return players_base.group_by("playerID") + + +@pytest.fixture(scope="module") +def players_df(players_base): + return players_base.execute().reset_index(drop=True) diff --git a/ibis/backends/pandas/tests/execution/__init__.py b/ibis/backends/pandas/tests/execution/__init__.py deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/ibis/backends/pandas/tests/execution/conftest.py b/ibis/backends/pandas/tests/execution/conftest.py deleted file mode 100644 index 32d5efad67d2..000000000000 --- a/ibis/backends/pandas/tests/execution/conftest.py +++ /dev/null @@ -1,289 +0,0 @@ -from __future__ import annotations - -import decimal - -import numpy as np -import pandas as pd -import pytest - -import ibis.expr.datatypes as dt -from ibis.backends.pandas import Backend - - -@pytest.fixture(scope="module") -def df(): - return pd.DataFrame( - { - "plain_int64": list(range(1, 4)), - "plain_strings": list("abc"), - "plain_float64": [4.0, 5.0, 6.0], - "plain_datetimes_naive": pd.Series( - pd.date_range(start="2017-01-02 01:02:03.234", periods=3).values - ), - "plain_datetimes_ny": pd.Series( - pd.date_range(start="2017-01-02 01:02:03.234", periods=3).values - ).dt.tz_localize("America/New_York"), - "plain_datetimes_utc": pd.Series( - pd.date_range(start="2017-01-02 01:02:03.234", periods=3).values - ).dt.tz_localize("UTC"), - "plain_uint64": pd.Series(range(1, 4), dtype=np.dtype("uint64")), - "dup_strings": list("dad"), - "dup_ints": [1, 2, 1], - "float64_as_strings": ["100.01", "234.23", "-999.34"], - "int64_as_strings": list(map(str, range(1, 4))), - "strings_with_space": [" ", "abab", "ddeeffgg"], - "translate_from_strings": ["rmz", "abc", "ghj"], - "translate_to_strings": ["lns", "ovk", "jfr"], - "int64_with_zeros": [0, 1, 0], - "float64_with_zeros": [1.0, 0.0, 1.0], - "float64_positive": [1.0, 2.0, 1.0], - "strings_with_nulls": ["a", None, "b"], - "datetime_strings_naive": pd.Series( - pd.date_range(start="2017-01-02 01:02:03.234", periods=3).values - ).astype(str), - "datetime_strings_ny": pd.Series( - pd.date_range(start="2017-01-02 01:02:03.234", periods=3).values - ) - .dt.tz_localize("America/New_York") - .astype(str), - "datetime_strings_utc": pd.Series( - pd.date_range(start="2017-01-02 01:02:03.234", periods=3).values - ) - .dt.tz_localize("UTC") - .astype(str), - "decimal": list(map(decimal.Decimal, ["1.0", "2", "3.234"])), - "array_of_float64": [ - np.array([1.0, 2.0], dtype="float64"), - np.array([3.0], dtype="float64"), - np.array([], dtype="float64"), - ], - "array_of_int64": [ - np.array([1, 2], dtype="int64"), - np.array([], dtype="int64"), - np.array([3], dtype="int64"), - ], - "array_of_strings": [ - np.array(["a", "b"], dtype="object"), - np.array([], dtype="object"), - np.array(["c"], dtype="object"), - ], - "map_of_strings_integers": [{"a": 1, "b": 2}, None, {}], - "map_of_integers_strings": [{}, None, {1: "a", 2: "b"}], - "map_of_complex_values": [None, {"a": [1, 2, 3], "b": []}, {}], - } - ) - - -@pytest.fixture(scope="module") -def batting_df(data_dir): - num_rows = 1000 - start_index = 30 - df = pd.read_parquet(data_dir / "parquet" / "batting.parquet").iloc[ - start_index : start_index + num_rows - ] - return df.reset_index(drop=True) - - -@pytest.fixture(scope="module") -def awards_players_df(data_dir): - return pd.read_parquet(data_dir / "parquet" / "awards_players.parquet") - - -@pytest.fixture(scope="module") -def df1(): - return pd.DataFrame( - {"key": list("abcd"), "value": [3, 4, 5, 6], "key2": list("eeff")} - ) - - -@pytest.fixture(scope="module") -def df2(): - return pd.DataFrame( - {"key": list("ac"), "other_value": [4.0, 6.0], "key3": list("fe")} - ) - - -@pytest.fixture(scope="module") -def intersect_df2(): - return pd.DataFrame({"key": list("cd"), "value": [5, 6], "key2": list("ff")}) - - -@pytest.fixture(scope="module") -def time_df1(): - return pd.DataFrame( - {"time": pd.to_datetime([1, 2, 3, 4]), "value": [1.1, 2.2, 3.3, 4.4]} - ) - - -@pytest.fixture(scope="module") -def time_df2(): - return pd.DataFrame({"time": pd.to_datetime([2, 4]), "other_value": [1.2, 2.0]}) - - -@pytest.fixture(scope="module") -def time_df3(): - return pd.DataFrame( - { - "time": pd.Series( - pd.date_range(start="2017-01-02 01:02:03.234", periods=8).values - ), - "id": list(range(1, 5)) * 2, - "value": [1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8], - } - ) - - -@pytest.fixture(scope="module") -def time_keyed_df1(): - return pd.DataFrame( - { - "time": pd.Series( - pd.date_range(start="2017-01-02 01:02:03.234", periods=6).values - ), - "key": [1, 2, 3, 1, 2, 3], - "value": [1.2, 1.4, 2.0, 4.0, 8.0, 16.0], - } - ) - - -@pytest.fixture(scope="module") -def time_keyed_df2(): - return pd.DataFrame( - { - "time": pd.Series( - pd.date_range( - start="2017-01-02 01:02:03.234", freq="3D", periods=3 - ).values - ), - "key": [1, 2, 3], - "other_value": [1.1, 1.2, 2.2], - } - ) - - -@pytest.fixture(scope="module") -def client( - df, - df1, - df2, - df3, - time_df1, - time_df2, - time_df3, - time_keyed_df1, - time_keyed_df2, - intersect_df2, -): - return Backend().connect( - { - "df": df, - "df1": df1, - "df2": df2, - "df3": df3, - "left": df1, - "right": df2, - "time_df1": time_df1, - "time_df2": time_df2, - "time_df3": time_df3, - "time_keyed_df1": time_keyed_df1, - "time_keyed_df2": time_keyed_df2, - "intersect_df2": intersect_df2, - } - ) - - -@pytest.fixture(scope="module") -def df3(): - return pd.DataFrame( - { - "key": list("ac"), - "other_value": [4.0, 6.0], - "key2": list("ae"), - "key3": list("fe"), - } - ) - - -t_schema = { - "decimal": dt.Decimal(4, 3), - "array_of_float64": dt.Array(dt.double), - "array_of_int64": dt.Array(dt.int64), - "array_of_strings": dt.Array(dt.string), - "map_of_strings_integers": dt.Map(dt.string, dt.int64), - "map_of_integers_strings": dt.Map(dt.int64, dt.string), - "map_of_complex_values": dt.Map(dt.string, dt.Array(dt.int64)), -} - - -@pytest.fixture(scope="module") -def t(client): - return client.table("df", schema=t_schema) - - -@pytest.fixture(scope="module") -def lahman(batting_df, awards_players_df): - return Backend().connect( - {"batting": batting_df, "awards_players": awards_players_df} - ) - - -@pytest.fixture(scope="module") -def left(client): - return client.table("left") - - -@pytest.fixture(scope="module") -def right(client): - return client.table("right") - - -@pytest.fixture(scope="module") -def time_left(client): - return client.table("time_df1") - - -@pytest.fixture(scope="module") -def time_right(client): - return client.table("time_df2") - - -@pytest.fixture(scope="module") -def time_table(client): - return client.table("time_df3") - - -@pytest.fixture(scope="module") -def time_keyed_left(client): - return client.table("time_keyed_df1") - - -@pytest.fixture(scope="module") -def time_keyed_right(client): - return client.table("time_keyed_df2") - - -@pytest.fixture(scope="module") -def batting(lahman): - return lahman.table("batting") - - -@pytest.fixture(scope="module") -def sel_cols(batting): - cols = batting.columns - start, end = cols.index("AB"), cols.index("H") + 1 - return ["playerID", "yearID", "teamID", "G"] + cols[start:end] - - -@pytest.fixture(scope="module") -def players_base(batting, sel_cols): - return batting[sel_cols].order_by(sel_cols[:3]) - - -@pytest.fixture(scope="module") -def players(players_base): - return players_base.group_by("playerID") - - -@pytest.fixture(scope="module") -def players_df(players_base): - return players_base.execute().reset_index(drop=True) diff --git a/ibis/backends/pandas/tests/execution/test_timecontext.py b/ibis/backends/pandas/tests/execution/test_timecontext.py deleted file mode 100644 index 5a96cf33888f..000000000000 --- a/ibis/backends/pandas/tests/execution/test_timecontext.py +++ /dev/null @@ -1,399 +0,0 @@ -from __future__ import annotations - -import pandas as pd -import pytest -from packaging.version import parse as vparse - -import ibis -import ibis.common.exceptions as com -import ibis.expr.operations as ops -from ibis.backends.base.df.scope import Scope -from ibis.backends.base.df.timecontext import ( - TimeContext, - TimeContextRelation, - adjust_context, - compare_timecontext, - construct_time_context_aware_series, -) -from ibis.backends.pandas.execution import execute -from ibis.backends.pandas.execution.window import trim_window_result -from ibis.backends.pandas.tests.conftest import TestConf as tm - - -class CustomAsOfJoin(ops.AsOfJoin): - pass - - -def test_execute_with_timecontext(time_table): - expr = time_table - # define a time context for time-series data - context = (pd.Timestamp("20170101"), pd.Timestamp("20170103")) - - # without time context, execute produces every row - df_all = expr.execute() - assert len(df_all["time"]) == 8 - - # with context set, execute produces only rows within context - df_within_context = expr.execute(timecontext=context) - assert len(df_within_context["time"]) == 1 - - -def test_bad_timecontext(time_table, t): - expr = time_table - - # define context with illegal string - with pytest.raises(com.IbisError, match=r".*type pd.Timestamp.*"): - context = ("bad", "context") - expr.execute(timecontext=context) - - # define context with unsupported type int - with pytest.raises(com.IbisError, match=r".*type pd.Timestamp.*"): - context = (20091010, 20100101) - expr.execute(timecontext=context) - - # define context with too few values - with pytest.raises(com.IbisError, match=r".*should specify.*"): - context = pd.Timestamp("20101010") - expr.execute(timecontext=context) - - # define context with begin value later than end - with pytest.raises(com.IbisError, match=r".*before or equal.*"): - context = (pd.Timestamp("20101010"), pd.Timestamp("20090101")) - expr.execute(timecontext=context) - - # execute context with a table without TIME_COL - with pytest.raises(com.IbisError, match=r".*must have a time column.*"): - context = (pd.Timestamp("20090101"), pd.Timestamp("20100101")) - t.execute(timecontext=context) - - -def test_bad_call_to_adjust_context(): - op = "not_a_node" - context = (pd.Timestamp("20170101"), pd.Timestamp("20170103")) - scope = Scope() - with pytest.raises( - com.IbisError, match=r".*Unsupported input type for adjust context.*" - ): - adjust_context(op, scope, context) - - -def test_compare_timecontext(): - c1 = (pd.Timestamp("20170101"), pd.Timestamp("20170103")) - c2 = (pd.Timestamp("20170101"), pd.Timestamp("20170111")) - c3 = (pd.Timestamp("20160101"), pd.Timestamp("20160103")) - c4 = (pd.Timestamp("20161215"), pd.Timestamp("20170102")) - assert compare_timecontext(c1, c2) == TimeContextRelation.SUBSET - assert compare_timecontext(c2, c1) == TimeContextRelation.SUPERSET - assert compare_timecontext(c1, c4) == TimeContextRelation.OVERLAP - assert compare_timecontext(c1, c3) == TimeContextRelation.NONOVERLAP - - -def test_context_adjustment_asof_join( - time_keyed_left, time_keyed_right, time_keyed_df1, time_keyed_df2 -): - expr = time_keyed_left.asof_join( - time_keyed_right, "time", by="key", tolerance=4 * ibis.interval(days=1) - )[time_keyed_left, time_keyed_right.other_value] - context = (pd.Timestamp("20170105"), pd.Timestamp("20170111")) - result = expr.execute(timecontext=context) - - # compare with asof_join of manually trimmed tables - trimmed_df1 = time_keyed_df1[time_keyed_df1["time"] >= context[0]][ - time_keyed_df1["time"] < context[1] - ] - trimmed_df2 = time_keyed_df2[ - time_keyed_df2["time"] >= context[0] - pd.Timedelta(days=4) - ][time_keyed_df2["time"] < context[1]] - expected = pd.merge_asof( - trimmed_df1, - trimmed_df2, - on="time", - by="key", - tolerance=pd.Timedelta("4D"), - ) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - ["interval_ibis", "interval_pd"], - [ - (ibis.interval(days=1), "1d"), - (3 * ibis.interval(days=1), "3d"), - (5 * ibis.interval(days=1), "5d"), - ], -) -def test_context_adjustment_window(time_table, time_df3, interval_ibis, interval_pd): - # trim data manually - expected = ( - time_df3.set_index("time").value.rolling(interval_pd, closed="both").mean() - ) - expected = expected[expected.index >= pd.Timestamp("20170105")].reset_index( - drop=True - ) - - context = pd.Timestamp("20170105"), pd.Timestamp("20170111") - - window = ibis.trailing_window(interval_ibis, order_by=time_table.time) - expr = time_table["value"].mean().over(window) - # result should adjust time context accordingly - result = expr.execute(timecontext=context) - tm.assert_series_equal(result, expected) - - -def test_trim_window_result(time_df3): - """Unit test `trim_window_result` in Window execution.""" - df = time_df3.copy() - context = pd.Timestamp("20170105"), pd.Timestamp("20170111") - - # trim_window_result takes a MultiIndex Series as input - series = df["value"] - time_index = df.set_index("time").index - series.index = pd.MultiIndex.from_arrays( - [series.index, time_index], - names=series.index.names + ["time"], - ) - result = trim_window_result(series, context) - expected = df["time"][df["time"] >= pd.Timestamp("20170105")].reset_index(drop=True) - - # result should adjust time context accordingly - tm.assert_series_equal(result.reset_index()["time"], expected) - - # trim with a non-datetime type of 'time' throws Exception - wrong_series = df["id"] - df["time"] = df["time"].astype(str) - time_index = df.set_index("time").index - wrong_series.index = pd.MultiIndex.from_arrays( - [wrong_series.index, time_index], - names=wrong_series.index.names + ["time"], - ) - with pytest.raises(TypeError, match=r".*not supported between instances.*"): - trim_window_result(wrong_series, context) - - # column is ignored and series is not trimmed - no_context_result = trim_window_result(series, None) - tm.assert_series_equal(no_context_result, series) - - -def test_setting_timecontext_in_scope(time_table, time_df3): - expected_win_1 = ( - time_df3.set_index("time").value.rolling("3d", closed="both").mean() - ) - expected_win_1 = expected_win_1[ - expected_win_1.index >= pd.Timestamp("20170105") - ].reset_index(drop=True) - - context = pd.Timestamp("20170105"), pd.Timestamp("20170111") - window1 = ibis.trailing_window(3 * ibis.interval(days=1), order_by=time_table.time) - """In the following expression, Selection node will be executed first and - get table in context ('20170105', '20170101'). - - Then in window execution table will be executed again with a larger - context adjusted by window preceding days ('20170102', '20170111'). - To get the correct result, the cached table result with a smaller - context must be discard and updated to a larger time range. - """ - expr = time_table.mutate(value=time_table["value"].mean().over(window1)) - result = expr.execute(timecontext=context) - tm.assert_series_equal(result["value"], expected_win_1) - - -def test_context_adjustment_multi_window(time_table, time_df3): - expected_win_1 = ( - time_df3.set_index("time") - .rename(columns={"value": "v1"})["v1"] - .rolling("3d", closed="both") - .mean() - ) - expected_win_1 = expected_win_1[ - expected_win_1.index >= pd.Timestamp("20170105") - ].reset_index(drop=True) - - expected_win_2 = ( - time_df3.set_index("time") - .rename(columns={"value": "v2"})["v2"] - .rolling("2d", closed="both") - .mean() - ) - expected_win_2 = expected_win_2[ - expected_win_2.index >= pd.Timestamp("20170105") - ].reset_index(drop=True) - - context = pd.Timestamp("20170105"), pd.Timestamp("20170111") - window1 = ibis.trailing_window(3 * ibis.interval(days=1), order_by=time_table.time) - window2 = ibis.trailing_window(2 * ibis.interval(days=1), order_by=time_table.time) - expr = time_table.mutate( - v1=time_table["value"].mean().over(window1), - v2=time_table["value"].mean().over(window2), - ) - result = expr.execute(timecontext=context) - - tm.assert_series_equal(result["v1"], expected_win_1) - tm.assert_series_equal(result["v2"], expected_win_2) - - -@pytest.mark.xfail( - condition=vparse("1.4") <= vparse(pd.__version__) < vparse("1.4.2"), - raises=ValueError, - reason="https://github.com/pandas-dev/pandas/pull/44068", -) -def test_context_adjustment_window_groupby_id(time_table, time_df3): - """This test case is meant to test trim_window_result method in - pandas/execution/window.py to see if it could trim Series correctly with - groupby params.""" - expected = ( - time_df3.set_index("time") - .groupby("id") - .value.rolling("3d", closed="both") - .mean() - ) - # This is a MultiIndexed Series - expected = expected.reset_index() - expected = expected[expected.time >= pd.Timestamp("20170105")].reset_index( - drop=True - )["value"] - - context = pd.Timestamp("20170105"), pd.Timestamp("20170111") - - # expected.index.name = None - window = ibis.trailing_window( - 3 * ibis.interval(days=1), group_by="id", order_by=time_table.time - ) - expr = time_table["value"].mean().over(window) - # result should adjust time context accordingly - result = expr.execute(timecontext=context) - tm.assert_series_equal(result, expected) - - -def test_adjust_context_scope(time_keyed_left, time_keyed_right): - """Test that `adjust_context` has access to `scope` by default.""" - - @adjust_context.register(CustomAsOfJoin) - def adjust_context_custom_asof_join( - op: ops.AsOfJoin, - scope: Scope, - timecontext: TimeContext, - ) -> TimeContext: - """Confirms that `scope` is passed in.""" - assert scope is not None - return timecontext - - expr = CustomAsOfJoin( - left=time_keyed_left, - right=time_keyed_right, - predicates="time", - by="key", - tolerance=ibis.interval(days=4), - ).to_expr() - expr = expr[time_keyed_left, time_keyed_right.other_value] - context = (pd.Timestamp("20170105"), pd.Timestamp("20170111")) - expr.execute(timecontext=context) - - -def test_adjust_context_complete_shift( - time_keyed_left, - time_keyed_right, - time_keyed_df1, - time_keyed_df2, -): - """Test `adjust_context` function that completely shifts the context. - - This results in an adjusted context that is NOT a subset of the - original context. This is unlike an `adjust_context` function - that only expands the context. - - See #3104 - """ - - # Create a contrived `adjust_context` function for - # CustomAsOfJoin to mock this. - - @adjust_context.register(CustomAsOfJoin) - def adjust_context_custom_asof_join( - op: ops.AsOfJoin, - scope: Scope, - timecontext: TimeContext, - ) -> TimeContext: - """Shifts both the begin and end in the same direction.""" - - begin, end = timecontext - timedelta = execute(op.tolerance) - return (begin - timedelta, end - timedelta) - - expr = CustomAsOfJoin( - left=time_keyed_left, - right=time_keyed_right, - predicates="time", - by="key", - tolerance=ibis.interval(days=4), - ).to_expr() - expr = expr[time_keyed_left, time_keyed_right.other_value] - context = (pd.Timestamp("20170101"), pd.Timestamp("20170111")) - result = expr.execute(timecontext=context) - - # Compare with asof_join of manually trimmed tables - # Left table: No shift for context - # Right table: Shift both begin and end of context by 4 days - trimmed_df1 = time_keyed_df1[time_keyed_df1["time"] >= context[0]][ - time_keyed_df1["time"] < context[1] - ] - trimmed_df2 = time_keyed_df2[ - time_keyed_df2["time"] >= context[0] - pd.Timedelta(days=4) - ][time_keyed_df2["time"] < context[1] - pd.Timedelta(days=4)] - expected = pd.merge_asof( - trimmed_df1, - trimmed_df2, - on="time", - by="key", - tolerance=pd.Timedelta("4D"), - ) - - tm.assert_frame_equal(result, expected) - - -def test_construct_time_context_aware_series(time_df3): - """Unit test for `construct_time_context_aware_series`""" - # Series without 'time' index will result in a MultiIndex with 'time' - df = time_df3 - expected = df["value"] - time_index = pd.Index(df["time"]) - expected.index = pd.MultiIndex.from_arrays( - [expected.index, time_index], - names=expected.index.names + ["time"], - ) - result = construct_time_context_aware_series(df["value"], df) - tm.assert_series_equal(result, expected) - - # Series with 'time' as index will not change - time_indexed_df = time_df3.set_index("time") - expected_time_aware = time_indexed_df["value"] - result_time_aware = construct_time_context_aware_series( - time_indexed_df["value"], time_indexed_df - ) - tm.assert_series_equal(result_time_aware, expected_time_aware) - - # Series with a MultiIndex, where 'time' is in the MultiIndex, - # will not change - multi_index_time_aware_series = result_time_aware - expected_multi_index_time_aware = result_time_aware - result_multi_index_time_aware = construct_time_context_aware_series( - multi_index_time_aware_series, time_indexed_df - ) - tm.assert_series_equal( - result_multi_index_time_aware, expected_multi_index_time_aware - ) - - # Series with a MultiIndex, where 'time' is NOT in the MultiIndex, - # 'time' will be added into the MultiIndex - multi_index_series = df["id"] - expected_multi_index = df["id"].copy() - other_index = pd.Index(df["value"]) - expected_multi_index.index = pd.MultiIndex.from_arrays( - [expected_multi_index.index, other_index, time_index], - names=expected_multi_index.index.names + ["value", "time"], - ) - multi_index_series.index = pd.MultiIndex.from_arrays( - [multi_index_series.index, other_index], - names=multi_index_series.index.names + ["value"], - ) - result_multi_index = construct_time_context_aware_series(multi_index_series, df) - tm.assert_series_equal(result_multi_index, expected_multi_index) diff --git a/ibis/backends/pandas/tests/test_aggcontext.py b/ibis/backends/pandas/tests/test_aggcontext.py deleted file mode 100644 index 8fc7281a6fa7..000000000000 --- a/ibis/backends/pandas/tests/test_aggcontext.py +++ /dev/null @@ -1,167 +0,0 @@ -from __future__ import annotations - -import numpy as np -import pandas as pd -import pytest -from pandas import testing as tm -from pytest import param - -from ibis.backends.pandas.aggcontext import Summarize, window_agg_udf - -df = pd.DataFrame( - { - "id": [1, 2, 1, 2], - "v1": [1.0, 2.0, 3.0, 4.0], - "v2": [10.0, 20.0, 30.0, 40.0], - } -) - - -@pytest.mark.parametrize( - ("agg_fn", "expected_fn"), - [ - param( - lambda v1: v1.mean(), - lambda df: df["v1"].mean(), - id="udf", - ), - param( - "mean", - lambda df: df["v1"].mean(), - id="string", - ), - ], -) -def test_summarize_single_series(agg_fn, expected_fn): - """Test Summarize.agg operating on a single Series.""" - - aggcontext = Summarize() - - result = aggcontext.agg(df["v1"], agg_fn) - expected = expected_fn(df) - - assert result == expected - - -@pytest.mark.parametrize( - ("agg_fn", "expected_fn"), - [ - param( - lambda v1: v1.mean(), - lambda df: df["v1"].mean(), - id="udf", - ), - param( - "mean", - lambda df: df["v1"].mean(), - id="string", - ), - ], -) -def test_summarize_single_seriesgroupby(agg_fn, expected_fn): - """Test Summarize.agg operating on a single SeriesGroupBy.""" - - aggcontext = Summarize() - - df_grouped = df.sort_values("id").groupby("id") - result = aggcontext.agg(df_grouped["v1"], agg_fn) - - expected = expected_fn(df_grouped) - - tm.assert_series_equal(result, expected) - - -@pytest.mark.parametrize( - ("agg_fn", "expected_fn"), - [ - param( - lambda v1, v2: v1.mean() - v2.mean(), - lambda df: df["v1"].mean() - df["v2"].mean(), - id="two-column", - ), - # Two columns, but only the second one is actually used in UDF - param( - lambda v1, v2: v2.mean(), - lambda df: df["v2"].mean(), - id="redundant-column", - ), - ], -) -def test_summarize_multiple_series(agg_fn, expected_fn): - """Test Summarize.agg operating on many Series.""" - - aggcontext = Summarize() - - args = [df["v1"], df["v2"]] - result = aggcontext.agg(args[0], agg_fn, *args[1:]) - - expected = expected_fn(df) - - assert result == expected - - -@pytest.mark.parametrize( - "param", - [ - ( - pd.Series([True, True, True, True]), - pd.Series([1.0, 2.0, 2.0, 3.0]), - ), - ( - pd.Series([False, True, True, False]), - pd.Series([np.NaN, 2.0, 2.0, np.NaN]), - ), - ], -) -def test_window_agg_udf(param): - """Test passing custom window indices for window aggregation.""" - - mask, expected = param - - grouped_data = df.sort_values("id").groupby("id")["v1"] - result_index = grouped_data.obj.index - - window_lower_indices = pd.Series([0, 0, 2, 2]) - window_upper_indices = pd.Series([1, 2, 3, 4]) - - result = window_agg_udf( - grouped_data, - lambda s: s.mean(), - window_lower_indices, - window_upper_indices, - mask, - result_index, - dtype="float", - max_lookback=None, - ) - - expected.index = grouped_data.obj.index - - tm.assert_series_equal(result, expected) - - -def test_window_agg_udf_different_freq(): - """Test that window_agg_udf works when the window series and data series - have different frequencies.""" - - time = pd.Series([pd.Timestamp("20200101"), pd.Timestamp("20200201")]) - data = pd.Series([1, 2, 3, 4, 5, 6]) - window_lower_indices = pd.Series([0, 4]) - window_upper_indices = pd.Series([5, 7]) - mask = pd.Series([True, True]) - result_index = time.index - - result = window_agg_udf( - data, - lambda s: s.mean(), - window_lower_indices, - window_upper_indices, - mask, - result_index, - "float", - None, - ) - - expected = pd.Series([data.iloc[0:5].mean(), data.iloc[4:7].mean()]) - - tm.assert_series_equal(result, expected) diff --git a/ibis/backends/pandas/tests/execution/test_arrays.py b/ibis/backends/pandas/tests/test_arrays.py similarity index 96% rename from ibis/backends/pandas/tests/execution/test_arrays.py rename to ibis/backends/pandas/tests/test_arrays.py index 00e873715224..98d1bb6fcd8d 100644 --- a/ibis/backends/pandas/tests/execution/test_arrays.py +++ b/ibis/backends/pandas/tests/test_arrays.py @@ -36,6 +36,13 @@ def test_array_length(t): tm.assert_frame_equal(result, expected) +def test_array_slice_using_column(t): + expr = t.array_of_int64[t.plain_int64 :] + result = expr.execute() + expected = pd.Series([[2], [], []]) + tm.assert_series_equal(result, expected) + + def test_array_length_scalar(client): raw_value = np.array([1, 2, 4]) value = ibis.array(raw_value) diff --git a/ibis/backends/pandas/tests/execution/test_cast.py b/ibis/backends/pandas/tests/test_cast.py similarity index 80% rename from ibis/backends/pandas/tests/execution/test_cast.py rename to ibis/backends/pandas/tests/test_cast.py index bc2d8a60f974..7ca38a675261 100644 --- a/ibis/backends/pandas/tests/execution/test_cast.py +++ b/ibis/backends/pandas/tests/test_cast.py @@ -5,14 +5,10 @@ import numpy as np import pandas as pd import pytest -import pytz -from pytest import param import ibis import ibis.expr.datatypes as dt -from ibis.backends.pandas.execution import execute from ibis.backends.pandas.tests.conftest import TestConf as tm -from ibis.common.exceptions import OperationNotDefinedError TIMESTAMP = "2022-03-13 06:59:10.467417" @@ -63,7 +59,9 @@ def test_cast_array(t, from_, to, expected): # One of the arrays in the Series res = result[0] assert isinstance(res, list) - assert [ibis.literal(v).type() for v in res] == [expected] * len(res) + + for v in result: + assert v == [dt.normalize(expected, x) for x in v] @pytest.mark.parametrize( @@ -71,7 +69,7 @@ def test_cast_array(t, from_, to, expected): [ ("string", "object"), ("int64", "int64"), - param("double", "float64", marks=pytest.mark.xfail(raises=TypeError)), + ("double", "float64"), ( dt.Timestamp("America/Los_Angeles"), "datetime64[ns, America/Los_Angeles]", @@ -97,22 +95,18 @@ def test_cast_timestamp_column(t, df, column, to, expected): [ ("string", str), ("int64", lambda x: pd.Timestamp(x).value // int(1e9)), - param( - "double", - float, - marks=pytest.mark.xfail(raises=OperationNotDefinedError), - ), + ("double", lambda x: float(pd.Timestamp(x).value // int(1e9))), ( dt.Timestamp("America/Los_Angeles"), - lambda x: x.astimezone(tz=pytz.timezone("America/Los_Angeles")), + lambda x: x.tz_localize(tz="America/Los_Angeles"), ), ], ) -def test_cast_timestamp_scalar_naive(to, expected): +def test_cast_timestamp_scalar_naive(client, to, expected): literal_expr = ibis.literal(pd.Timestamp(TIMESTAMP)) value = literal_expr.cast(to) - result = execute(value.op()) - raw = execute(literal_expr.op()) + result = client.execute(value) + raw = client.execute(literal_expr) assert result == expected(raw) @@ -121,23 +115,19 @@ def test_cast_timestamp_scalar_naive(to, expected): [ ("string", str), ("int64", lambda x: pd.Timestamp(x).value // int(1e9)), - param( - "double", - float, - marks=pytest.mark.xfail(raises=OperationNotDefinedError), - ), + ("double", lambda x: float(pd.Timestamp(x).value // int(1e9))), ( dt.Timestamp("America/Los_Angeles"), - lambda x: x.astimezone(tz=pytz.timezone("America/Los_Angeles")), + lambda x: x.astimezone(tz="America/Los_Angeles"), ), ], ) @pytest.mark.parametrize("tz", ["UTC", "America/New_York"]) -def test_cast_timestamp_scalar(to, expected, tz): +def test_cast_timestamp_scalar(client, to, expected, tz): literal_expr = ibis.literal(pd.Timestamp(TIMESTAMP).tz_localize(tz)) value = literal_expr.cast(to) - result = execute(value.op()) - raw = execute(literal_expr.op()) + result = client.execute(value) + raw = client.execute(literal_expr) assert result == expected(raw) @@ -158,7 +148,7 @@ def test_cast_date(t, df, column): tm.assert_series_equal(result, expected) -@pytest.mark.parametrize("type", [dt.Decimal(9, 0), dt.Decimal(12, 3)]) +@pytest.mark.parametrize("type", [dt.Decimal(9, 2), dt.Decimal(12, 3)]) def test_cast_to_decimal(t, df, type): expr = t.float64_as_strings.cast(type) result = expr.execute() diff --git a/ibis/backends/pandas/tests/test_core.py b/ibis/backends/pandas/tests/test_core.py index eb980c6cf7e9..45e3a3a02b94 100644 --- a/ibis/backends/pandas/tests/test_core.py +++ b/ibis/backends/pandas/tests/test_core.py @@ -6,11 +6,7 @@ import ibis import ibis.common.exceptions as com -import ibis.expr.operations as ops -from ibis.backends.base.df.scope import Scope from ibis.backends.pandas import Backend -from ibis.backends.pandas.dispatch import post_execute, pre_execute -from ibis.backends.pandas.execution import execute @pytest.fixture @@ -50,59 +46,24 @@ def test_from_dataframe(dataframe, ibis_table, core_client): tm.assert_frame_equal(result, expected) -def test_pre_execute_basic(): - """Test that pre_execute has intercepted execution and provided its own - scope dict.""" - - @pre_execute.register(ops.Add) - def pre_execute_test(op, *clients, scope=None, **kwargs): - return Scope({op: 4}, None) - - one = ibis.literal(1) - expr = one + one - result = execute(expr.op()) - assert result == 4 - - del pre_execute.funcs[(ops.Add,)] - pre_execute.reorder() - pre_execute._cache.clear() - - def test_execute_parameter_only(): param = ibis.param("int64") - result = execute(param.op(), params={param.op(): 42}) + con = ibis.pandas.connect() + result = con.execute(param, params={param.op(): 42}) assert result == 42 def test_missing_data_sources(): - t = ibis.table([("a", "string")]) + t = ibis.table([("a", "string")], name="t") expr = t.a.length() + con = ibis.pandas.connect() with pytest.raises(com.UnboundExpressionError): - execute(expr.op()) - - -def test_post_execute_called_on_joins(dataframe, core_client, ibis_table): - count = [0] - - @post_execute.register(ops.InnerJoin, pd.DataFrame) - def tmp_left_join_exe(op, lhs, **kwargs): - count[0] += 1 - return lhs - - left = ibis_table - right = left.view() - join = left.join(right, "plain_strings")[left.plain_int64] - result = join.execute() - assert result is not None - assert not result.empty - assert count[0] == 1 - - -def test_scope_look_up(): - # test if scope could lookup items properly - scope = Scope() - one_day = ibis.interval(days=1).op() - one_hour = ibis.interval(hours=1).op() - scope = scope.merge_scope(Scope({one_day: 1}, None)) - assert scope.get_value(one_hour) is None - assert scope.get_value(one_day) is not None + con.execute(expr) + + +def test_unbound_table_execution(): + t = ibis.table([("a", "string")], name="t") + expr = t.a.length() + con = ibis.pandas.connect({"t": pd.DataFrame({"a": ["a", "ab", "abc"]})}) + result = con.execute(expr) + assert result.tolist() == [1, 2, 3] diff --git a/ibis/backends/pandas/tests/test_dispatcher.py b/ibis/backends/pandas/tests/test_dispatcher.py deleted file mode 100644 index 27916fd112e0..000000000000 --- a/ibis/backends/pandas/tests/test_dispatcher.py +++ /dev/null @@ -1,143 +0,0 @@ -from __future__ import annotations - -import pytest -from multipledispatch import Dispatcher - -from ibis.backends.pandas.dispatcher import TwoLevelDispatcher - - -class A1: - pass - - -class A2(A1): - pass - - -class A3(A2): - pass - - -class B1: - pass - - -class B2(B1): - pass - - -class B3(B2): - pass - - -@pytest.fixture -def foo_dispatchers(): - foo = TwoLevelDispatcher("foo", doc="Test dispatcher foo") - foo_m = Dispatcher("foo_m", doc="Control dispatcher foo_m") - - @foo.register(A1, B1) - @foo_m.register(A1, B1) - def foo0(x, y): - return 0 - - @foo.register(A1, B2) - @foo_m.register(A1, B2) - def foo1(x, y): - return 1 - - @foo.register(A2, B1) - @foo_m.register(A2, B1) - def foo2(x, y): - return 2 - - @foo.register(A2, B2) - @foo_m.register(A2, B2) - def foo3(x, y): - return 3 - - @foo.register( - (A1, A2), - ) - @foo_m.register( - (A1, A2), - ) - def foo4(x): - return 4 - - return foo, foo_m - - -@pytest.fixture -def foo(foo_dispatchers): - return foo_dispatchers[0] - - -@pytest.fixture -def foo_m(foo_dispatchers): - return foo_dispatchers[1] - - -def test_cache(foo, mocker): - """Test that cache is properly set after calling with args.""" - - spy = mocker.spy(foo, "dispatch") - a1, b1 = A1(), B1() - - assert (A1, B1) not in foo._cache - foo(a1, b1) - assert (A1, B1) in foo._cache - foo(a1, b1) - spy.assert_called_once_with(A1, B1) - - -def test_dispatch(foo, mocker): - """Test that calling dispatcher with a signature that is registered does - not trigger a linear search through dispatch_iter.""" - - spy = mocker.spy(foo, "dispatch_iter") - - # This should not trigger a linear search - foo(A1(), B1()) - assert not spy.called, ( - "Calling dispatcher with registered signature should " - "not trigger linear search" - ) - - foo(A3(), B3()) - spy.assert_called_once_with(A3, B3) - - -@pytest.mark.parametrize( - "args", - [ - (A1(), B1()), - (A1(), B2()), - (A1(), B3()), - (A2(), B1()), - (A2(), B2()), - (A2(), B3()), - (A3(), B1()), - (A3(), B2()), - (A3(), B3()), - (A1(),), - (A2(),), - (A3(),), - ], -) -def test_registered(foo_dispatchers, args): - foo, foo_m = foo_dispatchers - assert foo(*args) == foo_m(*args) - - -def test_ordering(foo, foo_m): - assert foo.ordering == foo_m.ordering - - -def test_funcs(foo, foo_m): - assert foo.funcs == foo_m.funcs - - -@pytest.mark.parametrize("args", [(B1(),), (B2(),), (A1(), A1()), (A1(), A2(), A3())]) -def test_unregistered(foo, args): - with pytest.raises(NotImplementedError, match="Could not find signature for foo.*"): - foo(*args) diff --git a/ibis/backends/pandas/tests/execution/test_functions.py b/ibis/backends/pandas/tests/test_functions.py similarity index 92% rename from ibis/backends/pandas/tests/execution/test_functions.py rename to ibis/backends/pandas/tests/test_functions.py index 2b3851675858..9ef36b23ffb6 100644 --- a/ibis/backends/pandas/tests/execution/test_functions.py +++ b/ibis/backends/pandas/tests/test_functions.py @@ -13,7 +13,6 @@ import ibis import ibis.expr.datatypes as dt -from ibis.backends.pandas.execution import execute from ibis.backends.pandas.tests.conftest import TestConf as tm from ibis.backends.pandas.udf import udf @@ -74,7 +73,6 @@ def wrapper(*args, **kwargs): param( methodcaller("floor"), lambda x: decimal.Decimal(math.floor(x)), id="floor" ), - param(methodcaller("exp"), methodcaller("exp"), id="exp"), param( methodcaller("sign"), lambda x: x if not x else decimal.Decimal(1).copy_sign(x), @@ -97,19 +95,21 @@ def wrapper(*args, **kwargs): ) def test_math_functions_decimal(t, df, ibis_func, pandas_func): dtype = dt.Decimal(12, 3) - expr = ibis_func(t.float64_as_strings.cast(dtype)) - result = expr.execute() context = decimal.Context(prec=dtype.precision) - expected = df.float64_as_strings.apply( - lambda x: context.create_decimal(x).quantize( - decimal.Decimal( - f"{'0' * (dtype.precision - dtype.scale)}.{'0' * dtype.scale}" - ) + + def normalize(x): + x = context.create_decimal(x) + p = decimal.Decimal( + f"{'0' * (dtype.precision - dtype.scale)}.{'0' * dtype.scale}" ) - ).apply(pandas_func) + return x.quantize(p) + + expr = ibis_func(t.float64_as_strings.cast(dtype)) + result = expr.execute() - result[result.apply(math.isnan)] = -99999 - expected[expected.apply(math.isnan)] = -99999 + expected = ( + df.float64_as_strings.apply(normalize).apply(pandas_func).apply(normalize) + ) tm.assert_series_equal(result, expected.astype(expr.type().to_pandas())) @@ -221,10 +221,11 @@ def my_func(x, _): return x df = pd.DataFrame({"left": [left], "right": [right]}) - table = ibis.pandas.connect().from_dataframe(df) + con = ibis.pandas.connect() + table = con.from_dataframe(df) expr = my_func(table.left, table.right) - result = execute(expr.op()) + result = con.execute(expr) assert isinstance(result, pd.Series) result = result.tolist() @@ -238,8 +239,8 @@ def test_ifelse_returning_bool(): true = ibis.literal(True) false = ibis.literal(False) expr = ibis.ifelse(one + one == two, true, false) - result = execute(expr.op()) - assert result is True + result = ibis.pandas.connect().execute(expr) + assert result is True or result is np.True_ @pytest.mark.parametrize( @@ -261,7 +262,7 @@ def func(x): df = pd.DataFrame({"col": [value]}) table = ibis.pandas.connect().from_dataframe(df) - result = execute(table.col.op()) + result = table.col.execute() assert isinstance(result, pd.Series) result = result.tolist() diff --git a/ibis/backends/pandas/tests/test_helpers.py b/ibis/backends/pandas/tests/test_helpers.py new file mode 100644 index 000000000000..4814a0d85376 --- /dev/null +++ b/ibis/backends/pandas/tests/test_helpers.py @@ -0,0 +1,72 @@ +from __future__ import annotations + +import pytest + +from ibis.backends.pandas.helpers import RowsFrame + +lst = list(range(10)) + + +@pytest.mark.parametrize( + ("ix", "start", "end", "expected"), + [ + (0, None, None, lst), + (0, 0, None, lst), + (0, None, 0, [0]), + (0, 0, 0, [0]), + (0, 0, 1, [0, 1]), + (0, 1, 1, [1]), + (0, 1, 2, [1, 2]), + (0, 1, None, lst[1:]), + (0, None, 1, [0, 1]), + (0, -1, None, lst), + (0, None, -1, []), + (0, -1, -1, []), + (0, -2, -1, []), + (0, -2, None, lst), + (0, None, -2, []), + (0, -1, 1, [0, 1]), + (0, 1, -1, []), + (0, -1, 2, [0, 1, 2]), + (1, None, None, lst), + (1, 0, None, lst[1:]), + (1, None, 0, [0, 1]), + (1, 0, 0, [1]), + (1, 0, 1, [1, 2]), + (1, 1, 1, [2]), + (1, 1, 2, [2, 3]), + (1, 1, None, lst[2:]), + (1, None, 1, [0, 1, 2]), + (1, -1, None, lst), + (1, None, -1, [0]), + (1, -1, -1, [0]), + (1, -2, -1, [0]), + (1, -2, None, lst), + (1, None, -2, []), + (1, -1, 1, [0, 1, 2]), + (1, 1, -1, []), + (1, -1, 2, [0, 1, 2, 3]), + (2, None, None, lst), + (2, 0, None, lst[2:]), + (2, None, 0, [0, 1, 2]), + (2, 0, 0, [2]), + (2, 0, 1, [2, 3]), + (2, 1, 1, [3]), + (2, 1, 2, [3, 4]), + (2, 1, None, lst[3:]), + (2, None, 1, [0, 1, 2, 3]), + (2, -1, None, lst[1:]), + (2, None, -1, [0, 1]), + (2, -1, -1, [1]), + (2, -2, -1, [0, 1]), + (2, -2, None, lst), + (2, None, -2, [0]), + (2, -1, 1, [1, 2, 3]), + (2, 1, -1, []), + (2, -1, 2, [1, 2, 3, 4]), + (3, None, None, lst), + ], +) +def test_rows_frame_adjustment(ix, start, end, expected): + start_index, end_index = RowsFrame.adjust(len(lst), ix, start, end) + assert lst[start_index:end_index] == expected diff --git a/ibis/backends/pandas/tests/execution/test_join.py b/ibis/backends/pandas/tests/test_join.py similarity index 89% rename from ibis/backends/pandas/tests/execution/test_join.py rename to ibis/backends/pandas/tests/test_join.py index 8fd990ea86e1..a9acaad3ed6e 100644 --- a/ibis/backends/pandas/tests/execution/test_join.py +++ b/ibis/backends/pandas/tests/test_join.py @@ -1,5 +1,6 @@ from __future__ import annotations +import numpy as np import pandas as pd import pandas.testing as tm import pytest @@ -57,9 +58,26 @@ def test_join_with_multiple_predicates(how, left, right, df1, df2): ] result = expr.execute() expected = pd.merge( - df1, df2, how=how, left_on=["key", "key2"], right_on=["key", "key3"] + df1, + df2, + how=how, + left_on=["key", "key2"], + right_on=["key", "key3"], + suffixes=("_left", "_right"), ).reset_index(drop=True) - tm.assert_frame_equal(result[expected.columns], expected) + + expected_columns = ["key", "value", "key2", "key3", "other_value"] + expected = expected[expected_columns] + if how == "right": + # the ibis expression references the `key` column from the left table + # which is not present in the result of the right join, but pandas + # includes the column from the right table + expected["key"] = pd.Series([np.nan, np.nan, np.nan], dtype=object) + elif how == "outer": + expected["key"] = pd.Series(["a", "b", "c", "d", np.nan, np.nan], dtype=object) + + assert list(result.columns) == expected_columns + tm.assert_frame_equal(result, expected) @mutating_join_type @@ -70,6 +88,12 @@ def test_join_with_multiple_predicates_written_as_one(how, left, right, df1, df2 expected = pd.merge( df1, df2, how=how, left_on=["key", "key2"], right_on=["key", "key3"] ).reset_index(drop=True) + + if how == "right": + expected["key"] = pd.Series([np.nan, np.nan], dtype=object) + elif how == "outer": + expected["key"] = pd.Series(["a", "b", "c", "d", np.nan, np.nan], dtype=object) + tm.assert_frame_equal(result[expected.columns], expected) @@ -270,7 +294,9 @@ def test_asof_join(time_left, time_right, time_df1, time_df2): def test_asof_join_predicate(time_left, time_right, time_df1, time_df2): expr = time_left.asof_join(time_right, time_left.time == time_right.time) result = expr.execute() - expected = pd.merge_asof(time_df1, time_df2, on="time") + expected = pd.merge_asof( + time_df1, time_df2, on="time", direction="nearest", allow_exact_matches=True + ) tm.assert_frame_equal(result[expected.columns], expected) with pytest.raises(AssertionError): tm.assert_series_equal(result["time"], result["time_right"]) @@ -281,13 +307,10 @@ def test_keyed_asof_join( time_keyed_left, time_keyed_right, time_keyed_df1, time_keyed_df2 ): expr = time_keyed_left.asof_join(time_keyed_right, "time", by="key") + expr = expr.select(time_keyed_left, time_keyed_right.other_value) result = expr.execute() expected = pd.merge_asof(time_keyed_df1, time_keyed_df2, on="time", by="key") tm.assert_frame_equal(result[expected.columns], expected) - with pytest.raises(AssertionError): - tm.assert_series_equal(result["time"], result["time_right"]) - with pytest.raises(AssertionError): - tm.assert_series_equal(result["key"], result["key_right"]) @merge_asof_minversion @@ -327,7 +350,7 @@ def test_asof_join_overlapping_non_predicate( time_keyed_df2.assign(collide=time_keyed_df2["key"] + time_keyed_df2["other_value"]) expr = time_keyed_left.asof_join( - time_keyed_right, predicates=[("time", "time")], by=[("key", "key")] + time_keyed_right, on=("time", "time"), predicates=[("key", "key")] ) result = expr.execute() expected = pd.merge_asof( @@ -595,3 +618,33 @@ def test_multijoin(tracts_df, fields_df, harvest_df): ) tm.assert_frame_equal(result, expected) + + +def test_chain_join(): + test_df1 = pd.DataFrame({"id": ["1", "1"], "value": ["a", "a"]}) + test_df2 = pd.DataFrame({"id": ["1", "1"], "value": ["z", "z"]}) + test_df3 = pd.DataFrame({"id": ["1", "1"], "value": ["z1", "z1"]}) + + conn = ibis.pandas.connect({"df1": test_df1, "df2": test_df2, "df3": test_df3}) + + t1 = conn.table("df1") + t2 = conn.table("df2") + t3 = conn.table("df3") + + expr = ( + t1.join(t2, t1.id == t2.id) + .join(t3, t1.id == t3.id) + .select(t1.id, t1.value, t2.value.name("value2"), t3.value.name("value3")) + ) + result = expr.execute() + + n = len(test_df1) * len(test_df2) * len(test_df3) + expected = pd.DataFrame( + { + "id": ["1"] * n, + "value": ["a"] * n, + "value2": ["z"] * n, + "value3": ["z1"] * n, + } + ) + tm.assert_frame_equal(result, expected) diff --git a/ibis/backends/pandas/tests/execution/test_maps.py b/ibis/backends/pandas/tests/test_maps.py similarity index 100% rename from ibis/backends/pandas/tests/execution/test_maps.py rename to ibis/backends/pandas/tests/test_maps.py diff --git a/ibis/backends/pandas/tests/execution/test_operations.py b/ibis/backends/pandas/tests/test_operations.py similarity index 99% rename from ibis/backends/pandas/tests/execution/test_operations.py rename to ibis/backends/pandas/tests/test_operations.py index 54877d1ce4d0..3d6e78d9d2c6 100644 --- a/ibis/backends/pandas/tests/execution/test_operations.py +++ b/ibis/backends/pandas/tests/test_operations.py @@ -13,7 +13,6 @@ import ibis.expr.datatypes as dt from ibis import _ from ibis.backends.pandas import Backend -from ibis.backends.pandas.execution import execute from ibis.backends.pandas.tests.conftest import TestConf as tm @@ -183,7 +182,6 @@ def test_group_by_rename_key(t, df): expr = t.group_by(t.dup_strings.name("foo")).aggregate( dup_string_count=t.dup_strings.count() ) - assert "foo" in expr.schema() result = expr.execute() assert "foo" in result.columns @@ -281,7 +279,7 @@ def test_nullif_zero(t, df, column): param( lambda t: ibis.literal("a"), lambda t: t.dup_strings, - lambda _: pd.Series(["d", np.nan, "d"], name="dup_strings"), + lambda _: pd.Series(["a", np.nan, "a"], name="dup_strings"), tm.assert_series_equal, id="literal_series", ), @@ -289,7 +287,7 @@ def test_nullif_zero(t, df, column): ) def test_nullif(t, df, left, right, expected, compare): expr = left(t).nullif(right(t)) - result = execute(expr.op()) + result = Backend().execute(expr) compare(result, expected(df)) diff --git a/ibis/backends/pandas/tests/execution/test_strings.py b/ibis/backends/pandas/tests/test_strings.py similarity index 89% rename from ibis/backends/pandas/tests/execution/test_strings.py rename to ibis/backends/pandas/tests/test_strings.py index 27f603903cd6..e583cb53437e 100644 --- a/ibis/backends/pandas/tests/execution/test_strings.py +++ b/ibis/backends/pandas/tests/test_strings.py @@ -7,7 +7,9 @@ import pytest from pytest import param -from ibis.backends.pandas.execution.strings import sql_like_to_regex +import ibis +from ibis.backends.pandas import Backend +from ibis.backends.pandas.kernels import sql_like_to_regex @pytest.mark.parametrize( @@ -165,3 +167,23 @@ def test_translate( table = str.maketrans(from_str, to_str) series = df.strings_with_space.str.translate(table) tm.assert_series_equal(result, series, check_names=False) + + +def test_string_repeat(t): + int_col = t.plain_int64 + int_lit = ibis.literal(3) + string_col = t.strings_with_space + string_lit = ibis.literal("abc") + + expr1 = string_col.repeat(int_col) + expr2 = string_col.repeat(int_lit) + expr3 = string_lit.repeat(int_col) + expr4 = string_lit.repeat(int_lit) + + con = Backend() + con.execute(expr1) + con.execute(expr2) + con.execute(expr3) + con.execute(expr4) + + # TODO(kszucs): add assertions or rather parametrize the tests above diff --git a/ibis/backends/pandas/tests/execution/test_structs.py b/ibis/backends/pandas/tests/test_structs.py similarity index 95% rename from ibis/backends/pandas/tests/execution/test_structs.py rename to ibis/backends/pandas/tests/test_structs.py index 203d3e961b19..bf9647f73a62 100644 --- a/ibis/backends/pandas/tests/execution/test_structs.py +++ b/ibis/backends/pandas/tests/test_structs.py @@ -8,7 +8,6 @@ import ibis import ibis.expr.datatypes as dt from ibis.backends.pandas import Backend -from ibis.backends.pandas.execution import execute from ibis.backends.pandas.tests.conftest import TestConf as tm @@ -48,13 +47,14 @@ def test_struct_field_literal(value): assert struct.type() == dt.Struct.from_tuples( [("fruit", dt.string), ("weight", dt.int8)] ) + con = ibis.pandas.connect() expr = struct["fruit"] - result = execute(expr.op()) + result = con.execute(expr) assert result == "pear" expr = struct["weight"] - result = execute(expr.op()) + result = con.execute(expr) assert result == 0 diff --git a/ibis/backends/pandas/tests/execution/test_temporal.py b/ibis/backends/pandas/tests/test_temporal.py similarity index 98% rename from ibis/backends/pandas/tests/execution/test_temporal.py rename to ibis/backends/pandas/tests/test_temporal.py index cd9a1e98384b..f8cf670e99f1 100644 --- a/ibis/backends/pandas/tests/execution/test_temporal.py +++ b/ibis/backends/pandas/tests/test_temporal.py @@ -9,9 +9,9 @@ from packaging.version import parse as parse_version from pytest import param +import ibis from ibis import literal as L from ibis.backends.pandas import Backend -from ibis.backends.pandas.execution import execute from ibis.backends.pandas.tests.conftest import TestConf as tm from ibis.expr import datatypes as dt @@ -44,6 +44,7 @@ ], ) def test_timestamp_functions(case_func, expected_func): + con = ibis.pandas.connect() v = L("2015-09-01 14:48:05.359").cast("timestamp") vt = datetime.datetime( year=2015, @@ -56,7 +57,7 @@ def test_timestamp_functions(case_func, expected_func): ) result = case_func(v) expected = expected_func(vt) - assert execute(result.op()) == expected + assert con.execute(result) == expected @pytest.mark.parametrize( diff --git a/ibis/backends/pandas/tests/test_udf.py b/ibis/backends/pandas/tests/test_udf.py index f310db217413..df6917aa2b25 100644 --- a/ibis/backends/pandas/tests/test_udf.py +++ b/ibis/backends/pandas/tests/test_udf.py @@ -364,26 +364,28 @@ def my_wm(v, w): tm.assert_frame_equal(result, expected) -def test_udaf_window_nan(): - df = pd.DataFrame( - { - "a": np.arange(10, dtype=float), - "b": [3.0, np.NaN] * 5, - "key": list("ddeefffggh"), - } - ) - con = Backend().connect({"df": df}) - t = con.table("df") - window = ibis.trailing_window(2, order_by="a", group_by="key") - expr = t.mutate(rolled=my_mean(t.b).over(window)) - result = expr.execute().sort_values(["key", "a"]) - expected = df.sort_values(["key", "a"]).assign( - rolled=lambda d: d.groupby("key") - .b.rolling(3, min_periods=1) - .apply(lambda x: x.mean(), raw=True) - .reset_index(level=0, drop=True) - ) - tm.assert_frame_equal(result, expected) +# TODO(kszucs): revisit this, duckdb produces the same result as the pandas +# backend, but the expected result is different +# def test_udaf_window_nan(): +# df = pd.DataFrame( +# { +# "a": np.arange(10, dtype=float), +# "b": [3.0, np.NaN] * 5, +# "key": list("ddeefffggh"), +# } +# ) +# con = Backend().connect({"df": df}) +# t = con.table("df") +# window = ibis.trailing_window(2, order_by="a", group_by="key") +# expr = t.mutate(rolled=my_mean(t.b).over(window)) +# result = expr.execute().sort_values(["key", "a"]) +# expected = df.sort_values(["key", "a"]).assign( +# rolled=lambda d: d.groupby("key") +# .b.rolling(3, min_periods=1) +# .apply(lambda x: x.mean(), raw=True) +# .reset_index(level=0, drop=True) +# ) +# tm.assert_frame_equal(result, expected) @pytest.fixture(params=[[0.25, 0.75], [0.01, 0.99]]) diff --git a/ibis/backends/pandas/tests/execution/test_window.py b/ibis/backends/pandas/tests/test_window.py similarity index 93% rename from ibis/backends/pandas/tests/execution/test_window.py rename to ibis/backends/pandas/tests/test_window.py index 905dd833c775..0f46a4a987b4 100644 --- a/ibis/backends/pandas/tests/execution/test_window.py +++ b/ibis/backends/pandas/tests/test_window.py @@ -11,11 +11,7 @@ import ibis import ibis.expr.datatypes as dt -import ibis.expr.operations as ops -from ibis.backends.base.df.scope import Scope from ibis.backends.pandas import Backend -from ibis.backends.pandas.dispatch import pre_execute -from ibis.backends.pandas.execution import execute from ibis.backends.pandas.tests.conftest import TestConf as tm from ibis.common.annotations import ValidationError from ibis.legacy.udf.vectorized import reduction @@ -51,58 +47,63 @@ def range_window(): @default @row_offset def test_lead(t, df, row_offset, default, row_window): + con = ibis.pandas.connect() expr = t.dup_strings.lead(row_offset, default=default).over(row_window) result = expr.execute() - expected = df.dup_strings.shift(execute((-row_offset).op())) + expected = df.dup_strings.shift(con.execute(-row_offset)) if default is not ibis.NA: - expected = expected.fillna(execute(default.op())) + expected = expected.fillna(con.execute(default)) tm.assert_series_equal(result, expected.rename("tmp")) @default @row_offset def test_lag(t, df, row_offset, default, row_window): + con = ibis.pandas.connect() expr = t.dup_strings.lag(row_offset, default=default).over(row_window) result = expr.execute() - expected = df.dup_strings.shift(execute(row_offset.op())) + expected = df.dup_strings.shift(con.execute(row_offset)) if default is not ibis.NA: - expected = expected.fillna(execute(default.op())) + expected = expected.fillna(con.execute(default)) tm.assert_series_equal(result, expected.rename("tmp")) @default @range_offset def test_lead_delta(t, df, range_offset, default, range_window): + con = ibis.pandas.connect() expr = t.dup_strings.lead(range_offset, default=default).over(range_window) result = expr.execute() expected = ( df[["plain_datetimes_naive", "dup_strings"]] .set_index("plain_datetimes_naive") .squeeze() - .shift(freq=execute((-range_offset).op())) + .shift(freq=con.execute(-range_offset)) .reindex(df.plain_datetimes_naive) .reset_index(drop=True) ) if default is not ibis.NA: - expected = expected.fillna(execute(default.op())) + expected = expected.fillna(con.execute(default)) tm.assert_series_equal(result, expected.rename("tmp")) @default @range_offset def test_lag_delta(t, df, range_offset, default, range_window): + con = ibis.pandas.connect() expr = t.dup_strings.lag(range_offset, default=default).over(range_window) result = expr.execute() + expected = ( df[["plain_datetimes_naive", "dup_strings"]] .set_index("plain_datetimes_naive") .squeeze() - .shift(freq=execute(range_offset.op())) + .shift(freq=con.execute(range_offset)) .reindex(df.plain_datetimes_naive) .reset_index(drop=True) ) if default is not ibis.NA: - expected = expected.fillna(execute(default.op())) + expected = expected.fillna(con.execute(default)) tm.assert_series_equal(result, expected.rename("tmp")) @@ -510,29 +511,6 @@ def test_window_with_mlb(): ) -def test_window_has_pre_execute_scope(): - called = [0] - - @pre_execute.register(ops.Lag, Backend) - def test_pre_execute(op, client, **kwargs): - called[0] += 1 - return Scope() - - data = {"key": list("abc"), "value": [1, 2, 3], "dup": list("ggh")} - df = pd.DataFrame(data, columns=["key", "value", "dup"]) - client = ibis.pandas.connect({"df": df}) - t = client.table("df") - window = ibis.window(order_by="value") - expr = t.key.lag(1).over(window).name("foo") - result = expr.execute() - assert result is not None - - # once in window op at the top to pickup any scope changes before computing - # twice in window op when calling execute on the ops.Lag node at the - # beginning of execute and once before the actual computation - assert called[0] == 3 - - def test_window_grouping_key_has_scope(t, df): param = ibis.param(dt.string) window = ibis.window(group_by=t.dup_strings + param) diff --git a/ibis/backends/pandas/trace.py b/ibis/backends/pandas/trace.py deleted file mode 100644 index 2350e8957930..000000000000 --- a/ibis/backends/pandas/trace.py +++ /dev/null @@ -1,170 +0,0 @@ -"""Module that adds tracing to pandas execution. - -With tracing enabled, this module will log time and call stack information of -the executed expression. Call stack information is presented with indentation -level. - -For example: - -import pandas as pd -import logging - -import ibis.expr.datatypes as dt -import ibis.backends.pandas -from ibis.legacy.udf.vectorized import elementwise -from ibis.backends.pandas import trace - -logging.basicConfig() -trace.enable() - -df = pd.DataFrame( - { - 'a': [1, 2, 3] - } -) - -con = ibis.pandas.connect({"table1": df}) - -@elementwise( - input_type=[dt.double], - output_type=dt.double -) -def add_one(v): - import time - time.sleep(5) - return v + 1 - -table = con.table("table1") -table = table.mutate(b=add_one(table['a'])) -table.execute() - -Output: - -DEBUG:ibis.backends.pandas.trace: main_execute Selection -DEBUG:ibis.backends.pandas.trace: execute_until_in_scope Selection -DEBUG:ibis.backends.pandas.trace: execute_until_in_scope PandasTable -DEBUG:ibis.backends.pandas.trace: execute_database_table_client PandasTable -DEBUG:ibis.backends.pandas.trace: execute_database_table_client PandasTable 0:00:00.000085 -DEBUG:ibis.backends.pandas.trace: execute_until_in_scope PandasTable 0:00:00.000362 -DEBUG:ibis.backends.pandas.trace: execute_selection_dataframe Selection -DEBUG:ibis.backends.pandas.trace: main_execute ElementWiseVectorizedUDF -DEBUG:ibis.backends.pandas.trace: execute_until_in_scope ElementWiseVectorizedUDF -DEBUG:ibis.backends.pandas.trace: execute_until_in_scope TableColumn -DEBUG:ibis.backends.pandas.trace: execute_until_in_scope PandasTable -DEBUG:ibis.backends.pandas.trace: execute_until_in_scope PandasTable 0:00:00.000061 -DEBUG:ibis.backends.pandas.trace: execute_table_column_df_or_df_groupby TableColumn -DEBUG:ibis.backends.pandas.trace: execute_table_column_df_or_df_groupby TableColumn 0:00:00.000304 # noqa: E501 -DEBUG:ibis.backends.pandas.trace: execute_until_in_scope TableColumn 0:00:00.000584 -DEBUG:ibis.backends.pandas.trace: execute_udf_node ElementWiseVectorizedUDF -DEBUG:ibis.backends.pandas.trace: execute_udf_node ElementWiseVectorizedUDF 0:00:05.019173 -DEBUG:ibis.backends.pandas.trace: execute_until_in_scope ElementWiseVectorizedUDF 0:00:05.052604 # noqa: E501 -DEBUG:ibis.backends.pandas.trace: main_execute ElementWiseVectorizedUDF 0:00:05.052819 -DEBUG:ibis.backends.pandas.trace: execute_selection_dataframe Selection 0:00:05.054894 -DEBUG:ibis.backends.pandas.trace: execute_until_in_scope Selection 0:00:05.055662 -DEBUG:ibis.backends.pandas.trace: main_execute Selection 0:00:05.056556 -""" - -from __future__ import annotations - -import functools -import logging -import traceback -from datetime import datetime - -import ibis -from ibis.backends.pandas.dispatcher import TwoLevelDispatcher -from ibis.config import options -from ibis.expr import types as ir - -_logger = logging.getLogger("ibis.backends.pandas.trace") - -# A list of funcs that is traced -_trace_funcs = set() - - -def enable(): - """Enable tracing.""" - if options.pandas is None: - # pandas options haven't been registered yet - force module __getattr__ - ibis.pandas # noqa: B018 - options.pandas.enable_trace = True - logging.getLogger("ibis.backends.pandas.trace").setLevel(logging.DEBUG) - - -def _log_trace(func, start=None): - level = 0 - current_frame = None - - # Increase the current level for each traced function in the stackframe - # This way we can visualize the call stack. - for frame, _ in traceback.walk_stack(None): - current_frame = current_frame if current_frame is not None else frame - func_name = frame.f_code.co_name - if func_name in _trace_funcs: - level += 1 - - # We can assume we have 'args' because we only call _log_trace inside - # trace or TraceDispatcher.register - current_op = current_frame.f_locals["args"][0] - - # If the first argument is a Expr, we print its op because it's more - # informative. - if isinstance(current_op, ir.Expr): - current_op = current_op.op() - - _logger.debug( - "%s %s %s %s", - " " * level, - func.__name__, - type(current_op).__qualname__, - f"{datetime.now() - start}" if start else "", - ) - - -def trace(func): - """Return a function decorator that wraps `func` with tracing.""" - _trace_funcs.add(func.__name__) - - @functools.wraps(func) - def traced_func(*args, **kwargs): - # Unfortunately, this function can be called before the `ibis.pandas` - # attribute has ever been accessed, which means the trace configuration - # option might never get registered and will raise an error. Accessing - # the pandas attribute here forces the option initialization - import ibis - - ibis.pandas # noqa: B018 - - if not options.pandas.enable_trace: - return func(*args, **kwargs) - else: - start = datetime.now() - _log_trace(func) - res = func(*args, **kwargs) - _log_trace(func, start) - return res - - return traced_func - - -class TraceTwoLevelDispatcher(TwoLevelDispatcher): - """A Dispatcher that also wraps the registered function with tracing.""" - - def __init__(self, name, doc=None): - super().__init__(name, doc) - - def register(self, *types, **kwargs): - """Register a function with this Dispatcher. - - The function will also be wrapped with tracing information. - """ - - def _(func): - trace_func = trace(func) - TwoLevelDispatcher.register(self, *types, **kwargs)(trace_func) - # return func instead trace_func here so that - # chained register didn't get wrapped multiple - # times - return func - - return _ diff --git a/ibis/backends/pandas/udf.py b/ibis/backends/pandas/udf.py index 561aca6987d6..3168d348f67d 100644 --- a/ibis/backends/pandas/udf.py +++ b/ibis/backends/pandas/udf.py @@ -2,35 +2,7 @@ from __future__ import annotations -import itertools - -import pandas as pd -from pandas.core.groupby import SeriesGroupBy - -import ibis.expr.operations as ops import ibis.legacy.udf.vectorized -from ibis.backends.base import BaseBackend -from ibis.backends.pandas.aggcontext import Transform -from ibis.backends.pandas.dispatch import execute_node, pre_execute -from ibis.backends.pandas.execution.util import get_grouping - - -def create_gens_from_args_groupby(*args: tuple[SeriesGroupBy, ...]): - """Create generators for each of `args` for groupby UDAF. - - Returns a generator that outputs each group. - - Parameters - ---------- - *args - A tuple of group by objects - - Returns - ------- - Tuple[Generator] - Generators of group by data - """ - return ((data for _, data in arg) for arg in args) class udf: @@ -49,120 +21,3 @@ def reduction(input_type, output_type): def analytic(input_type, output_type): """Alias for ibis.legacy.udf.vectorized.analytic.""" return ibis.legacy.udf.vectorized.analytic(input_type, output_type) - - -@pre_execute.register(ops.ElementWiseVectorizedUDF) -@pre_execute.register(ops.ElementWiseVectorizedUDF, BaseBackend) -def pre_execute_elementwise_udf(op, *clients, scope=None, **kwargs): - """Register execution rules for elementwise UDFs.""" - input_type = op.input_type - - # definitions - - # Define an execution rule for elementwise operations on a - # grouped Series - nargs = len(input_type) - - @execute_node.register( - ops.ElementWiseVectorizedUDF, *(itertools.repeat(SeriesGroupBy, nargs)) - ) - def execute_udf_node_groupby(op, *args, **kwargs): - func = op.func - - groupers = [ - grouper - for grouper in (getattr(arg, "grouper", None) for arg in args) - if grouper is not None - ] - - # all grouping keys must be identical - assert all(groupers[0] == grouper for grouper in groupers[1:]) - - # we're performing a scalar operation on grouped column, so - # perform the operation directly on the underlying Series - # and regroup after it's finished - args = [getattr(arg, "obj", arg) for arg in args] - groupings = get_grouping(groupers[0].groupings) - return func(*args).groupby(groupings, group_keys=False) - - # Define an execution rule for a simple elementwise Series - # function - @execute_node.register( - ops.ElementWiseVectorizedUDF, *(itertools.repeat(pd.Series, nargs)) - ) - @execute_node.register( - ops.ElementWiseVectorizedUDF, *(itertools.repeat(object, nargs)) - ) - def execute_udf_node(op, *args, cache=None, timecontext=None, **kwargs): - # We have rewritten op.func to be a closure enclosing - # the kwargs, and therefore, we do not need to pass - # kwargs here. This is true for all udf execution in this - # file. - # See ibis.legacy.udf.vectorized.UserDefinedFunction - - # prevent executing UDFs multiple times on different execution branches - try: - result = cache[(op, timecontext)] - except KeyError: - result = cache[(op, timecontext)] = op.func(*args) - - return result - - return scope - - -@pre_execute.register(ops.AnalyticVectorizedUDF) -@pre_execute.register(ops.AnalyticVectorizedUDF, BaseBackend) -@pre_execute.register(ops.ReductionVectorizedUDF) -@pre_execute.register(ops.ReductionVectorizedUDF, BaseBackend) -def pre_execute_analytic_and_reduction_udf(op, *clients, scope=None, **kwargs): - input_type = op.input_type - nargs = len(input_type) - - # An execution rule to handle analytic and reduction UDFs over - # 1) an ungrouped window, - # 2) an ungrouped Aggregate node, or - # 3) an ungrouped custom aggregation context - @execute_node.register(type(op), *(itertools.repeat(pd.Series, nargs))) - def execute_udaf_node_no_groupby(op, *args, aggcontext, **kwargs): - func = op.func - return aggcontext.agg(args[0], func, *args[1:]) - - # An execution rule to handle analytic and reduction UDFs over - # 1) a grouped window, - # 2) a grouped Aggregate node, or - # 3) a grouped custom aggregation context - @execute_node.register(type(op), *(itertools.repeat(SeriesGroupBy, nargs))) - def execute_udaf_node_groupby(op, *args, aggcontext, **kwargs): - func = op.func - if isinstance(aggcontext, Transform): - # We are aggregating over an unbounded (and GROUPED) window, - # which uses a Transform aggregation context. - # We need to do some pre-processing to func and args so that - # Transform can pull data out of the SeriesGroupBys in args. - - # Construct a generator that yields the next group of data - # for every argument excluding the first (pandas performs - # the iteration for the first argument) for each argument - # that is a SeriesGroupBy. - iters = create_gens_from_args_groupby(*args[1:]) - - # TODO: Unify calling convention here to be more like - # window - def aggregator(first, *rest): - # map(next, *rest) gets the inputs for the next group - # TODO: might be inefficient to do this on every call - return func(first, *map(next, rest)) - - return aggcontext.agg(args[0], aggregator, *iters) - else: - # We are either: - # 1) Aggregating over a bounded window, which uses a Window - # aggregation context - # 2) Aggregating over a custom aggregation context - # 3) Aggregating using an Aggregate node (with GROUPING), which - # uses a Summarize aggregation context - # No pre-processing to be done for any case. - return aggcontext.agg(args[0], func, *args[1:]) - - return scope diff --git a/ibis/backends/tests/test_aggregation.py b/ibis/backends/tests/test_aggregation.py index 4b0a4f7cc056..be97ad419d92 100644 --- a/ibis/backends/tests/test_aggregation.py +++ b/ibis/backends/tests/test_aggregation.py @@ -1022,7 +1022,7 @@ def test_quantile( id="covar_pop", marks=[ pytest.mark.notimpl( - ["dask", "pandas", "polars", "druid"], + ["dask", "polars", "druid"], raises=com.OperationNotDefinedError, ), pytest.mark.notyet( @@ -1042,7 +1042,7 @@ def test_quantile( id="covar_samp", marks=[ pytest.mark.notimpl( - ["dask", "pandas", "polars", "druid"], + ["dask", "polars", "druid"], raises=com.OperationNotDefinedError, ), pytest.mark.notyet( @@ -1062,7 +1062,7 @@ def test_quantile( id="corr_pop", marks=[ pytest.mark.notimpl( - ["dask", "pandas", "druid"], + ["dask", "druid"], raises=com.OperationNotDefinedError, ), pytest.mark.notyet( @@ -1092,7 +1092,7 @@ def test_quantile( id="corr_samp", marks=[ pytest.mark.notimpl( - ["dask", "pandas", "druid"], + ["dask", "druid"], raises=com.OperationNotDefinedError, ), pytest.mark.notyet( @@ -1132,7 +1132,7 @@ def test_quantile( id="covar_pop_bool", marks=[ pytest.mark.notimpl( - ["dask", "pandas", "polars", "druid"], + ["dask", "polars", "druid"], raises=com.OperationNotDefinedError, ), pytest.mark.notyet( @@ -1156,7 +1156,7 @@ def test_quantile( id="corr_pop_bool", marks=[ pytest.mark.notimpl( - ["dask", "pandas", "druid"], + ["dask", "druid"], raises=com.OperationNotDefinedError, ), pytest.mark.notyet( @@ -1325,9 +1325,6 @@ def test_string_quantile(alltypes, func): @pytest.mark.notimpl(["dask"], raises=(AssertionError, NotImplementedError, TypeError)) @pytest.mark.notyet(["polars"], raises=PolarsInvalidOperationError) @pytest.mark.notyet(["datafusion"], raises=Exception, reason="not supported upstream") -@pytest.mark.broken( - ["pandas"], raises=AssertionError, reason="possibly incorrect results" -) @pytest.mark.parametrize( "func", [ @@ -1686,8 +1683,8 @@ def test_grouped_case(backend, con): ["datafusion", "mssql", "polars", "exasol"], raises=com.OperationNotDefinedError ) @pytest.mark.broken( - ["dask", "pandas"], - reason="Dask and Pandas do not windowize this operation correctly", + ["dask"], + reason="Dask does not windowize this operation correctly", raises=AssertionError, ) @pytest.mark.notyet(["impala", "flink"], raises=com.UnsupportedOperationError) diff --git a/ibis/backends/tests/test_array.py b/ibis/backends/tests/test_array.py index ea41cbb89956..e2063b94354d 100644 --- a/ibis/backends/tests/test_array.py +++ b/ibis/backends/tests/test_array.py @@ -342,6 +342,11 @@ def test_unnest_no_nulls(backend): raises=ValueError, reason="ValueError: Do not nest ARRAY types; ARRAY(basetype) handles multi-dimensional arrays of basetype", ) +@pytest.mark.broken( + ["pandas"], + raises=ValueError, + reason="all the input arrays must have same number of dimensions", +) def test_unnest_default_name(backend): array_types = backend.array_types df = array_types.execute() @@ -531,7 +536,7 @@ def test_array_filter(con, input, output): @builtin_array @pytest.mark.notimpl( - ["mssql", "pandas", "polars", "postgres"], + ["mssql", "polars", "postgres"], raises=com.OperationNotDefinedError, ) @pytest.mark.notimpl(["dask"], raises=com.OperationNotDefinedError) @@ -588,7 +593,7 @@ def test_array_contains(backend, con): ) @builtin_array @pytest.mark.notimpl( - ["dask", "impala", "mssql", "pandas", "polars"], + ["dask", "impala", "mssql", "polars"], raises=com.OperationNotDefinedError, ) def test_array_position(backend, con, a, expected_array): @@ -602,7 +607,7 @@ def test_array_position(backend, con, a, expected_array): @builtin_array @pytest.mark.notimpl( - ["dask", "impala", "mssql", "pandas", "polars"], + ["dask", "impala", "mssql", "polars"], raises=com.OperationNotDefinedError, ) @pytest.mark.broken( @@ -639,7 +644,7 @@ def test_array_remove(con, a): @builtin_array @pytest.mark.notimpl( - ["dask", "datafusion", "impala", "mssql", "pandas", "polars", "mysql"], + ["dask", "datafusion", "impala", "mssql", "polars", "mysql"], raises=com.OperationNotDefinedError, ) @pytest.mark.notimpl( @@ -693,7 +698,7 @@ def test_array_unique(con, input, expected): @builtin_array @pytest.mark.notimpl( - ["dask", "datafusion", "flink", "impala", "mssql", "pandas", "polars"], + ["dask", "datafusion", "flink", "impala", "mssql", "polars"], raises=com.OperationNotDefinedError, ) @pytest.mark.broken( @@ -714,7 +719,7 @@ def test_array_sort(con): @builtin_array @pytest.mark.notimpl( - ["dask", "datafusion", "impala", "mssql", "pandas", "polars"], + ["dask", "datafusion", "impala", "mssql", "polars"], raises=com.OperationNotDefinedError, ) @pytest.mark.parametrize( @@ -978,7 +983,7 @@ def test_array_flatten(backend, flatten_data, column, expected): reason="range isn't implemented upstream", raises=com.OperationNotDefinedError, ) -@pytest.mark.notimpl(["flink", "pandas", "dask"], raises=com.OperationNotDefinedError) +@pytest.mark.notimpl(["flink", "dask"], raises=com.OperationNotDefinedError) @pytest.mark.parametrize("n", [-2, 0, 2]) def test_range_single_argument(con, n): expr = ibis.range(n) @@ -992,9 +997,7 @@ def test_range_single_argument(con, n): raises=com.OperationNotDefinedError, ) @pytest.mark.parametrize("n", [-2, 0, 2]) -@pytest.mark.notimpl( - ["polars", "flink", "pandas", "dask"], raises=com.OperationNotDefinedError -) +@pytest.mark.notimpl(["polars", "flink", "dask"], raises=com.OperationNotDefinedError) @pytest.mark.skip("risingwave") def test_range_single_argument_unnest(backend, con, n): expr = ibis.range(n).unnest() @@ -1026,7 +1029,7 @@ def test_range_single_argument_unnest(backend, con, n): reason="range and unnest aren't implemented upstream", raises=com.OperationNotDefinedError, ) -@pytest.mark.notimpl(["flink", "pandas", "dask"], raises=com.OperationNotDefinedError) +@pytest.mark.notimpl(["flink", "dask"], raises=com.OperationNotDefinedError) def test_range_start_stop_step(con, start, stop, step): expr = ibis.range(start, stop, step) result = con.execute(expr) @@ -1041,7 +1044,7 @@ def test_range_start_stop_step(con, start, stop, step): @pytest.mark.notyet( ["datafusion"], raises=com.OperationNotDefinedError, reason="not supported upstream" ) -@pytest.mark.notimpl(["flink", "pandas", "dask"], raises=com.OperationNotDefinedError) +@pytest.mark.notimpl(["flink", "dask"], raises=com.OperationNotDefinedError) @pytest.mark.never( ["risingwave"], raises=sa.exc.InternalError, @@ -1222,7 +1225,7 @@ def swap(token): ) @timestamp_range_tzinfos @pytest.mark.notimpl( - ["pandas", "dask", "flink", "datafusion"], raises=com.OperationNotDefinedError + ["dask", "flink", "datafusion"], raises=com.OperationNotDefinedError ) def test_timestamp_range(con, start, stop, step, freq, tzinfo): start = start.replace(tzinfo=tzinfo) @@ -1273,7 +1276,7 @@ def test_timestamp_range(con, start, stop, step, freq, tzinfo): ) @timestamp_range_tzinfos @pytest.mark.notimpl( - ["pandas", "dask", "flink", "datafusion"], raises=com.OperationNotDefinedError + ["dask", "flink", "datafusion"], raises=com.OperationNotDefinedError ) def test_timestamp_range_zero_step(con, start, stop, step, tzinfo): start = start.replace(tzinfo=tzinfo) @@ -1293,7 +1296,7 @@ def test_repr_timestamp_array(con, monkeypatch): @pytest.mark.notyet( - ["dask", "datafusion", "flink", "pandas", "polars"], + ["dask", "datafusion", "flink", "polars"], raises=com.OperationNotDefinedError, ) @pytest.mark.broken( diff --git a/ibis/backends/tests/test_generic.py b/ibis/backends/tests/test_generic.py index e84a5eb97f02..0e2d41fabdf1 100644 --- a/ibis/backends/tests/test_generic.py +++ b/ibis/backends/tests/test_generic.py @@ -313,12 +313,14 @@ def test_filter(backend, alltypes, sorted_df, predicate_fn, expected_fn): "druid", "oracle", "exasol", + "pandas", ] ) @pytest.mark.never( ["flink"], reason="Flink engine does not support generic window clause with no order by", ) +# TODO(kszucs): this is not supported at the expression level def test_filter_with_window_op(backend, alltypes, sorted_df): sorted_alltypes = alltypes.order_by("id") table = sorted_alltypes @@ -1154,7 +1156,7 @@ def test_pivot_wider(backend): reason="backend doesn't implement window functions", ) @pytest.mark.notimpl( - ["pandas", "polars"], + ["polars"], raises=com.OperationNotDefinedError, reason="backend doesn't implement ops.WindowFunction", ) @@ -1232,7 +1234,7 @@ def test_distinct_on_keep(backend, on, keep): reason="backend doesn't implement window functions", ) @pytest.mark.notimpl( - ["pandas", "polars"], + ["polars"], raises=com.OperationNotDefinedError, reason="backend doesn't implement ops.WindowFunction", ) diff --git a/ibis/backends/tests/test_interactive.py b/ibis/backends/tests/test_interactive.py index bfa3f6adffe1..704e17019c6e 100644 --- a/ibis/backends/tests/test_interactive.py +++ b/ibis/backends/tests/test_interactive.py @@ -33,6 +33,7 @@ def table(backend): return backend.functional_alltypes +@pytest.mark.notimpl(["pandas"]) def test_interactive_execute_on_repr(table, queries, snapshot): repr(table.bigint_col.sum()) snapshot.assert_match(queries[0], "out.sql") @@ -52,18 +53,21 @@ def test_repr_png_is_not_none_in_not_interactive(table): assert table._repr_png_() is not None +@pytest.mark.notimpl(["pandas"]) def test_default_limit(table, snapshot, queries): repr(table.select("id", "bool_col")) snapshot.assert_match(queries[0], "out.sql") +@pytest.mark.notimpl(["pandas"]) def test_respect_set_limit(table, snapshot, queries): repr(table.select("id", "bool_col").limit(10)) snapshot.assert_match(queries[0], "out.sql") +@pytest.mark.notimpl(["pandas"]) def test_disable_query_limit(table, snapshot, queries): assert ibis.options.sql.default_limit is None diff --git a/ibis/backends/tests/test_param.py b/ibis/backends/tests/test_param.py index 8266186481b2..b7aa81c43dd1 100644 --- a/ibis/backends/tests/test_param.py +++ b/ibis/backends/tests/test_param.py @@ -65,9 +65,7 @@ def test_timestamp_accepts_date_literals(alltypes): assert expr.compile(params=params) is not None -@pytest.mark.notimpl( - ["dask", "impala", "pandas", "pyspark", "druid", "oracle", "exasol"] -) +@pytest.mark.notimpl(["dask", "impala", "pyspark", "druid", "oracle", "exasol"]) @pytest.mark.never( ["mysql", "sqlite", "mssql"], reason="backend will never implement array types" ) diff --git a/ibis/backends/tests/test_string.py b/ibis/backends/tests/test_string.py index d441b39896f2..cde2dc86d1bc 100644 --- a/ibis/backends/tests/test_string.py +++ b/ibis/backends/tests/test_string.py @@ -1098,7 +1098,7 @@ def test_no_conditional_percent_escape(con, expr): @pytest.mark.notimpl( - ["dask", "pandas", "mssql", "oracle", "exasol"], raises=com.OperationNotDefinedError + ["dask", "mssql", "oracle", "exasol"], raises=com.OperationNotDefinedError ) def test_non_match_regex_search_is_false(con): expr = ibis.literal("foo").re_search("bar") diff --git a/ibis/backends/tests/test_temporal.py b/ibis/backends/tests/test_temporal.py index baff2a018e18..4878ad46a287 100644 --- a/ibis/backends/tests/test_temporal.py +++ b/ibis/backends/tests/test_temporal.py @@ -1028,7 +1028,6 @@ def convert_to_offset(x): "dask", "impala", "mysql", - "pandas", "postgres", "risingwave", "snowflake", @@ -1644,13 +1643,6 @@ def test_interval_add_cast_column(backend, alltypes, df): ), "%Y%m%d", marks=[ - pytest.mark.notimpl(["pandas"], raises=com.OperationNotDefinedError), - pytest.mark.notimpl( - [ - "pandas", - ], - raises=com.OperationNotDefinedError, - ), pytest.mark.notimpl( [ "pyspark", @@ -2254,7 +2246,7 @@ def test_time_literal(con, backend): @pytest.mark.broken( ["sqlite"], raises=AssertionError, reason="SQLite returns Timedelta from execution" ) -@pytest.mark.notimpl(["dask", "pandas"], raises=com.OperationNotDefinedError) +@pytest.mark.notimpl(["dask"], raises=com.OperationNotDefinedError) @pytest.mark.notyet(["oracle"], raises=sa.exc.DatabaseError) @pytest.mark.parametrize( "microsecond", diff --git a/ibis/backends/tests/test_timecontext.py b/ibis/backends/tests/test_timecontext.py index 72e78065640e..50b181728d7e 100644 --- a/ibis/backends/tests/test_timecontext.py +++ b/ibis/backends/tests/test_timecontext.py @@ -54,7 +54,7 @@ def filter_by_time_context(df, context): ) -@pytest.mark.notimpl(["dask", "duckdb"]) +@pytest.mark.notimpl(["dask", "duckdb", "pandas"]) @pytest.mark.notimpl( ["flink"], raises=com.OperationNotDefinedError, @@ -91,7 +91,7 @@ def test_context_adjustment_window_udf(backend, alltypes, context, window, monke backend.assert_frame_equal(result, expected) -@pytest.mark.notimpl(["dask", "duckdb"]) +@pytest.mark.notimpl(["dask", "duckdb", "pandas"]) @pytest.mark.broken( # TODO (mehmet): Check with the team. ["flink"], diff --git a/ibis/backends/tests/test_vectorized_udf.py b/ibis/backends/tests/test_vectorized_udf.py index f130b5b60154..c1c85326f52e 100644 --- a/ibis/backends/tests/test_vectorized_udf.py +++ b/ibis/backends/tests/test_vectorized_udf.py @@ -570,7 +570,8 @@ def test_elementwise_udf_named_destruct(udf_alltypes): add_one_struct_udf = create_add_one_struct_udf( result_formatter=lambda v1, v2: (v1, v2) ) - with pytest.raises(com.IbisTypeError, match=r"Unable to infer"): + msg = "Duplicate column name 'new_struct' in result set" + with pytest.raises(com.IntegrityError, match=msg): udf_alltypes.mutate( new_struct=add_one_struct_udf(udf_alltypes["double_col"]).destructure() ) diff --git a/ibis/backends/tests/test_window.py b/ibis/backends/tests/test_window.py index 28ae24cfd19c..e7968831330f 100644 --- a/ibis/backends/tests/test_window.py +++ b/ibis/backends/tests/test_window.py @@ -247,7 +247,6 @@ def calc_zscore(s): id="row_number", marks=[ pytest.mark.notimpl(["dask"], raises=NotImplementedError), - pytest.mark.notimpl(["pandas"], raises=com.OperationNotDefinedError), ], ), param( @@ -469,7 +468,6 @@ def test_ungrouped_bounded_expanding_window( ) @pytest.mark.notimpl(["polars"], raises=com.OperationNotDefinedError) @pytest.mark.notimpl(["dask"], raises=NotImplementedError) -@pytest.mark.notimpl(["pandas"], raises=AssertionError) @pytest.mark.notimpl( ["flink"], raises=com.UnsupportedOperationError, @@ -652,7 +650,7 @@ def test_grouped_unbounded_window( ], ) @pytest.mark.broken(["snowflake"], raises=AssertionError) -@pytest.mark.broken(["dask", "pandas", "mssql"], raises=AssertionError) +@pytest.mark.broken(["dask", "mssql"], raises=AssertionError) @pytest.mark.notimpl(["polars"], raises=com.OperationNotDefinedError) @pytest.mark.notimpl( ["flink"], @@ -683,7 +681,7 @@ def test_simple_ungrouped_unbound_following_window( reason="OVER RANGE FOLLOWING windows are not supported in Flink yet", ) @pytest.mark.notimpl( - ["pandas", "dask"], + ["dask"], raises=NotImplementedError, reason="support scalar sorting keys are not yet implemented", ) @@ -719,7 +717,6 @@ def test_simple_ungrouped_window_with_scalar_order_by(alltypes): True, id="ordered-mean", marks=[ - pytest.mark.broken(["pandas"], raises=AssertionError), pytest.mark.notimpl( ["dask"], raises=NotImplementedError, @@ -796,7 +793,6 @@ def test_simple_ungrouped_window_with_scalar_order_by(alltypes): ], raises=com.OperationNotDefinedError, ), - pytest.mark.broken(["pandas"], raises=AssertionError), pytest.mark.broken( ["dask"], raises=ValueError, @@ -963,11 +959,6 @@ def test_simple_ungrouped_window_with_scalar_order_by(alltypes): ], raises=com.OperationNotDefinedError, ), - pytest.mark.notimpl( - ["pandas"], - raises=RuntimeWarning, - reason="invalid value encountered in divide", - ), pytest.mark.broken( ["dask"], raises=ValueError, @@ -1042,11 +1033,6 @@ def test_ungrouped_unbounded_window( ["impala"], raises=ImpalaHiveServer2Error, reason="limited RANGE support" ) @pytest.mark.notimpl(["dask"], raises=NotImplementedError) -@pytest.mark.notimpl( - ["pandas"], - raises=NotImplementedError, - reason="The pandas backend only implements range windows with temporal ordering keys", -) @pytest.mark.notimpl( ["flink"], raises=com.UnsupportedOperationError, @@ -1295,9 +1281,6 @@ def test_range_expression_bounds(backend): reason="clickhouse doesn't implement percent_rank", raises=com.OperationNotDefinedError, ) -@pytest.mark.broken( - ["pandas"], reason="missing column during execution", raises=KeyError -) @pytest.mark.broken( ["mssql"], reason="lack of support for booleans", raises=sa.exc.ProgrammingError ) @@ -1328,7 +1311,7 @@ def test_rank_followed_by_over_call_merge_frames(backend, alltypes, df): @pytest.mark.notyet( - ["pandas", "dask"], + ["dask"], reason="multiple ordering keys in a window function not supported for ranking", raises=ValueError, ) @@ -1342,6 +1325,11 @@ def test_rank_followed_by_over_call_merge_frames(backend, alltypes, df): @pytest.mark.broken( ["pyspark"], reason="pyspark requires CURRENT ROW", raises=PySparkAnalysisException ) +@pytest.mark.broken( + ["pandas"], + raises=TypeError, + reason="'<' not supported between instances of 'bool' and 'NoneType'", +) @pytest.mark.notimpl( ["risingwave"], raises=sa.exc.InternalError, diff --git a/ibis/expr/operations/reductions.py b/ibis/expr/operations/reductions.py index 2a85dbfcbab5..597f42107f35 100644 --- a/ibis/expr/operations/reductions.py +++ b/ibis/expr/operations/reductions.py @@ -17,6 +17,7 @@ class Reduction(Value): shape = ds.scalar + # TODO(kszucs): remove this @property def __window_op__(self): return self diff --git a/ibis/formats/pandas.py b/ibis/formats/pandas.py index 4f83af05d320..b18fc2bf106e 100644 --- a/ibis/formats/pandas.py +++ b/ibis/formats/pandas.py @@ -2,6 +2,7 @@ import contextlib import datetime +import decimal import warnings from importlib.util import find_spec as _find_spec @@ -117,8 +118,10 @@ def convert_table(cls, df, schema): "schema column count does not match input data column count" ) - for (name, series), dtype in zip(df.items(), schema.types): - df[name] = cls.convert_column(series, dtype) + columns = [] + for (_, series), dtype in zip(df.items(), schema.types): + columns.append(cls.convert_column(series, dtype)) + df = pd.concat(columns, axis=1) # return data with the schema's columns which may be different than the # input columns @@ -250,6 +253,23 @@ def convert_Interval(cls, s, dtype, pandas_type): def convert_String(cls, s, dtype, pandas_type): return s.astype(pandas_type, errors="ignore") + @classmethod + def convert_Decimal(cls, s, dtype, pandas_type): + context = decimal.Context(prec=dtype.precision) + + if dtype.scale is None: + normalize = context.create_decimal + else: + exponent = decimal.Decimal(10) ** -dtype.scale + + def normalize(x, exponent=exponent): + try: + return context.create_decimal(x).quantize(exponent) + except decimal.InvalidOperation: + return x + + return s.map(normalize, na_action="ignore").astype(pandas_type) + @classmethod def convert_UUID(cls, s, dtype, pandas_type): return s.map(cls.get_element_converter(dtype), na_action="ignore") diff --git a/ibis/formats/tests/test_dask.py b/ibis/formats/tests/test_dask.py index 89ce6c59198a..2dbe9b61ad7d 100644 --- a/ibis/formats/tests/test_dask.py +++ b/ibis/formats/tests/test_dask.py @@ -199,12 +199,3 @@ def test_schema_infer_exhaustive_dataframe(): ] assert DaskData.infer_table(df) == ibis.schema(expected) - - -def test_convert_dataframe_with_timezone(): - data = {"time": pd.date_range("2018-01-01", "2018-01-02", freq="H")} - df = dd.from_pandas(pd.DataFrame(data), npartitions=2) - expected = df.assign(time=df.time.dt.tz_localize("EST")) - desired_schema = ibis.schema([("time", 'timestamp("EST")')]) - result = DaskData.convert_table(df.copy(), desired_schema) - tm.assert_frame_equal(result.compute(), expected.compute())