From a6ef75f586038364654d8bfe9a29d3c9e220bb0e Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Mon, 11 Sep 2023 12:28:16 +0200 Subject: [PATCH] if it accepts expression, it should accept column --- .../API_specification/dataframe_api/_types.py | 8 ---- .../dataframe_api/dataframe_object.py | 10 ++-- .../dataframe_api/eagercolumn_object.py | 4 +- .../dataframe_api/eagerframe_object.py | 18 +++---- .../dataframe_api/expression_object.py | 47 +++++++++---------- spec/conf.py | 1 - spec/design_topics/python_builtin_types.md | 2 +- 7 files changed, 40 insertions(+), 50 deletions(-) diff --git a/spec/API_specification/dataframe_api/_types.py b/spec/API_specification/dataframe_api/_types.py index db2bb49a..2b6d7d08 100644 --- a/spec/API_specification/dataframe_api/_types.py +++ b/spec/API_specification/dataframe_api/_types.py @@ -17,17 +17,9 @@ TypeVar, Union, Protocol, - TYPE_CHECKING, - TypeAlias ) from enum import Enum -if TYPE_CHECKING: - from .expression_object import Expression - from .eagercolumn_object import EagerColumn - - IntoExpression: TypeAlias = Expression | EagerColumn - # Type alias: Mypy needs Any, but for readability we need to make clear this # is a Python scalar (i.e., an instance of `bool`, `int`, `float`, `str`, etc.) Scalar = Any diff --git a/spec/API_specification/dataframe_api/dataframe_object.py b/spec/API_specification/dataframe_api/dataframe_object.py index e7092678..315e448d 100644 --- a/spec/API_specification/dataframe_api/dataframe_object.py +++ b/spec/API_specification/dataframe_api/dataframe_object.py @@ -8,7 +8,7 @@ from .eagerframe_object import EagerFrame from .eagercolumn_object import EagerColumn from .groupby_object import GroupBy - from ._types import NullType, Scalar, IntoExpression + from ._types import NullType, Scalar __all__ = ["DataFrame"] @@ -92,7 +92,7 @@ def groupby(self, *keys: str) -> GroupBy: """ ... - def select(self, *names: str | Expression) -> DataFrame: + def select(self, *names: str | Expression | EagerColumn[Any]) -> DataFrame: """ Select multiple columns, either by name or by expressions. @@ -137,7 +137,7 @@ def slice_rows( """ ... - def filter(self, mask: IntoExpression) -> DataFrame: + def filter(self, mask: Expression | EagerColumn[bool]) -> DataFrame: """ Select a subset of rows corresponding to a mask. @@ -216,7 +216,7 @@ def update_columns(self, *columns: Expression | EagerColumn[Any]) -> DataFrame: Parameters ---------- - columns : Expression, EagerColumn, or sequence of either + columns : Expression | EagerColumn Column(s) to update. If updating multiple columns, they must all have different names. @@ -273,7 +273,7 @@ def column_names(self) -> list[str]: def sort( self, - *keys: str | Expression, + *keys: str | Expression | EagerColumn[Any], ascending: Sequence[bool] | bool = True, nulls_position: Literal['first', 'last'] = 'last', ) -> DataFrame: diff --git a/spec/API_specification/dataframe_api/eagercolumn_object.py b/spec/API_specification/dataframe_api/eagercolumn_object.py index f4a101ee..58dbe8e2 100644 --- a/spec/API_specification/dataframe_api/eagercolumn_object.py +++ b/spec/API_specification/dataframe_api/eagercolumn_object.py @@ -17,7 +17,7 @@ class EagerColumn(Generic[DType]): """ EagerColumn object - Instantiate via :meth:`EagerFrame.get_column_by_name`. + Instantiate via :meth:`EagerFrame.get_column`. If you need to use this within the context of a :class`DataFrame` operation (such as `:meth:`DataFrame.filter`) then you can convert it to an expression @@ -106,7 +106,7 @@ def slice_rows( ... - def filter(self: EagerColumn[DType], mask: EagerColumn[Bool]) -> EagerColumn[DType]: + def filter(self: EagerColumn[DType], mask: Expression | EagerColumn[Bool]) -> EagerColumn[DType]: """ Select a subset of rows corresponding to a mask. diff --git a/spec/API_specification/dataframe_api/eagerframe_object.py b/spec/API_specification/dataframe_api/eagerframe_object.py index 2c810b63..55095364 100644 --- a/spec/API_specification/dataframe_api/eagerframe_object.py +++ b/spec/API_specification/dataframe_api/eagerframe_object.py @@ -8,7 +8,7 @@ from .expression_object import Expression from .dataframe_object import DataFrame from .groupby_object import GroupBy - from ._types import NullType, Scalar, IntoExpression + from ._types import NullType, Scalar __all__ = ["EagerFrame"] @@ -89,7 +89,7 @@ def get_column(self, name: str, /) -> EagerColumn[Any]: """ ... - def select(self, *columns: str | Expression) -> EagerFrame: + def select(self, *columns: str | Expression | EagerColumn[Any]) -> EagerFrame: """ Select multiple columns by name. @@ -115,7 +115,7 @@ def select(self, *columns: str | Expression) -> EagerFrame: """ ... - def get_rows(self, indices: Expression) -> EagerFrame: + def get_rows(self, indices: Expression | EagerColumn[Any]) -> EagerFrame: """ Select a subset of rows, similar to `ndarray.take`. @@ -148,7 +148,7 @@ def slice_rows( """ ... - def filter(self, mask: IntoExpression) -> EagerFrame: + def filter(self, mask: Expression | EagerColumn[bool]) -> EagerFrame: """ Select a subset of rows corresponding to a mask. @@ -173,7 +173,7 @@ def insert_columns(self, *columns: Expression | EagerColumn[Any]) -> EagerFrame: .. code-block:: python - new_column = df.get_column_by_name('a') + 1 + new_column = df.get_column('a') + 1 df = df.insert_columns(new_column.rename('a_plus_1')) If you need to insert the column at a different location, combine with @@ -181,7 +181,7 @@ def insert_columns(self, *columns: Expression | EagerColumn[Any]) -> EagerFrame: .. code-block:: python - new_column = df.get_column_by_name('a') + 1 + new_column = df.get_column('a') + 1 new_columns_names = ['a_plus_1'] + df.get_column_names() df = df.insert_columns(new_column.rename('a_plus_1')) df = df.select(new_column_names) @@ -203,12 +203,12 @@ def update_columns(self, *columns: Expression | EagerColumn[Any]) -> EagerFrame: .. code-block:: python - new_column = df.get_column_by_name('a') + 1 + new_column = df.get_column('a') + 1 df = df.update_column(new_column.rename('b').to_expression()) Parameters ---------- - columns : IntoExpression | Sequence[IntoExpression] + columns : Expression | EagerColumn Column(s) to update. If updating multiple columns, they must all have different names. @@ -265,7 +265,7 @@ def column_names(self) -> list[str]: def sort( self, - *keys: str | Expression, + *keys: str | Expression | EagerColumn[Any], ascending: Sequence[bool] | bool = True, nulls_position: Literal['first', 'last'] = 'last', ) -> EagerFrame: diff --git a/spec/API_specification/dataframe_api/expression_object.py b/spec/API_specification/dataframe_api/expression_object.py index 8bc6584e..adb37164 100644 --- a/spec/API_specification/dataframe_api/expression_object.py +++ b/spec/API_specification/dataframe_api/expression_object.py @@ -4,9 +4,8 @@ if TYPE_CHECKING: - from ._types import DType - from . import Bool from ._types import NullType, Scalar + from .eagercolumn_object import EagerColumn __all__ = ['Expression'] @@ -92,7 +91,7 @@ def len(self) -> Expression: Return the number of rows. """ - def get_rows(self: Expression, indices: Expression) -> Expression: + def get_rows(self, indices: Expression | EagerColumn[Any]) -> Expression: """ Select a subset of rows, similar to `ndarray.take`. @@ -104,7 +103,7 @@ def get_rows(self: Expression, indices: Expression) -> Expression: ... def slice_rows( - self: Expression, start: int | None, stop: int | None, step: int | None + self, start: int | None, stop: int | None, step: int | None ) -> Expression: """ Select a subset of rows corresponding to a slice. @@ -121,7 +120,7 @@ def slice_rows( """ ... - def filter(self, mask: Expression) -> Expression: + def filter(self, mask: Expression | EagerColumn[bool]) -> Expression: """ Select a subset of rows corresponding to a mask. @@ -225,7 +224,7 @@ def __eq__(self, other: Expression | Scalar) -> Expression: # type: ignore[over Expression """ - def __ne__(self: Expression, other: Expression | Scalar) -> Expression: # type: ignore[override] + def __ne__(self, other: Expression | Scalar) -> Expression: # type: ignore[override] """ Compare for non-equality. @@ -243,7 +242,7 @@ def __ne__(self: Expression, other: Expression | Scalar) -> Expression: # type: Expression """ - def __ge__(self: Expression, other: Expression | Scalar) -> Expression: + def __ge__(self, other: Expression | Scalar) -> Expression: """ Compare for "greater than or equal to" `other`. @@ -259,7 +258,7 @@ def __ge__(self: Expression, other: Expression | Scalar) -> Expression: Expression """ - def __gt__(self: Expression, other: Expression | Scalar) -> Expression: + def __gt__(self, other: Expression | Scalar) -> Expression: """ Compare for "greater than" `other`. @@ -275,7 +274,7 @@ def __gt__(self: Expression, other: Expression | Scalar) -> Expression: Expression """ - def __le__(self: Expression, other: Expression | Scalar) -> Expression: + def __le__(self, other: Expression | Scalar) -> Expression: """ Compare for "less than or equal to" `other`. @@ -291,7 +290,7 @@ def __le__(self: Expression, other: Expression | Scalar) -> Expression: Expression """ - def __lt__(self: Expression, other: Expression | Scalar) -> Expression: + def __lt__(self, other: Expression | Scalar) -> Expression: """ Compare for "less than" `other`. @@ -307,7 +306,7 @@ def __lt__(self: Expression, other: Expression | Scalar) -> Expression: Expression """ - def __and__(self: Expression, other: Expression | bool) -> Expression: + def __and__(self, other: Expression | bool) -> Expression: """ Apply logical 'and' to `other` expression (or scalar) and this expression. @@ -328,7 +327,7 @@ def __and__(self: Expression, other: Expression | bool) -> Expression: If `self` or `other` is not boolean. """ - def __or__(self: Expression, other: Expression | bool) -> Expression: + def __or__(self, other: Expression | bool) -> Expression: """ Apply logical 'or' to `other` expression (or scalar) and this expression. @@ -349,7 +348,7 @@ def __or__(self: Expression, other: Expression | bool) -> Expression: If `self` or `other` is not boolean. """ - def __add__(self: Expression, other: Expression | Scalar) -> Expression: + def __add__(self, other: Expression | Scalar) -> Expression: """ Add `other` expression or scalar to this expression. @@ -365,7 +364,7 @@ def __add__(self: Expression, other: Expression | Scalar) -> Expression: Expression """ - def __sub__(self: Expression, other: Expression | Scalar) -> Expression: + def __sub__(self, other: Expression | Scalar) -> Expression: """ Subtract `other` expression or scalar from this expression. @@ -481,7 +480,7 @@ def __divmod__(self, other: Expression | Scalar) -> tuple[Expression, Expression tuple[Expression, Expression] """ - def __invert__(self: Expression) -> Expression: + def __invert__(self) -> Expression: """ Invert truthiness of (boolean) elements. @@ -491,7 +490,7 @@ def __invert__(self: Expression) -> Expression: If any of the expression's expressions is not boolean. """ - def any(self: Expression, *, skip_nulls: bool = True) -> Expression: + def any(self, *, skip_nulls: bool = True) -> Expression: """ Reduction returns a bool. @@ -501,7 +500,7 @@ def any(self: Expression, *, skip_nulls: bool = True) -> Expression: If expression is not boolean. """ - def all(self: Expression, *, skip_nulls: bool = True) -> Expression: + def all(self, *, skip_nulls: bool = True) -> Expression: """ Reduction returns a bool. @@ -595,26 +594,26 @@ def var(self, *, correction: int | float = 1, skip_nulls: bool = True) -> Expres Whether to skip null values. """ - def cumulative_max(self: Expression) -> Expression: + def cumulative_max(self) -> Expression: """ Reduction returns a expression. Any data type that supports comparisons must be supported. The returned value has the same dtype as the expression. """ - def cumulative_min(self: Expression) -> Expression: + def cumulative_min(self) -> Expression: """ Reduction returns a expression. Any data type that supports comparisons must be supported. The returned value has the same dtype as the expression. """ - def cumulative_sum(self: Expression) -> Expression: + def cumulative_sum(self) -> Expression: """ Reduction returns a expression. Must be supported for numerical and datetime data types. The returned value has the same dtype as the expression. """ - def cumulative_prod(self: Expression) -> Expression: + def cumulative_prod(self) -> Expression: """ Reduction returns a expression. Must be supported for numerical and datetime data types. The returned value has the same dtype as the @@ -659,7 +658,7 @@ def is_nan(self) -> Expression: In particular, does not check for `np.timedelta64('NaT')`. """ - def is_in(self: Expression, values: Expression) -> Expression: + def is_in(self, values: Expression | EagerColumn[Any]) -> Expression: """ Indicate whether the value at each row matches any value in `values`. @@ -698,7 +697,7 @@ def unique_indices(self, *, skip_nulls: bool = True) -> Expression: """ ... - def fill_nan(self: Expression, value: float | NullType, /) -> Expression: + def fill_nan(self, value: float | NullType, /) -> Expression: """ Fill floating point ``nan`` values with the given fill value. @@ -712,7 +711,7 @@ def fill_nan(self: Expression, value: float | NullType, /) -> Expression: """ ... - def fill_null(self: Expression, value: Scalar, /) -> Expression: + def fill_null(self, value: Scalar, /) -> Expression: """ Fill null values with the given fill value. diff --git a/spec/conf.py b/spec/conf.py index 98d1eedf..61f32560 100644 --- a/spec/conf.py +++ b/spec/conf.py @@ -85,7 +85,6 @@ ('py:class', 'optional'), ('py:class', 'NullType'), ('py:class', 'GroupBy'), - ('py:class', 'IntoExpression'), ] # NOTE: this alias handling isn't used yet - added in anticipation of future # need based on dataframe API aliases. diff --git a/spec/design_topics/python_builtin_types.md b/spec/design_topics/python_builtin_types.md index 3bc6956b..18e638c7 100644 --- a/spec/design_topics/python_builtin_types.md +++ b/spec/design_topics/python_builtin_types.md @@ -21,7 +21,7 @@ class EagerColumn: def mean(self, skip_nulls: bool = True) -> float | NullType: ... -larger = df2 > df1.get_column_by_name('foo').mean() +larger = df2 > df1.get_column('foo').mean() ``` For a GPU dataframe library, it is desirable for all data to reside on the GPU,