diff --git a/spec/API_specification/dataframe_api/__init__.py b/spec/API_specification/dataframe_api/__init__.py index 4829eb48..adf9dc40 100644 --- a/spec/API_specification/dataframe_api/__init__.py +++ b/spec/API_specification/dataframe_api/__init__.py @@ -5,7 +5,9 @@ from typing import Mapping, Sequence, Any, Literal, TYPE_CHECKING -from .column_object import * +from .permissivecolumn_object import PermissiveColumn +from .permissiveframe_object import PermissiveFrame +from .column_object import Column from .dataframe_object import DataFrame from .groupby_object import * from .dtypes import * @@ -16,11 +18,17 @@ __all__ = [ "__dataframe_api_version__", "DataFrame", + "PermissiveFrame", + "PermissiveColumn", "Column", + "GroupBy", "column_from_sequence", "column_from_1d_array", + "col", "concat", "dataframe_from_dict", + "sorted_indices", + "unique_indices", "dataframe_from_2d_array", "is_null", "null", @@ -40,6 +48,8 @@ "Duration", "String", "is_dtype", + "any_rowwise", + "all_rowwise", ] @@ -50,6 +60,21 @@ implementation of the dataframe API standard. """ +def col(name: str) -> Column: + """ + Instantiate an Column which selects given column by name. + + For example, to select column 'species' and then use it to filter + a DataFrame, you could do: + + .. code-block::python + + df: DataFrame + namespace = df.__dataframe_namespace__() + df.filter(namespace.col('species') == 'setosa') + """ + ... + def concat(dataframes: Sequence[DataFrame]) -> DataFrame: """ Concatenate DataFrames vertically. @@ -70,9 +95,9 @@ def concat(dataframes: Sequence[DataFrame]) -> DataFrame: """ ... -def column_from_sequence(sequence: Sequence[Any], *, dtype: DType, name: str = '') -> Column: +def column_from_sequence(sequence: Sequence[Any], *, dtype: DType, name: str = '') -> PermissiveColumn: """ - Construct Column from sequence of elements. + Construct PermissiveColumn from sequence of elements. Parameters ---------- @@ -87,18 +112,18 @@ def column_from_sequence(sequence: Sequence[Any], *, dtype: DType, name: str = ' Returns ------- - Column + PermissiveColumn """ ... -def dataframe_from_dict(data: Mapping[str, Column]) -> DataFrame: +def dataframe_from_dict(data: Mapping[str, PermissiveColumn]) -> DataFrame: """ - Construct DataFrame from map of column names to Columns. + Construct DataFrame from map of column names to PermissiveColumns. Parameters ---------- - data : Mapping[str, Column] - Column must be of the corresponding type of the DataFrame. + data : Mapping[str, PermissiveColumn] + PermissiveColumn must be of the corresponding type of the DataFrame. For example, it is only supported to build a ``LibraryXDataFrame`` using ``LibraryXColumn`` instances. @@ -116,9 +141,9 @@ def dataframe_from_dict(data: Mapping[str, Column]) -> DataFrame: ... -def column_from_1d_array(array: Any, *, dtype: DType, name: str = '') -> Column: +def column_from_1d_array(array: Any, *, dtype: DType, name: str = '') -> PermissiveColumn: """ - Construct Column from 1D array. + Construct PermissiveColumn from 1D array. See `dataframe_from_2d_array` for related 2D function. @@ -137,7 +162,7 @@ def column_from_1d_array(array: Any, *, dtype: DType, name: str = '') -> Column: Returns ------- - Column + PermissiveColumn """ ... @@ -166,11 +191,117 @@ def dataframe_from_2d_array(array: Any, *, names: Sequence[str], dtypes: Mapping """ ... +def any_rowwise(*columns: str | Column | PermissiveColumn, skip_nulls: bool = True) -> Column: + """ + Reduction returns an Column. + + Differs from ``DataFrame.any`` and that the reduction happens + for each row, rather than for each column. + + Parameters + ---------- + columns : str | Column | PermissiveColumn + Columns to consider. + + Raises + ------ + ValueError + If any of the DataFrame's columns is not boolean. + """ + ... + +def all_rowwise(*columns: str | Column | PermissiveColumn, skip_nulls: bool = True) -> Column: + """ + Reduction returns an Column. + + Differs from ``DataFrame.all`` and that the reduction happens + for each row, rather than for each column. + + Parameters + ---------- + columns : str | Column | PermissiveColumn + Columns to consider. + + Raises + ------ + ValueError + If any of the DataFrame's columns is not boolean. + """ + ... + +def sorted_indices( + *columns: str | Column | PermissiveColumn, + ascending: Sequence[bool] | bool = True, + nulls_position: Literal['first', 'last'] = 'last', +) -> Column: + """ + Return row numbers which would sort according to given columns. + + If you need to sort the DataFrame, use :meth:`DataFrame.sort`. + + Parameters + ---------- + columns : str | Column | PermissiveColumn + Column(s) to sort by. + ascending : Sequence[bool] or bool + If `True`, sort by all keys in ascending order. + If `False`, sort by all keys in descending order. + If a sequence, it must be the same length as `keys`, + and determines the direction with which to use each + key to sort by. + nulls_position : ``{'first', 'last'}`` + Whether null values should be placed at the beginning + or at the end of the result. + Note that the position of NaNs is unspecified and may + vary based on the implementation. + + Returns + ------- + Column + + Raises + ------ + ValueError + If `keys` and `ascending` are sequences of different lengths. + """ + ... + + +def unique_indices( + *columns: str | Column | PermissiveColumn, + skip_nulls: bool = True, + ) -> Column: + """ + Return indices corresponding to unique values across selected columns. + + Parameters + ---------- + columns : str | Column | PermissiveColumn + Columns to consider when finding unique values. + + Returns + ------- + Column + Indices corresponding to unique values. + + Notes + ----- + There are no ordering guarantees. In particular, if there are multiple + indices corresponding to the same unique value(s), there is no guarantee + about which one will appear in the result. + If the original column(s) contain multiple `'NaN'` values, then + only a single index corresponding to those values will be returned. + Likewise for null values (if ``skip_nulls=False``). + """ + ... + + + class null: """ A `null` object to represent missing data. - ``null`` is a scalar, and may be used when constructing a `Column` from a + ``null`` is a scalar, and may be used when constructing a `PermissiveColumn` from a Python sequence with `column_from_sequence`. It does not support ``is``, ``==`` or ``bool``. diff --git a/spec/API_specification/dataframe_api/_types.py b/spec/API_specification/dataframe_api/_types.py index 90b894cf..c745e3af 100644 --- a/spec/API_specification/dataframe_api/_types.py +++ b/spec/API_specification/dataframe_api/_types.py @@ -18,7 +18,9 @@ if TYPE_CHECKING: from .dataframe_object import DataFrame as DataFrameType + from .permissivecolumn_object import PermissiveColumn as PermissiveColumnType from .column_object import Column as ColumnType + from .permissiveframe_object import PermissiveFrame as PermissiveFrameType if TYPE_CHECKING: from .dtypes import ( @@ -51,6 +53,9 @@ class Namespace(Protocol): __dataframe_api_version__: str + @staticmethod + def col(name: str) -> ColumnType: ... + @staticmethod def DataFrame() -> DataFrameType: ... @@ -59,6 +64,14 @@ def DataFrame() -> DataFrameType: def Column() -> ColumnType: ... + @staticmethod + def PermissiveFrame() -> DataFrameType: + ... + + @staticmethod + def PermissiveColumn() -> PermissiveColumnType: + ... + @staticmethod def Int64() -> Int64: ... @@ -123,22 +136,21 @@ def concat(dataframes: Sequence[DataFrameType]) -> DataFrameType: def column_from_sequence( sequence: Sequence[Any], *, - dtype: Any, + dtype: DType, name: str = "", - api_version: str | None = None, - ) -> ColumnType: + ) -> PermissiveColumnType: ... @staticmethod def dataframe_from_dict( - data: Mapping[str, ColumnType], *, api_version: str | None = None + data: Mapping[str, PermissiveColumnType] ) -> DataFrameType: ... @staticmethod def column_from_1d_array( - array: Any, *, dtype: Any, name: str = "", api_version: str | None = None - ) -> ColumnType: + array: Any, *, dtype: DType, name: str = "" + ) -> PermissiveColumnType: ... @staticmethod @@ -146,8 +158,7 @@ def dataframe_from_2d_array( array: Any, *, names: Sequence[str], - dtypes: Mapping[str, Any], - api_version: str | None = None, + dtypes: Mapping[str, DType], ) -> DataFrameType: ... @@ -156,7 +167,7 @@ def is_null(value: object, /) -> bool: ... @staticmethod - def is_dtype(dtype: Any, kind: str | tuple[str, ...]) -> bool: + def is_dtype(dtype: DType, kind: str | tuple[str, ...]) -> bool: ... @@ -169,7 +180,7 @@ def __dataframe_consortium_standard__( class SupportsColumnAPI(Protocol): def __column_consortium_standard__( self, *, api_version: str | None = None - ) -> ColumnType: + ) -> PermissiveColumnType: ... diff --git a/spec/API_specification/dataframe_api/column_object.py b/spec/API_specification/dataframe_api/column_object.py index ff4d7ba3..a906ba5d 100644 --- a/spec/API_specification/dataframe_api/column_object.py +++ b/spec/API_specification/dataframe_api/column_object.py @@ -2,8 +2,10 @@ from typing import Any,NoReturn, TYPE_CHECKING, Literal, Generic + if TYPE_CHECKING: - from ._types import NullType, Scalar, DType, Namespace + from ._types import NullType, Scalar, Namespace + from .permissivecolumn_object import PermissiveColumn __all__ = ['Column'] @@ -11,12 +13,57 @@ class Column: """ - Column object + Column object, which maps a DataFrame to a column derived from it. + + Not meant to be instantiated directly - instead, use one of: + + - :func:`dataframe_api.col` + - :func:`dataframe_api.any_rowwise` + - :func:`dataframe_api.all_rowwise` + - :func:`dataframe_api.sorted_indices` + - :func:`dataframe_api.unique_indices` + + A column is lazy and only takes effect when passed to one of the following: - Note that this column object is not meant to be instantiated directly by - users of the library implementing the dataframe API standard. Rather, use - constructor functions or an already-created dataframe object retrieved via + - :meth:`DataFrame.select` + - :meth:`DataFrame.assign` + - :meth:`DataFrame.filter` + - :meth:`PermissiveFrame.select` + - :meth:`PermissiveFrame.assign` + - :meth:`PermissiveFrame.filter` + For example: + + .. code-block::python + + df: DataFrame + col = df.__dataframe_namespace__().col + df = df.filter(col('a') > col('b')*2) + + acts like (pandas syntax): + + .. code-block::python + + df: pd.DataFrame + col_a = lambda df: df.loc[:, 'a'] + col_b = lambda df: df.loc[:, 'b'] + col_b_doubled = lambda df: col_b(df) * 2 + mask = lambda df: col_a(df) > col_b_doubled(df) + df = df.loc[mask(df)] + + Notes + ----- + Binary operations between columns require that they resolve to columns of the + same length (unless one of them is of length-1, in which case it is broadcast to + the same length as the other one). + For example, the output column resulting from + + .. code-block::python + + col('a') - col('a').mean() + + will be the same length as column `'a'` (where its mean will have been subtracted from + each element). """ def __column_namespace__(self) -> Namespace: @@ -25,7 +72,7 @@ def __column_namespace__(self) -> Namespace: Returns ------- - namespace: Any + namespace: Namespace An object representing the dataframe API namespace. It should have every top-level function defined in the specification as an attribute. It may contain other public names as well, but it is @@ -33,44 +80,46 @@ def __column_namespace__(self) -> Namespace: specification. """ - - @property - def column(self) -> Any: - """ - Return underlying (not-necessarily-Standard-compliant) column. - If a library only implements the Standard, then this can return `self`. + def root_names(self) -> list[str]: """ - ... - - @property - def name(self) -> str: - """Return name of column.""" + Subset of column names to consider from original dataframe when building new column. - def __len__(self) -> int: - """ - Return the number of rows. - """ + Returns + ------- + list[str] + Column names - def __iter__(self) -> NoReturn: + Examples + -------- + >>> col('a').root_names() + ['a'] + >>> ((col('a') + 1) > col('b')).root_names() + ['a', 'b'] + >>> any_rowwise('a', 'b').root_names() + ['a', 'b'] """ - Iterate over elements. - - This is intentionally "poisoned" to discourage inefficient code patterns. - Raises - ------ - NotImplementedError + def output_name(self) -> str: + """ + Name of resulting column. + + Examples + -------- + >>> col('a').output_name() + 'a' + >>> col('a').rename('b').output_name() + 'b' + >>> df.select(col('a').rename('b')).column_names + ['b'] """ - raise NotImplementedError("'__iter__' is intentionally not implemented.") - @property - def dtype(self) -> Any: + def len(self) -> Column: """ - Return data type of column. + Return the number of rows. """ - def get_rows(self: Column, indices: Column) -> Column: + def get_rows(self, indices: Column | PermissiveColumn) -> Column: """ Select a subset of rows, similar to `ndarray.take`. @@ -81,9 +130,8 @@ def get_rows(self: Column, indices: Column) -> Column: """ ... - def slice_rows( - self: Column, start: int | None, stop: int | None, step: int | None + self, start: int | None, stop: int | None, step: int | None ) -> Column: """ Select a subset of rows corresponding to a slice. @@ -100,8 +148,7 @@ def slice_rows( """ ... - - def filter(self: Column, mask: Column) -> Column: + def filter(self, mask: Column | PermissiveColumn) -> Column: """ Select a subset of rows corresponding to a mask. @@ -112,16 +159,10 @@ def filter(self: Column, mask: Column) -> Column: Returns ------- Column - - Notes - ----- - Some participants preferred a weaker type Arraylike[bool] for mask, - where 'Arraylike' denotes an object adhering to the Array API standard. """ ... - - def get_value(self, row_number: int) -> Scalar: + def get_value(self, row_number: int) -> Column: """ Select the value at a row number, similar to `ndarray.__getitem__()`. @@ -132,9 +173,7 @@ def get_value(self, row_number: int) -> Scalar: Returns ------- - Scalar - Depends on the dtype of the Column, and may vary - across implementations. + Column """ ... @@ -148,7 +187,7 @@ def sort( Sort column. If you need the indices which would sort the column, - use :meth:`sorted_indices`. + use :func:`sorted_indices`. Parameters ---------- @@ -176,7 +215,7 @@ def sorted_indices( """ Return row numbers which would sort column. - If you need to sort the Column, use :meth:`sort`. + If you need to sort the column, use :meth:`sort`. Parameters ---------- @@ -204,7 +243,7 @@ def __eq__(self, other: Column | Scalar) -> Column: # type: ignore[override] Parameters ---------- other : Column or Scalar - If Column, must have same length. + If Column, must have same length or have length 1. "Scalar" here is defined implicitly by what scalar types are allowed for the operation by the underling dtypes. @@ -213,7 +252,7 @@ def __eq__(self, other: Column | Scalar) -> Column: # type: ignore[override] Column """ - def __ne__(self: Column, other: Column | Scalar) -> Column: # type: ignore[override] + def __ne__(self, other: Column | Scalar) -> Column: # type: ignore[override] """ Compare for non-equality. @@ -222,7 +261,7 @@ def __ne__(self: Column, other: Column | Scalar) -> Column: # type: ignore[over Parameters ---------- other : Column or Scalar - If Column, must have same length. + If Column, must have same length or have length 1. "Scalar" here is defined implicitly by what scalar types are allowed for the operation by the underling dtypes. @@ -231,14 +270,14 @@ def __ne__(self: Column, other: Column | Scalar) -> Column: # type: ignore[over Column """ - def __ge__(self: Column, other: Column | Scalar) -> Column: + def __ge__(self, other: Column | Scalar) -> Column: """ Compare for "greater than or equal to" `other`. Parameters ---------- other : Column or Scalar - If Column, must have same length. + If Column, must have same length or have length 1. "Scalar" here is defined implicitly by what scalar types are allowed for the operation by the underling dtypes. @@ -247,14 +286,14 @@ def __ge__(self: Column, other: Column | Scalar) -> Column: Column """ - def __gt__(self: Column, other: Column | Scalar) -> Column: + def __gt__(self, other: Column | Scalar) -> Column: """ Compare for "greater than" `other`. Parameters ---------- other : Column or Scalar - If Column, must have same length. + If Column, must have same length or have length 1. "Scalar" here is defined implicitly by what scalar types are allowed for the operation by the underling dtypes. @@ -263,14 +302,14 @@ def __gt__(self: Column, other: Column | Scalar) -> Column: Column """ - def __le__(self: Column, other: Column | Scalar) -> Column: + def __le__(self, other: Column | Scalar) -> Column: """ Compare for "less than or equal to" `other`. Parameters ---------- other : Column or Scalar - If Column, must have same length. + If Column, must have same length or have length 1. "Scalar" here is defined implicitly by what scalar types are allowed for the operation by the underling dtypes. @@ -279,14 +318,14 @@ def __le__(self: Column, other: Column | Scalar) -> Column: Column """ - def __lt__(self: Column, other: Column | Scalar) -> Column: + def __lt__(self, other: Column | Scalar) -> Column: """ Compare for "less than" `other`. Parameters ---------- other : Column or Scalar - If Column, must have same length. + If Column, must have same length or have length 1. "Scalar" here is defined implicitly by what scalar types are allowed for the operation by the underling dtypes. @@ -295,16 +334,16 @@ def __lt__(self: Column, other: Column | Scalar) -> Column: Column """ - def __and__(self: Column, other: Column | bool) -> Column: + def __and__(self, other: Column | bool) -> Column: """ - Apply logical 'and' to `other` Column (or scalar) and this Column. + Apply logical 'and' to `other` column (or scalar) and this column. Nulls should follow Kleene Logic. Parameters ---------- other : Column or bool - If Column, must have same length. + If Column, must have same length or have length 1. Returns ------- @@ -316,16 +355,16 @@ def __and__(self: Column, other: Column | bool) -> Column: If `self` or `other` is not boolean. """ - def __or__(self: Column, other: Column | bool) -> Column: + def __or__(self, other: Column | bool) -> Column: """ - Apply logical 'or' to `other` Column (or scalar) and this column. + Apply logical 'or' to `other` column (or scalar) and this column. Nulls should follow Kleene Logic. Parameters ---------- other : Column or Scalar - If Column, must have same length. + If Column, must have same length or have length 1. Returns ------- @@ -337,14 +376,14 @@ def __or__(self: Column, other: Column | bool) -> Column: If `self` or `other` is not boolean. """ - def __add__(self: Column, other: Column | Scalar) -> Column: + def __add__(self, other: Column | Scalar) -> Column: """ Add `other` column or scalar to this column. Parameters ---------- other : Column or Scalar - If Column, must have same length. + If Column, must have same length or have length 1. "Scalar" here is defined implicitly by what scalar types are allowed for the operation by the underling dtypes. @@ -353,14 +392,14 @@ def __add__(self: Column, other: Column | Scalar) -> Column: Column """ - def __sub__(self: Column, other: Column | Scalar) -> Column: + def __sub__(self, other: Column | Scalar) -> Column: """ Subtract `other` column or scalar from this column. Parameters ---------- other : Column or Scalar - If Column, must have same length. + If Column, must have same length or have length 1. "Scalar" here is defined implicitly by what scalar types are allowed for the operation by the underling dtypes. @@ -376,7 +415,7 @@ def __mul__(self, other: Column | Scalar) -> Column: Parameters ---------- other : Column or Scalar - If Column, must have same length. + If Column, must have same length or have length 1. "Scalar" here is defined implicitly by what scalar types are allowed for the operation by the underling dtypes. @@ -392,7 +431,7 @@ def __truediv__(self, other: Column | Scalar) -> Column: Parameters ---------- other : Column or Scalar - If Column, must have same length. + If Column, must have same length or have length 1. "Scalar" here is defined implicitly by what scalar types are allowed for the operation by the underling dtypes. @@ -408,7 +447,7 @@ def __floordiv__(self, other: Column | Scalar) -> Column: Parameters ---------- other : Column or Scalar - If Column, must have same length. + If Column, must have same length or have length 1. "Scalar" here is defined implicitly by what scalar types are allowed for the operation by the underling dtypes. @@ -428,7 +467,7 @@ def __pow__(self, other: Column | Scalar) -> Column: Parameters ---------- other : Column or Scalar - If Column, must have same length. + If Column, must have same length or have length 1. "Scalar" here is defined implicitly by what scalar types are allowed for the operation by the underling dtypes. @@ -444,7 +483,7 @@ def __mod__(self, other: Column | Scalar) -> Column: Parameters ---------- other : Column or Scalar - If Column, must have same length. + If Column, must have same length or have length 1. "Scalar" here is defined implicitly by what scalar types are allowed for the operation by the underling dtypes. @@ -460,71 +499,71 @@ def __divmod__(self, other: Column | Scalar) -> tuple[Column, Column]: Parameters ---------- other : Column or Scalar - If Column, must have same length. + If Column, must have same length or have length 1. "Scalar" here is defined implicitly by what scalar types are allowed for the operation by the underling dtypes. Returns ------- - Column + tuple[Column, Column] """ - def __invert__(self: Column) -> Column: + def __invert__(self) -> Column: """ Invert truthiness of (boolean) elements. Raises ------ ValueError - If any of the Column's columns is not boolean. + If the column is not boolean. """ - def any(self: Column, *, skip_nulls: bool = True) -> bool | NullType: + def any(self, *, skip_nulls: bool = True) -> Column: """ Reduction returns a bool. Raises ------ ValueError - If column is not boolean. + If Column is not boolean. """ - def all(self: Column, *, skip_nulls: bool = True) -> bool | NullType: + def all(self, *, skip_nulls: bool = True) -> Column: """ Reduction returns a bool. Raises ------ ValueError - If column is not boolean. + If Column is not boolean. """ - def min(self, *, skip_nulls: bool = True) -> Scalar | NullType: + def min(self, *, skip_nulls: bool = True) -> Column: """ Reduction returns a scalar. Any data type that supports comparisons must be supported. The returned value has the same dtype as the column. """ - def max(self, *, skip_nulls: bool = True) -> Scalar | NullType: + def max(self, *, skip_nulls: bool = True) -> Column: """ Reduction returns a scalar. Any data type that supports comparisons must be supported. The returned value has the same dtype as the column. """ - def sum(self, *, skip_nulls: bool = True) -> Scalar | NullType: + def sum(self, *, skip_nulls: bool = True) -> Column: """ Reduction returns a scalar. Must be supported for numerical and datetime data types. The returned value has the same dtype as the column. """ - def prod(self, *, skip_nulls: bool = True) -> Scalar | NullType: + def prod(self, *, skip_nulls: bool = True) -> Column: """ Reduction returns a scalar. Must be supported for numerical data types. The returned value has the same dtype as the column. """ - def median(self, *, skip_nulls: bool = True) -> Scalar | NullType: + def median(self, *, skip_nulls: bool = True) -> Column: """ Reduction returns a scalar. Must be supported for numerical and datetime data types. Returns a float for numerical data types, and @@ -532,7 +571,7 @@ def median(self, *, skip_nulls: bool = True) -> Scalar | NullType: dtypes. """ - def mean(self, *, skip_nulls: bool = True) -> Scalar | NullType: + def mean(self, *, skip_nulls: bool = True) -> Column: """ Reduction returns a scalar. Must be supported for numerical and datetime data types. Returns a float for numerical data types, and @@ -540,7 +579,7 @@ def mean(self, *, skip_nulls: bool = True) -> Scalar | NullType: dtypes. """ - def std(self, *, correction: int | float = 1, skip_nulls: bool = True) -> Scalar | NullType: + def std(self, *, correction: int | float = 1, skip_nulls: bool = True) -> Column: """ Reduction returns a scalar. Must be supported for numerical and datetime data types. Returns a float for numerical data types, and @@ -566,7 +605,7 @@ def std(self, *, correction: int | float = 1, skip_nulls: bool = True) -> Scalar Whether to skip null values. """ - def var(self, *, correction: int | float = 1, skip_nulls: bool = True) -> Scalar | NullType: + def var(self, *, correction: int | float = 1, skip_nulls: bool = True) -> Column: """ Reduction returns a scalar. Must be supported for numerical and datetime data types. Returns a float for numerical data types, and @@ -578,33 +617,33 @@ def var(self, *, correction: int | float = 1, skip_nulls: bool = True) -> Scalar correction Correction to apply to the result. For example, ``0`` for sample standard deviation and ``1`` for population standard deviation. - See `Column.std` for a more detailed description. + See :meth`Column.std` for a more detailed description. skip_nulls Whether to skip null values. """ - def cumulative_max(self: Column) -> Column: + def cumulative_max(self) -> Column: """ - Reduction returns a Column. Any data type that supports comparisons + Reduction returns a column. Any data type that supports comparisons must be supported. The returned value has the same dtype as the column. """ - def cumulative_min(self: Column) -> Column: + def cumulative_min(self) -> Column: """ - Reduction returns a Column. Any data type that supports comparisons + Reduction returns a column. Any data type that supports comparisons must be supported. The returned value has the same dtype as the column. """ - def cumulative_sum(self: Column) -> Column: + def cumulative_sum(self) -> Column: """ - Reduction returns a Column. Must be supported for numerical and + Reduction returns a column. Must be supported for numerical and datetime data types. The returned value has the same dtype as the column. """ - def cumulative_prod(self: Column) -> Column: + def cumulative_prod(self) -> Column: """ - Reduction returns a Column. Must be supported for numerical and + Reduction returns a column. Must be supported for numerical and datetime data types. The returned value has the same dtype as the column. """ @@ -647,7 +686,7 @@ def is_nan(self) -> Column: In particular, does not check for `np.timedelta64('NaT')`. """ - def is_in(self: Column, values: Column) -> Column: + def is_in(self, values: Column | PermissiveColumn) -> Column: """ Indicate whether the value at each row matches any value in `values`. @@ -667,7 +706,7 @@ def is_in(self: Column, values: Column) -> Column: def unique_indices(self, *, skip_nulls: bool = True) -> Column: """ - Return indices corresponding to unique values in Column. + Return indices corresponding to unique values in column. Returns ------- @@ -679,14 +718,14 @@ def unique_indices(self, *, skip_nulls: bool = True) -> Column: There are no ordering guarantees. In particular, if there are multiple indices corresponding to the same unique value, there is no guarantee about which one will appear in the result. - If the original Column contains multiple `'NaN'` values, then + If the original column contains multiple `'NaN'` values, then only a single index corresponding to those values will be returned. Likewise for null values (if ``skip_nulls=False``). To get the unique values, you can do ``col.get_rows(col.unique_indices())``. """ ... - def fill_nan(self: Column, value: float | NullType, /) -> Column: + def fill_nan(self, value: float | NullType, /) -> Column: """ Fill floating point ``nan`` values with the given fill value. @@ -700,7 +739,7 @@ def fill_nan(self: Column, value: float | NullType, /) -> Column: """ ... - def fill_null(self: Column, value: Scalar, /) -> Column: + def fill_null(self, value: Scalar, /) -> Column: """ Fill null values with the given fill value. @@ -713,41 +752,6 @@ def fill_null(self: Column, value: Scalar, /) -> Column: """ ... - def to_array_object(self, dtype: DType) -> Any: - """ - Convert to array-API-compliant object. - - Parameters - ---------- - dtype : DType - The dtype of the array-API-compliant object to return. - Must be one of: - - - Bool() - - Int8() - - Int16() - - Int32() - - Int64() - - UInt8() - - UInt16() - - UInt32() - - UInt64() - - Float32() - - Float64() - - Returns - ------- - Any - An array-API-compliant object. - - Notes - ----- - While numpy arrays are not yet array-API-compliant, implementations - may choose to return a numpy array (for numpy prior to 2.0), with the - understanding that consuming libraries would then use the - ``array-api-compat`` package to convert it to a Standard-compliant array. - """ - def rename(self, name: str) -> Column: """ Rename column. diff --git a/spec/API_specification/dataframe_api/dataframe_object.py b/spec/API_specification/dataframe_api/dataframe_object.py index e8a9a21e..0824a15b 100644 --- a/spec/API_specification/dataframe_api/dataframe_object.py +++ b/spec/API_specification/dataframe_api/dataframe_object.py @@ -5,6 +5,8 @@ if TYPE_CHECKING: from .column_object import Column + from .permissiveframe_object import PermissiveFrame + from .permissivecolumn_object import PermissiveColumn from .groupby_object import GroupBy from ._types import NullType, Scalar, Namespace, DType @@ -12,13 +14,21 @@ __all__ = ["DataFrame"] + class DataFrame: """ DataFrame object - Note that this dataframe object is not meant to be instantiated directly by - users of the library implementing the dataframe API standard. Rather, use - constructor functions or an already-created dataframe object retrieved via + Instantiate via any of the following: + + - ``df_non_standard.__dataframe_consortium_standard__()`` + - :func:`dataframe_api.dataframe_from_2d_array` + - :func:`dataframe_api.dataframe_from_dict` + + No assumptions should be made about the underlying execution engine. In particular, + any method which leads to materialisation (such as converting to ndarray) + is not supported. See :class:`dataframe_api.PermissiveFrame` for a counterpart + which supports materialisation. **Python operator support** @@ -32,9 +42,6 @@ class DataFrame: In-place operators must not be supported. All operations on the dataframe object are out-of-place. - - **Methods and Attributes** - """ def __dataframe_namespace__(self) -> Namespace: """ @@ -59,18 +66,24 @@ def dataframe(self) -> object: """ ... - def shape(self) -> tuple[int, int]: - """ - Return number of rows and number of columns. + @property + def schema(self) -> dict[str, DType]: """ + Get dataframe's schema. - def group_by(self, keys: str | list[str], /) -> GroupBy: + Returns + ------- + dict[str, DType] + Mapping from column name to data type. + """ + + def group_by(self, *keys: Column | str) -> GroupBy: """ Group the DataFrame by the given columns. Parameters ---------- - keys : str | list[str] + keys : str Returns ------- @@ -89,36 +102,46 @@ def group_by(self, keys: str | list[str], /) -> GroupBy: """ ... - def get_column_by_name(self, name: str, /) -> Column: + def select(self, *columns: str | Column) -> DataFrame: """ - Select a column by name. + Select multiple columns, either by name or by columns. Parameters ---------- - name : str + columns + Column names (or columns) to select. Returns ------- - Column + DataFrame - Raises - ------ - KeyError - If the key is not present. - """ - ... + Examples + -------- + >>> df: DataFrame + >>> col = df.__dataframe_namespace__().col + >>> df = df.select('a', col('b'), (col('c')+col('d')+1).rename('e')) - def select(self, names: Sequence[str], /) -> DataFrame: - """ - Select multiple columns by name. + Notes + ----- + Columns should all be the same length, apart from possibly some + length-1 columns (which should be broadcast to be the same length as the others). + For example, if ``df`` is a dataframe of length 150, then we have: - Parameters - ---------- - names : Sequence[str] + .. code-block::python - Returns - ------- - DataFrame + df: DataFrame + + # This returns a length-1 dataframe + df.select(col('a').mean(), col('b').mean()) + + # This returns a length-150 dataframe + df.select(col('a').mean(), col('b')) + + # and so does this + df.select(col('a'), col('b').mean()) + + # and so does this + df.select(col('a'), col('b')) Raises ------ @@ -127,20 +150,6 @@ def select(self, names: Sequence[str], /) -> DataFrame: """ ... - def get_rows(self, indices: Column) -> DataFrame: - """ - Select a subset of rows, similar to `ndarray.take`. - - Parameters - ---------- - indices : Column - Positions of rows to select. - - Returns - ------- - DataFrame - """ - ... def slice_rows( self, start: int | None, stop: int | None, step: int | None @@ -160,26 +169,32 @@ def slice_rows( """ ... - def filter(self, mask: Column) -> DataFrame: + def filter(self, mask: Column | PermissiveColumn) -> DataFrame: """ Select a subset of rows corresponding to a mask. Parameters ---------- - mask : Column + mask : Column or PermissiveColumn Returns ------- DataFrame - Notes - ----- - Some participants preferred a weaker type Arraylike[bool] for mask, - where 'Arraylike' denotes an object adhering to the Array API standard. + Examples + -------- + + Here is how you could keep rows in a dataframe where the values in + column 'a' are greater than 3: + + >>> df: DataFrame + >>> namespace = df.__dataframe_namespace__() + >>> mask = namespace.col('a') > 3 + >>> df = df.filter(mask) """ ... - def assign(self, columns: Column | Sequence[Column], /) -> DataFrame: + def assign(self, *columns: Column | PermissiveColumn) -> DataFrame: """ Insert new column(s), or update values in existing ones. @@ -188,7 +203,7 @@ def assign(self, columns: Column | Sequence[Column], /) -> DataFrame: If updating existing columns, their names will be used to tell which columns to update. To update a column with a different name, combine with - :meth:`Column.rename`, e.g.: + :meth:`PermissiveColumn.rename` or :meth:`Column.rename`, e.g.: .. code-block:: python @@ -197,7 +212,7 @@ def assign(self, columns: Column | Sequence[Column], /) -> DataFrame: Parameters ---------- - columns : Column | Sequence[Column] + columns : Column | PermissiveColumn Column(s) to update/insert. If updating/inserting multiple columns, they must all have different names. @@ -207,13 +222,13 @@ def assign(self, columns: Column | Sequence[Column], /) -> DataFrame: """ ... - def drop_columns(self, label: str | list[str]) -> DataFrame: + def drop_columns(self, *labels: str) -> DataFrame: """ Drop the specified column(s). Parameters ---------- - label : str | list[str] + labels : str Column name(s) to drop. Returns @@ -253,21 +268,9 @@ def column_names(self) -> list[str]: """ ... - @property - def schema(self) -> dict[str, Any]: - """ - Get dataframe's schema. - - Returns - ------- - dict[str, Any] - Mapping from column name to data type. - """ - def sort( self, - keys: str | list[str] | None = None, - *, + *keys: str | Column | PermissiveColumn, ascending: Sequence[bool] | bool = True, nulls_position: Literal['first', 'last'] = 'last', ) -> DataFrame: @@ -275,13 +278,13 @@ def sort( Sort dataframe according to given columns. If you only need the indices which would sort the dataframe, use - :meth:`sorted_indices`. + :func:`dataframe_api.sorted_indices`. Parameters ---------- - keys : str | list[str], optional + keys : str | Column Names of columns to sort by. - If `None`, sort by all columns. + If not passed, will sort by all columns. ascending : Sequence[bool] or bool If `True`, sort by all keys in ascending order. If `False`, sort by all keys in descending order. @@ -305,46 +308,6 @@ def sort( """ ... - def sorted_indices( - self, - keys: str | list[str] | None = None, - *, - ascending: Sequence[bool] | bool = True, - nulls_position: Literal['first', 'last'] = 'last', - ) -> Column: - """ - Return row numbers which would sort according to given columns. - - If you need to sort the DataFrame, use :meth:`sort`. - - Parameters - ---------- - keys : str | list[str], optional - Names of columns to sort by. - If `None`, sort by all columns. - ascending : Sequence[bool] or bool - If `True`, sort by all keys in ascending order. - If `False`, sort by all keys in descending order. - If a sequence, it must be the same length as `keys`, - and determines the direction with which to use each - key to sort by. - nulls_position : ``{'first', 'last'}`` - Whether null values should be placed at the beginning - or at the end of the result. - Note that the position of NaNs is unspecified and may - vary based on the implementation. - - Returns - ------- - Column - - Raises - ------ - ValueError - If `keys` and `ascending` are sequences of different lengths. - """ - ... - def __eq__(self, other: Scalar) -> DataFrame: # type: ignore[override] """ Compare for equality. @@ -662,34 +625,6 @@ def all(self, *, skip_nulls: bool = True) -> DataFrame: """ ... - def any_rowwise(self, *, skip_nulls: bool = True) -> Column: - """ - Reduction returns a Column. - - Differs from ``DataFrame.any`` and that the reduction happens - for each row, rather than for each column. - - Raises - ------ - ValueError - If any of the DataFrame's columns is not boolean. - """ - ... - - def all_rowwise(self, *, skip_nulls: bool = True) -> Column: - """ - Reduction returns a Column. - - Differs from ``DataFrame.all`` and that the reduction happens - for each row, rather than for each column. - - Raises - ------ - ValueError - If any of the DataFrame's columns is not boolean. - """ - ... - def min(self, *, skip_nulls: bool = True) -> DataFrame: """ Reduction returns a 1-row DataFrame. @@ -735,7 +670,7 @@ def std(self, *, correction: int | float = 1, skip_nulls: bool = True) -> DataFr correction Correction to apply to the result. For example, ``0`` for sample standard deviation and ``1`` for population standard deviation. - See `Column.std` for a more detailed description. + See :meth:`Column.std` for a more detailed description. skip_nulls Whether to skip null values. """ @@ -750,7 +685,7 @@ def var(self, *, correction: int | float = 1, skip_nulls: bool = True) -> DataFr correction Correction to apply to the result. For example, ``0`` for sample standard deviation and ``1`` for population standard deviation. - See `Column.std` for a more detailed description. + See :meth:`Column.std` for a more detailed description. skip_nulls Whether to skip null values. """ @@ -796,33 +731,6 @@ def is_nan(self) -> DataFrame: """ ... - def unique_indices(self, keys: str | list[str] | None = None, *, skip_nulls: bool = True) -> Column: - """ - Return indices corresponding to unique values across selected columns. - - Parameters - ---------- - keys : str | list[str], optional - Column names to consider when finding unique values. - If `None`, all columns are considered. - - Returns - ------- - Column - Indices corresponding to unique values. - - Notes - ----- - There are no ordering guarantees. In particular, if there are multiple - indices corresponding to the same unique value(s), there is no guarantee - about which one will appear in the result. - If the original column(s) contain multiple `'NaN'` values, then - only a single index corresponding to those values will be returned. - Likewise for null values (if ``skip_nulls=False``). - To get the unique values, you can do ``df.get_rows(df.unique_indices(keys))``. - """ - ... - def fill_nan(self, value: float | NullType, /) -> DataFrame: """ Fill ``nan`` values with the given fill value. @@ -868,45 +776,9 @@ def fill_null( KeyError If ``column_names`` contains a column name that is not present in the dataframe. - """ ... - - def to_array_object(self, dtype: DType) -> Any: - """ - Convert to array-API-compliant object. - Parameters - ---------- - dtype : DType - The dtype of the array-API-compliant object to return. - Must be one of: - - - Bool() - - Int8() - - Int16() - - Int32() - - Int64() - - UInt8() - - UInt16() - - UInt32() - - UInt64() - - Float32() - - Float64() - - Returns - ------- - Any - An array-API-compliant object. - - Notes - ----- - While numpy arrays are not yet array-API-compliant, implementations - may choose to return a numpy array (for numpy prior to 2.0), with the - understanding that consuming libraries would then use the - ``array-api-compat`` package to convert it to a Standard-compliant array. - """ - def join( self, other: DataFrame, @@ -948,3 +820,31 @@ def join( If, apart from `left_on` and `right_on`, there are any column names present in both `self` and `other`. """ + + def collect(self) -> PermissiveFrame: + """ + Transform dataframe into object which supports eager evaluation. + + .. warning:: + + This method may trigger compute and so can be extremely expensive. + If possible, don't use it at all. If you really need to materialise + data (for example, to convert it to an array object), then you should + call `collect` as late as possible - ideally, no more than once. + + The exact point in the code where to place the `collect` call may vary + depending on implementations and on use-cases. However, if you follow + the maxim of using it as late as possible and ideally only once, then + we expect this to be close to optimal performance in most cases. + + A pattern we recommend is + + .. code-block::python + + df: DataFrame + df_permissive = df.collect() + del def + + to avoid accidentally calling `collect` twice on the same dataframe. + """ + ... diff --git a/spec/API_specification/dataframe_api/permissivecolumn_object.py b/spec/API_specification/dataframe_api/permissivecolumn_object.py new file mode 100644 index 00000000..38e118d3 --- /dev/null +++ b/spec/API_specification/dataframe_api/permissivecolumn_object.py @@ -0,0 +1,764 @@ +from __future__ import annotations + +from typing import Any,NoReturn, TYPE_CHECKING, Literal, Generic + +if TYPE_CHECKING: + from . import Bool + from ._types import NullType, Scalar, DType, Namespace + from .column_object import Column + + +__all__ = ['PermissiveColumn'] + + +class PermissiveColumn: + """ + PermissiveColumn object + + Instantiate via :meth:`PermissiveFrame.get_column_by_name`. + """ + + def __column_namespace__(self) -> Namespace: + """ + Returns an object that has all the Dataframe Standard API functions on it. + + Returns + ------- + namespace: Namespace + An object representing the dataframe API namespace. It should have + every top-level function defined in the specification as an + attribute. It may contain other public names as well, but it is + recommended to only include those names that are part of the + specification. + + """ + + @property + def column(self) -> Any: + """ + Return underlying (not-necessarily-Standard-compliant) column. + + If a library only implements the Standard, then this can return `self`. + """ + ... + + @property + def name(self) -> str: + """Return name of column.""" + + def __iter__(self) -> NoReturn: + """ + Iterate over elements. + + This is intentionally "poisoned" to discourage inefficient code patterns. + + Raises + ------ + NotImplementedError + """ + raise NotImplementedError("'__iter__' is intentionally not implemented.") + + @property + def dtype(self) -> DType: + """ + Return data type of column. + """ + + def len(self) -> int: + """ + Return the number of rows. + """ + + + def get_rows(self: PermissiveColumn, indices: PermissiveColumn) -> PermissiveColumn: + """ + Select a subset of rows, similar to `ndarray.take`. + + Parameters + ---------- + indices : PermissiveColumn + Positions of rows to select. + """ + ... + + + def slice_rows( + self: PermissiveColumn, start: int | None, stop: int | None, step: int | None + ) -> PermissiveColumn: + """ + Select a subset of rows corresponding to a slice. + + Parameters + ---------- + start : int or None + stop : int or None + step : int or None + + Returns + ------- + PermissiveColumn + """ + ... + + + def filter(self: PermissiveColumn, mask: Column | PermissiveColumn) -> PermissiveColumn: + """ + Select a subset of rows corresponding to a mask. + + Parameters + ---------- + mask : PermissiveColumn + + Returns + ------- + PermissiveColumn + + Notes + ----- + Some participants preferred a weaker type Arraylike[bool] for mask, + where 'Arraylike' denotes an object adhering to the Array API standard. + """ + ... + + + def get_value(self, row_number: int) -> Scalar: + """ + Select the value at a row number, similar to `ndarray.__getitem__()`. + + Parameters + ---------- + row_number : int + Row number of value to return. + + Returns + ------- + Scalar + Depends on the dtype of the PermissiveColumn, and may vary + across implementations. + """ + ... + + def sort( + self, + *, + ascending: bool = True, + nulls_position: Literal['first', 'last'] = 'last', + ) -> PermissiveColumn: + """ + Sort column. + + If you need the indices which would sort the column, + use :meth:`sorted_indices`. + + Parameters + ---------- + ascending : bool + If `True`, sort in ascending order. + If `False`, sort in descending order. + nulls_position : ``{'first', 'last'}`` + Whether null values should be placed at the beginning + or at the end of the result. + Note that the position of NaNs is unspecified and may + vary based on the implementation. + + Returns + ------- + PermissiveColumn + """ + ... + + def sorted_indices( + self, + *, + ascending: bool = True, + nulls_position: Literal['first', 'last'] = 'last', + ) -> PermissiveColumn: + """ + Return row numbers which would sort column. + + If you need to sort the PermissiveColumn, use :meth:`sort`. + + Parameters + ---------- + ascending : bool + If `True`, sort in ascending order. + If `False`, sort in descending order. + nulls_position : ``{'first', 'last'}`` + Whether null values should be placed at the beginning + or at the end of the result. + Note that the position of NaNs is unspecified and may + vary based on the implementation. + + Returns + ------- + PermissiveColumn + """ + ... + + def __eq__(self, other: PermissiveColumn | Scalar) -> PermissiveColumn: # type: ignore[override] + """ + Compare for equality. + + Nulls should follow Kleene Logic. + + Parameters + ---------- + other : PermissiveColumn or Scalar + If PermissiveColumn, must have same length. + "Scalar" here is defined implicitly by what scalar types are allowed + for the operation by the underling dtypes. + + Returns + ------- + PermissiveColumn + """ + + def __ne__(self: PermissiveColumn, other: PermissiveColumn | Scalar) -> PermissiveColumn: # type: ignore[override] + """ + Compare for non-equality. + + Nulls should follow Kleene Logic. + + Parameters + ---------- + other : PermissiveColumn or Scalar + If PermissiveColumn, must have same length. + "Scalar" here is defined implicitly by what scalar types are allowed + for the operation by the underling dtypes. + + Returns + ------- + PermissiveColumn + """ + + def __ge__(self: PermissiveColumn, other: PermissiveColumn | Scalar) -> PermissiveColumn: + """ + Compare for "greater than or equal to" `other`. + + Parameters + ---------- + other : PermissiveColumn or Scalar + If PermissiveColumn, must have same length. + "Scalar" here is defined implicitly by what scalar types are allowed + for the operation by the underling dtypes. + + Returns + ------- + PermissiveColumn + """ + + def __gt__(self: PermissiveColumn, other: PermissiveColumn | Scalar) -> PermissiveColumn: + """ + Compare for "greater than" `other`. + + Parameters + ---------- + other : PermissiveColumn or Scalar + If PermissiveColumn, must have same length. + "Scalar" here is defined implicitly by what scalar types are allowed + for the operation by the underling dtypes. + + Returns + ------- + PermissiveColumn + """ + + def __le__(self: PermissiveColumn, other: PermissiveColumn | Scalar) -> PermissiveColumn: + """ + Compare for "less than or equal to" `other`. + + Parameters + ---------- + other : PermissiveColumn or Scalar + If PermissiveColumn, must have same length. + "Scalar" here is defined implicitly by what scalar types are allowed + for the operation by the underling dtypes. + + Returns + ------- + PermissiveColumn + """ + + def __lt__(self: PermissiveColumn, other: PermissiveColumn | Scalar) -> PermissiveColumn: + """ + Compare for "less than" `other`. + + Parameters + ---------- + other : PermissiveColumn or Scalar + If PermissiveColumn, must have same length. + "Scalar" here is defined implicitly by what scalar types are allowed + for the operation by the underling dtypes. + + Returns + ------- + PermissiveColumn + """ + + def __and__(self: PermissiveColumn, other: PermissiveColumn | bool) -> PermissiveColumn: + """ + Apply logical 'and' to `other` PermissiveColumn (or scalar) and this PermissiveColumn. + + Nulls should follow Kleene Logic. + + Parameters + ---------- + other : PermissiveColumn or bool + If PermissiveColumn, must have same length. + + Returns + ------- + PermissiveColumn + + Raises + ------ + ValueError + If `self` or `other` is not boolean. + """ + + def __or__(self: PermissiveColumn, other: PermissiveColumn | bool) -> PermissiveColumn: + """ + Apply logical 'or' to `other` PermissiveColumn (or scalar) and this column. + + Nulls should follow Kleene Logic. + + Parameters + ---------- + other : PermissiveColumn or Scalar + If PermissiveColumn, must have same length. + + Returns + ------- + PermissiveColumn + + Raises + ------ + ValueError + If `self` or `other` is not boolean. + """ + + def __add__(self: PermissiveColumn, other: PermissiveColumn | Scalar) -> PermissiveColumn: + """ + Add `other` column or scalar to this column. + + Parameters + ---------- + other : PermissiveColumn or Scalar + If PermissiveColumn, must have same length. + "Scalar" here is defined implicitly by what scalar types are allowed + for the operation by the underling dtypes. + + Returns + ------- + PermissiveColumn + """ + + def __sub__(self: PermissiveColumn, other: PermissiveColumn | Scalar) -> PermissiveColumn: + """ + Subtract `other` column or scalar from this column. + + Parameters + ---------- + other : PermissiveColumn or Scalar + If PermissiveColumn, must have same length. + "Scalar" here is defined implicitly by what scalar types are allowed + for the operation by the underling dtypes. + + Returns + ------- + PermissiveColumn + """ + + def __mul__(self, other: PermissiveColumn | Scalar) -> PermissiveColumn: + """ + Multiply `other` column or scalar with this column. + + Parameters + ---------- + other : PermissiveColumn or Scalar + If PermissiveColumn, must have same length. + "Scalar" here is defined implicitly by what scalar types are allowed + for the operation by the underling dtypes. + + Returns + ------- + PermissiveColumn + """ + + def __truediv__(self, other: PermissiveColumn | Scalar) -> PermissiveColumn: + """ + Divide this column by `other` column or scalar. True division, returns floats. + + Parameters + ---------- + other : PermissiveColumn or Scalar + If PermissiveColumn, must have same length. + "Scalar" here is defined implicitly by what scalar types are allowed + for the operation by the underling dtypes. + + Returns + ------- + PermissiveColumn + """ + + def __floordiv__(self, other: PermissiveColumn | Scalar) -> PermissiveColumn: + """ + Floor-divide `other` column or scalar to this column. + + Parameters + ---------- + other : PermissiveColumn or Scalar + If PermissiveColumn, must have same length. + "Scalar" here is defined implicitly by what scalar types are allowed + for the operation by the underling dtypes. + + Returns + ------- + PermissiveColumn + """ + + def __pow__(self, other: PermissiveColumn | Scalar) -> PermissiveColumn: + """ + Raise this column to the power of `other`. + + Integer dtype to the power of non-negative integer dtype is integer dtype. + Integer dtype to the power of float dtype is float dtype. + Float dtype to the power of integer dtype or float dtype is float dtype. + + Parameters + ---------- + other : PermissiveColumn or Scalar + If PermissiveColumn, must have same length. + "Scalar" here is defined implicitly by what scalar types are allowed + for the operation by the underling dtypes. + + Returns + ------- + PermissiveColumn + """ + + def __mod__(self, other: PermissiveColumn | Scalar) -> PermissiveColumn: + """ + Returns modulus of this column by `other` (`%` operator). + + Parameters + ---------- + other : PermissiveColumn or Scalar + If PermissiveColumn, must have same length. + "Scalar" here is defined implicitly by what scalar types are allowed + for the operation by the underling dtypes. + + Returns + ------- + PermissiveColumn + """ + + def __divmod__(self, other: PermissiveColumn | Scalar) -> tuple[PermissiveColumn, PermissiveColumn]: + """ + Return quotient and remainder of integer division. See `divmod` builtin function. + + Parameters + ---------- + other : PermissiveColumn or Scalar + If PermissiveColumn, must have same length. + "Scalar" here is defined implicitly by what scalar types are allowed + for the operation by the underling dtypes. + + Returns + ------- + PermissiveColumn + """ + + def __invert__(self: PermissiveColumn) -> PermissiveColumn: + """ + Invert truthiness of (boolean) elements. + + Raises + ------ + ValueError + If any of the PermissiveColumn's columns is not boolean. + """ + + def any(self: PermissiveColumn, *, skip_nulls: bool = True) -> bool | NullType: + """ + Reduction returns a bool. + + Raises + ------ + ValueError + If Column is not boolean. + """ + + def all(self: PermissiveColumn, *, skip_nulls: bool = True) -> bool | NullType: + """ + Reduction returns a bool. + + Raises + ------ + ValueError + If Column is not boolean. + """ + + def min(self, *, skip_nulls: bool = True) -> Scalar | NullType: + """ + Reduction returns a scalar. Any data type that supports comparisons + must be supported. The returned value has the same dtype as the column. + """ + + def max(self, *, skip_nulls: bool = True) -> Scalar | NullType: + """ + Reduction returns a scalar. Any data type that supports comparisons + must be supported. The returned value has the same dtype as the column. + """ + + def sum(self, *, skip_nulls: bool = True) -> Scalar | NullType: + """ + Reduction returns a scalar. Must be supported for numerical and + datetime data types. The returned value has the same dtype as the + column. + """ + + def prod(self, *, skip_nulls: bool = True) -> Scalar | NullType: + """ + Reduction returns a scalar. Must be supported for numerical data types. + The returned value has the same dtype as the column. + """ + + def median(self, *, skip_nulls: bool = True) -> Scalar | NullType: + """ + Reduction returns a scalar. Must be supported for numerical and + datetime data types. Returns a float for numerical data types, and + datetime (with the appropriate timedelta format string) for datetime + dtypes. + """ + + def mean(self, *, skip_nulls: bool = True) -> Scalar | NullType: + """ + Reduction returns a scalar. Must be supported for numerical and + datetime data types. Returns a float for numerical data types, and + datetime (with the appropriate timedelta format string) for datetime + dtypes. + """ + + def std(self, *, correction: int | float = 1, skip_nulls: bool = True) -> Scalar | NullType: + """ + Reduction returns a scalar. Must be supported for numerical and + datetime data types. Returns a float for numerical data types, and + datetime (with the appropriate timedelta format string) for datetime + dtypes. + + Parameters + ---------- + correction + Degrees of freedom adjustment. Setting this parameter to a value other + than ``0`` has the effect of adjusting the divisor during the + calculation of the standard deviation according to ``N-correction``, + where ``N`` corresponds to the total number of elements over which + the standard deviation is computed. When computing the standard + deviation of a population, setting this parameter to ``0`` is the + standard choice (i.e., the provided column contains data + constituting an entire population). When computing the corrected + sample standard deviation, setting this parameter to ``1`` is the + standard choice (i.e., the provided column contains data sampled + from a larger population; this is commonly referred to as Bessel's + correction). Fractional (float) values are allowed. Default: ``1``. + skip_nulls + Whether to skip null values. + """ + + def var(self, *, correction: int | float = 1, skip_nulls: bool = True) -> Scalar | NullType: + """ + Reduction returns a scalar. Must be supported for numerical and + datetime data types. Returns a float for numerical data types, and + datetime (with the appropriate timedelta format string) for datetime + dtypes. + + Parameters + ---------- + correction + Correction to apply to the result. For example, ``0`` for sample + standard deviation and ``1`` for population standard deviation. + See :meth`Column.std` for a more detailed description. + skip_nulls + Whether to skip null values. + """ + + def cumulative_max(self: PermissiveColumn) -> PermissiveColumn: + """ + Reduction returns a PermissiveColumn. Any data type that supports comparisons + must be supported. The returned value has the same dtype as the column. + """ + + def cumulative_min(self: PermissiveColumn) -> PermissiveColumn: + """ + Reduction returns a PermissiveColumn. Any data type that supports comparisons + must be supported. The returned value has the same dtype as the column. + """ + + def cumulative_sum(self: PermissiveColumn) -> PermissiveColumn: + """ + Reduction returns a PermissiveColumn. Must be supported for numerical and + datetime data types. The returned value has the same dtype as the + column. + """ + + def cumulative_prod(self: PermissiveColumn) -> PermissiveColumn: + """ + Reduction returns a PermissiveColumn. Must be supported for numerical and + datetime data types. The returned value has the same dtype as the + column. + """ + + def is_null(self) -> PermissiveColumn: + """ + Check for 'missing' or 'null' entries. + + Returns + ------- + PermissiveColumn + + See also + -------- + is_nan + + Notes + ----- + Does *not* include NaN-like entries. + May optionally include 'NaT' values (if present in an implementation), + but note that the Standard makes no guarantees about them. + """ + + def is_nan(self) -> PermissiveColumn: + """ + Check for nan entries. + + Returns + ------- + PermissiveColumn + + See also + -------- + is_null + + Notes + ----- + This only checks for 'NaN'. + Does *not* include 'missing' or 'null' entries. + In particular, does not check for `np.timedelta64('NaT')`. + """ + + def is_in(self: PermissiveColumn, values: PermissiveColumn) -> PermissiveColumn: + """ + Indicate whether the value at each row matches any value in `values`. + + Parameters + ---------- + values : PermissiveColumn + Contains values to compare against. May include ``float('nan')`` and + ``null``, in which case ``'nan'`` and ``null`` will + respectively return ``True`` even though ``float('nan') == float('nan')`` + isn't ``True``. + The dtype of ``values`` must match the current column's dtype. + + Returns + ------- + PermissiveColumn + """ + + def unique_indices(self, *, skip_nulls: bool = True) -> PermissiveColumn: + """ + Return indices corresponding to unique values in PermissiveColumn. + + Returns + ------- + PermissiveColumn + Indices corresponding to unique values. + + Notes + ----- + There are no ordering guarantees. In particular, if there are multiple + indices corresponding to the same unique value, there is no guarantee + about which one will appear in the result. + If the original PermissiveColumn contains multiple `'NaN'` values, then + only a single index corresponding to those values will be returned. + Likewise for null values (if ``skip_nulls=False``). + To get the unique values, you can do ``col.get_rows(col.unique_indices())``. + """ + ... + + def fill_nan(self: PermissiveColumn, value: float | NullType, /) -> PermissiveColumn: + """ + Fill floating point ``nan`` values with the given fill value. + + Parameters + ---------- + value : float or `null` + Value used to replace any ``nan`` in the column with. Must be + of the Python scalar type matching the dtype of the column (or + be `null`). + + """ + ... + + def fill_null(self: PermissiveColumn, value: Scalar, /) -> PermissiveColumn: + """ + Fill null values with the given fill value. + + Parameters + ---------- + value : Scalar + Value used to replace any ``null`` values in the column with. + Must be of the Python scalar type matching the dtype of the column. + """ + ... + + def to_array_object(self, dtype: DType) -> Any: + """ + Convert to array-API-compliant object. + + Parameters + ---------- + dtype : DType + The dtype of the array-API-compliant object to return. + Must be one of: + + - Bool() + - Int8() + - Int16() + - Int32() + - Int64() + - UInt8() + - UInt16() + - UInt32() + - UInt64() + - Float32() + - Float64() + + Returns + ------- + Any + An array-API-compliant object. + + Notes + ----- + While numpy arrays are not yet array-API-compliant, implementations + may choose to return a numpy array (for numpy prior to 2.0), with the + understanding that consuming libraries would then use the + ``array-api-compat`` package to convert it to a Standard-compliant array. + """ + + def rename(self, name: str) -> PermissiveColumn: + """ + Rename column. + + Parameters + ---------- + name : str + New name for column. + + Returns + ------- + PermissiveColumn + New column - this does not operate in-place. + """ + ... diff --git a/spec/API_specification/dataframe_api/permissiveframe_object.py b/spec/API_specification/dataframe_api/permissiveframe_object.py new file mode 100644 index 00000000..0d763234 --- /dev/null +++ b/spec/API_specification/dataframe_api/permissiveframe_object.py @@ -0,0 +1,838 @@ +from __future__ import annotations + +from typing import Any, Literal, Mapping, Sequence, Union, TYPE_CHECKING, NoReturn + + +if TYPE_CHECKING: + from .permissivecolumn_object import PermissiveColumn + from .column_object import Column + from .dataframe_object import DataFrame + from .groupby_object import GroupBy + from ._types import NullType, Scalar, Namespace, DType + + +__all__ = ["PermissiveFrame"] + + +class PermissiveFrame: + """ + PermissiveFrame object. Like :class:`DataFrame`, but must support some extra methods. + + In particular, it must support methods which require eager evaluation, such as + :meth:`to_array_object`. + + Instatiate from a :class:`DataFrame` using :meth:`DataFrame.collect()`. + + Convert back to `:class:`DataFrame` using `:meth:`relax()`. + """ + def __dataframe_namespace__(self) -> Namespace: + """ + Returns an object that has all the top-level dataframe API functions on it. + + Returns + ------- + namespace: Any + An object representing the dataframe API namespace. It should have + every top-level function defined in the specification as an + attribute. It may contain other public names as well, but it is + recommended to only include those names that are part of the + specification. + """ + + @property + def dataframe(self) -> object: + """ + Return underlying (not-necessarily-Standard-compliant) DataFrame. + + If a library only implements the Standard, then this can return `self`. + """ + ... + + @property + def schema(self) -> dict[str, DType]: + """ + Get dataframe's schema. + + Returns + ------- + dict[str, DType] + Mapping from column name to data type. + """ + + def group_by(self, *keys: str) -> GroupBy: + """ + Group the DataFrame by the given columns. + + Parameters + ---------- + keys : str + + Returns + ------- + GroupBy + + Raises + ------ + KeyError + If any of the requested keys are not present. + + Notes + ----- + Downstream operations from this function, like aggregations, return + results for which row order is not guaranteed and is implementation + defined. + """ + ... + + def get_column_by_name(self, name: str, /) -> PermissiveColumn: + """ + Select a column by name. + + Parameters + ---------- + name : str + + Returns + ------- + PermissiveColumn + + Raises + ------ + KeyError + If the key is not present. + """ + ... + + def select(self, *columns: str | Column) -> PermissiveFrame: + """ + Select multiple columns by name. + + Parameters + ---------- + columns + Column names (or columns) to select. + + Returns + ------- + PermissiveFrame + + Examples + -------- + >>> df: PermissiveFrame + >>> col = df.__dataframe_namespace__().col + >>> df = df.select('a', col('b'), (col('c')+col('d')+1).rename('e')) + + Raises + ------ + KeyError + If any requested key is not present. + """ + ... + + def get_rows(self, indices: Column | PermissiveColumn) -> PermissiveFrame: + """ + Select a subset of rows, similar to `ndarray.take`. + + Parameters + ---------- + indices : Column + Positions of rows to select. + + Returns + ------- + PermissiveFrame + """ + ... + + def slice_rows( + self, start: int | None, stop: int | None, step: int | None + ) -> PermissiveFrame: + """ + Select a subset of rows corresponding to a slice. + + Parameters + ---------- + start : int or None + stop : int or None + step : int or None + + Returns + ------- + PermissiveFrame + """ + ... + + def filter(self, mask: Column | PermissiveColumn) -> PermissiveFrame: + """ + Select a subset of rows corresponding to a mask. + + Parameters + ---------- + mask : Column or PermissiveColumn + Keep rows where `mask` is `True`. + + Returns + ------- + PermissiveFrame + """ + ... + + def assign(self, *columns: Column | PermissiveColumn) -> PermissiveFrame: + """ + Insert new column(s), or update values in existing ones. + + If inserting new columns, the column's names will be used as the labels, + and the columns will be inserted at the rightmost location. + + If updating existing columns, their names will be used to tell which columns + to update. To update a column with a different name, combine with + :meth:`PermissiveColumn.rename` or :meth:`Column.rename`, e.g.: + + .. code-block:: python + + new_column = df.get_column_by_name('a') + 1 + df = df.assign(new_column.rename('b')) + + Parameters + ---------- + columns : Column | PermissiveColumn + Column(s) to update/insert. If updating/inserting multiple columns, + they must all have different names. + + Returns + ------- + DataFrame + """ + ... + + def drop_columns(self, *labels: str) -> PermissiveFrame: + """ + Drop the specified column(s). + + Parameters + ---------- + labels : str + + Returns + ------- + PermissiveFrame + + Raises + ------ + KeyError + If a label is not present. + """ + ... + + def rename_columns(self, mapping: Mapping[str, str]) -> PermissiveFrame: + """ + Rename columns. + + Parameters + ---------- + mapping : Mapping[str, str] + Keys are old column names, values are new column names. + + Returns + ------- + PermissiveFrame + """ + ... + + @property + def column_names(self) -> list[str]: + """ + Get column names. + + Returns + ------- + list[str] + """ + ... + + def sort( + self, + *keys: str | Column | PermissiveColumn, + ascending: Sequence[bool] | bool = True, + nulls_position: Literal['first', 'last'] = 'last', + ) -> PermissiveFrame: + """ + Sort dataframe according to given columns. + + If you only need the indices which would sort the dataframe, use + :meth:`sorted_indices`. + + Parameters + ---------- + keys : str | Column + Names of columns to sort by. + If not passed, will sort by all columns. + ascending : Sequence[bool] or bool + If `True`, sort by all keys in ascending order. + If `False`, sort by all keys in descending order. + If a sequence, it must be the same length as `keys`, + and determines the direction with which to use each + key to sort by. + nulls_position : ``{'first', 'last'}`` + Whether null values should be placed at the beginning + or at the end of the result. + Note that the position of NaNs is unspecified and may + vary based on the implementation. + + Returns + ------- + PermissiveFrame + + Raises + ------ + ValueError + If `keys` and `ascending` are sequences of different lengths. + """ + ... + + + def __eq__(self, other: Scalar) -> PermissiveFrame: # type: ignore[override] + """ + Compare for equality. + + Nulls should follow Kleene Logic. + + Parameters + ---------- + other : Scalar + "Scalar" here is defined implicitly by what scalar types are allowed + for the operation by the underling dtypes. + + Returns + ------- + PermissiveFrame + """ + ... + + def __ne__(self, other: Scalar) -> PermissiveFrame: # type: ignore[override] + """ + Compare for non-equality. + + Nulls should follow Kleene Logic. + + Parameters + ---------- + other : Scalar + "Scalar" here is defined implicitly by what scalar types are allowed + for the operation by the underling dtypes. + + Returns + ------- + PermissiveFrame + """ + ... + + def __ge__(self, other: Scalar) -> PermissiveFrame: + """ + Compare for "greater than or equal to" `other`. + + Parameters + ---------- + other : Scalar + "Scalar" here is defined implicitly by what scalar types are allowed + for the operation by the underling dtypes. + + Returns + ------- + PermissiveFrame + """ + ... + + def __gt__(self, other: Scalar) -> PermissiveFrame: + """ + Compare for "greater than" `other`. + + Parameters + ---------- + other : Scalar + "Scalar" here is defined implicitly by what scalar types are allowed + for the operation by the underling dtypes. + + Returns + ------- + PermissiveFrame + """ + ... + + def __le__(self, other: Scalar) -> PermissiveFrame: + """ + Compare for "less than or equal to" `other`. + + Parameters + ---------- + other : Scalar + "Scalar" here is defined implicitly by what scalar types are allowed + for the operation by the underling dtypes. + + Returns + ------- + PermissiveFrame + """ + ... + + def __lt__(self, other: Scalar) -> PermissiveFrame: + """ + Compare for "less than" `other`. + + Parameters + ---------- + other : Scalar + "Scalar" here is defined implicitly by what scalar types are allowed + for the operation by the underling dtypes. + + Returns + ------- + PermissiveFrame + """ + ... + + def __and__(self, other: bool) -> PermissiveFrame: + """ + Apply logical 'and' to `other` scalar and this dataframe. + + Nulls should follow Kleene Logic. + + Parameters + ---------- + other : bool + + Returns + ------- + PermissiveFrame[bool] + + Raises + ------ + ValueError + If `self` or `other` is not boolean. + """ + + def __or__(self, other: PermissiveFrame | bool) -> PermissiveFrame: + """ + Apply logical 'or' to `other` scalar and this DataFrame. + + Nulls should follow Kleene Logic. + + Parameters + ---------- + other : bool + + Returns + ------- + PermissiveFrame[bool] + + Raises + ------ + ValueError + If `self` or `other` is not boolean. + """ + + def __add__(self, other: Scalar) -> PermissiveFrame: + """ + Add `other` scalar to this dataframe. + + Parameters + ---------- + other : Scalar + "Scalar" here is defined implicitly by what scalar types are allowed + for the operation by the underling dtypes. + + Returns + ------- + PermissiveFrame + """ + ... + + def __sub__(self, other: Scalar) -> PermissiveFrame: + """ + Subtract `other` scalar from this dataframe. + + Parameters + ---------- + other : Scalar + "Scalar" here is defined implicitly by what scalar types are allowed + for the operation by the underling dtypes. + + Returns + ------- + PermissiveFrame + """ + ... + + def __mul__(self, other: Scalar) -> PermissiveFrame: + """ + Multiply `other` scalar with this dataframe. + + Parameters + ---------- + other : Scalar + "Scalar" here is defined implicitly by what scalar types are allowed + for the operation by the underling dtypes. + + Returns + ------- + PermissiveFrame + """ + ... + + def __truediv__(self, other: Scalar) -> PermissiveFrame: + """ + Divide this dataframe by `other` scalar. True division, returns floats. + + Parameters + ---------- + other : Scalar + "Scalar" here is defined implicitly by what scalar types are allowed + for the operation by the underling dtypes. + + Returns + ------- + PermissiveFrame + """ + ... + + def __floordiv__(self, other: Scalar) -> PermissiveFrame: + """ + Floor-divide (returns integers) this dataframe by `other` scalar. + + Parameters + ---------- + other : Scalar + "Scalar" here is defined implicitly by what scalar types are allowed + for the operation by the underling dtypes. + + Returns + ------- + PermissiveFrame + """ + ... + + def __pow__(self, other: Scalar) -> PermissiveFrame: + """ + Raise this dataframe to the power of `other`. + + Integer dtype to the power of non-negative integer dtype is integer dtype. + Integer dtype to the power of float dtype is float dtype. + Float dtype to the power of integer dtype or float dtype is float dtype. + + Parameters + ---------- + other : Scalar + "Scalar" here is defined implicitly by what scalar types are allowed + for the operation by the underling dtypes. + + Returns + ------- + PermissiveFrame + """ + ... + + def __mod__(self, other: Scalar) -> PermissiveFrame: + """ + Return modulus of this dataframe by `other` (`%` operator). + + Parameters + ---------- + other : Scalar + "Scalar" here is defined implicitly by what scalar types are allowed + for the operation by the underling dtypes. + + Returns + ------- + PermissiveFrame + """ + ... + + def __divmod__(self, other: Scalar) -> tuple[PermissiveFrame, PermissiveFrame]: + """ + Return quotient and remainder of integer division. See `divmod` builtin function. + + Parameters + ---------- + other : Scalar + "Scalar" here is defined implicitly by what scalar types are allowed + for the operation by the underling dtypes. + + Returns + ------- + A tuple of two `DataFrame`s + """ + ... + + def __invert__(self) -> PermissiveFrame: + """ + Invert truthiness of (boolean) elements. + + Raises + ------ + ValueError + If any of the DataFrame's columns is not boolean. + """ + ... + + def __iter__(self) -> NoReturn: + """ + Iterate over elements. + + This is intentionally "poisoned" to discourage inefficient code patterns. + + Raises + ------ + NotImplementedError + """ + raise NotImplementedError("'__iter__' is intentionally not implemented.") + + def any(self, *, skip_nulls: bool = True) -> PermissiveFrame: + """ + Reduction returns a 1-row DataFrame. + + Raises + ------ + ValueError + If any of the DataFrame's columns is not boolean. + """ + ... + + def all(self, *, skip_nulls: bool = True) -> PermissiveFrame: + """ + Reduction returns a 1-row DataFrame. + + Raises + ------ + ValueError + If any of the DataFrame's columns is not boolean. + """ + ... + + def min(self, *, skip_nulls: bool = True) -> PermissiveFrame: + """ + Reduction returns a 1-row DataFrame. + """ + ... + + def max(self, *, skip_nulls: bool = True) -> PermissiveFrame: + """ + Reduction returns a 1-row DataFrame. + """ + ... + + def sum(self, *, skip_nulls: bool = True) -> PermissiveFrame: + """ + Reduction returns a 1-row DataFrame. + """ + ... + + def prod(self, *, skip_nulls: bool = True) -> PermissiveFrame: + """ + Reduction returns a 1-row DataFrame. + """ + ... + + def median(self, *, skip_nulls: bool = True) -> PermissiveFrame: + """ + Reduction returns a 1-row DataFrame. + """ + ... + + def mean(self, *, skip_nulls: bool = True) -> PermissiveFrame: + """ + Reduction returns a 1-row DataFrame. + """ + ... + + def std(self, *, correction: int | float = 1, skip_nulls: bool = True) -> PermissiveFrame: + """ + Reduction returns a 1-row DataFrame. + + Parameters + ---------- + correction + Correction to apply to the result. For example, ``0`` for sample + standard deviation and ``1`` for population standard deviation. + See :meth`Column.std` for a more detailed description. + skip_nulls + Whether to skip null values. + """ + ... + + def var(self, *, correction: int | float = 1, skip_nulls: bool = True) -> PermissiveFrame: + """ + Reduction returns a 1-row DataFrame. + + Parameters + ---------- + correction + Correction to apply to the result. For example, ``0`` for sample + standard deviation and ``1`` for population standard deviation. + See :meth`Column.std` for a more detailed description. + skip_nulls + Whether to skip null values. + """ + ... + + def is_null(self) -> PermissiveFrame: + """ + Check for 'missing' or 'null' entries. + + Returns + ------- + PermissiveFrame + + See also + -------- + is_nan + + Notes + ----- + Does *not* include NaN-like entries. + May optionally include 'NaT' values (if present in an implementation), + but note that the Standard makes no guarantees about them. + """ + ... + + def is_nan(self) -> PermissiveFrame: + """ + Check for nan entries. + + Returns + ------- + PermissiveFrame + + See also + -------- + is_null + + Notes + ----- + This only checks for 'NaN'. + Does *not* include 'missing' or 'null' entries. + In particular, does not check for `np.timedelta64('NaT')`. + """ + ... + + def fill_nan(self, value: float | NullType, /) -> PermissiveFrame: + """ + Fill ``nan`` values with the given fill value. + + The fill operation will apply to all columns with a floating-point + dtype. Other columns remain unchanged. + + Parameters + ---------- + value : float or `null` + Value used to replace any ``nan`` in the column with. Must be + of the Python scalar type matching the dtype of the column (or + be `null`). + + """ + ... + + def fill_null( + self, value: Scalar, /, *, column_names : list[str] | None = None + ) -> PermissiveFrame: + """ + Fill null values with the given fill value. + + This method can only be used if all columns that are to be filled are + of the same dtype (e.g., all of ``Float64`` or all of string dtype). + If that is not the case, it is not possible to use a single Python + scalar type that matches the dtype of all columns to which + ``fill_null`` is being applied, and hence an exception will be raised. + + Parameters + ---------- + value : Scalar + Value used to replace any ``null`` values in the dataframe with. + Must be of the Python scalar type matching the dtype(s) of the dataframe. + column_names : list[str] | None + A list of column names for which to replace nulls with the given + scalar value. If ``None``, nulls will be replaced in all columns. + + Raises + ------ + TypeError + If the columns of the dataframe are not all of the same kind. + KeyError + If ``column_names`` contains a column name that is not present in + the dataframe. + """ + ... + + def to_array_object(self, dtype: DType) -> Any: + """ + Convert to array-API-compliant object. + + Parameters + ---------- + dtype : DType + The dtype of the array-API-compliant object to return. + Must be one of: + + - Bool() + - Int8() + - Int16() + - Int32() + - Int64() + - UInt8() + - UInt16() + - UInt32() + - UInt64() + - Float32() + - Float64() + + Returns + ------- + Any + An array-API-compliant object. + + Notes + ----- + While numpy arrays are not yet array-API-compliant, implementations + may choose to return a numpy array (for numpy prior to 2.0), with the + understanding that consuming libraries would then use the + ``array-api-compat`` package to convert it to a Standard-compliant array. + """ + + def join( + self, + other: PermissiveFrame, + *, + how: Literal['left', 'inner', 'outer'], + left_on: str | list[str], + right_on: str | list[str], + ) -> PermissiveFrame: + """ + Join with other dataframe. + + Parameters + ---------- + other : DataFrame + Dataframe to join with. + how : str + Kind of join to perform. + Must be one of {'left', 'inner', 'outer'}. + left_on : str | list[str] + Key(s) from `self` to perform `join` on. + If more than one key is given, it must be + the same length as `right_on`. + right_on : str | list[str] + Key(s) from `other` to perform `join` on. + If more than one key is given, it must be + the same length as `left_on`. + + Returns + ------- + PermissiveFrame + """ + + def relax(self) -> DataFrame: + """ + Return DataFrame. + + As :class:`DataFrame` is agnostic to execution details, some implementations + may choose to make the backing object lazy. + """ diff --git a/spec/API_specification/examples/01_standardise_columns.py b/spec/API_specification/examples/01_standardise_columns.py index cb6b49b1..001de38f 100644 --- a/spec/API_specification/examples/01_standardise_columns.py +++ b/spec/API_specification/examples/01_standardise_columns.py @@ -4,11 +4,12 @@ def my_dataframe_agnostic_function(df_non_standard: SupportsDataFrameAPI) -> Any: df = df_non_standard.__dataframe_consortium_standard__(api_version='2023.09-beta') + xp = df.__dataframe_namespace__() for column_name in df.column_names: if column_name == 'species': continue - new_column = df.get_column_by_name(column_name) + new_column = xp.col(column_name) new_column = (new_column - new_column.mean()) / new_column.std() df = df.assign(new_column.rename(f'{column_name}_scaled')) diff --git a/spec/API_specification/examples/02_plotting.py b/spec/API_specification/examples/02_plotting.py index 82068835..ac86d63f 100644 --- a/spec/API_specification/examples/02_plotting.py +++ b/spec/API_specification/examples/02_plotting.py @@ -17,7 +17,7 @@ def group_by_and_plot( df = namespace.dataframe_from_dict({"x": x, "y": y, "color": color}) - agg = df.group_by("color").mean() + agg = df.group_by("color").mean().collect() x = agg.get_column_by_name("x").to_array_object(namespace.Float64()) y = agg.get_column_by_name("y").to_array_object(namespace.Float64()) diff --git a/spec/API_specification/examples/03_split_and_train.py b/spec/API_specification/examples/03_split_and_train.py new file mode 100644 index 00000000..82855bcf --- /dev/null +++ b/spec/API_specification/examples/03_split_and_train.py @@ -0,0 +1,56 @@ +from __future__ import annotations + +from dataframe_api import DataFrame + +from typing import Callable, Protocol, Any, cast +from typing_extensions import Self + +from dataframe_api._types import SupportsDataFrameAPI + +expensive_feature_engineering: Callable[[DataFrame], DataFrame] + +class Model(Protocol): + def __init__(self) -> None: + ... + def __call__(self) -> Self: + ... + def fit(self, x_train: Any, y_train: Any) -> Self: + ... + def predict(self, x_test: Any) -> Self: + ... + +MyFancyModel: Model + +def split_train_and_predict(df_non_standard: SupportsDataFrameAPI) -> DataFrame: + df = df_non_standard.__dataframe_consortium_standard__() + namespace = df.__dataframe_namespace__() + col = namespace.col + + df = expensive_feature_engineering(df) + + df_permissive = df.collect() + train = df_permissive.filter(col("id") == 0) + val = df_permissive.filter(col("id") == 1) + + x_train = train.drop_columns("y").to_array_object(namespace.Float64()) + y_train = train.get_column_by_name("y").to_array_object(namespace.Float64()) + x_val = val.drop_columns("y").to_array_object(namespace.Float64()) + y_val = val.get_column_by_name("y").to_array_object(namespace.Float64()) + xp = x_train.__array_namespace__() + + model = MyFancyModel() + model.fit(x_train, y_train) + preds = model.predict(x_val) + + results = xp.concat( + [ + xp.expand_dims(preds, axis=1), + xp.expand_dims(y_val, axis=1), + ] + ) + results_df: DataFrame = namespace.dataframe_from_2d_array( + results, + names=["preds", "true"], + dtypes={"preds": namespace.Float64(), "true": namespace.Float64()}, + ) + return results_df diff --git a/spec/API_specification/examples/04_select_and_filter_demo.py b/spec/API_specification/examples/04_select_and_filter_demo.py new file mode 100644 index 00000000..c3b57a03 --- /dev/null +++ b/spec/API_specification/examples/04_select_and_filter_demo.py @@ -0,0 +1,23 @@ +from dataframe_api import DataFrame + +df: DataFrame +namespace = df.__dataframe_namespace__() +col = namespace.col + +# You can select columns using column names or columns +# the following are all valid +df.select("a") +df.select("a", "b") +df.select(col("a")) +df.select((col("a") + 1).rename("b")) + +# You can filter using columns +df = df.filter(col("width") > col("height")) + +# PermissiveColumn can be thought of as a trivial column. +# So, filtering using PermissiveColumn works too, though is more verbose +df_permissive = df.collect() +df_permissive = df_permissive.filter( + df_permissive.get_column_by_name("width") + > df_permissive.get_column_by_name("height") +) diff --git a/spec/API_specification/index.rst b/spec/API_specification/index.rst index 04c290b1..9384ae3b 100644 --- a/spec/API_specification/index.rst +++ b/spec/API_specification/index.rst @@ -16,6 +16,11 @@ of objects and functions in the top-level namespace. The latter are: __dataframe_api_version__ is_null null + col + sorted_indices + unique_indices + any_rowwise + all_rowwise Int64 Int32 Int16 @@ -37,12 +42,14 @@ of objects and functions in the top-level namespace. The latter are: dataframe_from_dict dataframe_from_2d_array -The ``DataFrame``, ``Column`` and ``GroupBy`` objects have the following +The ``DataFrame``, ``PermissiveFrame``, ``PermissiveColumn``, ``Column`` and ``GroupBy`` objects have the following methods and attributes: .. toctree:: :maxdepth: 3 dataframe_object + permissiveframe_object + permissivecolumn_object column_object groupby_object diff --git a/spec/API_specification/permissivecolumn_object.rst b/spec/API_specification/permissivecolumn_object.rst new file mode 100644 index 00000000..dc59843a --- /dev/null +++ b/spec/API_specification/permissivecolumn_object.rst @@ -0,0 +1,12 @@ +.. _permissivecolumn-object: + +PermissiveColumn object +======================= + +A conforming implementation of the dataframe API standard must provide and +support a column object having the following methods, attributes, and +behavior. + +.. currentmodule:: dataframe_api + +.. autoclass:: PermissiveColumn diff --git a/spec/API_specification/permissiveframe_object.rst b/spec/API_specification/permissiveframe_object.rst new file mode 100644 index 00000000..ba51a6bd --- /dev/null +++ b/spec/API_specification/permissiveframe_object.rst @@ -0,0 +1,12 @@ +.. _permissiveframe-object: + +PermissiveFrame object +====================== + +A conforming implementation of the dataframe API standard must provide and +support a dataframe object having the following methods, attributes, and +behavior. + +.. currentmodule:: dataframe_api + +.. autoclass:: PermissiveFrame diff --git a/spec/conf.py b/spec/conf.py index 2af862cc..4356f4d5 100644 --- a/spec/conf.py +++ b/spec/conf.py @@ -73,7 +73,6 @@ # them don't actually refer to anything that we have a document for. nitpick_ignore = [ ('py:class', 'array'), - ('py:class', 'DataFrame'), ('py:class', 'device'), ('py:class', 'DType'), ('py:class', 'NestedSequence'), @@ -85,6 +84,7 @@ ('py:class', 'Bool'), ('py:class', 'optional'), ('py:class', 'NullType'), + ('py:class', 'GroupBy'), ('py:class', 'Namespace'), ] # NOTE: this alias handling isn't used yet - added in anticipation of future diff --git a/spec/design_topics/python_builtin_types.md b/spec/design_topics/python_builtin_types.md index 93de5c53..b97115bd 100644 --- a/spec/design_topics/python_builtin_types.md +++ b/spec/design_topics/python_builtin_types.md @@ -11,13 +11,13 @@ the `float` it is documented to return, in combination with the `__gt__` method (i.e., the `>` operator) on the dataframe: ```python -class DataFrame: - def __gt__(self, other: DataFrame | Scalar) -> DataFrame: +class PermissiveFrame: + def __gt__(self, other: Scalar) -> PermissiveFrame: ... - def get_column_by_name(self, name: str, /) -> Column: + def get_column_by_name(self, name: str, /) -> PermissiveColumn: ... -class Column: +class PermissiveColumn: def mean(self, skip_nulls: bool = True) -> float | NullType: ... diff --git a/spec/purpose_and_scope.md b/spec/purpose_and_scope.md index 96a0d5c9..320db544 100644 --- a/spec/purpose_and_scope.md +++ b/spec/purpose_and_scope.md @@ -125,7 +125,8 @@ See the [use cases](use_cases.md) section for details on the exact use cases con Implementation details of the dataframes and execution of operations. This includes: - How data is represented and stored (whether the data is in memory, disk, distributed) -- Expectations on when the execution is happening (in an eager or lazy way) +- Expectations on when the execution is happening (in an eager or lazy way). Exceptions + are clearly demarcated, such as `PermissiveFrame` and `PermissiveColumn`. - Other execution details **Rationale:** The API defined in this document needs to be used by libraries as diverse as Ibis,