Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Separate eager and lazy APIs #249

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
155 changes: 143 additions & 12 deletions spec/API_specification/dataframe_api/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@

from typing import Mapping, Sequence, Any, Literal, TYPE_CHECKING

from .column_object import *
from .permissivecolumn_object import PermissiveColumn
from .permissiveframe_object import PermissiveFrame
from .column_object import Column
from .dataframe_object import DataFrame
from .groupby_object import *
from .dtypes import *
Expand All @@ -16,11 +18,17 @@
__all__ = [
"__dataframe_api_version__",
"DataFrame",
"PermissiveFrame",
"PermissiveColumn",
"Column",
"GroupBy",
"column_from_sequence",
"column_from_1d_array",
"col",
"concat",
"dataframe_from_dict",
"sorted_indices",
"unique_indices",
"dataframe_from_2d_array",
"is_null",
"null",
Expand All @@ -40,6 +48,8 @@
"Duration",
"String",
"is_dtype",
"any_rowwise",
"all_rowwise",
]


Expand All @@ -50,6 +60,21 @@
implementation of the dataframe API standard.
"""

def col(name: str) -> Column:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since this isn't bound to a DataFrame, for libraries other than polars and ibis this will be a new concept that will require implementation and maintenance. Do you have a sense for what this would look like for Pandas for example?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yup take a look here data-apis/dataframe-api-compat#13

It's surprisingly simple to just add the syntax

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This doesn't really look simple to me: https://github.com/data-apis/dataframe-api-compat/blob/76284fa158ffe0f21ab1758f46caf523427077a3/dataframe_api_compat/pandas_standard/pandas_standard.py#L81-L417

My concern is that we went from pushing for changes in polars to now pushing for changes in most other dataframe libraries. I would love some thoughts from other dataframe library maintainers here.

Copy link
Contributor Author

@MarcoGorelli MarcoGorelli Oct 5, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's just a matter of recording some lambda calls and then unpacking them - e.g.

        df: DataFrame
        col = df.__dataframe_namespace__().col
        df = df.filter(col('a') > col('b')*2)

becomes

        df: pd.DataFrame
        col_a = lambda df: df.loc[:, 'a']
        col_b = lambda df: df.loc[:, 'b']
        col_b_doubled = lambda df: col_b(df) * 2
        mask = lambda df: col_a(df) > col_b_doubled(df)
        df = df.loc[mask(df)]

If this lives in a separate namespace, then there's not really any extra maintenance that needs doing in the main implementation

I would love some thoughts from other dataframe library maintainers here.

For a start, it might be added to pandas (regardless of what the consortium does), check Joris' lightning talk from euroscipy: https://youtu.be/g2JsyNQgcoU?si=ax0ZINFQINf9a5jv&t=512

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My concern is that we went from pushing for changes in polars to now pushing for changes in most other dataframe libraries

Also, this isn't "apples to apples" : asking Polars to add complexity to the query optimiser isn't comparable to keeping track of lazy column calls in a separate namespace

Anyway, thanks for your input, and I hope you're keeping well on parental leave!

"""
Instantiate an Column which selects given column by name.

For example, to select column 'species' and then use it to filter
a DataFrame, you could do:

.. code-block::python

df: DataFrame
namespace = df.__dataframe_namespace__()
df.filter(namespace.col('species') == 'setosa')
"""
...

def concat(dataframes: Sequence[DataFrame]) -> DataFrame:
"""
Concatenate DataFrames vertically.
Expand All @@ -70,9 +95,9 @@ def concat(dataframes: Sequence[DataFrame]) -> DataFrame:
"""
...

def column_from_sequence(sequence: Sequence[Any], *, dtype: DType, name: str = '') -> Column:
def column_from_sequence(sequence: Sequence[Any], *, dtype: DType, name: str = '') -> PermissiveColumn:
"""
Construct Column from sequence of elements.
Construct PermissiveColumn from sequence of elements.

Parameters
----------
Expand All @@ -87,18 +112,18 @@ def column_from_sequence(sequence: Sequence[Any], *, dtype: DType, name: str = '

Returns
-------
Column
PermissiveColumn
"""
...

def dataframe_from_dict(data: Mapping[str, Column]) -> DataFrame:
def dataframe_from_dict(data: Mapping[str, PermissiveColumn]) -> DataFrame:
"""
Construct DataFrame from map of column names to Columns.
Construct DataFrame from map of column names to PermissiveColumns.

Parameters
----------
data : Mapping[str, Column]
Column must be of the corresponding type of the DataFrame.
data : Mapping[str, PermissiveColumn]
PermissiveColumn must be of the corresponding type of the DataFrame.
For example, it is only supported to build a ``LibraryXDataFrame`` using
``LibraryXColumn`` instances.

Expand All @@ -116,9 +141,9 @@ def dataframe_from_dict(data: Mapping[str, Column]) -> DataFrame:
...


def column_from_1d_array(array: Any, *, dtype: DType, name: str = '') -> Column:
def column_from_1d_array(array: Any, *, dtype: DType, name: str = '') -> PermissiveColumn:
"""
Construct Column from 1D array.
Construct PermissiveColumn from 1D array.

See `dataframe_from_2d_array` for related 2D function.

Expand All @@ -137,7 +162,7 @@ def column_from_1d_array(array: Any, *, dtype: DType, name: str = '') -> Column:

Returns
-------
Column
PermissiveColumn
"""
...

Expand Down Expand Up @@ -166,11 +191,117 @@ def dataframe_from_2d_array(array: Any, *, names: Sequence[str], dtypes: Mapping
"""
...

def any_rowwise(*columns: str | Column | PermissiveColumn, skip_nulls: bool = True) -> Column:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Having to type things as Column | PermissiveColumn feels like a design flaw to me and will make downstream usage potentially annoying. I.E. for code that agnostically handles columns and dataframes, there's now 4 objects that people will need to type check to understand what they're working with.

Given that PermissiveColumn is a superset of Column, maybe we could use Column as a base class and inherit from it in PermissiveColumn? Similarly for DataFrame and PermissiveFrame?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is type checking the only concern here? If so, we could define (and export) a type alias, like we do for DType?

code that agnostically handles columns and dataframes

if you have completely agnostic code, then I'd suggest just accepting DataFrame and leaving it up to the caller to convert to DataFrame

After all:

  • if an end user passes in df_non_standard to your function, then df_non_standard.__dataframe_consortium_standard__() returns a DataFrame
  • if the function is only used internally, then you can control what you pass it. If you have a PermissiveFrame, you can call .relax to convert it to DataFrame

"""
Reduction returns an Column.

Differs from ``DataFrame.any`` and that the reduction happens
for each row, rather than for each column.

Parameters
----------
columns : str | Column | PermissiveColumn
Columns to consider.

Raises
------
ValueError
If any of the DataFrame's columns is not boolean.
"""
...

def all_rowwise(*columns: str | Column | PermissiveColumn, skip_nulls: bool = True) -> Column:
"""
Reduction returns an Column.

Differs from ``DataFrame.all`` and that the reduction happens
for each row, rather than for each column.

Parameters
----------
columns : str | Column | PermissiveColumn
Columns to consider.

Raises
------
ValueError
If any of the DataFrame's columns is not boolean.
"""
...

def sorted_indices(
*columns: str | Column | PermissiveColumn,
ascending: Sequence[bool] | bool = True,
nulls_position: Literal['first', 'last'] = 'last',
) -> Column:
"""
Return row numbers which would sort according to given columns.

If you need to sort the DataFrame, use :meth:`DataFrame.sort`.

Parameters
----------
columns : str | Column | PermissiveColumn
Column(s) to sort by.
ascending : Sequence[bool] or bool
If `True`, sort by all keys in ascending order.
If `False`, sort by all keys in descending order.
If a sequence, it must be the same length as `keys`,
and determines the direction with which to use each
key to sort by.
nulls_position : ``{'first', 'last'}``
Whether null values should be placed at the beginning
or at the end of the result.
Note that the position of NaNs is unspecified and may
vary based on the implementation.

Returns
-------
Column

Raises
------
ValueError
If `keys` and `ascending` are sequences of different lengths.
"""
...


def unique_indices(
*columns: str | Column | PermissiveColumn,
skip_nulls: bool = True,
) -> Column:
"""
Return indices corresponding to unique values across selected columns.

Parameters
----------
columns : str | Column | PermissiveColumn
Columns to consider when finding unique values.

Returns
-------
Column
Indices corresponding to unique values.

Notes
-----
There are no ordering guarantees. In particular, if there are multiple
indices corresponding to the same unique value(s), there is no guarantee
about which one will appear in the result.
If the original column(s) contain multiple `'NaN'` values, then
only a single index corresponding to those values will be returned.
Likewise for null values (if ``skip_nulls=False``).
"""
...



class null:
"""
A `null` object to represent missing data.

``null`` is a scalar, and may be used when constructing a `Column` from a
``null`` is a scalar, and may be used when constructing a `PermissiveColumn` from a
Python sequence with `column_from_sequence`. It does not support ``is``,
``==`` or ``bool``.

Expand Down
31 changes: 21 additions & 10 deletions spec/API_specification/dataframe_api/_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,9 @@

if TYPE_CHECKING:
from .dataframe_object import DataFrame as DataFrameType
from .permissivecolumn_object import PermissiveColumn as PermissiveColumnType
from .column_object import Column as ColumnType
from .permissiveframe_object import PermissiveFrame as PermissiveFrameType

if TYPE_CHECKING:
from .dtypes import (
Expand Down Expand Up @@ -51,6 +53,9 @@
class Namespace(Protocol):
__dataframe_api_version__: str

@staticmethod
def col(name: str) -> ColumnType: ...

@staticmethod
def DataFrame() -> DataFrameType:
...
Expand All @@ -59,6 +64,14 @@ def DataFrame() -> DataFrameType:
def Column() -> ColumnType:
...

@staticmethod
def PermissiveFrame() -> DataFrameType:
...

@staticmethod
def PermissiveColumn() -> PermissiveColumnType:
...

@staticmethod
def Int64() -> Int64:
...
Expand Down Expand Up @@ -123,31 +136,29 @@ def concat(dataframes: Sequence[DataFrameType]) -> DataFrameType:
def column_from_sequence(
sequence: Sequence[Any],
*,
dtype: Any,
dtype: DType,
name: str = "",
api_version: str | None = None,
) -> ColumnType:
) -> PermissiveColumnType:
...

@staticmethod
def dataframe_from_dict(
data: Mapping[str, ColumnType], *, api_version: str | None = None
data: Mapping[str, PermissiveColumnType]
) -> DataFrameType:
...

@staticmethod
def column_from_1d_array(
array: Any, *, dtype: Any, name: str = "", api_version: str | None = None
) -> ColumnType:
array: Any, *, dtype: DType, name: str = ""
) -> PermissiveColumnType:
...

@staticmethod
def dataframe_from_2d_array(
array: Any,
*,
names: Sequence[str],
dtypes: Mapping[str, Any],
api_version: str | None = None,
dtypes: Mapping[str, DType],
) -> DataFrameType:
...

Expand All @@ -156,7 +167,7 @@ def is_null(value: object, /) -> bool:
...

@staticmethod
def is_dtype(dtype: Any, kind: str | tuple[str, ...]) -> bool:
def is_dtype(dtype: DType, kind: str | tuple[str, ...]) -> bool:
...


Expand All @@ -169,7 +180,7 @@ def __dataframe_consortium_standard__(
class SupportsColumnAPI(Protocol):
def __column_consortium_standard__(
self, *, api_version: str | None = None
) -> ColumnType:
) -> PermissiveColumnType:
...


Expand Down
Loading