-
Notifications
You must be signed in to change notification settings - Fork 21
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Separate eager and lazy APIs #249
Changes from all commits
3f9aae3
20ad6d0
86ec0ae
a43224e
858fdf3
db7c19f
ca294c8
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,7 +5,9 @@ | |
|
||
from typing import Mapping, Sequence, Any, Literal, TYPE_CHECKING | ||
|
||
from .column_object import * | ||
from .permissivecolumn_object import PermissiveColumn | ||
from .permissiveframe_object import PermissiveFrame | ||
from .column_object import Column | ||
from .dataframe_object import DataFrame | ||
from .groupby_object import * | ||
from .dtypes import * | ||
|
@@ -16,11 +18,17 @@ | |
__all__ = [ | ||
"__dataframe_api_version__", | ||
"DataFrame", | ||
"PermissiveFrame", | ||
"PermissiveColumn", | ||
"Column", | ||
"GroupBy", | ||
"column_from_sequence", | ||
"column_from_1d_array", | ||
"col", | ||
"concat", | ||
"dataframe_from_dict", | ||
"sorted_indices", | ||
"unique_indices", | ||
"dataframe_from_2d_array", | ||
"is_null", | ||
"null", | ||
|
@@ -40,6 +48,8 @@ | |
"Duration", | ||
"String", | ||
"is_dtype", | ||
"any_rowwise", | ||
"all_rowwise", | ||
] | ||
|
||
|
||
|
@@ -50,6 +60,21 @@ | |
implementation of the dataframe API standard. | ||
""" | ||
|
||
def col(name: str) -> Column: | ||
""" | ||
Instantiate an Column which selects given column by name. | ||
|
||
For example, to select column 'species' and then use it to filter | ||
a DataFrame, you could do: | ||
|
||
.. code-block::python | ||
|
||
df: DataFrame | ||
namespace = df.__dataframe_namespace__() | ||
df.filter(namespace.col('species') == 'setosa') | ||
""" | ||
... | ||
|
||
def concat(dataframes: Sequence[DataFrame]) -> DataFrame: | ||
""" | ||
Concatenate DataFrames vertically. | ||
|
@@ -70,9 +95,9 @@ def concat(dataframes: Sequence[DataFrame]) -> DataFrame: | |
""" | ||
... | ||
|
||
def column_from_sequence(sequence: Sequence[Any], *, dtype: DType, name: str = '') -> Column: | ||
def column_from_sequence(sequence: Sequence[Any], *, dtype: DType, name: str = '') -> PermissiveColumn: | ||
""" | ||
Construct Column from sequence of elements. | ||
Construct PermissiveColumn from sequence of elements. | ||
|
||
Parameters | ||
---------- | ||
|
@@ -87,18 +112,18 @@ def column_from_sequence(sequence: Sequence[Any], *, dtype: DType, name: str = ' | |
|
||
Returns | ||
------- | ||
Column | ||
PermissiveColumn | ||
""" | ||
... | ||
|
||
def dataframe_from_dict(data: Mapping[str, Column]) -> DataFrame: | ||
def dataframe_from_dict(data: Mapping[str, PermissiveColumn]) -> DataFrame: | ||
""" | ||
Construct DataFrame from map of column names to Columns. | ||
Construct DataFrame from map of column names to PermissiveColumns. | ||
|
||
Parameters | ||
---------- | ||
data : Mapping[str, Column] | ||
Column must be of the corresponding type of the DataFrame. | ||
data : Mapping[str, PermissiveColumn] | ||
PermissiveColumn must be of the corresponding type of the DataFrame. | ||
For example, it is only supported to build a ``LibraryXDataFrame`` using | ||
``LibraryXColumn`` instances. | ||
|
||
|
@@ -116,9 +141,9 @@ def dataframe_from_dict(data: Mapping[str, Column]) -> DataFrame: | |
... | ||
|
||
|
||
def column_from_1d_array(array: Any, *, dtype: DType, name: str = '') -> Column: | ||
def column_from_1d_array(array: Any, *, dtype: DType, name: str = '') -> PermissiveColumn: | ||
""" | ||
Construct Column from 1D array. | ||
Construct PermissiveColumn from 1D array. | ||
|
||
See `dataframe_from_2d_array` for related 2D function. | ||
|
||
|
@@ -137,7 +162,7 @@ def column_from_1d_array(array: Any, *, dtype: DType, name: str = '') -> Column: | |
|
||
Returns | ||
------- | ||
Column | ||
PermissiveColumn | ||
""" | ||
... | ||
|
||
|
@@ -166,11 +191,117 @@ def dataframe_from_2d_array(array: Any, *, names: Sequence[str], dtypes: Mapping | |
""" | ||
... | ||
|
||
def any_rowwise(*columns: str | Column | PermissiveColumn, skip_nulls: bool = True) -> Column: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Having to type things as Given that There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is type checking the only concern here? If so, we could define (and export) a type alias, like we do for
if you have completely agnostic code, then I'd suggest just accepting After all:
|
||
""" | ||
Reduction returns an Column. | ||
|
||
Differs from ``DataFrame.any`` and that the reduction happens | ||
for each row, rather than for each column. | ||
|
||
Parameters | ||
---------- | ||
columns : str | Column | PermissiveColumn | ||
Columns to consider. | ||
|
||
Raises | ||
------ | ||
ValueError | ||
If any of the DataFrame's columns is not boolean. | ||
""" | ||
... | ||
|
||
def all_rowwise(*columns: str | Column | PermissiveColumn, skip_nulls: bool = True) -> Column: | ||
""" | ||
Reduction returns an Column. | ||
|
||
Differs from ``DataFrame.all`` and that the reduction happens | ||
for each row, rather than for each column. | ||
|
||
Parameters | ||
---------- | ||
columns : str | Column | PermissiveColumn | ||
Columns to consider. | ||
|
||
Raises | ||
------ | ||
ValueError | ||
If any of the DataFrame's columns is not boolean. | ||
""" | ||
... | ||
|
||
def sorted_indices( | ||
*columns: str | Column | PermissiveColumn, | ||
ascending: Sequence[bool] | bool = True, | ||
nulls_position: Literal['first', 'last'] = 'last', | ||
) -> Column: | ||
""" | ||
Return row numbers which would sort according to given columns. | ||
|
||
If you need to sort the DataFrame, use :meth:`DataFrame.sort`. | ||
|
||
Parameters | ||
---------- | ||
columns : str | Column | PermissiveColumn | ||
Column(s) to sort by. | ||
ascending : Sequence[bool] or bool | ||
If `True`, sort by all keys in ascending order. | ||
If `False`, sort by all keys in descending order. | ||
If a sequence, it must be the same length as `keys`, | ||
and determines the direction with which to use each | ||
key to sort by. | ||
nulls_position : ``{'first', 'last'}`` | ||
Whether null values should be placed at the beginning | ||
or at the end of the result. | ||
Note that the position of NaNs is unspecified and may | ||
vary based on the implementation. | ||
|
||
Returns | ||
------- | ||
Column | ||
|
||
Raises | ||
------ | ||
ValueError | ||
If `keys` and `ascending` are sequences of different lengths. | ||
""" | ||
... | ||
|
||
|
||
def unique_indices( | ||
*columns: str | Column | PermissiveColumn, | ||
skip_nulls: bool = True, | ||
) -> Column: | ||
""" | ||
Return indices corresponding to unique values across selected columns. | ||
|
||
Parameters | ||
---------- | ||
columns : str | Column | PermissiveColumn | ||
Columns to consider when finding unique values. | ||
|
||
Returns | ||
------- | ||
Column | ||
Indices corresponding to unique values. | ||
|
||
Notes | ||
----- | ||
There are no ordering guarantees. In particular, if there are multiple | ||
indices corresponding to the same unique value(s), there is no guarantee | ||
about which one will appear in the result. | ||
If the original column(s) contain multiple `'NaN'` values, then | ||
only a single index corresponding to those values will be returned. | ||
Likewise for null values (if ``skip_nulls=False``). | ||
""" | ||
... | ||
|
||
|
||
|
||
class null: | ||
""" | ||
A `null` object to represent missing data. | ||
|
||
``null`` is a scalar, and may be used when constructing a `Column` from a | ||
``null`` is a scalar, and may be used when constructing a `PermissiveColumn` from a | ||
Python sequence with `column_from_sequence`. It does not support ``is``, | ||
``==`` or ``bool``. | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Since this isn't bound to a DataFrame, for libraries other than polars and ibis this will be a new concept that will require implementation and maintenance. Do you have a sense for what this would look like for Pandas for example?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yup take a look here data-apis/dataframe-api-compat#13
It's surprisingly simple to just add the syntax
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This doesn't really look simple to me: https://github.com/data-apis/dataframe-api-compat/blob/76284fa158ffe0f21ab1758f46caf523427077a3/dataframe_api_compat/pandas_standard/pandas_standard.py#L81-L417
My concern is that we went from pushing for changes in polars to now pushing for changes in most other dataframe libraries. I would love some thoughts from other dataframe library maintainers here.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It's just a matter of recording some lambda calls and then unpacking them - e.g.
becomes
If this lives in a separate namespace, then there's not really any extra maintenance that needs doing in the main implementation
For a start, it might be added to pandas (regardless of what the consortium does), check Joris' lightning talk from euroscipy: https://youtu.be/g2JsyNQgcoU?si=ax0ZINFQINf9a5jv&t=512
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Also, this isn't "apples to apples" : asking Polars to add complexity to the query optimiser isn't comparable to keeping track of lazy column calls in a separate namespace
Anyway, thanks for your input, and I hope you're keeping well on parental leave!