Skip to content

Commit

Permalink
SNOW-1805836: Implement the dataframe interchange protocol. (#2683)
Browse files Browse the repository at this point in the history
Fixes SNOW-1805836

---------

Signed-off-by: sfc-gh-mvashishtha <[email protected]>
Co-authored-by: Rehan Durrani <[email protected]>
  • Loading branch information
sfc-gh-mvashishtha and sfc-gh-rdurrani authored Dec 4, 2024
1 parent 91f4357 commit b76912f
Show file tree
Hide file tree
Showing 9 changed files with 674 additions and 80 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,8 @@
- Added support for `GroupBy.pct_change` with `axis=0`, `freq=None`, and `limit=None`.
- Added support for `DataFrameGroupBy.__iter__` and `SeriesGroupBy.__iter__`.
- Added support for `np.sqrt`, `np.trunc`, `np.floor`, numpy trig functions, `np.exp`, `np.abs`, `np.positive` and `np.negative`.
- Added partial support for the dataframe interchange protocol method
`DataFrame.__dataframe__()`.

#### Dependency Updates

Expand Down
3 changes: 3 additions & 0 deletions docs/source/modin/supported/general_supported.rst
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@ Data manipulations
+-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
| ``cut`` | P | ``retbins``, ``labels`` | ``N`` if ``retbins=True``or ``labels!=False`` |
+-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
| ``__dataframe__`` | P | | ``N`` for columns of type ``Timedelta`` and columns|
| | | | containing list objects |
+-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
| ``factorize`` | N | | |
+-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
| ``from_dummies`` | N | | |
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
import pandas as native_pd
import pandas.core.resample
import pandas.io.parsers
from pandas.core.interchange.dataframe_protocol import DataFrame as InterchangeDataframe
import pandas.io.parsers.readers
import pytz # type: ignore
from modin.core.storage_formats import BaseQueryCompiler # type: ignore
Expand Down Expand Up @@ -817,8 +818,12 @@ def from_pandas(
def from_arrow(cls, at: Any, *args: Any, **kwargs: Any) -> "SnowflakeQueryCompiler":
return cls(at.to_pandas())

def to_dataframe(self, nan_as_null: bool = False, allow_copy: bool = True) -> None:
pass
def to_dataframe(
self, nan_as_null: bool = False, allow_copy: bool = True
) -> InterchangeDataframe:
return self.to_pandas().__dataframe__(
nan_as_null=nan_as_null, allow_copy=allow_copy
)

@classmethod
def from_dataframe(cls, df: native_pd.DataFrame, data_cls: Any) -> None:
Expand Down
12 changes: 11 additions & 1 deletion src/snowflake/snowpark/modin/plugin/docstrings/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -5099,10 +5099,20 @@ def __delitem__():

def __dataframe__():
"""
Get a Modin DataFrame that implements the dataframe exchange protocol.
Get an object that implements the dataframe interchange protocol for this dataframe.
See more about the protocol in https://data-apis.org/dataframe-protocol/latest/index.html.
Like `DataFrame.to_pandas`, this method this methods triggers a query
evaluation and pulls data to the local machine.
If this dataframe has columns of `Timedelta` type or columns containing
list objects, the interchange dataframe that this method returns will
raise `NotImplementedError` if you try to check those columns'
datatypes, to e.g. convert the interchange dataframe to pandas with
`pandas.api.interchange.from_dataframe`. This limitation comes from
pandas itself.
Parameters
----------
nan_as_null : bool, default: False
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
import numpy as np
import pandas as native_pd
from modin.pandas import DataFrame, Series
from pandas.core.interchange.dataframe_protocol import DataFrame as InterchangeDataframe
from modin.pandas.api.extensions import register_dataframe_accessor
from modin.pandas.base import BasePandasDataset
from modin.pandas.io import from_pandas
Expand Down Expand Up @@ -729,40 +730,9 @@ def _df_init_list_data_with_snowpark_pandas_values(


@register_dataframe_accessor("__dataframe__")
def __dataframe__(self, nan_as_null: bool = False, allow_copy: bool = True):
"""
Get a Modin DataFrame that implements the dataframe exchange protocol.
See more about the protocol in https://data-apis.org/dataframe-protocol/latest/index.html.
Parameters
----------
nan_as_null : bool, default: False
A keyword intended for the consumer to tell the producer
to overwrite null values in the data with ``NaN`` (or ``NaT``).
This currently has no effect; once support for nullable extension
dtypes is added, this value should be propagated to columns.
allow_copy : bool, default: True
A keyword that defines whether or not the library is allowed
to make a copy of the data. For example, copying data would be necessary
if a library supports strided buffers, given that this protocol
specifies contiguous buffers. Currently, if the flag is set to ``False``
and a copy is needed, a ``RuntimeError`` will be raised.
Returns
-------
ProtocolDataframe
A dataframe object following the dataframe protocol specification.
"""
# TODO: SNOW-1063346: Modin upgrade - modin.pandas.DataFrame functions
ErrorMessage.not_implemented(
"Snowpark pandas does not support the DataFrame interchange "
+ "protocol method `__dataframe__`. To use Snowpark pandas "
+ "DataFrames with third-party libraries that try to call the "
+ "`__dataframe__` method, please convert this Snowpark pandas "
+ "DataFrame to pandas with `to_pandas()`."
)

def __dataframe__(
self, nan_as_null: bool = False, allow_copy: bool = True
) -> InterchangeDataframe:
return self._query_compiler.to_dataframe(
nan_as_null=nan_as_null, allow_copy=allow_copy
)
Expand Down
Loading

0 comments on commit b76912f

Please sign in to comment.