SNOW-1805836: Implement the dataframe interchange protocol. (#2683)

Fixes SNOW-1805836 --------- Signed-off-by: sfc-gh-mvashishtha <[email protected]> Co-authored-by: Rehan Durrani <[email protected]>
snowflakedb · Dec 4, 2024 · b76912f · b76912f
1 parent 91f4357
commit b76912f
Show file tree

Hide file tree

Showing 9 changed files with 674 additions and 80 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -68,6 +68,8 @@
 - Added support for `GroupBy.pct_change` with `axis=0`, `freq=None`, and `limit=None`.
 - Added support for `DataFrameGroupBy.__iter__` and `SeriesGroupBy.__iter__`.
 - Added support for `np.sqrt`, `np.trunc`, `np.floor`, numpy trig functions, `np.exp`, `np.abs`, `np.positive` and `np.negative`.
+- Added partial support for the dataframe interchange protocol method
+  `DataFrame.__dataframe__()`.
 
 #### Dependency Updates
 

diff --git a/docs/source/modin/supported/general_supported.rst b/docs/source/modin/supported/general_supported.rst
@@ -25,6 +25,9 @@ Data manipulations
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
 | ``cut``                     | P                               | ``retbins``, ``labels``          | ``N`` if ``retbins=True``or ``labels!=False``      |
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
+| ``__dataframe__``           | P                               |                                  | ``N`` for columns of type ``Timedelta`` and columns|
+|                             |                                 |                                  | containing list objects                            |
++-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
 | ``factorize``               | N                               |                                  |                                                    |
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
 | ``from_dummies``            | N                               |                                  |                                                    |

diff --git a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py
@@ -23,6 +23,7 @@
 import pandas as native_pd
 import pandas.core.resample
 import pandas.io.parsers
+from pandas.core.interchange.dataframe_protocol import DataFrame as InterchangeDataframe
 import pandas.io.parsers.readers
 import pytz  # type: ignore
 from modin.core.storage_formats import BaseQueryCompiler  # type: ignore
@@ -817,8 +818,12 @@ def from_pandas(
     def from_arrow(cls, at: Any, *args: Any, **kwargs: Any) -> "SnowflakeQueryCompiler":
         return cls(at.to_pandas())
 
-    def to_dataframe(self, nan_as_null: bool = False, allow_copy: bool = True) -> None:
-        pass
+    def to_dataframe(
+        self, nan_as_null: bool = False, allow_copy: bool = True
+    ) -> InterchangeDataframe:
+        return self.to_pandas().__dataframe__(
+            nan_as_null=nan_as_null, allow_copy=allow_copy
+        )
 
     @classmethod
     def from_dataframe(cls, df: native_pd.DataFrame, data_cls: Any) -> None:

diff --git a/src/snowflake/snowpark/modin/plugin/docstrings/dataframe.py b/src/snowflake/snowpark/modin/plugin/docstrings/dataframe.py
@@ -5099,10 +5099,20 @@ def __delitem__():
 
     def __dataframe__():
         """
-        Get a Modin DataFrame that implements the dataframe exchange protocol.
+        Get an object that implements the dataframe interchange protocol for this dataframe.
 
         See more about the protocol in https://data-apis.org/dataframe-protocol/latest/index.html.
 
+        Like `DataFrame.to_pandas`, this method this methods triggers a query
+        evaluation and pulls data to the local machine.
+
+        If this dataframe has columns of `Timedelta` type or columns containing
+        list objects, the interchange dataframe that this method returns will
+        raise `NotImplementedError` if you try to check those columns'
+        datatypes, to e.g. convert the interchange dataframe to pandas with
+        `pandas.api.interchange.from_dataframe`. This limitation comes from
+        pandas itself.
+
         Parameters
         ----------
         nan_as_null : bool, default: False

diff --git a/src/snowflake/snowpark/modin/plugin/extensions/dataframe_overrides.py b/src/snowflake/snowpark/modin/plugin/extensions/dataframe_overrides.py
@@ -31,6 +31,7 @@
 import numpy as np
 import pandas as native_pd
 from modin.pandas import DataFrame, Series
+from pandas.core.interchange.dataframe_protocol import DataFrame as InterchangeDataframe
 from modin.pandas.api.extensions import register_dataframe_accessor
 from modin.pandas.base import BasePandasDataset
 from modin.pandas.io import from_pandas
@@ -729,40 +730,9 @@ def _df_init_list_data_with_snowpark_pandas_values(
 
 
 @register_dataframe_accessor("__dataframe__")
-def __dataframe__(self, nan_as_null: bool = False, allow_copy: bool = True):
-    """
-    Get a Modin DataFrame that implements the dataframe exchange protocol.
-
-    See more about the protocol in https://data-apis.org/dataframe-protocol/latest/index.html.
-
-    Parameters
-    ----------
-    nan_as_null : bool, default: False
-        A keyword intended for the consumer to tell the producer
-        to overwrite null values in the data with ``NaN`` (or ``NaT``).
-        This currently has no effect; once support for nullable extension
-        dtypes is added, this value should be propagated to columns.
-    allow_copy : bool, default: True
-        A keyword that defines whether or not the library is allowed
-        to make a copy of the data. For example, copying data would be necessary
-        if a library supports strided buffers, given that this protocol
-        specifies contiguous buffers. Currently, if the flag is set to ``False``
-        and a copy is needed, a ``RuntimeError`` will be raised.
-
-    Returns
-    -------
-    ProtocolDataframe
-        A dataframe object following the dataframe protocol specification.
-    """
-    # TODO: SNOW-1063346: Modin upgrade - modin.pandas.DataFrame functions
-    ErrorMessage.not_implemented(
-        "Snowpark pandas does not support the DataFrame interchange "
-        + "protocol method `__dataframe__`. To use Snowpark pandas "
-        + "DataFrames with third-party libraries that try to call the "
-        + "`__dataframe__` method, please convert this Snowpark pandas "
-        + "DataFrame to pandas with `to_pandas()`."
-    )
-
+def __dataframe__(
+    self, nan_as_null: bool = False, allow_copy: bool = True
+) -> InterchangeDataframe:
     return self._query_compiler.to_dataframe(
         nan_as_null=nan_as_null, allow_copy=allow_copy
     )