diff --git a/doc/source/reference/frame.rst b/doc/source/reference/frame.rst index 9a1ebc8d670dc..ea27d1efbb235 100644 --- a/doc/source/reference/frame.rst +++ b/doc/source/reference/frame.rst @@ -391,3 +391,4 @@ Serialization / IO / conversion DataFrame.to_clipboard DataFrame.to_markdown DataFrame.style + DataFrame.__dataframe__ diff --git a/doc/source/reference/general_functions.rst b/doc/source/reference/general_functions.rst index 4b2fadcb367a1..a42d54b7e50ef 100644 --- a/doc/source/reference/general_functions.rst +++ b/doc/source/reference/general_functions.rst @@ -78,3 +78,10 @@ Hashing util.hash_array util.hash_pandas_object + +Importing from other DataFrame libraries +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: api/ + + api.exchange.from_dataframe diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 08500019143ed..01bf97f35702a 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -14,6 +14,24 @@ including other versions of pandas. Enhancements ~~~~~~~~~~~~ +.. _whatsnew_150.enhancements.dataframe_exchange: + +DataFrame exchange protocol implementation +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Pandas now implement the DataFrame exchange API spec. +See the full details on the API at https://data-apis.org/dataframe-protocol/latest/index.html + +The protocol consists of two parts: + + - New method :meth:`DataFrame.__dataframe__` which produces the exchange object. + It effectively "exports" the Pandas dataframe as an exchange object so + any other library which has the protocol implemented can "import" that dataframe + without knowing anything about the producer except that it makes an exchange object. + - New function :func:`pandas.api.exchange.from_dataframe` which can take + an arbitrary exchange object from any conformant library and construct a + Pandas DataFrame out of it. + .. _whatsnew_150.enhancements.styler: Styler diff --git a/pandas/api/__init__.py b/pandas/api/__init__.py index 80202b3569862..67fd722c9198b 100644 --- a/pandas/api/__init__.py +++ b/pandas/api/__init__.py @@ -1,5 +1,6 @@ """ public toolkit API """ from pandas.api import ( # noqa:F401 + exchange, extensions, indexers, types, diff --git a/pandas/api/exchange/__init__.py b/pandas/api/exchange/__init__.py new file mode 100644 index 0000000000000..6760d81f60ac7 --- /dev/null +++ b/pandas/api/exchange/__init__.py @@ -0,0 +1,8 @@ +""" +Public API for DataFrame exchange protocol. +""" + +from pandas.core.exchange.dataframe_protocol import DataFrame +from pandas.core.exchange.from_dataframe import from_dataframe + +__all__ = ["from_dataframe", "DataFrame"] diff --git a/pandas/core/exchange/__init__.py b/pandas/core/exchange/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/core/exchange/buffer.py b/pandas/core/exchange/buffer.py new file mode 100644 index 0000000000000..098c596bff4cd --- /dev/null +++ b/pandas/core/exchange/buffer.py @@ -0,0 +1,80 @@ +from typing import ( + Optional, + Tuple, +) + +import numpy as np +from packaging import version + +from pandas.core.exchange.dataframe_protocol import ( + Buffer, + DlpackDeviceType, +) + +_NUMPY_HAS_DLPACK = version.parse(np.__version__) >= version.parse("1.22.0") + + +class PandasBuffer(Buffer): + """ + Data in the buffer is guaranteed to be contiguous in memory. + """ + + def __init__(self, x: np.ndarray, allow_copy: bool = True) -> None: + """ + Handle only regular columns (= numpy arrays) for now. + """ + if not x.strides == (x.dtype.itemsize,): + # The protocol does not support strided buffers, so a copy is + # necessary. If that's not allowed, we need to raise an exception. + if allow_copy: + x = x.copy() + else: + raise RuntimeError( + "Exports cannot be zero-copy in the case " + "of a non-contiguous buffer" + ) + + # Store the numpy array in which the data resides as a private + # attribute, so we can use it to retrieve the public attributes + self._x = x + + @property + def bufsize(self) -> int: + """ + Buffer size in bytes. + """ + return self._x.size * self._x.dtype.itemsize + + @property + def ptr(self) -> int: + """ + Pointer to start of the buffer as an integer. + """ + return self._x.__array_interface__["data"][0] + + def __dlpack__(self): + """ + Represent this structure as DLPack interface. + """ + if _NUMPY_HAS_DLPACK: + return self._x.__dlpack__() + raise NotImplementedError("__dlpack__") + + def __dlpack_device__(self) -> Tuple[DlpackDeviceType, Optional[int]]: + """ + Device type and device ID for where the data in the buffer resides. + """ + return (DlpackDeviceType.CPU, None) + + def __repr__(self) -> str: + return ( + "PandasBuffer(" + + str( + { + "bufsize": self.bufsize, + "ptr": self.ptr, + "device": self.__dlpack_device__()[0].name, + } + ) + + ")" + ) diff --git a/pandas/core/exchange/column.py b/pandas/core/exchange/column.py new file mode 100644 index 0000000000000..ae24c5d295cc9 --- /dev/null +++ b/pandas/core/exchange/column.py @@ -0,0 +1,375 @@ +from typing import ( + Any, + Tuple, +) + +import numpy as np + +from pandas._libs.lib import infer_dtype +from pandas.util._decorators import cache_readonly + +import pandas as pd +from pandas.api.types import ( + is_categorical_dtype, + is_string_dtype, +) +from pandas.core.exchange.buffer import PandasBuffer +from pandas.core.exchange.dataframe_protocol import ( + Column, + ColumnBuffers, + ColumnNullType, + DtypeKind, +) +from pandas.core.exchange.utils import ( + ArrowCTypes, + Endianness, + NoBufferPresent, + dtype_to_arrow_c_fmt, +) + +_NP_KINDS = { + "i": DtypeKind.INT, + "u": DtypeKind.UINT, + "f": DtypeKind.FLOAT, + "b": DtypeKind.BOOL, + "U": DtypeKind.STRING, + "M": DtypeKind.DATETIME, + "m": DtypeKind.DATETIME, +} + +_NULL_DESCRIPTION = { + DtypeKind.FLOAT: (ColumnNullType.USE_NAN, None), + DtypeKind.DATETIME: (ColumnNullType.USE_NAN, None), + DtypeKind.INT: (ColumnNullType.NON_NULLABLE, None), + DtypeKind.UINT: (ColumnNullType.NON_NULLABLE, None), + DtypeKind.BOOL: (ColumnNullType.NON_NULLABLE, None), + # Null values for categoricals are stored as `-1` sentinel values + # in the category date (e.g., `col.values.codes` is int8 np.ndarray) + DtypeKind.CATEGORICAL: (ColumnNullType.USE_SENTINEL, -1), + # follow Arrow in using 1 as valid value and 0 for missing/null value + DtypeKind.STRING: (ColumnNullType.USE_BYTEMASK, 0), +} + +_NO_VALIDITY_BUFFER = { + ColumnNullType.NON_NULLABLE: "This column is non-nullable", + ColumnNullType.USE_NAN: "This column uses NaN as null", + ColumnNullType.USE_SENTINEL: "This column uses a sentinel value", +} + + +class PandasColumn(Column): + """ + A column object, with only the methods and properties required by the + interchange protocol defined. + A column can contain one or more chunks. Each chunk can contain up to three + buffers - a data buffer, a mask buffer (depending on null representation), + and an offsets buffer (if variable-size binary; e.g., variable-length + strings). + Note: this Column object can only be produced by ``__dataframe__``, so + doesn't need its own version or ``__column__`` protocol. + """ + + def __init__(self, column: pd.Series, allow_copy: bool = True) -> None: + """ + Note: doesn't deal with extension arrays yet, just assume a regular + Series/ndarray for now. + """ + if not isinstance(column, pd.Series): + raise NotImplementedError(f"Columns of type {type(column)} not handled yet") + + # Store the column as a private attribute + self._col = column + self._allow_copy = allow_copy + + @property + def size(self) -> int: + """ + Size of the column, in elements. + """ + return self._col.size + + @property + def offset(self) -> int: + """ + Offset of first element. Always zero. + """ + # TODO: chunks are implemented now, probably this should return something + return 0 + + @cache_readonly + def dtype(self): + dtype = self._col.dtype + + if is_categorical_dtype(dtype): + codes = self._col.values.codes + ( + _, + bitwidth, + c_arrow_dtype_f_str, + _, + ) = self._dtype_from_pandasdtype(codes.dtype) + return ( + DtypeKind.CATEGORICAL, + bitwidth, + c_arrow_dtype_f_str, + Endianness.NATIVE, + ) + elif is_string_dtype(dtype): + if infer_dtype(self._col) == "string": + return ( + DtypeKind.STRING, + 8, + dtype_to_arrow_c_fmt(dtype), + Endianness.NATIVE, + ) + raise NotImplementedError("Non-string object dtypes are not supported yet") + else: + return self._dtype_from_pandasdtype(dtype) + + def _dtype_from_pandasdtype(self, dtype) -> Tuple[DtypeKind, int, str, str]: + """ + See `self.dtype` for details. + """ + # Note: 'c' (complex) not handled yet (not in array spec v1). + # 'b', 'B' (bytes), 'S', 'a', (old-style string) 'V' (void) not handled + # datetime and timedelta both map to datetime (is timedelta handled?) + + kind = _NP_KINDS.get(dtype.kind, None) + if kind is None: + # Not a NumPy dtype. Check if it's a categorical maybe + raise ValueError(f"Data type {dtype} not supported by exchange protocol") + + return (kind, dtype.itemsize * 8, dtype_to_arrow_c_fmt(dtype), dtype.byteorder) + + @property + def describe_categorical(self): + """ + If the dtype is categorical, there are two options: + - There are only values in the data buffer. + - There is a separate dictionary-style encoding for categorical values. + Raises RuntimeError if the dtype is not categorical + Content of returned dict: + - "is_ordered" : bool, whether the ordering of dictionary indices is + semantically meaningful. + - "is_dictionary" : bool, whether a dictionary-style mapping of + categorical values to other objects exists + - "mapping" : dict, Python-level only (e.g. ``{int: str}``). + None if not a dictionary-style categorical. + """ + if not self.dtype[0] == DtypeKind.CATEGORICAL: + raise TypeError( + "describe_categorical only works on a column with categorical dtype!" + ) + + return { + "is_ordered": self._col.cat.ordered, + "is_dictionary": True, + "mapping": dict(enumerate(self._col.cat.categories)), + } + + @property + def describe_null(self): + kind = self.dtype[0] + try: + null, value = _NULL_DESCRIPTION[kind] + except KeyError: + raise NotImplementedError(f"Data type {kind} not yet supported") + + return null, value + + @cache_readonly + def null_count(self) -> int: + """ + Number of null elements. Should always be known. + """ + return self._col.isna().sum() + + @property + def metadata(self): + """ + Store specific metadata of the column. + """ + return {"pandas.index": self._col.index} + + def num_chunks(self) -> int: + """ + Return the number of chunks the column consists of. + """ + return 1 + + def get_chunks(self, n_chunks=None): + """ + Return an iterator yielding the chunks. + See `DataFrame.get_chunks` for details on ``n_chunks``. + """ + if n_chunks and n_chunks > 1: + size = len(self._col) + step = size // n_chunks + if size % n_chunks != 0: + step += 1 + for start in range(0, step * n_chunks, step): + yield PandasColumn( + self._col.iloc[start : start + step], self._allow_copy + ) + else: + yield self + + def get_buffers(self): + """ + Return a dictionary containing the underlying buffers. + The returned dictionary has the following contents: + - "data": a two-element tuple whose first element is a buffer + containing the data and whose second element is the data + buffer's associated dtype. + - "validity": a two-element tuple whose first element is a buffer + containing mask values indicating missing data and + whose second element is the mask value buffer's + associated dtype. None if the null representation is + not a bit or byte mask. + - "offsets": a two-element tuple whose first element is a buffer + containing the offset values for variable-size binary + data (e.g., variable-length strings) and whose second + element is the offsets buffer's associated dtype. None + if the data buffer does not have an associated offsets + buffer. + """ + buffers: ColumnBuffers = { + "data": self._get_data_buffer(), + "validity": None, + "offsets": None, + } + + try: + buffers["validity"] = self._get_validity_buffer() + except NoBufferPresent: + pass + + try: + buffers["offsets"] = self._get_offsets_buffer() + except NoBufferPresent: + pass + + return buffers + + def _get_data_buffer( + self, + ) -> Tuple[PandasBuffer, Any]: # Any is for self.dtype tuple + """ + Return the buffer containing the data and the buffer's associated dtype. + """ + if self.dtype[0] in ( + DtypeKind.INT, + DtypeKind.UINT, + DtypeKind.FLOAT, + DtypeKind.BOOL, + DtypeKind.DATETIME, + ): + buffer = PandasBuffer(self._col.to_numpy(), allow_copy=self._allow_copy) + dtype = self.dtype + elif self.dtype[0] == DtypeKind.CATEGORICAL: + codes = self._col.values.codes + buffer = PandasBuffer(codes, allow_copy=self._allow_copy) + dtype = self._dtype_from_pandasdtype(codes.dtype) + elif self.dtype[0] == DtypeKind.STRING: + # Marshal the strings from a NumPy object array into a byte array + buf = self._col.to_numpy() + b = bytearray() + + # TODO: this for-loop is slow; can be implemented in Cython/C/C++ later + for obj in buf: + if isinstance(obj, str): + b.extend(obj.encode(encoding="utf-8")) + + # Convert the byte array to a Pandas "buffer" using + # a NumPy array as the backing store + buffer = PandasBuffer(np.frombuffer(b, dtype="uint8")) + + # Define the dtype for the returned buffer + dtype = ( + DtypeKind.STRING, + 8, + ArrowCTypes.STRING, + Endianness.NATIVE, + ) # note: currently only support native endianness + else: + raise NotImplementedError(f"Data type {self._col.dtype} not handled yet") + + return buffer, dtype + + def _get_validity_buffer(self) -> Tuple[PandasBuffer, Any]: + """ + Return the buffer containing the mask values indicating missing data and + the buffer's associated dtype. + Raises NoBufferPresent if null representation is not a bit or byte mask. + """ + null, invalid = self.describe_null + + if self.dtype[0] == DtypeKind.STRING: + # For now, use byte array as the mask. + # TODO: maybe store as bit array to save space?.. + buf = self._col.to_numpy() + + # Determine the encoding for valid values + valid = invalid == 0 + invalid = not valid + + mask = np.zeros(shape=(len(buf),), dtype=np.bool8) + for i, obj in enumerate(buf): + mask[i] = valid if isinstance(obj, str) else invalid + + # Convert the mask array to a Pandas "buffer" using + # a NumPy array as the backing store + buffer = PandasBuffer(mask) + + # Define the dtype of the returned buffer + dtype = (DtypeKind.BOOL, 8, ArrowCTypes.BOOL, Endianness.NATIVE) + + return buffer, dtype + + try: + msg = _NO_VALIDITY_BUFFER[null] + " so does not have a separate mask" + except KeyError: + # TODO: implement for other bit/byte masks? + raise NotImplementedError("See self.describe_null") + + raise NoBufferPresent(msg) + + def _get_offsets_buffer(self) -> Tuple[PandasBuffer, Any]: + """ + Return the buffer containing the offset values for variable-size binary + data (e.g., variable-length strings) and the buffer's associated dtype. + Raises NoBufferPresent if the data buffer does not have an associated + offsets buffer. + """ + if self.dtype[0] == DtypeKind.STRING: + # For each string, we need to manually determine the next offset + values = self._col.to_numpy() + ptr = 0 + offsets = np.zeros(shape=(len(values) + 1,), dtype=np.int64) + for i, v in enumerate(values): + # For missing values (in this case, `np.nan` values) + # we don't increment the pointer + if isinstance(v, str): + b = v.encode(encoding="utf-8") + ptr += len(b) + + offsets[i + 1] = ptr + + # Convert the offsets to a Pandas "buffer" using + # the NumPy array as the backing store + buffer = PandasBuffer(offsets) + + # Assemble the buffer dtype info + dtype = ( + DtypeKind.INT, + 64, + ArrowCTypes.INT64, + Endianness.NATIVE, + ) # note: currently only support native endianness + else: + raise NoBufferPresent( + "This column has a fixed-length dtype so " + "it does not have an offsets buffer" + ) + + return buffer, dtype diff --git a/pandas/core/exchange/dataframe.py b/pandas/core/exchange/dataframe.py new file mode 100644 index 0000000000000..c8a89184b34c6 --- /dev/null +++ b/pandas/core/exchange/dataframe.py @@ -0,0 +1,101 @@ +from collections import abc + +import pandas as pd +from pandas.core.exchange.column import PandasColumn +from pandas.core.exchange.dataframe_protocol import DataFrame as DataFrameXchg + + +class PandasDataFrameXchg(DataFrameXchg): + """ + A data frame class, with only the methods required by the interchange + protocol defined. + Instances of this (private) class are returned from + ``pd.DataFrame.__dataframe__`` as objects with the methods and + attributes defined on this class. + """ + + def __init__( + self, df: pd.DataFrame, nan_as_null: bool = False, allow_copy: bool = True + ) -> None: + """ + Constructor - an instance of this (private) class is returned from + `pd.DataFrame.__dataframe__`. + """ + self._df = df + # ``nan_as_null`` is a keyword intended for the consumer to tell the + # producer to overwrite null values in the data with ``NaN`` (or ``NaT``). + # This currently has no effect; once support for nullable extension + # dtypes is added, this value should be propagated to columns. + self._nan_as_null = nan_as_null + self._allow_copy = allow_copy + + def __dataframe__(self, nan_as_null: bool = False, allow_copy: bool = True): + return PandasDataFrameXchg(self._df, nan_as_null, allow_copy) + + @property + def metadata(self): + # `index` isn't a regular column, and the protocol doesn't support row + # labels - so we export it as Pandas-specific metadata here. + return {"pandas.index": self._df.index} + + def num_columns(self) -> int: + return len(self._df.columns) + + def num_rows(self) -> int: + return len(self._df) + + def num_chunks(self) -> int: + return 1 + + def column_names(self): + return self._df.columns + + def get_column(self, i: int) -> PandasColumn: + return PandasColumn(self._df.iloc[:, i], allow_copy=self._allow_copy) + + def get_column_by_name(self, name: str) -> PandasColumn: + return PandasColumn(self._df[name], allow_copy=self._allow_copy) + + def get_columns(self): + return [ + PandasColumn(self._df[name], allow_copy=self._allow_copy) + for name in self._df.columns + ] + + def select_columns(self, indices): + if not isinstance(indices, abc.Sequence): + raise ValueError("`indices` is not a sequence") + if not isinstance(indices, list): + indices = list(indices) + + return PandasDataFrameXchg( + self._df.iloc[:, indices], self._nan_as_null, self._allow_copy + ) + + def select_columns_by_name(self, names): + if not isinstance(names, abc.Sequence): + raise ValueError("`names` is not a sequence") + if not isinstance(names, list): + names = list(names) + + return PandasDataFrameXchg( + self._df.loc[:, names], self._nan_as_null, self._allow_copy + ) + + def get_chunks(self, n_chunks=None): + """ + Return an iterator yielding the chunks. + """ + if n_chunks and n_chunks > 1: + size = len(self._df) + step = size // n_chunks + if size % n_chunks != 0: + step += 1 + for start in range(0, step * n_chunks, step): + yield PandasDataFrameXchg( + self._df.iloc[start : start + step, :], + self._nan_as_null, + self._allow_copy, + ) + else: + yield self diff --git a/pandas/core/exchange/dataframe_protocol.py b/pandas/core/exchange/dataframe_protocol.py new file mode 100644 index 0000000000000..ee2ae609e73f9 --- /dev/null +++ b/pandas/core/exchange/dataframe_protocol.py @@ -0,0 +1,486 @@ +""" +A verbatim copy (vendored) of the spec from https://github.com/data-apis/dataframe-api +""" + +from abc import ( + ABC, + abstractmethod, +) +import enum +from typing import ( + Any, + Dict, + Iterable, + Optional, + Sequence, + Tuple, + TypedDict, +) + + +class DlpackDeviceType(enum.IntEnum): + """Integer enum for device type codes matching DLPack.""" + + CPU = 1 + CUDA = 2 + CPU_PINNED = 3 + OPENCL = 4 + VULKAN = 7 + METAL = 8 + VPI = 9 + ROCM = 10 + + +class DtypeKind(enum.IntEnum): + """ + Integer enum for data types. + + Attributes + ---------- + INT : int + Matches to signed integer data type. + UINT : int + Matches to unsigned integer data type. + FLOAT : int + Matches to floating point data type. + BOOL : int + Matches to boolean data type. + STRING : int + Matches to string data type (UTF-8 encoded). + DATETIME : int + Matches to datetime data type. + CATEGORICAL : int + Matches to categorical data type. + """ + + INT = 0 + UINT = 1 + FLOAT = 2 + BOOL = 20 + STRING = 21 # UTF-8 + DATETIME = 22 + CATEGORICAL = 23 + + +class ColumnNullType(enum.IntEnum): + """ + Integer enum for null type representation. + + Attributes + ---------- + NON_NULLABLE : int + Non-nullable column. + USE_NAN : int + Use explicit float NaN/NaT value. + USE_SENTINEL : int + Sentinel value besides NaN/NaT. + USE_BITMASK : int + The bit is set/unset representing a null on a certain position. + USE_BYTEMASK : int + The byte is set/unset representing a null on a certain position. + """ + + NON_NULLABLE = 0 + USE_NAN = 1 + USE_SENTINEL = 2 + USE_BITMASK = 3 + USE_BYTEMASK = 4 + + +class ColumnBuffers(TypedDict): + # first element is a buffer containing the column data; + # second element is the data buffer's associated dtype + data: Tuple["Buffer", Any] + + # first element is a buffer containing mask values indicating missing data; + # second element is the mask value buffer's associated dtype. + # None if the null representation is not a bit or byte mask + validity: Optional[Tuple["Buffer", Any]] + + # first element is a buffer containing the offset values for + # variable-size binary data (e.g., variable-length strings); + # second element is the offsets buffer's associated dtype. + # None if the data buffer does not have an associated offsets buffer + offsets: Optional[Tuple["Buffer", Any]] + + +class CategoricalDescription(TypedDict): + # whether the ordering of dictionary indices is semantically meaningful + is_ordered: bool + # whether a dictionary-style mapping of categorical values to other objects exists + is_dictionary: bool + # Python-level only (e.g. ``{int: str}``). + # None if not a dictionary-style categorical. + mapping: Optional[dict] + + +class Buffer(ABC): + """ + Data in the buffer is guaranteed to be contiguous in memory. + + Note that there is no dtype attribute present, a buffer can be thought of + as simply a block of memory. However, if the column that the buffer is + attached to has a dtype that's supported by DLPack and ``__dlpack__`` is + implemented, then that dtype information will be contained in the return + value from ``__dlpack__``. + + This distinction is useful to support both data exchange via DLPack on a + buffer and (b) dtypes like variable-length strings which do not have a + fixed number of bytes per element. + """ + + @property + @abstractmethod + def bufsize(self) -> int: + """ + Buffer size in bytes. + """ + pass + + @property + @abstractmethod + def ptr(self) -> int: + """ + Pointer to start of the buffer as an integer. + """ + pass + + @abstractmethod + def __dlpack__(self): + """ + Produce DLPack capsule (see array API standard). + + Raises: + + - TypeError : if the buffer contains unsupported dtypes. + - NotImplementedError : if DLPack support is not implemented + + Useful to have to connect to array libraries. Support optional because + it's not completely trivial to implement for a Python-only library. + """ + raise NotImplementedError("__dlpack__") + + @abstractmethod + def __dlpack_device__(self) -> Tuple[DlpackDeviceType, Optional[int]]: + """ + Device type and device ID for where the data in the buffer resides. + Uses device type codes matching DLPack. + Note: must be implemented even if ``__dlpack__`` is not. + """ + pass + + +class Column(ABC): + """ + A column object, with only the methods and properties required by the + interchange protocol defined. + + A column can contain one or more chunks. Each chunk can contain up to three + buffers - a data buffer, a mask buffer (depending on null representation), + and an offsets buffer (if variable-size binary; e.g., variable-length + strings). + + TBD: Arrow has a separate "null" dtype, and has no separate mask concept. + Instead, it seems to use "children" for both columns with a bit mask, + and for nested dtypes. Unclear whether this is elegant or confusing. + This design requires checking the null representation explicitly. + + The Arrow design requires checking: + 1. the ARROW_FLAG_NULLABLE (for sentinel values) + 2. if a column has two children, combined with one of those children + having a null dtype. + + Making the mask concept explicit seems useful. One null dtype would + not be enough to cover both bit and byte masks, so that would mean + even more checking if we did it the Arrow way. + + TBD: there's also the "chunk" concept here, which is implicit in Arrow as + multiple buffers per array (= column here). Semantically it may make + sense to have both: chunks were meant for example for lazy evaluation + of data which doesn't fit in memory, while multiple buffers per column + could also come from doing a selection operation on a single + contiguous buffer. + + Given these concepts, one would expect chunks to be all of the same + size (say a 10,000 row dataframe could have 10 chunks of 1,000 rows), + while multiple buffers could have data-dependent lengths. Not an issue + in pandas if one column is backed by a single NumPy array, but in + Arrow it seems possible. + Are multiple chunks *and* multiple buffers per column necessary for + the purposes of this interchange protocol, or must producers either + reuse the chunk concept for this or copy the data? + + Note: this Column object can only be produced by ``__dataframe__``, so + doesn't need its own version or ``__column__`` protocol. + """ + + @property + @abstractmethod + def size(self) -> int: + """ + Size of the column, in elements. + + Corresponds to DataFrame.num_rows() if column is a single chunk; + equal to size of this current chunk otherwise. + """ + pass + + @property + @abstractmethod + def offset(self) -> int: + """ + Offset of first element. + + May be > 0 if using chunks; for example for a column with N chunks of + equal size M (only the last chunk may be shorter), + ``offset = n * M``, ``n = 0 .. N-1``. + """ + pass + + @property + @abstractmethod + def dtype(self) -> Tuple[DtypeKind, int, str, str]: + """ + Dtype description as a tuple ``(kind, bit-width, format string, endianness)``. + + Bit-width : the number of bits as an integer + Format string : data type description format string in Apache Arrow C + Data Interface format. + Endianness : current only native endianness (``=``) is supported + + Notes: + - Kind specifiers are aligned with DLPack where possible (hence the + jump to 20, leave enough room for future extension) + - Masks must be specified as boolean with either bit width 1 (for bit + masks) or 8 (for byte masks). + - Dtype width in bits was preferred over bytes + - Endianness isn't too useful, but included now in case in the future + we need to support non-native endianness + - Went with Apache Arrow format strings over NumPy format strings + because they're more complete from a dataframe perspective + - Format strings are mostly useful for datetime specification, and + for categoricals. + - For categoricals, the format string describes the type of the + categorical in the data buffer. In case of a separate encoding of + the categorical (e.g. an integer to string mapping), this can + be derived from ``self.describe_categorical``. + - Data types not included: complex, Arrow-style null, binary, decimal, + and nested (list, struct, map, union) dtypes. + """ + pass + + @property + @abstractmethod + def describe_categorical(self) -> CategoricalDescription: + """ + If the dtype is categorical, there are two options: + - There are only values in the data buffer. + - There is a separate dictionary-style encoding for categorical values. + + Raises TypeError if the dtype is not categorical + + Returns the dictionary with description on how to interpret the data buffer: + - "is_ordered" : bool, whether the ordering of dictionary indices is + semantically meaningful. + - "is_dictionary" : bool, whether a dictionary-style mapping of + categorical values to other objects exists + - "mapping" : dict, Python-level only (e.g. ``{int: str}``). + None if not a dictionary-style categorical. + + TBD: are there any other in-memory representations that are needed? + """ + pass + + @property + @abstractmethod + def describe_null(self) -> Tuple[ColumnNullType, Any]: + """ + Return the missing value (or "null") representation the column dtype + uses, as a tuple ``(kind, value)``. + + Value : if kind is "sentinel value", the actual value. If kind is a bit + mask or a byte mask, the value (0 or 1) indicating a missing value. None + otherwise. + """ + pass + + @property + @abstractmethod + def null_count(self) -> Optional[int]: + """ + Number of null elements, if known. + + Note: Arrow uses -1 to indicate "unknown", but None seems cleaner. + """ + pass + + @property + @abstractmethod + def metadata(self) -> Dict[str, Any]: + """ + The metadata for the column. See `DataFrame.metadata` for more details. + """ + pass + + @abstractmethod + def num_chunks(self) -> int: + """ + Return the number of chunks the column consists of. + """ + pass + + @abstractmethod + def get_chunks(self, n_chunks: Optional[int] = None) -> Iterable["Column"]: + """ + Return an iterator yielding the chunks. + + See `DataFrame.get_chunks` for details on ``n_chunks``. + """ + pass + + @abstractmethod + def get_buffers(self) -> ColumnBuffers: + """ + Return a dictionary containing the underlying buffers. + + The returned dictionary has the following contents: + + - "data": a two-element tuple whose first element is a buffer + containing the data and whose second element is the data + buffer's associated dtype. + - "validity": a two-element tuple whose first element is a buffer + containing mask values indicating missing data and + whose second element is the mask value buffer's + associated dtype. None if the null representation is + not a bit or byte mask. + - "offsets": a two-element tuple whose first element is a buffer + containing the offset values for variable-size binary + data (e.g., variable-length strings) and whose second + element is the offsets buffer's associated dtype. None + if the data buffer does not have an associated offsets + buffer. + """ + pass + + +# def get_children(self) -> Iterable[Column]: +# """ +# Children columns underneath the column, each object in this iterator +# must adhere to the column specification. +# """ +# pass + + +class DataFrame(ABC): + """ + A data frame class, with only the methods required by the interchange + protocol defined. + + A "data frame" represents an ordered collection of named columns. + A column's "name" must be a unique string. + Columns may be accessed by name or by position. + + This could be a public data frame class, or an object with the methods and + attributes defined on this DataFrame class could be returned from the + ``__dataframe__`` method of a public data frame class in a library adhering + to the dataframe interchange protocol specification. + """ + + version = 0 # version of the protocol + + @abstractmethod + def __dataframe__(self, nan_as_null: bool = False, allow_copy: bool = True): + """Construct a new exchange object, potentially changing the parameters.""" + pass + + @property + @abstractmethod + def metadata(self) -> Dict[str, Any]: + """ + The metadata for the data frame, as a dictionary with string keys. The + contents of `metadata` may be anything, they are meant for a library + to store information that it needs to, e.g., roundtrip losslessly or + for two implementations to share data that is not (yet) part of the + interchange protocol specification. For avoiding collisions with other + entries, please add name the keys with the name of the library + followed by a period and the desired name, e.g, ``pandas.indexcol``. + """ + pass + + @abstractmethod + def num_columns(self) -> int: + """ + Return the number of columns in the DataFrame. + """ + pass + + @abstractmethod + def num_rows(self) -> Optional[int]: + # TODO: not happy with Optional, but need to flag it may be expensive + # why include it if it may be None - what do we expect consumers + # to do here? + """ + Return the number of rows in the DataFrame, if available. + """ + pass + + @abstractmethod + def num_chunks(self) -> int: + """ + Return the number of chunks the DataFrame consists of. + """ + pass + + @abstractmethod + def column_names(self) -> Iterable[str]: + """ + Return an iterator yielding the column names. + """ + pass + + @abstractmethod + def get_column(self, i: int) -> Column: + """ + Return the column at the indicated position. + """ + pass + + @abstractmethod + def get_column_by_name(self, name: str) -> Column: + """ + Return the column whose name is the indicated name. + """ + pass + + @abstractmethod + def get_columns(self) -> Iterable[Column]: + """ + Return an iterator yielding the columns. + """ + pass + + @abstractmethod + def select_columns(self, indices: Sequence[int]) -> "DataFrame": + """ + Create a new DataFrame by selecting a subset of columns by index. + """ + pass + + @abstractmethod + def select_columns_by_name(self, names: Sequence[str]) -> "DataFrame": + """ + Create a new DataFrame by selecting a subset of columns by name. + """ + pass + + @abstractmethod + def get_chunks(self, n_chunks: Optional[int] = None) -> Iterable["DataFrame"]: + """ + Return an iterator yielding the chunks. + + By default (None), yields the chunks that the data is stored as by the + producer. If given, ``n_chunks`` must be a multiple of + ``self.num_chunks()``, meaning the producer must subdivide each chunk + before yielding it. + """ + pass diff --git a/pandas/core/exchange/from_dataframe.py b/pandas/core/exchange/from_dataframe.py new file mode 100644 index 0000000000000..18d8c9c257af4 --- /dev/null +++ b/pandas/core/exchange/from_dataframe.py @@ -0,0 +1,527 @@ +import ctypes +import re +from typing import ( + Any, + Dict, + List, + Optional, + Tuple, + Union, +) + +import numpy as np + +import pandas as pd +from pandas.core.exchange.dataframe_protocol import ( + Buffer, + Column, + ColumnNullType, + DataFrame as DataFrameXchg, + DtypeKind, +) +from pandas.core.exchange.utils import ( + ArrowCTypes, + Endianness, +) + +_NP_DTYPES: Dict[DtypeKind, Dict[int, Any]] = { + DtypeKind.INT: {8: np.int8, 16: np.int16, 32: np.int32, 64: np.int64}, + DtypeKind.UINT: {8: np.uint8, 16: np.uint16, 32: np.uint32, 64: np.uint64}, + DtypeKind.FLOAT: {32: np.float32, 64: np.float64}, + DtypeKind.BOOL: {8: bool}, +} + + +def from_dataframe(df, allow_copy=True): + """ + Build a ``pd.DataFrame`` from any DataFrame supporting the interchange protocol. + + Parameters + ---------- + df : DataFrameXchg + Object supporting the exchange protocol, i.e. `__dataframe__` method. + allow_copy : bool, default: True + Whether to allow copying the memory to perform the conversion + (if false then zero-copy approach is requested). + + Returns + ------- + pd.DataFrame + """ + if isinstance(df, pd.DataFrame): + return df + + if not hasattr(df, "__dataframe__"): + raise ValueError("`df` does not support __dataframe__") + + return _from_dataframe(df.__dataframe__(allow_copy=allow_copy)) + + +def _from_dataframe(df: DataFrameXchg, allow_copy=True): + """ + Build a ``pd.DataFrame`` from the DataFrame exchange object. + + Parameters + ---------- + df : DataFrameXchg + Object supporting the exchange protocol, i.e. `__dataframe__` method. + allow_copy : bool, default: True + Whether to allow copying the memory to perform the conversion + (if false then zero-copy approach is requested). + + Returns + ------- + pd.DataFrame + """ + pandas_dfs = [] + for chunk in df.get_chunks(): + pandas_df = protocol_df_chunk_to_pandas(chunk) + pandas_dfs.append(pandas_df) + + if not allow_copy and len(pandas_dfs) > 1: + raise RuntimeError( + "To join chunks a copy is required which is forbidden by allow_copy=False" + ) + if len(pandas_dfs) == 1: + pandas_df = pandas_dfs[0] + else: + pandas_df = pd.concat(pandas_dfs, axis=0, ignore_index=True, copy=False) + + index_obj = df.metadata.get("pandas.index", None) + if index_obj is not None: + pandas_df.index = index_obj + + return pandas_df + + +def protocol_df_chunk_to_pandas(df: DataFrameXchg) -> pd.DataFrame: + """ + Convert exchange protocol chunk to ``pd.DataFrame``. + + Parameters + ---------- + df : DataFrameXchg + + Returns + ------- + pd.DataFrame + """ + # We need a dict of columns here, with each column being a NumPy array (at + # least for now, deal with non-NumPy dtypes later). + columns: Dict[str, Any] = {} + buffers = [] # hold on to buffers, keeps memory alive + for name in df.column_names(): + if not isinstance(name, str): + raise ValueError(f"Column {name} is not a string") + if name in columns: + raise ValueError(f"Column {name} is not unique") + col = df.get_column_by_name(name) + dtype = col.dtype[0] + if dtype in ( + DtypeKind.INT, + DtypeKind.UINT, + DtypeKind.FLOAT, + DtypeKind.BOOL, + ): + columns[name], buf = primitive_column_to_ndarray(col) + elif dtype == DtypeKind.CATEGORICAL: + columns[name], buf = categorical_column_to_series(col) + elif dtype == DtypeKind.STRING: + columns[name], buf = string_column_to_ndarray(col) + elif dtype == DtypeKind.DATETIME: + columns[name], buf = datetime_column_to_ndarray(col) + else: + raise NotImplementedError(f"Data type {dtype} not handled yet") + + buffers.append(buf) + + pandas_df = pd.DataFrame(columns) + pandas_df._buffers = buffers + return pandas_df + + +def primitive_column_to_ndarray(col: Column) -> Tuple[np.ndarray, Any]: + """ + Convert a column holding one of the primitive dtypes to a NumPy array. + + A primitive type is one of: int, uint, float, bool. + + Parameters + ---------- + col : Column + + Returns + ------- + tuple + Tuple of np.ndarray holding the data and the memory owner object + that keeps the memory alive. + """ + buffers = col.get_buffers() + + data_buff, data_dtype = buffers["data"] + data = buffer_to_ndarray(data_buff, data_dtype, col.offset, col.size) + + data = set_nulls(data, col, buffers["validity"]) + return data, buffers + + +def categorical_column_to_series(col: Column) -> Tuple[pd.Series, Any]: + """ + Convert a column holding categorical data to a pandas Series. + + Parameters + ---------- + col : Column + + Returns + ------- + tuple + Tuple of pd.Series holding the data and the memory owner object + that keeps the memory alive. + """ + categorical = col.describe_categorical + + if not categorical["is_dictionary"]: + raise NotImplementedError("Non-dictionary categoricals not supported yet") + + mapping = categorical["mapping"] + assert isinstance(mapping, dict), "Categorical mapping must be a dict" + categories = np.array(tuple(mapping[k] for k in sorted(mapping))) + buffers = col.get_buffers() + + codes_buff, codes_dtype = buffers["data"] + codes = buffer_to_ndarray(codes_buff, codes_dtype, col.offset, col.size) + + # Doing module in order to not get ``IndexError`` for + # out-of-bounds sentinel values in `codes` + values = categories[codes % len(categories)] + + cat = pd.Categorical( + values, categories=categories, ordered=categorical["is_ordered"] + ) + data = pd.Series(cat) + + data = set_nulls(data, col, buffers["validity"]) + return data, buffers + + +def string_column_to_ndarray(col: Column) -> Tuple[np.ndarray, Any]: + """ + Convert a column holding string data to a NumPy array. + + Parameters + ---------- + col : Column + + Returns + ------- + tuple + Tuple of np.ndarray holding the data and the memory owner object + that keeps the memory alive. + """ + null_kind, sentinel_val = col.describe_null + + if null_kind not in ( + ColumnNullType.NON_NULLABLE, + ColumnNullType.USE_BITMASK, + ColumnNullType.USE_BYTEMASK, + ): + raise NotImplementedError( + f"{null_kind} null kind is not yet supported for string columns." + ) + + buffers = col.get_buffers() + + assert buffers["offsets"], "String buffers must contain offsets" + # Retrieve the data buffer containing the UTF-8 code units + data_buff, protocol_data_dtype = buffers["data"] + # We're going to reinterpret the buffer as uint8, so make sure we can do it safely + assert protocol_data_dtype[1] == 8 # bitwidth == 8 + assert protocol_data_dtype[2] == ArrowCTypes.STRING # format_str == utf-8 + # Convert the buffers to NumPy arrays. In order to go from STRING to + # an equivalent ndarray, we claim that the buffer is uint8 (i.e., a byte array) + data_dtype = ( + DtypeKind.UINT, + 8, + ArrowCTypes.UINT8, + Endianness.NATIVE, + ) + # Specify zero offset as we don't want to chunk the string data + data = buffer_to_ndarray(data_buff, data_dtype, offset=0, length=col.size) + + # Retrieve the offsets buffer containing the index offsets demarcating + # the beginning and the ending of each string + offset_buff, offset_dtype = buffers["offsets"] + # Offsets buffer contains start-stop positions of strings in the data buffer, + # meaning that it has more elements than in the data buffer, do `col.size + 1` here + # to pass a proper offsets buffer size + offsets = buffer_to_ndarray( + offset_buff, offset_dtype, col.offset, length=col.size + 1 + ) + + null_pos = None + if null_kind in (ColumnNullType.USE_BITMASK, ColumnNullType.USE_BYTEMASK): + assert buffers["validity"], "Validity buffers cannot be empty for masks" + valid_buff, valid_dtype = buffers["validity"] + null_pos = buffer_to_ndarray(valid_buff, valid_dtype, col.offset, col.size) + if sentinel_val == 0: + null_pos = ~null_pos + + # Assemble the strings from the code units + str_list: List[Union[None, float, str]] = [None] * col.size + for i in range(col.size): + # Check for missing values + if null_pos is not None and null_pos[i]: + str_list[i] = np.nan + continue + + # Extract a range of code units + units = data[offsets[i] : offsets[i + 1]] + + # Convert the list of code units to bytes + str_bytes = bytes(units) + + # Create the string + string = str_bytes.decode(encoding="utf-8") + + # Add to our list of strings + str_list[i] = string + + # Convert the string list to a NumPy array + return np.asarray(str_list, dtype="object"), buffers + + +def parse_datetime_format_str(format_str, data): + """Parse datetime `format_str` to interpret the `data`.""" + # timestamp 'ts{unit}:tz' + timestamp_meta = re.match(r"ts([smun]):(.*)", format_str) + if timestamp_meta: + unit, tz = timestamp_meta.group(1), timestamp_meta.group(2) + if tz != "": + raise NotImplementedError("Timezones are not supported yet") + if unit != "s": + # the format string describes only a first letter of the unit, so + # add one extra letter to convert the unit to numpy-style: + # 'm' -> 'ms', 'u' -> 'us', 'n' -> 'ns' + unit += "s" + data = data.astype(f"datetime64[{unit}]") + return data + + # date 'td{Days/Ms}' + date_meta = re.match(r"td([Dm])", format_str) + if date_meta: + unit = date_meta.group(1) + if unit == "D": + # NumPy doesn't support DAY unit, so converting days to seconds + # (converting to uint64 to avoid overflow) + data = (data.astype(np.uint64) * (24 * 60 * 60)).astype("datetime64[s]") + elif unit == "m": + data = data.astype("datetime64[ms]") + else: + raise NotImplementedError(f"Date unit is not supported: {unit}") + return data + + raise NotImplementedError(f"DateTime kind is not supported: {format_str}") + + +def datetime_column_to_ndarray(col: Column) -> Tuple[np.ndarray, Any]: + """ + Convert a column holding DateTime data to a NumPy array. + + Parameters + ---------- + col : Column + + Returns + ------- + tuple + Tuple of np.ndarray holding the data and the memory owner object + that keeps the memory alive. + """ + buffers = col.get_buffers() + + _, _, format_str, _ = col.dtype + dbuf, dtype = buffers["data"] + # Consider dtype being `uint` to get number of units passed since the 01.01.1970 + data = buffer_to_ndarray( + dbuf, + ( + DtypeKind.UINT, + dtype[1], + getattr(ArrowCTypes, f"UINT{dtype[1]}"), + Endianness.NATIVE, + ), + col.offset, + col.size, + ) + + data = parse_datetime_format_str(format_str, data) + data = set_nulls(data, col, buffers["validity"]) + return data, buffers + + +def buffer_to_ndarray( + buffer: Buffer, + dtype: Tuple[DtypeKind, int, str, str], + offset: int = 0, + length: Optional[int] = None, +) -> np.ndarray: + """ + Build a NumPy array from the passed buffer. + + Parameters + ---------- + buffer : Buffer + Buffer to build a NumPy array from. + dtype : tuple + Data type of the buffer conforming protocol dtypes format. + offset : int, default: 0 + Number of elements to offset from the start of the buffer. + length : int, optional + If the buffer is a bit-mask, specifies a number of bits to read + from the buffer. Has no effect otherwise. + + Returns + ------- + np.ndarray + + Notes + ----- + The returned array doesn't own the memory. The caller of this function is + responsible for keeping the memory owner object alive as long as + the returned NumPy array is being used. + """ + kind, bit_width, _, _ = dtype + + column_dtype = _NP_DTYPES.get(kind, {}).get(bit_width, None) + if column_dtype is None: + raise NotImplementedError(f"Conversion for {dtype} is not yet supported.") + + # TODO: No DLPack yet, so need to construct a new ndarray from the data pointer + # and size in the buffer plus the dtype on the column. Use DLPack as NumPy supports + # it since https://github.com/numpy/numpy/pull/19083 + ctypes_type = np.ctypeslib.as_ctypes_type(column_dtype) + data_pointer = ctypes.cast( + buffer.ptr + (offset * bit_width // 8), ctypes.POINTER(ctypes_type) + ) + + if bit_width == 1: + assert length is not None, "`length` must be specified for a bit-mask buffer." + arr = np.ctypeslib.as_array(data_pointer, shape=(buffer.bufsize,)) + return bitmask_to_bool_ndarray(arr, length, first_byte_offset=offset % 8) + else: + return np.ctypeslib.as_array( + data_pointer, shape=(buffer.bufsize // (bit_width // 8),) + ) + + +def bitmask_to_bool_ndarray( + bitmask: np.ndarray, mask_length: int, first_byte_offset: int = 0 +) -> np.ndarray: + """ + Convert bit-mask to a boolean NumPy array. + + Parameters + ---------- + bitmask : np.ndarray[uint8] + NumPy array of uint8 dtype representing the bitmask. + mask_length : int + Number of elements in the mask to interpret. + first_byte_offset : int, default: 0 + Number of elements to offset from the start of the first byte. + + Returns + ------- + np.ndarray[bool] + """ + bytes_to_skip = first_byte_offset // 8 + bitmask = bitmask[bytes_to_skip:] + first_byte_offset %= 8 + + bool_mask = np.zeros(mask_length, dtype=bool) + + # Processing the first byte separately as it has its own offset + val = bitmask[0] + mask_idx = 0 + bits_in_first_byte = min(8 - first_byte_offset, mask_length) + for j in range(bits_in_first_byte): + if val & (1 << (j + first_byte_offset)): + bool_mask[mask_idx] = True + mask_idx += 1 + + # `mask_length // 8` describes how many full bytes to process + for i in range((mask_length - bits_in_first_byte) // 8): + # doing `+ 1` as we already processed the first byte + val = bitmask[i + 1] + for j in range(8): + if val & (1 << j): + bool_mask[mask_idx] = True + mask_idx += 1 + + if len(bitmask) > 1: + # Processing reminder of last byte + val = bitmask[-1] + for j in range(len(bool_mask) - mask_idx): + if val & (1 << j): + bool_mask[mask_idx] = True + mask_idx += 1 + + return bool_mask + + +def set_nulls( + data: Union[np.ndarray, pd.Series], + col: Column, + validity: Optional[Tuple[Buffer, Tuple[DtypeKind, int, str, str]]], + allow_modify_inplace: bool = True, +): + """ + Set null values for the data according to the column null kind. + + Parameters + ---------- + data : np.ndarray or pd.Series + Data to set nulls in. + col : Column + Column object that describes the `data`. + validity : tuple(Buffer, dtype) or None + The return value of ``col.buffers()``. We do not access the ``col.buffers()`` + here to not take the ownership of the memory of buffer objects. + allow_modify_inplace : bool, default: True + Whether to modify the `data` inplace when zero-copy is possible (True) or always + modify a copy of the `data` (False). + + Returns + ------- + np.ndarray or pd.Series + Data with the nulls being set. + """ + null_kind, sentinel_val = col.describe_null + null_pos = None + + if null_kind == ColumnNullType.USE_SENTINEL: + null_pos = data == sentinel_val + elif null_kind in (ColumnNullType.USE_BITMASK, ColumnNullType.USE_BYTEMASK): + assert validity, "Expected to have a validity buffer for the mask" + valid_buff, valid_dtype = validity + null_pos = buffer_to_ndarray(valid_buff, valid_dtype, col.offset, col.size) + if sentinel_val == 0: + null_pos = ~null_pos + elif null_kind in (ColumnNullType.NON_NULLABLE, ColumnNullType.USE_NAN): + pass + else: + raise NotImplementedError(f"Null kind {null_kind} is not yet supported.") + + if null_pos is not None and np.any(null_pos): + if not allow_modify_inplace: + data = data.copy() + try: + data[null_pos] = None + except TypeError: + # TypeError happens if the `data` dtype appears to be non-nullable + # in numpy notation (bool, int, uint). If this happens, + # cast the `data` to nullable float dtype. + data = data.astype(float) + data[null_pos] = None + + return data diff --git a/pandas/core/exchange/utils.py b/pandas/core/exchange/utils.py new file mode 100644 index 0000000000000..0c746113babee --- /dev/null +++ b/pandas/core/exchange/utils.py @@ -0,0 +1,93 @@ +""" +Utility functions and objects for implementing the exchange API. +""" + +import re +import typing + +import numpy as np + +from pandas._typing import DtypeObj + +import pandas as pd +from pandas.api.types import is_datetime64_dtype + + +class ArrowCTypes: + """ + Enum for Apache Arrow C type format strings. + + The Arrow C data interface: + https://arrow.apache.org/docs/format/CDataInterface.html#data-type-description-format-strings + """ + + NULL = "n" + BOOL = "b" + INT8 = "c" + UINT8 = "C" + INT16 = "s" + UINT16 = "S" + INT32 = "i" + UINT32 = "I" + INT64 = "l" + UINT64 = "L" + FLOAT16 = "e" + FLOAT32 = "f" + FLOAT64 = "g" + STRING = "u" # utf-8 + DATE32 = "tdD" + DATE64 = "tdm" + # Resoulution: + # - seconds -> 's' + # - milliseconds -> 'm' + # - microseconds -> 'u' + # - nanoseconds -> 'n' + TIMESTAMP = "ts{resolution}:{tz}" + TIME = "tt{resolution}" + + +class Endianness: + """Enum indicating the byte-order of a data-type.""" + + LITTLE = "<" + BIG = ">" + NATIVE = "=" + NA = "|" + + +def dtype_to_arrow_c_fmt(dtype: DtypeObj) -> str: + """ + Represent pandas `dtype` as a format string in Apache Arrow C notation. + + Parameters + ---------- + dtype : np.dtype + Datatype of pandas DataFrame to represent. + + Returns + ------- + str + Format string in Apache Arrow C notation of the given `dtype`. + """ + if isinstance(dtype, pd.CategoricalDtype): + return ArrowCTypes.INT64 + elif dtype == np.dtype("O"): + return ArrowCTypes.STRING + + format_str = getattr(ArrowCTypes, dtype.name.upper(), None) + if format_str is not None: + return format_str + + if is_datetime64_dtype(dtype): + # Selecting the first char of resolution string: + # dtype.str -> ' DataFrameXchg: + """ + Return the dataframe exchange object implementing the exchange protocol. + + Parameters + ---------- + nan_as_null : bool, default False + Whether to tell the DataFrame to overwrite null values in the data + with ``NaN`` (or ``NaT``). + allow_copy : bool, default True + Whether to allow memory copying when exporting. If set to False + it would cause non-zero-copy exports to fail. + + Returns + ------- + DataFrame exchange object + The object which consuming library can use to ingress the dataframe. + + Notes + ----- + Details on the exchange protocol: + https://data-apis.org/dataframe-protocol/latest/index.html + + `nan_as_null` currently has no effect; once support for nullable extension + dtypes is added, this value should be propagated to columns. + """ + + from pandas.core.exchange.dataframe import PandasDataFrameXchg + + return PandasDataFrameXchg(self, nan_as_null, allow_copy) + # ---------------------------------------------------------------------- @property diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index 2e306c76d246c..1bc2cf5085f1a 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -274,7 +274,7 @@ def test_np(): class TestApi(Base): - allowed = ["types", "extensions", "indexers"] + allowed = ["types", "extensions", "indexers", "exchange"] def test_api(self): self.check(api, self.allowed) diff --git a/pandas/tests/exchange/conftest.py b/pandas/tests/exchange/conftest.py new file mode 100644 index 0000000000000..033f44984b551 --- /dev/null +++ b/pandas/tests/exchange/conftest.py @@ -0,0 +1,12 @@ +import pytest + +import pandas as pd + + +@pytest.fixture(scope="package") +def df_from_dict(): + def maker(dct, is_categorical=False): + df = pd.DataFrame(dct) + return df.astype("category") if is_categorical else df + + return maker diff --git a/pandas/tests/exchange/test_impl.py b/pandas/tests/exchange/test_impl.py new file mode 100644 index 0000000000000..a40e5fef789ad --- /dev/null +++ b/pandas/tests/exchange/test_impl.py @@ -0,0 +1,176 @@ +from datetime import datetime +import random + +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm +from pandas.core.exchange.dataframe_protocol import ( + ColumnNullType, + DtypeKind, +) +from pandas.core.exchange.from_dataframe import from_dataframe + +test_data_categorical = { + "ordered": pd.Categorical(list("testdata") * 30, ordered=True), + "unordered": pd.Categorical(list("testdata") * 30, ordered=False), +} + +NCOLS, NROWS = 100, 200 + + +def _make_data(make_one): + return { + f"col{int((i - NCOLS / 2) % NCOLS + 1)}": [make_one() for _ in range(NROWS)] + for i in range(NCOLS) + } + + +int_data = _make_data(lambda: random.randint(-100, 100)) +uint_data = _make_data(lambda: random.randint(1, 100)) +bool_data = _make_data(lambda: random.choice([True, False])) +float_data = _make_data(lambda: random.random()) +datetime_data = _make_data( + lambda: datetime( + year=random.randint(1900, 2100), + month=random.randint(1, 12), + day=random.randint(1, 20), + ) +) + +string_data = { + "separator data": [ + "abC|DeF,Hik", + "234,3245.67", + "gSaf,qWer|Gre", + "asd3,4sad|", + np.NaN, + ] +} + + +@pytest.mark.parametrize("data", [("ordered", True), ("unordered", False)]) +def test_categorical_dtype(data): + df = pd.DataFrame({"A": (test_data_categorical[data[0]])}) + + col = df.__dataframe__().get_column_by_name("A") + assert col.dtype[0] == DtypeKind.CATEGORICAL + assert col.null_count == 0 + assert col.describe_null == (ColumnNullType.USE_SENTINEL, -1) + assert col.num_chunks() == 1 + assert col.describe_categorical == { + "is_ordered": data[1], + "is_dictionary": True, + "mapping": {0: "a", 1: "d", 2: "e", 3: "s", 4: "t"}, + } + + tm.assert_frame_equal(df, from_dataframe(df.__dataframe__())) + + +@pytest.mark.parametrize( + "data", [int_data, uint_data, float_data, bool_data, datetime_data] +) +def test_dataframe(data): + df = pd.DataFrame(data) + + df2 = df.__dataframe__() + + assert df2.num_columns() == NCOLS + assert df2.num_rows() == NROWS + + assert list(df2.column_names()) == list(data.keys()) + + indices = (0, 2) + names = tuple(list(data.keys())[idx] for idx in indices) + + tm.assert_frame_equal( + from_dataframe(df2.select_columns(indices)), + from_dataframe(df2.select_columns_by_name(names)), + ) + + +def test_missing_from_masked(): + df = pd.DataFrame( + { + "x": np.array([1, 2, 3, 4, 0]), + "y": np.array([1.5, 2.5, 3.5, 4.5, 0]), + "z": np.array([True, False, True, True, True]), + } + ) + + df2 = df.__dataframe__() + + rng = np.random.RandomState(42) + dict_null = {col: rng.randint(low=0, high=len(df)) for col in df.columns} + for col, num_nulls in dict_null.items(): + null_idx = df.index[ + rng.choice(np.arange(len(df)), size=num_nulls, replace=False) + ] + df.loc[null_idx, col] = None + + df2 = df.__dataframe__() + + assert df2.get_column_by_name("x").null_count == dict_null["x"] + assert df2.get_column_by_name("y").null_count == dict_null["y"] + assert df2.get_column_by_name("z").null_count == dict_null["z"] + + +@pytest.mark.parametrize( + "data", + [ + {"x": [1.5, 2.5, 3.5], "y": [9.2, 10.5, 11.8]}, + {"x": [1, 2, 0], "y": [9.2, 10.5, 11.8]}, + { + "x": np.array([True, True, False]), + "y": np.array([1, 2, 0]), + "z": np.array([9.2, 10.5, 11.8]), + }, + ], +) +def test_mixed_data(data): + df = pd.DataFrame(data) + df2 = df.__dataframe__() + + for col_name in df.columns: + assert df2.get_column_by_name(col_name).null_count == 0 + + +def test_mixed_missing(): + df = pd.DataFrame( + { + "x": np.array([True, None, False, None, True]), + "y": np.array([None, 2, None, 1, 2]), + "z": np.array([9.2, 10.5, None, 11.8, None]), + } + ) + + df2 = df.__dataframe__() + + for col_name in df.columns: + assert df2.get_column_by_name(col_name).null_count == 2 + + +def test_string(): + test_str_data = string_data["separator data"] + [""] + df = pd.DataFrame({"A": test_str_data}) + col = df.__dataframe__().get_column_by_name("A") + + assert col.size == 6 + assert col.null_count == 1 + assert col.dtype[0] == DtypeKind.STRING + assert col.describe_null == (ColumnNullType.USE_BYTEMASK, 0) + + df_sliced = df[1:] + col = df_sliced.__dataframe__().get_column_by_name("A") + assert col.size == 5 + assert col.null_count == 1 + assert col.dtype[0] == DtypeKind.STRING + assert col.describe_null == (ColumnNullType.USE_BYTEMASK, 0) + + +def test_nonstring_object(): + df = pd.DataFrame({"A": ["a", 10, 1.0, ()]}) + col = df.__dataframe__().get_column_by_name("A") + with pytest.raises(NotImplementedError, match="not supported yet"): + col.dtype diff --git a/pandas/tests/exchange/test_spec_conformance.py b/pandas/tests/exchange/test_spec_conformance.py new file mode 100644 index 0000000000000..f5b8bb569f35e --- /dev/null +++ b/pandas/tests/exchange/test_spec_conformance.py @@ -0,0 +1,160 @@ +""" +A verbatim copy (vendored) of the spec tests. +Taken from https://github.com/data-apis/dataframe-api +""" +import ctypes +import math + +import pytest + + +@pytest.mark.parametrize( + "test_data", + [ + {"a": ["foo", "bar"], "b": ["baz", "qux"]}, + {"a": [1.5, 2.5, 3.5], "b": [9.2, 10.5, 11.8]}, + {"A": [1, 2, 3, 4], "B": [1, 2, 3, 4]}, + ], + ids=["str_data", "float_data", "int_data"], +) +def test_only_one_dtype(test_data, df_from_dict): + columns = list(test_data.keys()) + df = df_from_dict(test_data) + dfX = df.__dataframe__() + + column_size = len(test_data[columns[0]]) + for column in columns: + assert dfX.get_column_by_name(column).null_count == 0 + assert dfX.get_column_by_name(column).size == column_size + assert dfX.get_column_by_name(column).offset == 0 + + +def test_mixed_dtypes(df_from_dict): + df = df_from_dict( + { + "a": [1, 2, 3], # dtype kind INT = 0 + "b": [3, 4, 5], # dtype kind INT = 0 + "c": [1.5, 2.5, 3.5], # dtype kind FLOAT = 2 + "d": [9, 10, 11], # dtype kind INT = 0 + "e": [True, False, True], # dtype kind BOOLEAN = 20 + "f": ["a", "", "c"], # dtype kind STRING = 21 + } + ) + dfX = df.__dataframe__() + # for meanings of dtype[0] see the spec; we cannot import the spec here as this + # file is expected to be vendored *anywhere*; + # values for dtype[0] are explained above + columns = {"a": 0, "b": 0, "c": 2, "d": 0, "e": 20, "f": 21} + + for column, kind in columns.items(): + colX = dfX.get_column_by_name(column) + assert colX.null_count == 0 + assert colX.size == 3 + assert colX.offset == 0 + + assert colX.dtype[0] == kind + + assert dfX.get_column_by_name("c").dtype[1] == 64 + + +def test_na_float(df_from_dict): + df = df_from_dict({"a": [1.0, math.nan, 2.0]}) + dfX = df.__dataframe__() + colX = dfX.get_column_by_name("a") + assert colX.null_count == 1 + + +def test_noncategorical(df_from_dict): + df = df_from_dict({"a": [1, 2, 3]}) + dfX = df.__dataframe__() + colX = dfX.get_column_by_name("a") + with pytest.raises(TypeError, match=".*categorical.*"): + colX.describe_categorical + + +def test_categorical(df_from_dict): + df = df_from_dict( + {"weekday": ["Mon", "Tue", "Mon", "Wed", "Mon", "Thu", "Fri", "Sat", "Sun"]}, + is_categorical=True, + ) + + colX = df.__dataframe__().get_column_by_name("weekday") + categorical = colX.describe_categorical + assert isinstance(categorical["is_ordered"], bool) + assert isinstance(categorical["is_dictionary"], bool) + + +def test_dataframe(df_from_dict): + df = df_from_dict( + {"x": [True, True, False], "y": [1, 2, 0], "z": [9.2, 10.5, 11.8]} + ) + dfX = df.__dataframe__() + + assert dfX.num_columns() == 3 + assert dfX.num_rows() == 3 + assert dfX.num_chunks() == 1 + assert list(dfX.column_names()) == ["x", "y", "z"] + assert list(dfX.select_columns((0, 2)).column_names()) == list( + dfX.select_columns_by_name(("x", "z")).column_names() + ) + + +@pytest.mark.parametrize(["size", "n_chunks"], [(10, 3), (12, 3), (12, 5)]) +def test_df_get_chunks(size, n_chunks, df_from_dict): + df = df_from_dict({"x": list(range(size))}) + dfX = df.__dataframe__() + chunks = list(dfX.get_chunks(n_chunks)) + assert len(chunks) == n_chunks + assert sum(chunk.num_rows() for chunk in chunks) == size + + +@pytest.mark.parametrize(["size", "n_chunks"], [(10, 3), (12, 3), (12, 5)]) +def test_column_get_chunks(size, n_chunks, df_from_dict): + df = df_from_dict({"x": list(range(size))}) + dfX = df.__dataframe__() + chunks = list(dfX.get_column(0).get_chunks(n_chunks)) + assert len(chunks) == n_chunks + assert sum(chunk.size for chunk in chunks) == size + + +def test_get_columns(df_from_dict): + df = df_from_dict({"a": [0, 1], "b": [2.5, 3.5]}) + dfX = df.__dataframe__() + for colX in dfX.get_columns(): + assert colX.size == 2 + assert colX.num_chunks() == 1 + # for meanings of dtype[0] see the spec; we cannot import the spec here as this + # file is expected to be vendored *anywhere* + assert dfX.get_column(0).dtype[0] == 0 # INT + assert dfX.get_column(1).dtype[0] == 2 # FLOAT + + +def test_buffer(df_from_dict): + arr = [0, 1, -1] + df = df_from_dict({"a": arr}) + dfX = df.__dataframe__() + colX = dfX.get_column(0) + bufX = colX.get_buffers() + + dataBuf, dataDtype = bufX["data"] + + assert dataBuf.bufsize > 0 + assert dataBuf.ptr != 0 + device, _ = dataBuf.__dlpack_device__() + + # for meanings of dtype[0] see the spec; we cannot import the spec here as this + # file is expected to be vendored *anywhere* + assert dataDtype[0] == 0 # INT + + if device == 1: # CPU-only as we're going to directly read memory here + bitwidth = dataDtype[1] + ctype = { + 8: ctypes.c_int8, + 16: ctypes.c_int16, + 32: ctypes.c_int32, + 64: ctypes.c_int64, + }[bitwidth] + + for idx, truth in enumerate(arr): + val = ctype.from_address(dataBuf.ptr + idx * (bitwidth // 8)).value + assert val == truth, f"Buffer at index {idx} mismatch" diff --git a/pandas/tests/exchange/test_utils.py b/pandas/tests/exchange/test_utils.py new file mode 100644 index 0000000000000..4c80ecf0d23a0 --- /dev/null +++ b/pandas/tests/exchange/test_utils.py @@ -0,0 +1,40 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas.core.exchange.utils import dtype_to_arrow_c_fmt + +# TODO: use ArrowSchema to get reference C-string. +# At the time, there is no way to access ArrowSchema holding a type format string +# from python. The only way to access it is to export the structure to a C-pointer, +# see DataType._export_to_c() method defined in +# https://github.com/apache/arrow/blob/master/python/pyarrow/types.pxi + + +@pytest.mark.parametrize( + "pandas_dtype, c_string", + [ + (np.dtype("bool"), "b"), + (np.dtype("int8"), "c"), + (np.dtype("uint8"), "C"), + (np.dtype("int16"), "s"), + (np.dtype("uint16"), "S"), + (np.dtype("int32"), "i"), + (np.dtype("uint32"), "I"), + (np.dtype("int64"), "l"), + (np.dtype("uint64"), "L"), + (np.dtype("float16"), "e"), + (np.dtype("float32"), "f"), + (np.dtype("float64"), "g"), + (pd.Series(["a"]).dtype, "u"), + ( + pd.Series([0]).astype("datetime64[ns]").dtype, + "tsn:", + ), + (pd.CategoricalDtype(["a"]), "l"), + (np.dtype("O"), "u"), + ], +) +def test_dtype_to_arrow_c_fmt(pandas_dtype, c_string): # PR01 + """Test ``dtype_to_arrow_c_fmt`` utility function.""" + assert dtype_to_arrow_c_fmt(pandas_dtype) == c_string