From 8eab8a2e44b04bca89983bd138aa5ecdf0eaaf59 Mon Sep 17 00:00:00 2001 From: Vasily Litvinov Date: Thu, 10 Feb 2022 20:43:32 +0300 Subject: [PATCH 1/6] Declare enums explicitly, fix hints Signed-off-by: Vasily Litvinov --- protocol/dataframe_protocol.py | 74 +++++++++++++++++----------------- 1 file changed, 38 insertions(+), 36 deletions(-) diff --git a/protocol/dataframe_protocol.py b/protocol/dataframe_protocol.py index 14854133..9bd356c4 100644 --- a/protocol/dataframe_protocol.py +++ b/protocol/dataframe_protocol.py @@ -1,3 +1,32 @@ +from typing import Tuple, Optional, Dict, Any, Iterable, Sequence +import enum + +class DlpackDeviceType(enum.IntEnum): + CPU = 1 + CUDA = 2 + CPU_PINNED = 3 + OPENCL = 4 + VULKAN = 7 + METAL = 8 + VPI = 9 + ROCM = 10 + +class DtypeKind(enum.IntEnum): + INT = 0 + UINT = 1 + FLOAT = 2 + BOOL = 20 + STRING = 21 # UTF-8 + DATETIME = 22 + CATEGORICAL = 23 + +class ColumnNullType: + NON_NULLABLE = 0 + USE_NAN = 1 + USE_SENTINEL = 2 + USE_BITMASK = 3 + USE_BYTEMASK = 4 + class Buffer: """ Data in the buffer is guaranteed to be contiguous in memory. @@ -41,20 +70,11 @@ def __dlpack__(self): """ raise NotImplementedError("__dlpack__") - def __dlpack_device__(self) -> Tuple[enum.IntEnum, int]: + def __dlpack_device__(self) -> Tuple[DlpackDeviceType, int]: """ Device type and device ID for where the data in the buffer resides. - Uses device type codes matching DLPack. Enum members are:: - - - CPU = 1 - - CUDA = 2 - - CPU_PINNED = 3 - - OPENCL = 4 - - VULKAN = 7 - - METAL = 8 - - VPI = 9 - - ROCM = 10 + Uses device type codes matching DLPack. Note: must be implemented even if ``__dlpack__`` is not. """ @@ -128,20 +148,10 @@ def offset(self) -> int: pass @property - def dtype(self) -> Tuple[enum.IntEnum, int, str, str]: + def dtype(self) -> Tuple[DtypeKind, int, str, str]: """ Dtype description as a tuple ``(kind, bit-width, format string, endianness)``. - Kind : - - - INT = 0 - - UINT = 1 - - FLOAT = 2 - - BOOL = 20 - - STRING = 21 # UTF-8 - - DATETIME = 22 - - CATEGORICAL = 23 - Bit-width : the number of bits as an integer Format string : data type description format string in Apache Arrow C Data Interface format. @@ -194,19 +204,11 @@ def describe_categorical(self) -> dict[bool, bool, Optional[Column]]: pass @property - def describe_null(self) -> Tuple[int, Any]: + def describe_null(self) -> Tuple[ColumnNullType, Any]: """ Return the missing value (or "null") representation the column dtype uses, as a tuple ``(kind, value)``. - Kind: - - - 0 : non-nullable - - 1 : NaN/NaT - - 2 : sentinel value - - 3 : bit mask - - 4 : byte mask - Value : if kind is "sentinel value", the actual value. If kind is a bit mask or a byte mask, the value (0 or 1) indicating a missing value. None otherwise. @@ -235,7 +237,7 @@ def num_chunks(self) -> int: """ pass - def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable[Column]: + def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable["Column"]: """ Return an iterator yielding the chunks. @@ -243,7 +245,7 @@ def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable[Column]: """ pass - def get_buffers(self) -> dict[Tuple[Buffer, Any], Optional[Tuple[Buffer, Any]], Optional[Tuple[Buffer, Any]]]: + def get_buffers(self) -> Dict[Tuple[Buffer, Any], Optional[Tuple[Buffer, Any]], Optional[Tuple[Buffer, Any]]]: """ Return a dictionary containing the underlying buffers. @@ -368,19 +370,19 @@ def get_columns(self) -> Iterable[Column]: """ pass - def select_columns(self, indices: Sequence[int]) -> DataFrame: + def select_columns(self, indices: Sequence[int]) -> "DataFrame": """ Create a new DataFrame by selecting a subset of columns by index. """ pass - def select_columns_by_name(self, names: Sequence[str]) -> DataFrame: + def select_columns_by_name(self, names: Sequence[str]) -> "DataFrame": """ Create a new DataFrame by selecting a subset of columns by name. """ pass - def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable[DataFrame]: + def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable["DataFrame"]: """ Return an iterator yielding the chunks. From 2b35e5d71c6d3cdaa09bc11bf3d4561998f0083a Mon Sep 17 00:00:00 2001 From: Vasily Litvinov Date: Thu, 24 Feb 2022 20:54:18 +0300 Subject: [PATCH 2/6] Align spec with existing implementations Signed-off-by: Vasily Litvinov --- protocol/dataframe_protocol.py | 130 ++++++++++++++++++++++++--------- 1 file changed, 95 insertions(+), 35 deletions(-) diff --git a/protocol/dataframe_protocol.py b/protocol/dataframe_protocol.py index 9bd356c4..27313e74 100644 --- a/protocol/dataframe_protocol.py +++ b/protocol/dataframe_protocol.py @@ -1,7 +1,11 @@ -from typing import Tuple, Optional, Dict, Any, Iterable, Sequence +from typing import Tuple, Optional, Dict, Any, Iterable, Sequence, TypedDict import enum +from abc import ABC, abstractmethod + class DlpackDeviceType(enum.IntEnum): + """Integer enum for device type codes matching DLPack.""" + CPU = 1 CUDA = 2 CPU_PINNED = 3 @@ -11,7 +15,29 @@ class DlpackDeviceType(enum.IntEnum): VPI = 9 ROCM = 10 + class DtypeKind(enum.IntEnum): + """ + Integer enum for data types. + + Attributes + ---------- + INT : int + Matches to signed integer data type. + UINT : int + Matches to unsigned integer data type. + FLOAT : int + Matches to floating point data type. + BOOL : int + Matches to boolean data type. + STRING : int + Matches to string data type (UTF-8 encoded). + DATETIME : int + Matches to datetime data type. + CATEGORICAL : int + Matches to categorical data type. + """ + INT = 0 UINT = 1 FLOAT = 2 @@ -20,14 +46,48 @@ class DtypeKind(enum.IntEnum): DATETIME = 22 CATEGORICAL = 23 -class ColumnNullType: + +class ColumnNullType(enum.IntEnum): + """ + Integer enum for null type representation. + + Attributes + ---------- + NON_NULLABLE : int + Non-nullable column. + USE_NAN : int + Use explicit float NaN/NaT value. + USE_SENTINEL : int + Sentinel value besides NaN/NaT. + USE_BITMASK : int + The bit is set/unset representing a null on a certain position. + USE_BYTEMASK : int + The byte is set/unset representing a null on a certain position. + """ + NON_NULLABLE = 0 USE_NAN = 1 USE_SENTINEL = 2 USE_BITMASK = 3 USE_BYTEMASK = 4 -class Buffer: + +class ColumnBuffers(TypedDict): + data: Tuple["Buffer", Any] # first element is a buffer containing the column data; + # second element is the data buffer's associated dtype + validity: Optional[Tuple["Buffer", Any]] # first element is a buffer containing mask values + # indicating missing data and second element is + # the mask value buffer's associated dtype. + # None if the null representation is not a bit or byte mask + offsets: Optional[Tuple["Buffer", Any]] # first element is a buffer containing the + # offset values for variable-size binary data + # (e.g., variable-length strings) and + # second element is the offsets buffer's associated dtype. + # None if the data buffer does not have + # an associated offsets buffer + + +class Buffer(ABC): """ Data in the buffer is guaranteed to be contiguous in memory. @@ -43,6 +103,7 @@ class Buffer: """ @property + @abstractmethod def bufsize(self) -> int: """ Buffer size in bytes. @@ -50,12 +111,14 @@ def bufsize(self) -> int: pass @property + @abstractmethod def ptr(self) -> int: """ Pointer to start of the buffer as an integer. """ pass + @abstractmethod def __dlpack__(self): """ Produce DLPack capsule (see array API standard). @@ -70,18 +133,17 @@ def __dlpack__(self): """ raise NotImplementedError("__dlpack__") - def __dlpack_device__(self) -> Tuple[DlpackDeviceType, int]: + @abstractmethod + def __dlpack_device__(self) -> Tuple[DlpackDeviceType, Optional[int]]: """ Device type and device ID for where the data in the buffer resides. - Uses device type codes matching DLPack. - Note: must be implemented even if ``__dlpack__`` is not. """ pass -class Column: +class Column(ABC): """ A column object, with only the methods and properties required by the interchange protocol defined. @@ -123,10 +185,10 @@ class Column: Note: this Column object can only be produced by ``__dataframe__``, so doesn't need its own version or ``__column__`` protocol. - """ @property + @abstractmethod def size(self) -> Optional[int]: """ Size of the column, in elements. @@ -137,6 +199,7 @@ def size(self) -> Optional[int]: pass @property + @abstractmethod def offset(self) -> int: """ Offset of first element. @@ -148,6 +211,7 @@ def offset(self) -> int: pass @property + @abstractmethod def dtype(self) -> Tuple[DtypeKind, int, str, str]: """ Dtype description as a tuple ``(kind, bit-width, format string, endianness)``. @@ -158,7 +222,6 @@ def dtype(self) -> Tuple[DtypeKind, int, str, str]: Endianness : current only native endianness (``=``) is supported Notes: - - Kind specifiers are aligned with DLPack where possible (hence the jump to 20, leave enough room for future extension) - Masks must be specified as boolean with either bit width 1 (for bit @@ -180,17 +243,16 @@ def dtype(self) -> Tuple[DtypeKind, int, str, str]: pass @property + @abstractmethod def describe_categorical(self) -> dict[bool, bool, Optional[Column]]: """ If the dtype is categorical, there are two options: - - There are only values in the data buffer. - There is a separate non-categorical Column encoding categorical values. - Raises RuntimeError if the dtype is not categorical - - Content of returned dict: + Raises TypeError if the dtype is not categorical + Returns the description on how to interpret the data buffer: - "is_ordered" : bool, whether the ordering of dictionary indices is semantically meaningful. - "is_dictionary" : bool, whether a mapping of @@ -204,6 +266,7 @@ def describe_categorical(self) -> dict[bool, bool, Optional[Column]]: pass @property + @abstractmethod def describe_null(self) -> Tuple[ColumnNullType, Any]: """ Return the missing value (or "null") representation the column dtype @@ -216,6 +279,7 @@ def describe_null(self) -> Tuple[ColumnNullType, Any]: pass @property + @abstractmethod def null_count(self) -> Optional[int]: """ Number of null elements, if known. @@ -225,18 +289,21 @@ def null_count(self) -> Optional[int]: pass @property + @abstractmethod def metadata(self) -> Dict[str, Any]: """ The metadata for the column. See `DataFrame.metadata` for more details. """ pass + @abstractmethod def num_chunks(self) -> int: """ Return the number of chunks the column consists of. """ pass + @abstractmethod def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable["Column"]: """ Return an iterator yielding the chunks. @@ -245,7 +312,8 @@ def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable["Column"]: """ pass - def get_buffers(self) -> Dict[Tuple[Buffer, Any], Optional[Tuple[Buffer, Any]], Optional[Tuple[Buffer, Any]]]: + @abstractmethod + def get_buffers(self) -> ColumnBuffers: """ Return a dictionary containing the underlying buffers. @@ -276,7 +344,7 @@ def get_buffers(self) -> Dict[Tuple[Buffer, Any], Optional[Tuple[Buffer, Any]], # pass -class DataFrame: +class DataFrame(ABC): """ A data frame class, with only the methods required by the interchange protocol defined. @@ -290,29 +358,11 @@ class DataFrame: ``__dataframe__`` method of a public data frame class in a library adhering to the dataframe interchange protocol specification. """ - def __dataframe__(self, nan_as_null : bool = False, - allow_copy : bool = True) -> dict: - """ - Produces a dictionary object following the dataframe protocol specification. - ``nan_as_null`` is a keyword intended for the consumer to tell the - producer to overwrite null values in the data with ``NaN`` (or ``NaT``). - It is intended for cases where the consumer does not support the bit - mask or byte mask that is the producer's native representation. - - ``allow_copy`` is a keyword that defines whether or not the library is - allowed to make a copy of the data. For example, copying data would be - necessary if a library supports strided buffers, given that this protocol - specifies contiguous buffers. - """ - self._nan_as_null = nan_as_null - self._allow_zero_zopy = allow_copy - return { - "dataframe": self, # DataFrame object adhering to the protocol - "version": 0 # Version number of the protocol - } + version = 0 # version of the protocol @property + @abstractmethod def metadata(self) -> Dict[str, Any]: """ The metadata for the data frame, as a dictionary with string keys. The @@ -325,12 +375,14 @@ def metadata(self) -> Dict[str, Any]: """ pass + @abstractmethod def num_columns(self) -> int: """ Return the number of columns in the DataFrame. """ pass + @abstractmethod def num_rows(self) -> Optional[int]: # TODO: not happy with Optional, but need to flag it may be expensive # why include it if it may be None - what do we expect consumers @@ -340,48 +392,56 @@ def num_rows(self) -> Optional[int]: """ pass + @abstractmethod def num_chunks(self) -> int: """ Return the number of chunks the DataFrame consists of. """ pass + @abstractmethod def column_names(self) -> Iterable[str]: """ Return an iterator yielding the column names. """ pass + @abstractmethod def get_column(self, i: int) -> Column: """ Return the column at the indicated position. """ pass + @abstractmethod def get_column_by_name(self, name: str) -> Column: """ Return the column whose name is the indicated name. """ pass + @abstractmethod def get_columns(self) -> Iterable[Column]: """ Return an iterator yielding the columns. """ pass + @abstractmethod def select_columns(self, indices: Sequence[int]) -> "DataFrame": """ Create a new DataFrame by selecting a subset of columns by index. """ pass + @abstractmethod def select_columns_by_name(self, names: Sequence[str]) -> "DataFrame": """ Create a new DataFrame by selecting a subset of columns by name. """ pass + @abstractmethod def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable["DataFrame"]: """ Return an iterator yielding the chunks. From 6b49f22d91e959f42a319f276492125ac4e42be8 Mon Sep 17 00:00:00 2001 From: Vasily Litvinov Date: Thu, 24 Feb 2022 21:25:23 +0300 Subject: [PATCH 3/6] Format the spec with black Signed-off-by: Vasily Litvinov --- protocol/dataframe_protocol.py | 35 ++++++++++++++++++---------------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/protocol/dataframe_protocol.py b/protocol/dataframe_protocol.py index 27313e74..a403ef37 100644 --- a/protocol/dataframe_protocol.py +++ b/protocol/dataframe_protocol.py @@ -42,7 +42,7 @@ class DtypeKind(enum.IntEnum): UINT = 1 FLOAT = 2 BOOL = 20 - STRING = 21 # UTF-8 + STRING = 21 # UTF-8 DATETIME = 22 CATEGORICAL = 23 @@ -73,18 +73,20 @@ class ColumnNullType(enum.IntEnum): class ColumnBuffers(TypedDict): - data: Tuple["Buffer", Any] # first element is a buffer containing the column data; - # second element is the data buffer's associated dtype - validity: Optional[Tuple["Buffer", Any]] # first element is a buffer containing mask values - # indicating missing data and second element is - # the mask value buffer's associated dtype. - # None if the null representation is not a bit or byte mask - offsets: Optional[Tuple["Buffer", Any]] # first element is a buffer containing the - # offset values for variable-size binary data - # (e.g., variable-length strings) and - # second element is the offsets buffer's associated dtype. - # None if the data buffer does not have - # an associated offsets buffer + # first element is a buffer containing the column data; + # second element is the data buffer's associated dtype + data: Tuple["Buffer", Any] + + # first element is a buffer containing mask values indicating missing data; + # second element is the mask value buffer's associated dtype. + # None if the null representation is not a bit or byte mask + validity: Optional[Tuple["Buffer", Any]] + + # first element is a buffer containing the offset values for + # variable-size binary data (e.g., variable-length strings); + # second element is the offsets buffer's associated dtype. + # None if the data buffer does not have an associated offsets buffer + offsets: Optional[Tuple["Buffer", Any]] class Buffer(ABC): @@ -304,7 +306,7 @@ def num_chunks(self) -> int: pass @abstractmethod - def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable["Column"]: + def get_chunks(self, n_chunks: Optional[int] = None) -> Iterable["Column"]: """ Return an iterator yielding the chunks. @@ -336,6 +338,7 @@ def get_buffers(self) -> ColumnBuffers: """ pass + # def get_children(self) -> Iterable[Column]: # """ # Children columns underneath the column, each object in this iterator @@ -359,7 +362,7 @@ class DataFrame(ABC): to the dataframe interchange protocol specification. """ - version = 0 # version of the protocol + version = 0 # version of the protocol @property @abstractmethod @@ -442,7 +445,7 @@ def select_columns_by_name(self, names: Sequence[str]) -> "DataFrame": pass @abstractmethod - def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable["DataFrame"]: + def get_chunks(self, n_chunks: Optional[int] = None) -> Iterable["DataFrame"]: """ Return an iterator yielding the chunks. From d772b47781a54960360e4e57a5e348c36605debe Mon Sep 17 00:00:00 2001 From: Vasily Litvinov Date: Fri, 1 Apr 2022 16:09:29 +0300 Subject: [PATCH 4/6] Change API a bit, align formatting with pandas Signed-off-by: Vasily Litvinov --- protocol/dataframe_protocol.py | 49 ++++++++++++++++++++++++++++++---- 1 file changed, 44 insertions(+), 5 deletions(-) diff --git a/protocol/dataframe_protocol.py b/protocol/dataframe_protocol.py index a403ef37..6eaae7b4 100644 --- a/protocol/dataframe_protocol.py +++ b/protocol/dataframe_protocol.py @@ -1,6 +1,17 @@ -from typing import Tuple, Optional, Dict, Any, Iterable, Sequence, TypedDict +from abc import ( + ABC, + abstractmethod, +) import enum -from abc import ABC, abstractmethod +from typing import ( + Any, + Dict, + Iterable, + Optional, + Sequence, + Tuple, + TypedDict, +) class DlpackDeviceType(enum.IntEnum): @@ -89,6 +100,16 @@ class ColumnBuffers(TypedDict): offsets: Optional[Tuple["Buffer", Any]] +class CategoricalDescription(TypedDict): + # whether the ordering of dictionary indices is semantically meaningful + is_ordered: bool + # whether a dictionary-style mapping of categorical values to other objects exists + is_dictionary: bool + # Python-level only (e.g. ``{int: str}``). + # None if not a dictionary-style categorical. + categories: Optional[Column] + + class Buffer(ABC): """ Data in the buffer is guaranteed to be contiguous in memory. @@ -191,7 +212,7 @@ class Column(ABC): @property @abstractmethod - def size(self) -> Optional[int]: + def size(self) -> int: """ Size of the column, in elements. @@ -246,7 +267,7 @@ def dtype(self) -> Tuple[DtypeKind, int, str, str]: @property @abstractmethod - def describe_categorical(self) -> dict[bool, bool, Optional[Column]]: + def describe_categorical(self) -> CategoricalDescription: """ If the dtype is categorical, there are two options: - There are only values in the data buffer. @@ -254,7 +275,7 @@ def describe_categorical(self) -> dict[bool, bool, Optional[Column]]: Raises TypeError if the dtype is not categorical - Returns the description on how to interpret the data buffer: + Returns the dictionary with description on how to interpret the data buffer: - "is_ordered" : bool, whether the ordering of dictionary indices is semantically meaningful. - "is_dictionary" : bool, whether a mapping of @@ -364,6 +385,24 @@ class DataFrame(ABC): version = 0 # version of the protocol + @abstractmethod + def __dataframe__( + self, nan_as_null: bool = False, allow_copy: bool = True + ) -> "DataFrame": + """ + Construct a new exchange object, potentially changing the parameters. + + ``nan_as_null`` is a keyword intended for the consumer to tell the + producer to overwrite null values in the data with ``NaN`` (or ``NaT``). + It is intended for cases where the consumer does not support the bit + mask or byte mask that is the producer's native representation. + ``allow_copy`` is a keyword that defines whether or not the library is + allowed to make a copy of the data. For example, copying data would be + necessary if a library supports strided buffers, given that this protocol + specifies contiguous buffers. + """ + pass + @property @abstractmethod def metadata(self) -> Dict[str, Any]: From 0e9e17332c6d61c12bce40fd89efb2d6bf790d07 Mon Sep 17 00:00:00 2001 From: Ralf Gommers Date: Thu, 28 Jul 2022 14:21:11 +0200 Subject: [PATCH 5/6] Remove NaT (not-a-datetime) from the `USE_NAN` description. This address the review comment that NaT is not a thing outside of NumPy. Hence for not-a-datetime, all implementers should be using sentinel values, because those are explicit. --- protocol/dataframe_protocol.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/protocol/dataframe_protocol.py b/protocol/dataframe_protocol.py index 6eaae7b4..b2705e31 100644 --- a/protocol/dataframe_protocol.py +++ b/protocol/dataframe_protocol.py @@ -67,9 +67,9 @@ class ColumnNullType(enum.IntEnum): NON_NULLABLE : int Non-nullable column. USE_NAN : int - Use explicit float NaN/NaT value. + Use explicit float NaN value. USE_SENTINEL : int - Sentinel value besides NaN/NaT. + Sentinel value besides NaN. USE_BITMASK : int The bit is set/unset representing a null on a certain position. USE_BYTEMASK : int @@ -393,7 +393,7 @@ def __dataframe__( Construct a new exchange object, potentially changing the parameters. ``nan_as_null`` is a keyword intended for the consumer to tell the - producer to overwrite null values in the data with ``NaN`` (or ``NaT``). + producer to overwrite null values in the data with ``NaN``. It is intended for cases where the consumer does not support the bit mask or byte mask that is the producer's native representation. ``allow_copy`` is a keyword that defines whether or not the library is From f1f1eac5c9660097ce2838bf9cfe8b8916f35af2 Mon Sep 17 00:00:00 2001 From: Ralf Gommers Date: Fri, 29 Jul 2022 15:24:03 +0200 Subject: [PATCH 6/6] Change `Column.size` from a property to a method --- protocol/dataframe_protocol.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/protocol/dataframe_protocol.py b/protocol/dataframe_protocol.py index b2705e31..adde1a48 100644 --- a/protocol/dataframe_protocol.py +++ b/protocol/dataframe_protocol.py @@ -210,7 +210,6 @@ class Column(ABC): doesn't need its own version or ``__column__`` protocol. """ - @property @abstractmethod def size(self) -> int: """ @@ -218,6 +217,9 @@ def size(self) -> int: Corresponds to DataFrame.num_rows() if column is a single chunk; equal to size of this current chunk otherwise. + + Is a method rather than a property because it may cause a (potentially + expensive) computation for some dataframe implementations. """ pass