diff --git a/altair/utils/_dfi_types.py b/altair/utils/_dfi_types.py index 16b83fb4d..a76435e7f 100644 --- a/altair/utils/_dfi_types.py +++ b/altair/utils/_dfi_types.py @@ -1,34 +1,11 @@ # DataFrame Interchange Protocol Types -# Copied from https://data-apis.org/dataframe-protocol/latest/API.html +# Copied from https://data-apis.org/dataframe-protocol/latest/API.html, +# changed ABCs to Protocols, and subset the type hints to only those that are +# relevant for Altair. # # These classes are only for use in type signatures -from abc import ( - ABC, - abstractmethod, -) import enum -from typing import ( - Any, - Dict, - Iterable, - Optional, - Sequence, - Tuple, - TypedDict, -) - - -class DlpackDeviceType(enum.IntEnum): - """Integer enum for device type codes matching DLPack.""" - - CPU = 1 - CUDA = 2 - CPU_PINNED = 3 - OPENCL = 4 - VULKAN = 7 - METAL = 8 - VPI = 9 - ROCM = 10 +from typing import Any, Iterable, Optional, Tuple, Protocol class DtypeKind(enum.IntEnum): @@ -62,188 +39,15 @@ class DtypeKind(enum.IntEnum): CATEGORICAL = 23 -Dtype = Tuple[DtypeKind, int, str, str] # see Column.dtype - - -class ColumnNullType(enum.IntEnum): - """ - Integer enum for null type representation. - - Attributes - ---------- - NON_NULLABLE : int - Non-nullable column. - USE_NAN : int - Use explicit float NaN value. - USE_SENTINEL : int - Sentinel value besides NaN. - USE_BITMASK : int - The bit is set/unset representing a null on a certain position. - USE_BYTEMASK : int - The byte is set/unset representing a null on a certain position. - """ - - NON_NULLABLE = 0 - USE_NAN = 1 - USE_SENTINEL = 2 - USE_BITMASK = 3 - USE_BYTEMASK = 4 - - -class ColumnBuffers(TypedDict): - # first element is a buffer containing the column data; - # second element is the data buffer's associated dtype - data: Tuple["Buffer", Dtype] - - # first element is a buffer containing mask values indicating missing data; - # second element is the mask value buffer's associated dtype. - # None if the null representation is not a bit or byte mask - validity: Optional[Tuple["Buffer", Dtype]] - - # first element is a buffer containing the offset values for - # variable-size binary data (e.g., variable-length strings); - # second element is the offsets buffer's associated dtype. - # None if the data buffer does not have an associated offsets buffer - offsets: Optional[Tuple["Buffer", Dtype]] - - -class CategoricalDescription(TypedDict): - # whether the ordering of dictionary indices is semantically meaningful - is_ordered: bool - # whether a dictionary-style mapping of categorical values to other objects exists - is_dictionary: bool - # Python-level only (e.g. ``{int: str}``). - # None if not a dictionary-style categorical. - categories: "Optional[Column]" +# Type hint of first element would actually be DtypeKind but can't use that +# as other libraries won't use an instance of our own Enum in this module but have +# their own. Type checkers will raise an error on that even though the enums +# are identical. +Dtype = Tuple[Any, int, str, str] # see Column.dtype -class Buffer(ABC): - """ - Data in the buffer is guaranteed to be contiguous in memory. - - Note that there is no dtype attribute present, a buffer can be thought of - as simply a block of memory. However, if the column that the buffer is - attached to has a dtype that's supported by DLPack and ``__dlpack__`` is - implemented, then that dtype information will be contained in the return - value from ``__dlpack__``. - - This distinction is useful to support both data exchange via DLPack on a - buffer and (b) dtypes like variable-length strings which do not have a - fixed number of bytes per element. - """ - +class Column(Protocol): @property - @abstractmethod - def bufsize(self) -> int: - """ - Buffer size in bytes. - """ - pass - - @property - @abstractmethod - def ptr(self) -> int: - """ - Pointer to start of the buffer as an integer. - """ - pass - - @abstractmethod - def __dlpack__(self): - """ - Produce DLPack capsule (see array API standard). - - Raises: - - - TypeError : if the buffer contains unsupported dtypes. - - NotImplementedError : if DLPack support is not implemented - - Useful to have to connect to array libraries. Support optional because - it's not completely trivial to implement for a Python-only library. - """ - raise NotImplementedError("__dlpack__") - - @abstractmethod - def __dlpack_device__(self) -> Tuple[DlpackDeviceType, Optional[int]]: - """ - Device type and device ID for where the data in the buffer resides. - Uses device type codes matching DLPack. - Note: must be implemented even if ``__dlpack__`` is not. - """ - pass - - -class Column(ABC): - """ - A column object, with only the methods and properties required by the - interchange protocol defined. - - A column can contain one or more chunks. Each chunk can contain up to three - buffers - a data buffer, a mask buffer (depending on null representation), - and an offsets buffer (if variable-size binary; e.g., variable-length - strings). - - TBD: Arrow has a separate "null" dtype, and has no separate mask concept. - Instead, it seems to use "children" for both columns with a bit mask, - and for nested dtypes. Unclear whether this is elegant or confusing. - This design requires checking the null representation explicitly. - - The Arrow design requires checking: - 1. the ARROW_FLAG_NULLABLE (for sentinel values) - 2. if a column has two children, combined with one of those children - having a null dtype. - - Making the mask concept explicit seems useful. One null dtype would - not be enough to cover both bit and byte masks, so that would mean - even more checking if we did it the Arrow way. - - TBD: there's also the "chunk" concept here, which is implicit in Arrow as - multiple buffers per array (= column here). Semantically it may make - sense to have both: chunks were meant for example for lazy evaluation - of data which doesn't fit in memory, while multiple buffers per column - could also come from doing a selection operation on a single - contiguous buffer. - - Given these concepts, one would expect chunks to be all of the same - size (say a 10,000 row dataframe could have 10 chunks of 1,000 rows), - while multiple buffers could have data-dependent lengths. Not an issue - in pandas if one column is backed by a single NumPy array, but in - Arrow it seems possible. - Are multiple chunks *and* multiple buffers per column necessary for - the purposes of this interchange protocol, or must producers either - reuse the chunk concept for this or copy the data? - - Note: this Column object can only be produced by ``__dataframe__``, so - doesn't need its own version or ``__column__`` protocol. - """ - - @abstractmethod - def size(self) -> int: - """ - Size of the column, in elements. - - Corresponds to DataFrame.num_rows() if column is a single chunk; - equal to size of this current chunk otherwise. - - Is a method rather than a property because it may cause a (potentially - expensive) computation for some dataframe implementations. - """ - pass - - @property - @abstractmethod - def offset(self) -> int: - """ - Offset of first element. - - May be > 0 if using chunks; for example for a column with N chunks of - equal size M (only the last chunk may be shorter), - ``offset = n * M``, ``n = 0 .. N-1``. - """ - pass - - @property - @abstractmethod def dtype(self) -> Dtype: """ Dtype description as a tuple ``(kind, bit-width, format string, endianness)``. @@ -274,9 +78,13 @@ def dtype(self) -> Dtype: """ pass + # Have to use a generic Any return type as not all libraries who implement + # the dataframe interchange protocol implement the TypedDict that is usually + # returned here in the same way. As TypedDicts are invariant, even a slight change + # will lead to an error by a type checker. See PR in which this code was added + # for details. @property - @abstractmethod - def describe_categorical(self) -> CategoricalDescription: + def describe_categorical(self) -> Any: """ If the dtype is categorical, there are two options: - There are only values in the data buffer. @@ -297,87 +105,8 @@ def describe_categorical(self) -> CategoricalDescription: """ pass - @property - @abstractmethod - def describe_null(self) -> Tuple[ColumnNullType, Any]: - """ - Return the missing value (or "null") representation the column dtype - uses, as a tuple ``(kind, value)``. - - Value : if kind is "sentinel value", the actual value. If kind is a bit - mask or a byte mask, the value (0 or 1) indicating a missing value. None - otherwise. - """ - pass - - @property - @abstractmethod - def null_count(self) -> Optional[int]: - """ - Number of null elements, if known. - - Note: Arrow uses -1 to indicate "unknown", but None seems cleaner. - """ - pass - - @property - @abstractmethod - def metadata(self) -> Dict[str, Any]: - """ - The metadata for the column. See `DataFrame.metadata` for more details. - """ - pass - - @abstractmethod - def num_chunks(self) -> int: - """ - Return the number of chunks the column consists of. - """ - pass - - @abstractmethod - def get_chunks(self, n_chunks: Optional[int] = None) -> Iterable["Column"]: - """ - Return an iterator yielding the chunks. - - See `DataFrame.get_chunks` for details on ``n_chunks``. - """ - pass - - @abstractmethod - def get_buffers(self) -> ColumnBuffers: - """ - Return a dictionary containing the underlying buffers. - The returned dictionary has the following contents: - - - "data": a two-element tuple whose first element is a buffer - containing the data and whose second element is the data - buffer's associated dtype. - - "validity": a two-element tuple whose first element is a buffer - containing mask values indicating missing data and - whose second element is the mask value buffer's - associated dtype. None if the null representation is - not a bit or byte mask. - - "offsets": a two-element tuple whose first element is a buffer - containing the offset values for variable-size binary - data (e.g., variable-length strings) and whose second - element is the offsets buffer's associated dtype. None - if the data buffer does not have an associated offsets - buffer. - """ - pass - - -# def get_children(self) -> Iterable[Column]: -# """ -# Children columns underneath the column, each object in this iterator -# must adhere to the column specification. -# """ -# pass - - -class DataFrame(ABC): +class DataFrame(Protocol): """ A data frame class, with only the methods required by the interchange protocol defined. @@ -392,9 +121,6 @@ class DataFrame(ABC): to the dataframe interchange protocol specification. """ - version = 0 # version of the protocol - - @abstractmethod def __dataframe__( self, nan_as_null: bool = False, allow_copy: bool = True ) -> "DataFrame": @@ -412,87 +138,18 @@ def __dataframe__( """ pass - @property - @abstractmethod - def metadata(self) -> Dict[str, Any]: - """ - The metadata for the data frame, as a dictionary with string keys. The - contents of `metadata` may be anything, they are meant for a library - to store information that it needs to, e.g., roundtrip losslessly or - for two implementations to share data that is not (yet) part of the - interchange protocol specification. For avoiding collisions with other - entries, please add name the keys with the name of the library - followed by a period and the desired name, e.g, ``pandas.indexcol``. - """ - pass - - @abstractmethod - def num_columns(self) -> int: - """ - Return the number of columns in the DataFrame. - """ - pass - - @abstractmethod - def num_rows(self) -> Optional[int]: - # TODO: not happy with Optional, but need to flag it may be expensive - # why include it if it may be None - what do we expect consumers - # to do here? - """ - Return the number of rows in the DataFrame, if available. - """ - pass - - @abstractmethod - def num_chunks(self) -> int: - """ - Return the number of chunks the DataFrame consists of. - """ - pass - - @abstractmethod def column_names(self) -> Iterable[str]: """ Return an iterator yielding the column names. """ pass - @abstractmethod - def get_column(self, i: int) -> Column: - """ - Return the column at the indicated position. - """ - pass - - @abstractmethod def get_column_by_name(self, name: str) -> Column: """ Return the column whose name is the indicated name. """ pass - @abstractmethod - def get_columns(self) -> Iterable[Column]: - """ - Return an iterator yielding the columns. - """ - pass - - @abstractmethod - def select_columns(self, indices: Sequence[int]) -> "DataFrame": - """ - Create a new DataFrame by selecting a subset of columns by index. - """ - pass - - @abstractmethod - def select_columns_by_name(self, names: Sequence[str]) -> "DataFrame": - """ - Create a new DataFrame by selecting a subset of columns by name. - """ - pass - - @abstractmethod def get_chunks(self, n_chunks: Optional[int] = None) -> Iterable["DataFrame"]: """ Return an iterator yielding the chunks. diff --git a/altair/utils/core.py b/altair/utils/core.py index ea8abf1f1..f6fec3801 100644 --- a/altair/utils/core.py +++ b/altair/utils/core.py @@ -46,7 +46,9 @@ class DataFrameLike(Protocol): - def __dataframe__(self, *args, **kwargs) -> DfiDataFrame: + def __dataframe__( + self, nan_as_null: bool = False, allow_copy: bool = True + ) -> DfiDataFrame: ... diff --git a/doc/releases/changes.rst b/doc/releases/changes.rst index 998fbe3e9..c8de5018c 100644 --- a/doc/releases/changes.rst +++ b/doc/releases/changes.rst @@ -12,6 +12,7 @@ Enhancements Bug Fixes ~~~~~~~~~ +- Fix type hints for libraries such as Polars where Altair uses the dataframe interchange protocol (#3297) Backward-Incompatible Changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~