From ac5896714572550f36300aaf4ca0423c6444b953 Mon Sep 17 00:00:00 2001 From: Vasily Litvinov Date: Mon, 21 Feb 2022 20:13:50 +0300 Subject: [PATCH 01/49] Vendor smoke tests from consortium Signed-off-by: Vasily Litvinov --- pandas/tests/api/conftest.py | 8 ++ pandas/tests/api/test_protocol.py | 140 ++++++++++++++++++++++++++++++ 2 files changed, 148 insertions(+) create mode 100644 pandas/tests/api/conftest.py create mode 100644 pandas/tests/api/test_protocol.py diff --git a/pandas/tests/api/conftest.py b/pandas/tests/api/conftest.py new file mode 100644 index 0000000000000..5d3c42870cbb8 --- /dev/null +++ b/pandas/tests/api/conftest.py @@ -0,0 +1,8 @@ +import pytest +import pandas as pd + +@pytest.fixture(scope='package') +def create_df_from_dict(): + def maker(dct): + return pd.DataFrame(dct) + return maker diff --git a/pandas/tests/api/test_protocol.py b/pandas/tests/api/test_protocol.py new file mode 100644 index 0000000000000..891233661d498 --- /dev/null +++ b/pandas/tests/api/test_protocol.py @@ -0,0 +1,140 @@ +import pytest +import numpy as np + +@pytest.mark.parametrize("test_data", + [ + ({'a': [np.array([1, 2, 3]), np.array([4, 5, 6])], + 'b': [np.array([1.5, 2.0, 3.2]), np.array([4.1, 5.7, 6.9])]}, + np.object_, None), + ({'a': [1.5, 2.5, 3.5], 'b': [9.2, 10.5, 11.8]}, np.float64, None), + ({'A': [1, 2, 3, 4], 'B': [1, 2, 3, 4]}, np.int64, np.float64) + ], + ids=["array_data", "float_data", "int_data"]) +def test_only_one_data(test_data, create_df_from_dict): + data, dtype, new_dtype = test_data + columns = list(data.keys()) + df = create_df_from_dict(data) + df2 = df.__dataframe__() + new_dtype = dtype if new_dtype is None else new_dtype + assert df.columns.values.tolist() == columns + val = len(df[columns[0]])-1 + column_size = df.size + for column in columns: + assert df[column].tolist() == df[column].tolist() + assert df[column].dtype.type is dtype + assert df2.get_column_by_name(column).null_count == 0 + assert df2.get_column_by_name(column).size == column_size + assert df2.get_column_by_name(column).offset == 0 + assert not df2["x"].is_masked + n = np.random.randint(0, val) + (df[column])[n] = None + assert df[column].dtype.type is new_dtype + assert df2.get_column_by_name(column).null_count == 1 + + +def test_float_int(create_df_from_dict): + df = create_df_from_dict({'a': [1, 2, 3], 'b': [3, 4, 5], + 'c': [1.5, 2.5, 3.5], 'd': [9, 10, 11]}) + df2 = df.__dataframe__() + columns = ['a', 'b', 'c', 'd'] + assert df.columns.values.tolist() == columns + for column in columns: + assert df[column].tolist() == df[column].tolist() + if column is 'c': + assert df[column].dtype.type is np.float64 + else: + assert df[column].dtype.type is np.int64 + + assert df2.get_column_by_name(column).null_count == 0 + assert df2.get_column_by_name(column).size == 3 + assert df2.get_column_by_name(column).offset == 0 + + n = np.random.randint(0, 2) + (df[column])[n] = None + assert df[column].dtype.type is np.float64 + assert df2.get_column_by_name(column).null_count == 1 + + +def test_mixed_intfloatbool(create_df_from_dict): + df = create_df_from_dict({"x": np.array([True, True, False]), + "y": np.array([1, 2, 0]), + "z": np.array([9.2, 10.5, 11.8])}) + df2 = df.__dataframe__() + columns = ['x', 'y', 'z'] + assert df.columns.values.tolist() == columns + for column in columns: + assert df[column].tolist() == df[column].tolist() + assert df2.get_column_by_name(column).null_count == 0 + assert df2.get_column_by_name(column).size == 3 + assert df2.get_column_by_name(column).offset == 0 + + assert df["x"].dtype.type is np.bool_ + assert df["y"].dtype.type is np.int32 + assert df["z"].dtype.type is np.float64 + + assert df2.get_column_by_name("x")._allow_copy == True + + for column in columns: + n = np.random.randint(0, 2) + (df[column])[n] = None + if column is "x": + assert df[column].dtype.type is np.object_ + else: + assert df[column].dtype.type is np.float64 + assert df2.get_column_by_name(column).null_count == 1 + + +def test_string_dtype(create_df_from_dict): + df = create_df_from_dict({"A": ["a", "b", "cdef", "", "g"]}) + df2 = df.__dataframe__() + columns = ['A'] + assert df.columns.values.tolist() == columns + for column in columns: + assert df[column].tolist() == df[column].tolist() + assert df[column].dtype.type is np.object_ + assert df2.get_column_by_name(column).null_count == 0 + + +def test_categorical(create_df_from_dict): + df = create_df_from_dict({"year": [2012, 2013, 2015, 2019], "weekday": [0, 1, 4, 6]}) + df = df.categorize("year", min_value=2012, max_value=2019) + df = df.categorize("weekday", labels=["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]) + # Some detailed testing for correctness of dtype and null handling: + col = df.__dataframe__().get_column_by_name("year") + assert col.describe_categorical == (False, True, {0: 2012, 1: 2013, 2: 2014, 3: 2015, 4: 2016, 5: 2017, 6: 2018, 7: 2019}) + assert col.describe_null == (0, None) + col2 = df.__dataframe__().get_column_by_name("weekday") + assert col2.describe_categorical == (False, True, {0: "Mon", 1: "Tue", 2: "Wed", 3: "Thu", 4: "Fri", 5: "Sat", 6: "Sun"}) + assert col2.describe_null == (0, None) + + +def test_dataframe(create_df_from_dict): + df = create_df_from_dict({"x": [True, True, False], "y": [1, 2, 0], "z": [9.2, 10.5, 11.8]}) + df2 = df.__dataframe__() + assert df2._allow_copy == True + assert df2.num_columns() == 3 + assert df2.num_rows() == 3 + assert df2.num_chunks() == 1 + assert df2.column_names() == ["x", "y", "z"] + assert df2.select_columns((0, 2))._df[:, 0].tolist() == df2.select_columns_by_name(("x", "z"))._df[:, 0].tolist() + assert df2.select_columns((0, 2))._df[:, 1].tolist() == df2.select_columns_by_name(("x", "z"))._df[:, 1].tolist() + + +def test_chunks(create_df_from_dict): + df = create_df_from_dict({"x": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]}) + df2 = df.__dataframe__() + chunk_iter = iter(df2.get_chunks(3)) + chunk = next(chunk_iter) + assert chunk.num_rows() == 4 + chunk = next(chunk_iter) + assert chunk.num_rows() == 4 + chunk = next(chunk_iter) + assert chunk.num_rows() == 2 + with pytest.raises(StopIteration): + chunk = next(chunk_iter) + + +def test_get_chunks(create_df_from_dict): + df = create_df_from_dict({"x": [1]}) + df2 = df.__dataframe__() + assert df2.get_chunks() == 1 From fce881e983557097593e89e1f8352de4617baa29 Mon Sep 17 00:00:00 2001 From: Vasily Litvinov Date: Mon, 21 Feb 2022 20:47:08 +0300 Subject: [PATCH 02/49] Vendor dataframe_protocol spec Signed-off-by: Vasily Litvinov --- pandas/api/exchange/dataframe_protocol.py | 358 ++++++++++++++++++++++ 1 file changed, 358 insertions(+) create mode 100644 pandas/api/exchange/dataframe_protocol.py diff --git a/pandas/api/exchange/dataframe_protocol.py b/pandas/api/exchange/dataframe_protocol.py new file mode 100644 index 0000000000000..0b388878e2891 --- /dev/null +++ b/pandas/api/exchange/dataframe_protocol.py @@ -0,0 +1,358 @@ +from typing import Tuple, Optional, Dict, Any, Iterable, Sequence +import enum + +class DlpackDeviceType(enum.IntEnum): + CPU = 1 + CUDA = 2 + CPU_PINNED = 3 + OPENCL = 4 + VULKAN = 7 + METAL = 8 + VPI = 9 + ROCM = 10 + +class DtypeKind(enum.IntEnum): + INT = 0 + UINT = 1 + FLOAT = 2 + BOOL = 20 + STRING = 21 # UTF-8 + DATETIME = 22 + CATEGORICAL = 23 + +class ColumnNullType: + NON_NULLABLE = 0 + USE_NAN = 1 + USE_SENTINEL = 2 + USE_BITMASK = 3 + USE_BYTEMASK = 4 + +class Buffer: + """ + Data in the buffer is guaranteed to be contiguous in memory. + Note that there is no dtype attribute present, a buffer can be thought of + as simply a block of memory. However, if the column that the buffer is + attached to has a dtype that's supported by DLPack and ``__dlpack__`` is + implemented, then that dtype information will be contained in the return + value from ``__dlpack__``. + This distinction is useful to support both data exchange via DLPack on a + buffer and (b) dtypes like variable-length strings which do not have a + fixed number of bytes per element. + """ + + @property + def bufsize(self) -> int: + """ + Buffer size in bytes. + """ + pass + + @property + def ptr(self) -> int: + """ + Pointer to start of the buffer as an integer. + """ + pass + + def __dlpack__(self): + """ + Produce DLPack capsule (see array API standard). + Raises: + - TypeError : if the buffer contains unsupported dtypes. + - NotImplementedError : if DLPack support is not implemented + Useful to have to connect to array libraries. Support optional because + it's not completely trivial to implement for a Python-only library. + """ + raise NotImplementedError("__dlpack__") + + def __dlpack_device__(self) -> Tuple[DlpackDeviceType, int]: + """ + Device type and device ID for where the data in the buffer resides. + Uses device type codes matching DLPack. + Note: must be implemented even if ``__dlpack__`` is not. + """ + pass + + +class Column: + """ + A column object, with only the methods and properties required by the + interchange protocol defined. + A column can contain one or more chunks. Each chunk can contain up to three + buffers - a data buffer, a mask buffer (depending on null representation), + and an offsets buffer (if variable-size binary; e.g., variable-length + strings). + TBD: Arrow has a separate "null" dtype, and has no separate mask concept. + Instead, it seems to use "children" for both columns with a bit mask, + and for nested dtypes. Unclear whether this is elegant or confusing. + This design requires checking the null representation explicitly. + The Arrow design requires checking: + 1. the ARROW_FLAG_NULLABLE (for sentinel values) + 2. if a column has two children, combined with one of those children + having a null dtype. + Making the mask concept explicit seems useful. One null dtype would + not be enough to cover both bit and byte masks, so that would mean + even more checking if we did it the Arrow way. + TBD: there's also the "chunk" concept here, which is implicit in Arrow as + multiple buffers per array (= column here). Semantically it may make + sense to have both: chunks were meant for example for lazy evaluation + of data which doesn't fit in memory, while multiple buffers per column + could also come from doing a selection operation on a single + contiguous buffer. + Given these concepts, one would expect chunks to be all of the same + size (say a 10,000 row dataframe could have 10 chunks of 1,000 rows), + while multiple buffers could have data-dependent lengths. Not an issue + in pandas if one column is backed by a single NumPy array, but in + Arrow it seems possible. + Are multiple chunks *and* multiple buffers per column necessary for + the purposes of this interchange protocol, or must producers either + reuse the chunk concept for this or copy the data? + Note: this Column object can only be produced by ``__dataframe__``, so + doesn't need its own version or ``__column__`` protocol. + """ + + @property + def size(self) -> Optional[int]: + """ + Size of the column, in elements. + Corresponds to DataFrame.num_rows() if column is a single chunk; + equal to size of this current chunk otherwise. + """ + pass + + @property + def offset(self) -> int: + """ + Offset of first element. + May be > 0 if using chunks; for example for a column with N chunks of + equal size M (only the last chunk may be shorter), + ``offset = n * M``, ``n = 0 .. N-1``. + """ + pass + + @property + def dtype(self) -> Tuple[DtypeKind, int, str, str]: + """ + Dtype description as a tuple ``(kind, bit-width, format string, endianness)``. + Bit-width : the number of bits as an integer + Format string : data type description format string in Apache Arrow C + Data Interface format. + Endianness : current only native endianness (``=``) is supported + Notes: + - Kind specifiers are aligned with DLPack where possible (hence the + jump to 20, leave enough room for future extension) + - Masks must be specified as boolean with either bit width 1 (for bit + masks) or 8 (for byte masks). + - Dtype width in bits was preferred over bytes + - Endianness isn't too useful, but included now in case in the future + we need to support non-native endianness + - Went with Apache Arrow format strings over NumPy format strings + because they're more complete from a dataframe perspective + - Format strings are mostly useful for datetime specification, and + for categoricals. + - For categoricals, the format string describes the type of the + categorical in the data buffer. In case of a separate encoding of + the categorical (e.g. an integer to string mapping), this can + be derived from ``self.describe_categorical``. + - Data types not included: complex, Arrow-style null, binary, decimal, + and nested (list, struct, map, union) dtypes. + """ + pass + + @property + def describe_categorical(self) -> Dict[bool, bool, Optional[dict]]: + """ + If the dtype is categorical, there are two options: + - There are only values in the data buffer. + - There is a separate dictionary-style encoding for categorical values. + Raises RuntimeError if the dtype is not categorical + Content of returned dict: + - "is_ordered" : bool, whether the ordering of dictionary indices is + semantically meaningful. + - "is_dictionary" : bool, whether a dictionary-style mapping of + categorical values to other objects exists + - "mapping" : dict, Python-level only (e.g. ``{int: str}``). + None if not a dictionary-style categorical. + TBD: are there any other in-memory representations that are needed? + """ + pass + + @property + def describe_null(self) -> Tuple[ColumnNullType, Any]: + """ + Return the missing value (or "null") representation the column dtype + uses, as a tuple ``(kind, value)``. + Value : if kind is "sentinel value", the actual value. If kind is a bit + mask or a byte mask, the value (0 or 1) indicating a missing value. None + otherwise. + """ + pass + + @property + def null_count(self) -> Optional[int]: + """ + Number of null elements, if known. + Note: Arrow uses -1 to indicate "unknown", but None seems cleaner. + """ + pass + + @property + def metadata(self) -> Dict[str, Any]: + """ + The metadata for the column. See `DataFrame.metadata` for more details. + """ + pass + + def num_chunks(self) -> int: + """ + Return the number of chunks the column consists of. + """ + pass + + def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable["Column"]: + """ + Return an iterator yielding the chunks. + See `DataFrame.get_chunks` for details on ``n_chunks``. + """ + pass + + def get_buffers(self) -> Dict[Tuple[Buffer, Any], Optional[Tuple[Buffer, Any]], Optional[Tuple[Buffer, Any]]]: + """ + Return a dictionary containing the underlying buffers. + The returned dictionary has the following contents: + - "data": a two-element tuple whose first element is a buffer + containing the data and whose second element is the data + buffer's associated dtype. + - "validity": a two-element tuple whose first element is a buffer + containing mask values indicating missing data and + whose second element is the mask value buffer's + associated dtype. None if the null representation is + not a bit or byte mask. + - "offsets": a two-element tuple whose first element is a buffer + containing the offset values for variable-size binary + data (e.g., variable-length strings) and whose second + element is the offsets buffer's associated dtype. None + if the data buffer does not have an associated offsets + buffer. + """ + pass + +# def get_children(self) -> Iterable[Column]: +# """ +# Children columns underneath the column, each object in this iterator +# must adhere to the column specification. +# """ +# pass + + +class DataFrame: + """ + A data frame class, with only the methods required by the interchange + protocol defined. + A "data frame" represents an ordered collection of named columns. + A column's "name" must be a unique string. + Columns may be accessed by name or by position. + This could be a public data frame class, or an object with the methods and + attributes defined on this DataFrame class could be returned from the + ``__dataframe__`` method of a public data frame class in a library adhering + to the dataframe interchange protocol specification. + """ + def __dataframe__(self, nan_as_null : bool = False, + allow_copy : bool = True) -> dict: + """ + Produces a dictionary object following the dataframe protocol specification. + ``nan_as_null`` is a keyword intended for the consumer to tell the + producer to overwrite null values in the data with ``NaN`` (or ``NaT``). + It is intended for cases where the consumer does not support the bit + mask or byte mask that is the producer's native representation. + ``allow_copy`` is a keyword that defines whether or not the library is + allowed to make a copy of the data. For example, copying data would be + necessary if a library supports strided buffers, given that this protocol + specifies contiguous buffers. + """ + self._nan_as_null = nan_as_null + self._allow_zero_zopy = allow_copy + return { + "dataframe": self, # DataFrame object adhering to the protocol + "version": 0 # Version number of the protocol + } + + @property + def metadata(self) -> Dict[str, Any]: + """ + The metadata for the data frame, as a dictionary with string keys. The + contents of `metadata` may be anything, they are meant for a library + to store information that it needs to, e.g., roundtrip losslessly or + for two implementations to share data that is not (yet) part of the + interchange protocol specification. For avoiding collisions with other + entries, please add name the keys with the name of the library + followed by a period and the desired name, e.g, ``pandas.indexcol``. + """ + pass + + def num_columns(self) -> int: + """ + Return the number of columns in the DataFrame. + """ + pass + + def num_rows(self) -> Optional[int]: + # TODO: not happy with Optional, but need to flag it may be expensive + # why include it if it may be None - what do we expect consumers + # to do here? + """ + Return the number of rows in the DataFrame, if available. + """ + pass + + def num_chunks(self) -> int: + """ + Return the number of chunks the DataFrame consists of. + """ + pass + + def column_names(self) -> Iterable[str]: + """ + Return an iterator yielding the column names. + """ + pass + + def get_column(self, i: int) -> Column: + """ + Return the column at the indicated position. + """ + pass + + def get_column_by_name(self, name: str) -> Column: + """ + Return the column whose name is the indicated name. + """ + pass + + def get_columns(self) -> Iterable[Column]: + """ + Return an iterator yielding the columns. + """ + pass + + def select_columns(self, indices: Sequence[int]) -> "DataFrame": + """ + Create a new DataFrame by selecting a subset of columns by index. + """ + pass + + def select_columns_by_name(self, names: Sequence[str]) -> "DataFrame": + """ + Create a new DataFrame by selecting a subset of columns by name. + """ + pass + + def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable["DataFrame"]: + """ + Return an iterator yielding the chunks. + By default (None), yields the chunks that the data is stored as by the + producer. If given, ``n_chunks`` must be a multiple of + ``self.num_chunks()``, meaning the producer must subdivide each chunk + before yielding it. + """ + pass From 02946f898a0c75f927e9d24da1b7a04ba9dc5c96 Mon Sep 17 00:00:00 2001 From: Vasily Litvinov Date: Mon, 21 Feb 2022 20:47:27 +0300 Subject: [PATCH 03/49] Copy over the prototype and polish it a bit Signed-off-by: Vasily Litvinov --- pandas/api/exchange/__init__.py | 0 pandas/api/exchange/implementation.py | 625 ++++++++++++++++++++++++++ 2 files changed, 625 insertions(+) create mode 100644 pandas/api/exchange/__init__.py create mode 100644 pandas/api/exchange/implementation.py diff --git a/pandas/api/exchange/__init__.py b/pandas/api/exchange/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/api/exchange/implementation.py b/pandas/api/exchange/implementation.py new file mode 100644 index 0000000000000..94000ce576acc --- /dev/null +++ b/pandas/api/exchange/implementation.py @@ -0,0 +1,625 @@ +import collections +import ctypes + +from typing import Tuple, Any + +from .dataframe_protocol import Buffer, Column, DataFrame as DataFrameXchg, DtypeKind, DlpackDeviceType + +import pandas as pd +import numpy as np + + +def from_dataframe(df : DataFrameXchg, + allow_copy : bool = True) -> pd.DataFrame: + """ + Construct a pandas DataFrame from ``df`` if it supports ``__dataframe__`` + """ + # NOTE: commented out for roundtrip testing + # if isinstance(df, pd.DataFrame): + # return df + + if not hasattr(df, '__dataframe__'): + raise ValueError("`df` does not support __dataframe__") + + return _from_dataframe(df.__dataframe__(allow_copy=allow_copy)) + + +def _from_dataframe(df : DataFrameXchg) -> pd.DataFrame: + """ + Note: not all cases are handled yet, only ones that can be implemented with + only Pandas. Later, we need to implement/test support for categoricals, + bit/byte masks, chunk handling, etc. + """ + # Check number of chunks, if there's more than one we need to iterate + if df.num_chunks() > 1: + raise NotImplementedError + + # We need a dict of columns here, with each column being a numpy array (at + # least for now, deal with non-numpy dtypes later). + columns = dict() + _buffers = [] # hold on to buffers, keeps memory alive + for name in df.column_names(): + if not isinstance(name, str): + raise ValueError(f"Column {name} is not a string") + if name in columns: + raise ValueError(f"Column {name} is not unique") + col = df.get_column_by_name(name) + if col.dtype[0] in (DtypeKind.INT, DtypeKind.UINT, DtypeKind.FLOAT, DtypeKind.BOOL): + # Simple numerical or bool dtype, turn into numpy array + columns[name], _buf = convert_column_to_ndarray(col) + elif col.dtype[0] == DtypeKind.CATEGORICAL: + columns[name], _buf = convert_categorical_column(col) + elif col.dtype[0] == DtypeKind.STRING: + columns[name], _buf = convert_string_column(col) + else: + raise NotImplementedError(f"Data type {col.dtype[0]} not handled yet") + + _buffers.append(_buf) + + df_new = pd.DataFrame(columns) + df_new._buffers = _buffers + return df_new + + +def convert_column_to_ndarray(col : Column) -> Tuple[np.ndarray, Buffer]: + """ + Convert an int, uint, float or bool column to a numpy array. + """ + if col.offset != 0: + raise NotImplementedError("column.offset > 0 not handled yet") + + if col.describe_null[0] not in (0, 1): + raise NotImplementedError("Null values represented as masks or " + "sentinel values not handled yet") + + _buffer, _dtype = col.get_buffers()["data"] + return buffer_to_ndarray(_buffer, _dtype), _buffer + + +def buffer_to_ndarray(_buffer : Buffer, _dtype) -> np.ndarray: + # Handle the dtype + kind = _dtype[0] + bitwidth = _dtype[1] + if _dtype[0] not in (DtypeKind.INT, DtypeKind.UINT, DtypeKind.FLOAT, DtypeKind.BOOL): + raise RuntimeError("Not a boolean, integer or floating-point dtype") + + _ints = {8: np.int8, 16: np.int16, 32: np.int32, 64: np.int64} + _uints = {8: np.uint8, 16: np.uint16, 32: np.uint32, 64: np.uint64} + _floats = {32: np.float32, 64: np.float64} + _np_dtypes = {0: _ints, 1: _uints, 2: _floats, 20: {8: bool}} + column_dtype = _np_dtypes[kind][bitwidth] + + # No DLPack yet, so need to construct a new ndarray from the data pointer + # and size in the buffer plus the dtype on the column + ctypes_type = np.ctypeslib.as_ctypes_type(column_dtype) + data_pointer = ctypes.cast(_buffer.ptr, ctypes.POINTER(ctypes_type)) + + # NOTE: `x` does not own its memory, so the caller of this function must + # either make a copy or hold on to a reference of the column or + # buffer! (not done yet, this is pretty awful ...) + x = np.ctypeslib.as_array(data_pointer, + shape=(_buffer.bufsize // (bitwidth//8),)) + + return x + + +def convert_categorical_column(col : Column) -> Tuple[pd.Series, Buffer]: + """ + Convert a categorical column to a Series instance. + """ + ordered, is_dict, mapping = col.describe_categorical + if not is_dict: + raise NotImplementedError('Non-dictionary categoricals not supported yet') + + # If you want to cheat for testing (can't use `_col` in real-world code): + # categories = col._col.values.categories.values + # codes = col._col.values.codes + categories = np.asarray(list(mapping.values())) + codes_buffer, codes_dtype = col.get_buffers()["data"] + codes = buffer_to_ndarray(codes_buffer, codes_dtype) + values = categories[codes] + + # Seems like Pandas can only construct with non-null values, so need to + # null out the nulls later + cat = pd.Categorical(values, categories=categories, ordered=ordered) + series = pd.Series(cat) + null_kind = col.describe_null[0] + if null_kind == 2: # sentinel value + sentinel = col.describe_null[1] + series[codes == sentinel] = np.nan + else: + raise NotImplementedError("Only categorical columns with sentinel " + "value supported at the moment") + + return series, codes_buffer + + +def convert_string_column(col : Column) -> Tuple[np.ndarray, dict]: + """ + Convert a string column to a NumPy array. + """ + # Retrieve the data buffers + buffers = col.get_buffers() + + # Retrieve the data buffer containing the UTF-8 code units + dbuffer, bdtype = buffers["data"] + + # Retrieve the offsets buffer containing the index offsets demarcating the beginning and end of each string + obuffer, odtype = buffers["offsets"] + + # Retrieve the mask buffer indicating the presence of missing values + mbuffer, mdtype = buffers["validity"] + + # Retrieve the missing value encoding + null_kind, null_value = col.describe_null + + # Convert the buffers to NumPy arrays + dt = (DtypeKind.UINT, 8, None, None) # note: in order to go from STRING to an equivalent ndarray, we claim that the buffer is uint8 (i.e., a byte array) + dbuf = buffer_to_ndarray(dbuffer, dt) + + obuf = buffer_to_ndarray(obuffer, odtype) + mbuf = buffer_to_ndarray(mbuffer, mdtype) + + # Assemble the strings from the code units + str_list = [] + for i in range(obuf.size-1): + # Check for missing values + if null_kind == 3: # bit mask + v = mbuf[i/8] + if null_value == 1: + v = ~v + + if v & (1<<(i%8)): + str_list.append(np.nan) + continue + + elif null_kind == 4 and mbuf[i] == null_value: # byte mask + str_list.append(np.nan) + continue + + # Extract a range of code units + units = dbuf[obuf[i]:obuf[i+1]] + + # Convert the list of code units to bytes + b = bytes(units) + + # Create the string + s = b.decode(encoding="utf-8") + + # Add to our list of strings + str_list.append(s) + + # Convert the string list to a NumPy array + return np.asarray(str_list, dtype="object"), buffers + +# Implementation of interchange protocol +# -------------------------------------- + +class _PandasBuffer(Buffer): + """ + Data in the buffer is guaranteed to be contiguous in memory. + """ + + def __init__(self, x : np.ndarray, allow_copy : bool = True) -> None: + """ + Handle only regular columns (= numpy arrays) for now. + """ + if not x.strides == (x.dtype.itemsize,): + # The protocol does not support strided buffers, so a copy is + # necessary. If that's not allowed, we need to raise an exception. + if allow_copy: + x = x.copy() + else: + raise RuntimeError("Exports cannot be zero-copy in the case " + "of a non-contiguous buffer") + + # Store the numpy array in which the data resides as a private + # attribute, so we can use it to retrieve the public attributes + self._x = x + + @property + def bufsize(self) -> int: + """ + Buffer size in bytes. + """ + return self._x.size * self._x.dtype.itemsize + + @property + def ptr(self) -> int: + """ + Pointer to start of the buffer as an integer. + """ + return self._x.__array_interface__['data'][0] + + def __dlpack__(self): + """ + DLPack not implemented in NumPy yet, so leave it out here. + """ + raise NotImplementedError("__dlpack__") + + def __dlpack_device__(self) -> Tuple[DlpackDeviceType, int]: + """ + Device type and device ID for where the data in the buffer resides. + """ + return (DlpackDeviceType.CPU, None) + + def __repr__(self) -> str: + return 'PandasBuffer(' + str({'bufsize': self.bufsize, + 'ptr': self.ptr, + 'device': self.__dlpack_device__()[0].name} + ) + ')' + +class _PandasColumn(Column): + """ + A column object, with only the methods and properties required by the + interchange protocol defined. + A column can contain one or more chunks. Each chunk can contain up to three + buffers - a data buffer, a mask buffer (depending on null representation), + and an offsets buffer (if variable-size binary; e.g., variable-length + strings). + Note: this Column object can only be produced by ``__dataframe__``, so + doesn't need its own version or ``__column__`` protocol. + """ + + def __init__(self, column : pd.Series, + allow_copy : bool = True) -> None: + """ + Note: doesn't deal with extension arrays yet, just assume a regular + Series/ndarray for now. + """ + if not isinstance(column, pd.Series): + raise NotImplementedError("Columns of type {} not handled " + "yet".format(type(column))) + + # Store the column as a private attribute + self._col = column + self._allow_copy = allow_copy + + @property + def size(self) -> int: + """ + Size of the column, in elements. + """ + return self._col.size + + @property + def offset(self) -> int: + """ + Offset of first element. Always zero. + """ + return 0 + + @property + def dtype(self): + dtype = self._col.dtype + + # For now, assume that, if the column dtype is 'O' (i.e., `object`), then we have an array of strings + if not isinstance(dtype, pd.CategoricalDtype) and dtype.kind == 'O': + return (DtypeKind.STRING, 8, 'u', '=') + + return self._dtype_from_pandasdtype(dtype) + + def _dtype_from_pandasdtype(self, dtype) -> Tuple[DtypeKind, int, str, str]: + """ + See `self.dtype` for details. + """ + # Note: 'c' (complex) not handled yet (not in array spec v1). + # 'b', 'B' (bytes), 'S', 'a', (old-style string) 'V' (void) not handled + # datetime and timedelta both map to datetime (is timedelta handled?) + _np_kinds = {"i": DtypeKind.INT, "u": DtypeKind.UINT, "f": DtypeKind.FLOAT, "b": DtypeKind.BOOL, + "U": DtypeKind.STRING, + "M": DtypeKind.DATETIME, "m": DtypeKind.DATETIME} + kind = _np_kinds.get(dtype.kind, None) + if kind is None: + # Not a NumPy dtype. Check if it's a categorical maybe + if isinstance(dtype, pd.CategoricalDtype): + kind = DtypeKind.CATEGORICAL + else: + raise ValueError(f"Data type {dtype} not supported by exchange" + "protocol") + + if kind not in (DtypeKind.INT, DtypeKind.UINT, DtypeKind.FLOAT, DtypeKind.BOOL, DtypeKind.CATEGORICAL, DtypeKind.STRING): + raise NotImplementedError(f"Data type {dtype} not handled yet") + + bitwidth = dtype.itemsize * 8 + format_str = dtype.str + endianness = dtype.byteorder if not kind == DtypeKind.CATEGORICAL else '=' + return (kind, bitwidth, format_str, endianness) + + + @property + def describe_categorical(self): + """ + If the dtype is categorical, there are two options: + - There are only values in the data buffer. + - There is a separate dictionary-style encoding for categorical values. + Raises RuntimeError if the dtype is not categorical + Content of returned dict: + - "is_ordered" : bool, whether the ordering of dictionary indices is + semantically meaningful. + - "is_dictionary" : bool, whether a dictionary-style mapping of + categorical values to other objects exists + - "mapping" : dict, Python-level only (e.g. ``{int: str}``). + None if not a dictionary-style categorical. + """ + if not self.dtype[0] == DtypeKind.CATEGORICAL: + raise TypeError("`describe_categorical only works on a column with " + "categorical dtype!") + + ordered = self._col.dtype.ordered + is_dictionary = True + # NOTE: this shows the children approach is better, transforming + # `categories` to a "mapping" dict is inefficient + codes = self._col.values.codes # ndarray, length `self.size` + # categories.values is ndarray of length n_categories + categories = self._col.values.categories.values + mapping = {ix: val for ix, val in enumerate(categories)} + return ordered, is_dictionary, mapping + + @property + def describe_null(self): + kind = self.dtype[0] + value = None + if kind == DtypeKind.FLOAT: + null = 1 # np.nan + elif kind == DtypeKind.DATETIME: + null = 1 # np.datetime64('NaT') + elif kind in (DtypeKind.INT, DtypeKind.UINT, DtypeKind.BOOL): + # TODO: check if extension dtypes are used once support for them is + # implemented in this protocol code + null = 0 # integer and boolean dtypes are non-nullable + elif kind == DtypeKind.CATEGORICAL: + # Null values for categoricals are stored as `-1` sentinel values + # in the category date (e.g., `col.values.codes` is int8 np.ndarray) + null = 2 + value = -1 + elif kind == DtypeKind.STRING: + null = 4 + value = 0 # follow Arrow in using 1 as valid value and 0 for missing/null value + else: + raise NotImplementedError(f"Data type {self.dtype} not yet supported") + + return null, value + + @property + def null_count(self) -> int: + """ + Number of null elements. Should always be known. + """ + return self._col.isna().sum() + + @property + def metadata(self): + """ + Store specific metadata of the column. + """ + return {} + + def num_chunks(self) -> int: + """ + Return the number of chunks the column consists of. + """ + return 1 + + def get_chunks(self, n_chunks=None): + """ + Return an iterator yielding the chunks. + See `DataFrame.get_chunks` for details on ``n_chunks``. + """ + # TODO: implement proper chunking for n_chunks > 1 + return (self,) + + def get_buffers(self): + """ + Return a dictionary containing the underlying buffers. + The returned dictionary has the following contents: + - "data": a two-element tuple whose first element is a buffer + containing the data and whose second element is the data + buffer's associated dtype. + - "validity": a two-element tuple whose first element is a buffer + containing mask values indicating missing data and + whose second element is the mask value buffer's + associated dtype. None if the null representation is + not a bit or byte mask. + - "offsets": a two-element tuple whose first element is a buffer + containing the offset values for variable-size binary + data (e.g., variable-length strings) and whose second + element is the offsets buffer's associated dtype. None + if the data buffer does not have an associated offsets + buffer. + """ + buffers = {} + buffers["data"] = self._get_data_buffer() + try: + buffers["validity"] = self._get_validity_buffer() + except: + buffers["validity"] = None + + try: + buffers["offsets"] = self._get_offsets_buffer() + except: + buffers["offsets"] = None + + return buffers + + def _get_data_buffer(self) -> Tuple[_PandasBuffer, Any]: # Any is for self.dtype tuple + """ + Return the buffer containing the data and the buffer's associated dtype. + """ + if self.dtype[0] in (DtypeKind.INT, DtypeKind.UINT, DtypeKind.FLOAT, DtypeKind.BOOL): + buffer = _PandasBuffer( + self._col.to_numpy(), allow_copy=self._allow_copy) + dtype = self.dtype + elif self.dtype[0] == DtypeKind.CATEGORICAL: + codes = self._col.values.codes + buffer = _PandasBuffer( + codes, allow_copy=self._allow_copy) + dtype = self._dtype_from_pandasdtype(codes.dtype) + elif self.dtype[0] == DtypeKind.STRING: + # Marshal the strings from a NumPy object array into a byte array + buf = self._col.to_numpy() + b = bytearray() + + # TODO: this for-loop is slow; can be implemented in Cython/C/C++ later + for i in range(buf.size): + if type(buf[i]) == str: + b.extend(buf[i].encode(encoding="utf-8")) + + # Convert the byte array to a Pandas "buffer" using a NumPy array as the backing store + buffer = _PandasBuffer(np.frombuffer(b, dtype="uint8")) + + # Define the dtype for the returned buffer + dtype = (DtypeKind.STRING, 8, "u", "=") # note: currently only support native endianness + else: + raise NotImplementedError(f"Data type {self._col.dtype} not handled yet") + + return buffer, dtype + + def _get_validity_buffer(self) -> Tuple[_PandasBuffer, Any]: + """ + Return the buffer containing the mask values indicating missing data and + the buffer's associated dtype. + Raises RuntimeError if null representation is not a bit or byte mask. + """ + null, invalid = self.describe_null + + if self.dtype[0] == DtypeKind.STRING: + # For now, have the mask array be comprised of bytes, rather than a bit array + buf = self._col.to_numpy() + mask = [] + + # Determine the encoding for valid values + if invalid == 0: + valid = 1 + else: + valid = 0 + + for i in range(buf.size): + if type(buf[i]) == str: + v = valid + else: + v = invalid + + mask.append(v) + + # Convert the mask array to a Pandas "buffer" using a NumPy array as the backing store + buffer = _PandasBuffer(np.asarray(mask, dtype="uint8")) + + # Define the dtype of the returned buffer + dtype = (DtypeKind.UINT, 8, "C", "=") + + return buffer, dtype + + if null == 0: + msg = "This column is non-nullable so does not have a mask" + elif null == 1: + msg = "This column uses NaN as null so does not have a separate mask" + else: + raise NotImplementedError("See self.describe_null") + + raise RuntimeError(msg) + + def _get_offsets_buffer(self) -> Tuple[_PandasBuffer, Any]: + """ + Return the buffer containing the offset values for variable-size binary + data (e.g., variable-length strings) and the buffer's associated dtype. + Raises RuntimeError if the data buffer does not have an associated + offsets buffer. + """ + if self.dtype[0] == DtypeKind.STRING: + # For each string, we need to manually determine the next offset + values = self._col.to_numpy() + ptr = 0 + offsets = [ptr] + for v in values: + # For missing values (in this case, `np.nan` values), we don't increment the pointer) + if type(v) == str: + b = v.encode(encoding="utf-8") + ptr += len(b) + + offsets.append(ptr) + + # Convert the list of offsets to a NumPy array of signed 64-bit integers (note: Arrow allows the offsets array to be either `int32` or `int64`; here, we default to the latter) + buf = np.asarray(offsets, dtype="int64") + + # Convert the offsets to a Pandas "buffer" using the NumPy array as the backing store + buffer = _PandasBuffer(buf) + + # Assemble the buffer dtype info + dtype = (DtypeKind.INT, 64, 'l', "=") # note: currently only support native endianness + else: + raise RuntimeError("This column has a fixed-length dtype so does not have an offsets buffer") + + return buffer, dtype + + +class _PandasDataFrameXchg(DataFrameXchg): + """ + A data frame class, with only the methods required by the interchange + protocol defined. + Instances of this (private) class are returned from + ``pd.DataFrame.__dataframe__`` as objects with the methods and + attributes defined on this class. + """ + def __init__(self, df : pd.DataFrame, nan_as_null : bool = False, + allow_copy : bool = True) -> None: + """ + Constructor - an instance of this (private) class is returned from + `pd.DataFrame.__dataframe__`. + """ + self._df = df + # ``nan_as_null`` is a keyword intended for the consumer to tell the + # producer to overwrite null values in the data with ``NaN`` (or ``NaT``). + # This currently has no effect; once support for nullable extension + # dtypes is added, this value should be propagated to columns. + self._nan_as_null = nan_as_null + self._allow_copy = allow_copy + + @property + def metadata(self): + # `index` isn't a regular column, and the protocol doesn't support row + # labels - so we export it as Pandas-specific metadata here. + return {"pandas.index": self._df.index} + + def num_columns(self) -> int: + return len(self._df.columns) + + def num_rows(self) -> int: + return len(self._df) + + def num_chunks(self) -> int: + return 1 + + def column_names(self): + return self._df.columns.tolist() + + def get_column(self, i: int) -> _PandasColumn: + return _PandasColumn( + self._df.iloc[:, i], allow_copy=self._allow_copy) + + def get_column_by_name(self, name: str) -> _PandasColumn: + return _PandasColumn( + self._df[name], allow_copy=self._allow_copy) + + def get_columns(self): + return [_PandasColumn(self._df[name], allow_copy=self._allow_copy) + for name in self._df.columns] + + def select_columns(self, indices): + if not isinstance(indices, collections.Sequence): + raise ValueError("`indices` is not a sequence") + + return _PandasDataFrameXchg(self._df.iloc[:, indices]) + + def select_columns_by_name(self, names): + if not isinstance(names, collections.Sequence): + raise ValueError("`names` is not a sequence") + + return _PandasDataFrameXchg(self._df.xs(names, axis='columns')) + + def get_chunks(self, n_chunks=None): + """ + Return an iterator yielding the chunks. + """ + #TODO: implement chunking when n_chunks > 1 + return (self,) From 14fd4782e2c239ba259a9951637d3c5cdb42583b Mon Sep 17 00:00:00 2001 From: Vasily Litvinov Date: Tue, 22 Feb 2022 10:48:18 +0300 Subject: [PATCH 04/49] Fix the protocol spec Signed-off-by: Vasily Litvinov --- pandas/api/exchange/dataframe_protocol.py | 25 ++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/pandas/api/exchange/dataframe_protocol.py b/pandas/api/exchange/dataframe_protocol.py index 0b388878e2891..b767334ff2ba4 100644 --- a/pandas/api/exchange/dataframe_protocol.py +++ b/pandas/api/exchange/dataframe_protocol.py @@ -1,4 +1,4 @@ -from typing import Tuple, Optional, Dict, Any, Iterable, Sequence +from typing import Tuple, Optional, Dict, Any, Iterable, Sequence, TypedDict import enum class DlpackDeviceType(enum.IntEnum): @@ -27,6 +27,11 @@ class ColumnNullType: USE_BITMASK = 3 USE_BYTEMASK = 4 +class CategoricalDescription(TypedDict): + is_ordered: bool # whether the ordering of dictionary indices is semantically meaningful + is_dictionary: bool # whether a dictionary-style mapping of categorical values to other objects exists + mapping: Optional[dict] # Python-level only (e.g. ``{int: str}``). None if not a dictionary-style categorical. + class Buffer: """ Data in the buffer is guaranteed to be contiguous in memory. @@ -73,6 +78,20 @@ def __dlpack_device__(self) -> Tuple[DlpackDeviceType, int]: """ pass +class ColumnBuffers(TypedDict): + data: Tuple[Buffer, Any] # first element is a buffer containing the column data; + # second element is the data buffer's associated dtype + validity: Optional[Tuple[Buffer, Any]] # first element is a buffer containing mask values + # indicating missing data and second element is + # the mask value buffer's associated dtype. + # None if the null representation is not a bit or byte mask + offsets: Optional[Tuple[Buffer, Any]] # first element is a buffer containing the + # offset values for variable-size binary data + # (e.g., variable-length strings) and + # second element is the offsets buffer's associated dtype. + # None if the data buffer does not have + # an associated offsets buffer + class Column: """ @@ -160,7 +179,7 @@ def dtype(self) -> Tuple[DtypeKind, int, str, str]: pass @property - def describe_categorical(self) -> Dict[bool, bool, Optional[dict]]: + def describe_categorical(self) -> CategoricalDescription: """ If the dtype is categorical, there are two options: - There are only values in the data buffer. @@ -216,7 +235,7 @@ def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable["Column"]: """ pass - def get_buffers(self) -> Dict[Tuple[Buffer, Any], Optional[Tuple[Buffer, Any]], Optional[Tuple[Buffer, Any]]]: + def get_buffers(self) -> ColumnBuffers: """ Return a dictionary containing the underlying buffers. The returned dictionary has the following contents: From 45150118b842c1f0ebf414e8fa0f8bf904cf998f Mon Sep 17 00:00:00 2001 From: Vasily Litvinov Date: Tue, 22 Feb 2022 10:48:40 +0300 Subject: [PATCH 05/49] Enable pd.DataFrame.__dataframe__ Signed-off-by: Vasily Litvinov --- pandas/core/frame.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index ad7f961f30170..8948bcceb7f0b 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -204,6 +204,8 @@ nargsort, ) +from pandas.api.exchange.dataframe_protocol import DataFrame as DataFrameXchg + from pandas.io.common import get_handle from pandas.io.formats import ( console, @@ -811,6 +813,21 @@ def __init__( NDFrame.__init__(self, mgr) + # ---------------------------------------------------------------------- + def __dataframe__(self, nan_as_null : bool = False, + allow_copy : bool = True) -> DataFrameXchg: + """ + Return the dataframe exchange object implementing the exchange protocol. + + See Also + -------- + Details on the exchange protocol: + https://data-apis.org/dataframe-protocol/latest/index.html + """ + + from pandas.api.exchange.implementation import _PandasDataFrameXchg + return _PandasDataFrameXchg(self, nan_as_null, allow_copy) + # ---------------------------------------------------------------------- @property From 7d6fd5bafca57be855793563ae934fcbaa951693 Mon Sep 17 00:00:00 2001 From: Vasily Litvinov Date: Tue, 22 Feb 2022 18:17:31 +0300 Subject: [PATCH 06/49] Align spec with existing implementations Signed-off-by: Vasily Litvinov --- pandas/api/exchange/dataframe_protocol.py | 59 ++++++++--------------- 1 file changed, 19 insertions(+), 40 deletions(-) diff --git a/pandas/api/exchange/dataframe_protocol.py b/pandas/api/exchange/dataframe_protocol.py index b767334ff2ba4..a558ef64a38ad 100644 --- a/pandas/api/exchange/dataframe_protocol.py +++ b/pandas/api/exchange/dataframe_protocol.py @@ -27,10 +27,20 @@ class ColumnNullType: USE_BITMASK = 3 USE_BYTEMASK = 4 -class CategoricalDescription(TypedDict): - is_ordered: bool # whether the ordering of dictionary indices is semantically meaningful - is_dictionary: bool # whether a dictionary-style mapping of categorical values to other objects exists - mapping: Optional[dict] # Python-level only (e.g. ``{int: str}``). None if not a dictionary-style categorical. +class ColumnBuffers(TypedDict): + data: Tuple["Buffer", Any] # first element is a buffer containing the column data; + # second element is the data buffer's associated dtype + validity: Optional[Tuple["Buffer", Any]] # first element is a buffer containing mask values + # indicating missing data and second element is + # the mask value buffer's associated dtype. + # None if the null representation is not a bit or byte mask + offsets: Optional[Tuple["Buffer", Any]] # first element is a buffer containing the + # offset values for variable-size binary data + # (e.g., variable-length strings) and + # second element is the offsets buffer's associated dtype. + # None if the data buffer does not have + # an associated offsets buffer + class Buffer: """ @@ -78,20 +88,6 @@ def __dlpack_device__(self) -> Tuple[DlpackDeviceType, int]: """ pass -class ColumnBuffers(TypedDict): - data: Tuple[Buffer, Any] # first element is a buffer containing the column data; - # second element is the data buffer's associated dtype - validity: Optional[Tuple[Buffer, Any]] # first element is a buffer containing mask values - # indicating missing data and second element is - # the mask value buffer's associated dtype. - # None if the null representation is not a bit or byte mask - offsets: Optional[Tuple[Buffer, Any]] # first element is a buffer containing the - # offset values for variable-size binary data - # (e.g., variable-length strings) and - # second element is the offsets buffer's associated dtype. - # None if the data buffer does not have - # an associated offsets buffer - class Column: """ @@ -179,13 +175,14 @@ def dtype(self) -> Tuple[DtypeKind, int, str, str]: pass @property - def describe_categorical(self) -> CategoricalDescription: + def describe_categorical(self) -> Tuple[bool, bool, dict]: """ If the dtype is categorical, there are two options: - There are only values in the data buffer. - There is a separate dictionary-style encoding for categorical values. - Raises RuntimeError if the dtype is not categorical - Content of returned dict: + Raises TypeError if the dtype is not categorical + + Returns the description on how to interpret the data buffer: - "is_ordered" : bool, whether the ordering of dictionary indices is semantically meaningful. - "is_dictionary" : bool, whether a dictionary-style mapping of @@ -276,25 +273,7 @@ class DataFrame: ``__dataframe__`` method of a public data frame class in a library adhering to the dataframe interchange protocol specification. """ - def __dataframe__(self, nan_as_null : bool = False, - allow_copy : bool = True) -> dict: - """ - Produces a dictionary object following the dataframe protocol specification. - ``nan_as_null`` is a keyword intended for the consumer to tell the - producer to overwrite null values in the data with ``NaN`` (or ``NaT``). - It is intended for cases where the consumer does not support the bit - mask or byte mask that is the producer's native representation. - ``allow_copy`` is a keyword that defines whether or not the library is - allowed to make a copy of the data. For example, copying data would be - necessary if a library supports strided buffers, given that this protocol - specifies contiguous buffers. - """ - self._nan_as_null = nan_as_null - self._allow_zero_zopy = allow_copy - return { - "dataframe": self, # DataFrame object adhering to the protocol - "version": 0 # Version number of the protocol - } + version = 0 # version of the protocol @property def metadata(self) -> Dict[str, Any]: From 5d64c4abe0852c8a7dac0411209ca401f1c7a178 Mon Sep 17 00:00:00 2001 From: Vasily Litvinov Date: Tue, 22 Feb 2022 18:19:48 +0300 Subject: [PATCH 07/49] Fix protocol tests Signed-off-by: Vasily Litvinov --- pandas/tests/api/test_protocol.py | 215 ++++++++++++------------------ 1 file changed, 82 insertions(+), 133 deletions(-) diff --git a/pandas/tests/api/test_protocol.py b/pandas/tests/api/test_protocol.py index 891233661d498..b4335e83fcf2d 100644 --- a/pandas/tests/api/test_protocol.py +++ b/pandas/tests/api/test_protocol.py @@ -1,140 +1,89 @@ import pytest -import numpy as np +import math @pytest.mark.parametrize("test_data", [ - ({'a': [np.array([1, 2, 3]), np.array([4, 5, 6])], - 'b': [np.array([1.5, 2.0, 3.2]), np.array([4.1, 5.7, 6.9])]}, - np.object_, None), - ({'a': [1.5, 2.5, 3.5], 'b': [9.2, 10.5, 11.8]}, np.float64, None), - ({'A': [1, 2, 3, 4], 'B': [1, 2, 3, 4]}, np.int64, np.float64) + {'a': ["foo", "bar"], + 'b': ["baz", "qux"]}, + {'a': [1.5, 2.5, 3.5], 'b': [9.2, 10.5, 11.8]}, + {'A': [1, 2, 3, 4], 'B': [1, 2, 3, 4]} ], - ids=["array_data", "float_data", "int_data"]) -def test_only_one_data(test_data, create_df_from_dict): - data, dtype, new_dtype = test_data - columns = list(data.keys()) - df = create_df_from_dict(data) - df2 = df.__dataframe__() - new_dtype = dtype if new_dtype is None else new_dtype - assert df.columns.values.tolist() == columns - val = len(df[columns[0]])-1 - column_size = df.size - for column in columns: - assert df[column].tolist() == df[column].tolist() - assert df[column].dtype.type is dtype - assert df2.get_column_by_name(column).null_count == 0 - assert df2.get_column_by_name(column).size == column_size - assert df2.get_column_by_name(column).offset == 0 - assert not df2["x"].is_masked - n = np.random.randint(0, val) - (df[column])[n] = None - assert df[column].dtype.type is new_dtype - assert df2.get_column_by_name(column).null_count == 1 - - -def test_float_int(create_df_from_dict): - df = create_df_from_dict({'a': [1, 2, 3], 'b': [3, 4, 5], - 'c': [1.5, 2.5, 3.5], 'd': [9, 10, 11]}) - df2 = df.__dataframe__() - columns = ['a', 'b', 'c', 'd'] - assert df.columns.values.tolist() == columns - for column in columns: - assert df[column].tolist() == df[column].tolist() - if column is 'c': - assert df[column].dtype.type is np.float64 - else: - assert df[column].dtype.type is np.int64 - - assert df2.get_column_by_name(column).null_count == 0 - assert df2.get_column_by_name(column).size == 3 - assert df2.get_column_by_name(column).offset == 0 - - n = np.random.randint(0, 2) - (df[column])[n] = None - assert df[column].dtype.type is np.float64 - assert df2.get_column_by_name(column).null_count == 1 - - -def test_mixed_intfloatbool(create_df_from_dict): - df = create_df_from_dict({"x": np.array([True, True, False]), - "y": np.array([1, 2, 0]), - "z": np.array([9.2, 10.5, 11.8])}) - df2 = df.__dataframe__() - columns = ['x', 'y', 'z'] - assert df.columns.values.tolist() == columns - for column in columns: - assert df[column].tolist() == df[column].tolist() - assert df2.get_column_by_name(column).null_count == 0 - assert df2.get_column_by_name(column).size == 3 - assert df2.get_column_by_name(column).offset == 0 + ids=["str_data", "float_data", "int_data"]) +def test_only_one_dtype(test_data, df_from_dict): + columns = list(test_data.keys()) + df = df_from_dict(test_data) + dfX = df.__dataframe__() - assert df["x"].dtype.type is np.bool_ - assert df["y"].dtype.type is np.int32 - assert df["z"].dtype.type is np.float64 - - assert df2.get_column_by_name("x")._allow_copy == True - - for column in columns: - n = np.random.randint(0, 2) - (df[column])[n] = None - if column is "x": - assert df[column].dtype.type is np.object_ - else: - assert df[column].dtype.type is np.float64 - assert df2.get_column_by_name(column).null_count == 1 - - -def test_string_dtype(create_df_from_dict): - df = create_df_from_dict({"A": ["a", "b", "cdef", "", "g"]}) - df2 = df.__dataframe__() - columns = ['A'] - assert df.columns.values.tolist() == columns + column_size = len(test_data[columns[0]]) for column in columns: - assert df[column].tolist() == df[column].tolist() - assert df[column].dtype.type is np.object_ - assert df2.get_column_by_name(column).null_count == 0 - - -def test_categorical(create_df_from_dict): - df = create_df_from_dict({"year": [2012, 2013, 2015, 2019], "weekday": [0, 1, 4, 6]}) - df = df.categorize("year", min_value=2012, max_value=2019) - df = df.categorize("weekday", labels=["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]) - # Some detailed testing for correctness of dtype and null handling: - col = df.__dataframe__().get_column_by_name("year") - assert col.describe_categorical == (False, True, {0: 2012, 1: 2013, 2: 2014, 3: 2015, 4: 2016, 5: 2017, 6: 2018, 7: 2019}) - assert col.describe_null == (0, None) - col2 = df.__dataframe__().get_column_by_name("weekday") - assert col2.describe_categorical == (False, True, {0: "Mon", 1: "Tue", 2: "Wed", 3: "Thu", 4: "Fri", 5: "Sat", 6: "Sun"}) - assert col2.describe_null == (0, None) - - -def test_dataframe(create_df_from_dict): - df = create_df_from_dict({"x": [True, True, False], "y": [1, 2, 0], "z": [9.2, 10.5, 11.8]}) - df2 = df.__dataframe__() - assert df2._allow_copy == True - assert df2.num_columns() == 3 - assert df2.num_rows() == 3 - assert df2.num_chunks() == 1 - assert df2.column_names() == ["x", "y", "z"] - assert df2.select_columns((0, 2))._df[:, 0].tolist() == df2.select_columns_by_name(("x", "z"))._df[:, 0].tolist() - assert df2.select_columns((0, 2))._df[:, 1].tolist() == df2.select_columns_by_name(("x", "z"))._df[:, 1].tolist() - - -def test_chunks(create_df_from_dict): - df = create_df_from_dict({"x": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]}) - df2 = df.__dataframe__() - chunk_iter = iter(df2.get_chunks(3)) - chunk = next(chunk_iter) - assert chunk.num_rows() == 4 - chunk = next(chunk_iter) - assert chunk.num_rows() == 4 - chunk = next(chunk_iter) - assert chunk.num_rows() == 2 - with pytest.raises(StopIteration): - chunk = next(chunk_iter) - - -def test_get_chunks(create_df_from_dict): - df = create_df_from_dict({"x": [1]}) - df2 = df.__dataframe__() - assert df2.get_chunks() == 1 + assert dfX.get_column_by_name(column).null_count == 0 + assert dfX.get_column_by_name(column).size == column_size + assert dfX.get_column_by_name(column).offset == 0 + + +def test_float_int(df_from_dict): + df = df_from_dict({'a': [1, 2, 3], 'b': [3, 4, 5], + 'c': [1.5, 2.5, 3.5], 'd': [9, 10, 11], + 'e': [True, False, True], + 'f': ["a", "", "c"]}) + dfX = df.__dataframe__() + columns = {'a': 0, 'b': 0, 'c': 2, 'd': 0, 'e': 20, 'f': 21} + + for column, kind in columns.items(): + colX = dfX.get_column_by_name(column) + assert colX.null_count == 0 + assert colX.size == 3 + assert colX.offset == 0 + + assert colX.dtype[0] == kind + + assert dfX.get_column_by_name("c").dtype[1] == 64 + + +def test_na_float(df_from_dict): + df = df_from_dict({'a': [1.0, math.nan, 2.0]}) + dfX = df.__dataframe__() + colX = dfX.get_column_by_name('a') + assert colX.null_count == 1 + +def test_noncategorical(df_from_dict): + df = df_from_dict({'a': [1, 2, 3]}) + dfX = df.__dataframe__() + colX = dfX.get_column_by_name('a') + with pytest.raises(TypeError): + colX.describe_categorical + +def test_categorical(df_from_dict): + df = df_from_dict({"weekday": ["Mon", "Tue", "Mon", "Wed", "Mon", "Thu", "Fri", "Sat", "Sun"]}, is_categorical=True) + + colX = df.__dataframe__().get_column_by_name("weekday") + is_ordered, is_dictionary, _ = colX.describe_categorical + assert isinstance(is_ordered, bool) + assert isinstance(is_dictionary, bool) + + +def test_dataframe(df_from_dict): + df = df_from_dict({"x": [True, True, False], "y": [1, 2, 0], "z": [9.2, 10.5, 11.8]}) + dfX = df.__dataframe__() + + assert dfX.num_columns() == 3 + assert dfX.num_rows() == 3 + assert dfX.num_chunks() == 1 + assert dfX.column_names() == ["x", "y", "z"] + assert dfX.select_columns((0, 2)).column_names() == dfX.select_columns_by_name(("x", "z")).column_names() + +@pytest.mark.parametrize(["size", "n_chunks"], + [(10, 3), (12, 3), (12, 5)] +) +def test_chunks(size, n_chunks, df_from_dict): + df = df_from_dict({"x": list(range(size))}) + dfX = df.__dataframe__() + chunks = list(dfX.get_chunks(n_chunks)) + assert len(chunks) == n_chunks + assert sum(chunk.num_rows() for chunk in chunks) == size + + +def test_get_chunks(df_from_dict): + df = df_from_dict({"x": [1]}) + dfX = df.__dataframe__() + assert len(list(dfX.get_chunks())) == 1 From b36fd468580eb2d0e815203b1fc96bdc35b7e10b Mon Sep 17 00:00:00 2001 From: Vasily Litvinov Date: Tue, 22 Feb 2022 18:20:11 +0300 Subject: [PATCH 08/49] Make DataFrame.__dataframe__ pass protocol tests Signed-off-by: Vasily Litvinov --- pandas/api/exchange/implementation.py | 30 ++++++++++++++++++--------- pandas/tests/api/conftest.py | 14 ++++++++++--- 2 files changed, 31 insertions(+), 13 deletions(-) diff --git a/pandas/api/exchange/implementation.py b/pandas/api/exchange/implementation.py index 94000ce576acc..5553ba4e3b68f 100644 --- a/pandas/api/exchange/implementation.py +++ b/pandas/api/exchange/implementation.py @@ -1,4 +1,4 @@ -import collections +import collections.abc import ctypes from typing import Tuple, Any @@ -14,9 +14,8 @@ def from_dataframe(df : DataFrameXchg, """ Construct a pandas DataFrame from ``df`` if it supports ``__dataframe__`` """ - # NOTE: commented out for roundtrip testing - # if isinstance(df, pd.DataFrame): - # return df + if isinstance(df, pd.DataFrame): + return df if not hasattr(df, '__dataframe__'): raise ValueError("`df` does not support __dataframe__") @@ -606,20 +605,31 @@ def get_columns(self): for name in self._df.columns] def select_columns(self, indices): - if not isinstance(indices, collections.Sequence): + if not isinstance(indices, collections.abc.Sequence): raise ValueError("`indices` is not a sequence") + if not isinstance(indices, list): + indices = list(indices) - return _PandasDataFrameXchg(self._df.iloc[:, indices]) + return _PandasDataFrameXchg(self._df.iloc[:, indices], self._nan_as_null, self._allow_copy) def select_columns_by_name(self, names): - if not isinstance(names, collections.Sequence): + if not isinstance(names, collections.abc.Sequence): raise ValueError("`names` is not a sequence") + if not isinstance(names, list): + names = list(names) - return _PandasDataFrameXchg(self._df.xs(names, axis='columns')) + return _PandasDataFrameXchg(self._df.loc[:, names], self._nan_as_null, self._allow_copy) def get_chunks(self, n_chunks=None): """ Return an iterator yielding the chunks. """ - #TODO: implement chunking when n_chunks > 1 - return (self,) + if n_chunks and n_chunks > 1: + size = len(self._df) + step = size // n_chunks + if size % n_chunks != 0: + step +=1 + for start in range(0, step * n_chunks, step): + yield _PandasDataFrameXchg(self._df.iloc[start:start + step, :], self._nan_as_null, self._allow_copy) + else: + yield self diff --git a/pandas/tests/api/conftest.py b/pandas/tests/api/conftest.py index 5d3c42870cbb8..f4e9f4a3fd524 100644 --- a/pandas/tests/api/conftest.py +++ b/pandas/tests/api/conftest.py @@ -1,8 +1,16 @@ import pytest import pandas as pd +from pandas.api.exchange.implementation import _from_dataframe @pytest.fixture(scope='package') -def create_df_from_dict(): - def maker(dct): - return pd.DataFrame(dct) +def df_from_dict(): + def maker(dct, is_categorical=False): + df = pd.DataFrame(dct) + return df.astype('category') if is_categorical else df + return maker + +@pytest.fixture(scope='package') +def df_from_xchg(): + def maker(xchg): + return _from_dataframe(xchg) return maker From d334b20532cf5f225d08ac67af040699c1eac290 Mon Sep 17 00:00:00 2001 From: Vasily Litvinov Date: Thu, 24 Feb 2022 11:30:20 +0300 Subject: [PATCH 09/49] Explicitly mark abstract methods in spec Signed-off-by: Vasily Litvinov --- pandas/api/exchange/dataframe_protocol.py | 34 ++++++++++++++++++++--- 1 file changed, 30 insertions(+), 4 deletions(-) diff --git a/pandas/api/exchange/dataframe_protocol.py b/pandas/api/exchange/dataframe_protocol.py index a558ef64a38ad..33fbf4427b4dd 100644 --- a/pandas/api/exchange/dataframe_protocol.py +++ b/pandas/api/exchange/dataframe_protocol.py @@ -1,5 +1,6 @@ from typing import Tuple, Optional, Dict, Any, Iterable, Sequence, TypedDict import enum +from abc import ABC, abstractmethod class DlpackDeviceType(enum.IntEnum): CPU = 1 @@ -20,7 +21,7 @@ class DtypeKind(enum.IntEnum): DATETIME = 22 CATEGORICAL = 23 -class ColumnNullType: +class ColumnNullType(enum.IntEnum): NON_NULLABLE = 0 USE_NAN = 1 USE_SENTINEL = 2 @@ -42,7 +43,7 @@ class ColumnBuffers(TypedDict): # an associated offsets buffer -class Buffer: +class Buffer(ABC): """ Data in the buffer is guaranteed to be contiguous in memory. Note that there is no dtype attribute present, a buffer can be thought of @@ -56,6 +57,7 @@ class Buffer: """ @property + @abstractmethod def bufsize(self) -> int: """ Buffer size in bytes. @@ -63,12 +65,14 @@ def bufsize(self) -> int: pass @property + @abstractmethod def ptr(self) -> int: """ Pointer to start of the buffer as an integer. """ pass + @abstractmethod def __dlpack__(self): """ Produce DLPack capsule (see array API standard). @@ -80,6 +84,7 @@ def __dlpack__(self): """ raise NotImplementedError("__dlpack__") + @abstractmethod def __dlpack_device__(self) -> Tuple[DlpackDeviceType, int]: """ Device type and device ID for where the data in the buffer resides. @@ -89,7 +94,7 @@ def __dlpack_device__(self) -> Tuple[DlpackDeviceType, int]: pass -class Column: +class Column(ABC): """ A column object, with only the methods and properties required by the interchange protocol defined. @@ -127,6 +132,7 @@ class Column: """ @property + @abstractmethod def size(self) -> Optional[int]: """ Size of the column, in elements. @@ -136,6 +142,7 @@ def size(self) -> Optional[int]: pass @property + @abstractmethod def offset(self) -> int: """ Offset of first element. @@ -146,6 +153,7 @@ def offset(self) -> int: pass @property + @abstractmethod def dtype(self) -> Tuple[DtypeKind, int, str, str]: """ Dtype description as a tuple ``(kind, bit-width, format string, endianness)``. @@ -175,6 +183,7 @@ def dtype(self) -> Tuple[DtypeKind, int, str, str]: pass @property + @abstractmethod def describe_categorical(self) -> Tuple[bool, bool, dict]: """ If the dtype is categorical, there are two options: @@ -194,6 +203,7 @@ def describe_categorical(self) -> Tuple[bool, bool, dict]: pass @property + @abstractmethod def describe_null(self) -> Tuple[ColumnNullType, Any]: """ Return the missing value (or "null") representation the column dtype @@ -205,6 +215,7 @@ def describe_null(self) -> Tuple[ColumnNullType, Any]: pass @property + @abstractmethod def null_count(self) -> Optional[int]: """ Number of null elements, if known. @@ -213,18 +224,21 @@ def null_count(self) -> Optional[int]: pass @property + @abstractmethod def metadata(self) -> Dict[str, Any]: """ The metadata for the column. See `DataFrame.metadata` for more details. """ pass + @abstractmethod def num_chunks(self) -> int: """ Return the number of chunks the column consists of. """ pass + @abstractmethod def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable["Column"]: """ Return an iterator yielding the chunks. @@ -232,6 +246,7 @@ def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable["Column"]: """ pass + @abstractmethod def get_buffers(self) -> ColumnBuffers: """ Return a dictionary containing the underlying buffers. @@ -261,7 +276,7 @@ def get_buffers(self) -> ColumnBuffers: # pass -class DataFrame: +class DataFrame(ABC): """ A data frame class, with only the methods required by the interchange protocol defined. @@ -276,6 +291,7 @@ class DataFrame: version = 0 # version of the protocol @property + @abstractmethod def metadata(self) -> Dict[str, Any]: """ The metadata for the data frame, as a dictionary with string keys. The @@ -288,12 +304,14 @@ def metadata(self) -> Dict[str, Any]: """ pass + @abstractmethod def num_columns(self) -> int: """ Return the number of columns in the DataFrame. """ pass + @abstractmethod def num_rows(self) -> Optional[int]: # TODO: not happy with Optional, but need to flag it may be expensive # why include it if it may be None - what do we expect consumers @@ -303,48 +321,56 @@ def num_rows(self) -> Optional[int]: """ pass + @abstractmethod def num_chunks(self) -> int: """ Return the number of chunks the DataFrame consists of. """ pass + @abstractmethod def column_names(self) -> Iterable[str]: """ Return an iterator yielding the column names. """ pass + @abstractmethod def get_column(self, i: int) -> Column: """ Return the column at the indicated position. """ pass + @abstractmethod def get_column_by_name(self, name: str) -> Column: """ Return the column whose name is the indicated name. """ pass + @abstractmethod def get_columns(self) -> Iterable[Column]: """ Return an iterator yielding the columns. """ pass + @abstractmethod def select_columns(self, indices: Sequence[int]) -> "DataFrame": """ Create a new DataFrame by selecting a subset of columns by index. """ pass + @abstractmethod def select_columns_by_name(self, names: Sequence[str]) -> "DataFrame": """ Create a new DataFrame by selecting a subset of columns by name. """ pass + @abstractmethod def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable["DataFrame"]: """ Return an iterator yielding the chunks. From 014165d54a2bf82ad428b8c0ce92e3404e216b85 Mon Sep 17 00:00:00 2001 From: Vasily Litvinov Date: Thu, 24 Feb 2022 11:44:20 +0300 Subject: [PATCH 10/49] Add more smoke tests Signed-off-by: Vasily Litvinov --- pandas/tests/api/test_protocol.py | 37 +++++++++++++++++++++++++------ 1 file changed, 30 insertions(+), 7 deletions(-) diff --git a/pandas/tests/api/test_protocol.py b/pandas/tests/api/test_protocol.py index b4335e83fcf2d..32deac1e1c058 100644 --- a/pandas/tests/api/test_protocol.py +++ b/pandas/tests/api/test_protocol.py @@ -72,18 +72,41 @@ def test_dataframe(df_from_dict): assert dfX.column_names() == ["x", "y", "z"] assert dfX.select_columns((0, 2)).column_names() == dfX.select_columns_by_name(("x", "z")).column_names() -@pytest.mark.parametrize(["size", "n_chunks"], - [(10, 3), (12, 3), (12, 5)] -) -def test_chunks(size, n_chunks, df_from_dict): +@pytest.mark.parametrize(["size", "n_chunks"], [(10, 3), (12, 3), (12, 5)]) +def test_df_get_chunks(size, n_chunks, df_from_dict): df = df_from_dict({"x": list(range(size))}) dfX = df.__dataframe__() chunks = list(dfX.get_chunks(n_chunks)) assert len(chunks) == n_chunks assert sum(chunk.num_rows() for chunk in chunks) == size +@pytest.mark.parametrize(["size", "n_chunks"], [(10, 3), (12, 3), (12, 5)]) +def test_column_get_chunks(size, n_chunks, df_from_dict): + df = df_from_dict({"x": list(range(size))}) + dfX = df.__dataframe__() + chunks = list(dfX.get_column(0).get_chunks(n_chunks)) + assert len(chunks) == n_chunks + assert sum(chunk.size for chunk in chunks) == size -def test_get_chunks(df_from_dict): - df = df_from_dict({"x": [1]}) +def test_get_columns(df_from_dict): + df = df_from_dict({"a": [0, 1], "b": [2.5, 3.5]}) + dfX = df.__dataframe__() + for colX in dfX.get_columns(): + assert colX.size == 2 + assert colX.num_chunks() == 1 + assert dfX.get_column(0).dtype[0] == 0 + assert dfX.get_column(1).dtype[0] == 2 + +def test_buffer(df_from_dict): + df = df_from_dict({"a": [0, 1]}) dfX = df.__dataframe__() - assert len(list(dfX.get_chunks())) == 1 + colX = dfX.get_column(0) + bufX = colX.get_buffers() + + dataBuf, dataDtype = bufX['data'] + + assert dataBuf.bufsize > 0 + assert dataBuf.ptr != 0 + assert dataBuf.__dlpack_device__ + + assert dataDtype[0] == 0 From def54bab76deeada0d573a39fc983ceb4669612e Mon Sep 17 00:00:00 2001 From: Vasily Litvinov Date: Thu, 24 Feb 2022 11:49:30 +0300 Subject: [PATCH 11/49] Implement column chunking Signed-off-by: Vasily Litvinov --- pandas/api/exchange/implementation.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/pandas/api/exchange/implementation.py b/pandas/api/exchange/implementation.py index 5553ba4e3b68f..ff69848b77afc 100644 --- a/pandas/api/exchange/implementation.py +++ b/pandas/api/exchange/implementation.py @@ -405,8 +405,15 @@ def get_chunks(self, n_chunks=None): Return an iterator yielding the chunks. See `DataFrame.get_chunks` for details on ``n_chunks``. """ - # TODO: implement proper chunking for n_chunks > 1 - return (self,) + if n_chunks and n_chunks > 1: + size = len(self._col) + step = size // n_chunks + if size % n_chunks != 0: + step +=1 + for start in range(0, step * n_chunks, step): + yield _PandasColumn(self._col.iloc[start:start + step], self._allow_copy) + else: + yield self def get_buffers(self): """ From 8e6b88229df57edda4f531f0e7d9869c72a900ac Mon Sep 17 00:00:00 2001 From: Vasily Litvinov Date: Thu, 24 Feb 2022 11:53:44 +0300 Subject: [PATCH 12/49] Fix tests formatting Signed-off-by: Vasily Litvinov --- pandas/tests/api/test_protocol.py | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/pandas/tests/api/test_protocol.py b/pandas/tests/api/test_protocol.py index 32deac1e1c058..cb2906c30fe76 100644 --- a/pandas/tests/api/test_protocol.py +++ b/pandas/tests/api/test_protocol.py @@ -3,10 +3,10 @@ @pytest.mark.parametrize("test_data", [ - {'a': ["foo", "bar"], - 'b': ["baz", "qux"]}, - {'a': [1.5, 2.5, 3.5], 'b': [9.2, 10.5, 11.8]}, - {'A': [1, 2, 3, 4], 'B': [1, 2, 3, 4]} + {"a": ["foo", "bar"], + "b": ["baz", "qux"]}, + {"a": [1.5, 2.5, 3.5], "b": [9.2, 10.5, 11.8]}, + {"A": [1, 2, 3, 4], "B": [1, 2, 3, 4]} ], ids=["str_data", "float_data", "int_data"]) def test_only_one_dtype(test_data, df_from_dict): @@ -22,12 +22,12 @@ def test_only_one_dtype(test_data, df_from_dict): def test_float_int(df_from_dict): - df = df_from_dict({'a': [1, 2, 3], 'b': [3, 4, 5], - 'c': [1.5, 2.5, 3.5], 'd': [9, 10, 11], - 'e': [True, False, True], - 'f': ["a", "", "c"]}) + df = df_from_dict({"a": [1, 2, 3], "b": [3, 4, 5], + "c": [1.5, 2.5, 3.5], "d": [9, 10, 11], + "e": [True, False, True], + "f": ["a", "", "c"]}) dfX = df.__dataframe__() - columns = {'a': 0, 'b': 0, 'c': 2, 'd': 0, 'e': 20, 'f': 21} + columns = {"a": 0, "b": 0, "c": 2, "d": 0, "e": 20, "f": 21} for column, kind in columns.items(): colX = dfX.get_column_by_name(column) @@ -41,15 +41,15 @@ def test_float_int(df_from_dict): def test_na_float(df_from_dict): - df = df_from_dict({'a': [1.0, math.nan, 2.0]}) + df = df_from_dict({"a": [1.0, math.nan, 2.0]}) dfX = df.__dataframe__() - colX = dfX.get_column_by_name('a') + colX = dfX.get_column_by_name("a") assert colX.null_count == 1 def test_noncategorical(df_from_dict): - df = df_from_dict({'a': [1, 2, 3]}) + df = df_from_dict({"a": [1, 2, 3]}) dfX = df.__dataframe__() - colX = dfX.get_column_by_name('a') + colX = dfX.get_column_by_name("a") with pytest.raises(TypeError): colX.describe_categorical @@ -103,7 +103,7 @@ def test_buffer(df_from_dict): colX = dfX.get_column(0) bufX = colX.get_buffers() - dataBuf, dataDtype = bufX['data'] + dataBuf, dataDtype = bufX["data"] assert dataBuf.bufsize > 0 assert dataBuf.ptr != 0 From 282c85df4af0a6f54c58b884f3d600c3981fab90 Mon Sep 17 00:00:00 2001 From: Vasily Litvinov Date: Thu, 24 Feb 2022 12:01:01 +0300 Subject: [PATCH 13/49] Start implementing chunk support in from_df Signed-off-by: Vasily Litvinov --- pandas/api/exchange/implementation.py | 49 ++++++++++++++------------- 1 file changed, 25 insertions(+), 24 deletions(-) diff --git a/pandas/api/exchange/implementation.py b/pandas/api/exchange/implementation.py index ff69848b77afc..1ebdc33642676 100644 --- a/pandas/api/exchange/implementation.py +++ b/pandas/api/exchange/implementation.py @@ -29,33 +29,34 @@ def _from_dataframe(df : DataFrameXchg) -> pd.DataFrame: only Pandas. Later, we need to implement/test support for categoricals, bit/byte masks, chunk handling, etc. """ - # Check number of chunks, if there's more than one we need to iterate - if df.num_chunks() > 1: - raise NotImplementedError - - # We need a dict of columns here, with each column being a numpy array (at - # least for now, deal with non-numpy dtypes later). - columns = dict() _buffers = [] # hold on to buffers, keeps memory alive - for name in df.column_names(): - if not isinstance(name, str): - raise ValueError(f"Column {name} is not a string") - if name in columns: - raise ValueError(f"Column {name} is not unique") - col = df.get_column_by_name(name) - if col.dtype[0] in (DtypeKind.INT, DtypeKind.UINT, DtypeKind.FLOAT, DtypeKind.BOOL): - # Simple numerical or bool dtype, turn into numpy array - columns[name], _buf = convert_column_to_ndarray(col) - elif col.dtype[0] == DtypeKind.CATEGORICAL: - columns[name], _buf = convert_categorical_column(col) - elif col.dtype[0] == DtypeKind.STRING: - columns[name], _buf = convert_string_column(col) - else: - raise NotImplementedError(f"Data type {col.dtype[0]} not handled yet") + result = [] + for chunk in df.get_chunks(): + # We need a dict of columns here, with each column being a numpy array (at + # least for now, deal with non-numpy dtypes later). + chunk_cols = {} + for name in chunk.column_names(): + if not isinstance(name, str): + raise ValueError(f"Column {name} is not a string") + if name in chunk_cols: + raise ValueError(f"Column {name} is not unique") + col = chunk.get_column_by_name(name) + if col.dtype[0] in (DtypeKind.INT, DtypeKind.UINT, DtypeKind.FLOAT, DtypeKind.BOOL): + # Simple numerical or bool dtype, turn into numpy array + chunk_cols[name], _buf = convert_column_to_ndarray(col) + elif col.dtype[0] == DtypeKind.CATEGORICAL: + chunk_cols[name], _buf = convert_categorical_column(col) + elif col.dtype[0] == DtypeKind.STRING: + chunk_cols[name], _buf = convert_string_column(col) + else: + raise NotImplementedError(f"Data type {col.dtype[0]} not handled yet") + + _buffers.append(_buf) - _buffers.append(_buf) + df_new = pd.DataFrame(chunk_cols) + result.append(df_new) - df_new = pd.DataFrame(columns) + df_new = pd.concat(result) df_new._buffers = _buffers return df_new From 9fbb58de042f6dd3113eb2aec2954ca6856460d5 Mon Sep 17 00:00:00 2001 From: Vasily Litvinov Date: Thu, 24 Feb 2022 19:54:18 +0300 Subject: [PATCH 14/49] Test buffer contents if on CPU Signed-off-by: Vasily Litvinov --- pandas/tests/api/test_protocol.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/pandas/tests/api/test_protocol.py b/pandas/tests/api/test_protocol.py index cb2906c30fe76..5bb7bbc7d0f8c 100644 --- a/pandas/tests/api/test_protocol.py +++ b/pandas/tests/api/test_protocol.py @@ -1,5 +1,6 @@ import pytest import math +import ctypes @pytest.mark.parametrize("test_data", [ @@ -98,7 +99,8 @@ def test_get_columns(df_from_dict): assert dfX.get_column(1).dtype[0] == 2 def test_buffer(df_from_dict): - df = df_from_dict({"a": [0, 1]}) + arr = [0, 1, -1] + df = df_from_dict({"a": arr}) dfX = df.__dataframe__() colX = dfX.get_column(0) bufX = colX.get_buffers() @@ -107,6 +109,17 @@ def test_buffer(df_from_dict): assert dataBuf.bufsize > 0 assert dataBuf.ptr != 0 - assert dataBuf.__dlpack_device__ + device, _ = dataBuf.__dlpack_device__ assert dataDtype[0] == 0 + + if device == 1: # CPU-only as we're going to directly read memory here + bitwidth = dataDtype[1] + ctype = {8: ctypes.c_int8, + 16: ctypes.c_int16, + 32: ctypes.c_int32, + 64: ctypes.c_int64}[bitwidth] + + for idx, truth in enumerate(arr): + val = ctype.from_address(dataBuf.ptr + idx * (bitwidth // 8)).value + assert val == truth, f"Buffer at index {idx} mismatch" From dd936256a68770eae566174de1173fcb9a392b94 Mon Sep 17 00:00:00 2001 From: Vasily Litvinov Date: Thu, 24 Feb 2022 20:47:35 +0300 Subject: [PATCH 15/49] Improve spec a bit Signed-off-by: Vasily Litvinov --- pandas/api/exchange/dataframe_protocol.py | 44 +++++++++++++++++++++-- 1 file changed, 42 insertions(+), 2 deletions(-) diff --git a/pandas/api/exchange/dataframe_protocol.py b/pandas/api/exchange/dataframe_protocol.py index 33fbf4427b4dd..b537a52c2c776 100644 --- a/pandas/api/exchange/dataframe_protocol.py +++ b/pandas/api/exchange/dataframe_protocol.py @@ -3,6 +3,8 @@ from abc import ABC, abstractmethod class DlpackDeviceType(enum.IntEnum): + """Integer enum for device type codes matching DLPack.""" + CPU = 1 CUDA = 2 CPU_PINNED = 3 @@ -13,6 +15,27 @@ class DlpackDeviceType(enum.IntEnum): ROCM = 10 class DtypeKind(enum.IntEnum): + """ + Integer enum for data types. + + Attributes + ---------- + INT : int + Matches to signed integer data type. + UINT : int + Matches to unsigned integer data type. + FLOAT : int + Matches to floating point data type. + BOOL : int + Matches to boolean data type. + STRING : int + Matches to string data type (UTF-8 encoded). + DATETIME : int + Matches to datetime data type. + CATEGORICAL : int + Matches to categorical data type. + """ + INT = 0 UINT = 1 FLOAT = 2 @@ -22,6 +45,23 @@ class DtypeKind(enum.IntEnum): CATEGORICAL = 23 class ColumnNullType(enum.IntEnum): + """ + Integer enum for null type representation. + + Attributes + ---------- + NON_NULLABLE : int + Non-nullable column. + USE_NAN : int + Use explicit float NaN/NaT value. + USE_SENTINEL : int + Sentinel value besides NaN/NaT. + USE_BITMASK : int + The bit is set/unset representing a null on a certain position. + USE_BYTEMASK : int + The byte is set/unset representing a null on a certain position. + """ + NON_NULLABLE = 0 USE_NAN = 1 USE_SENTINEL = 2 @@ -85,7 +125,7 @@ def __dlpack__(self): raise NotImplementedError("__dlpack__") @abstractmethod - def __dlpack_device__(self) -> Tuple[DlpackDeviceType, int]: + def __dlpack_device__(self) -> Tuple[DlpackDeviceType, Optional[int]]: """ Device type and device ID for where the data in the buffer resides. Uses device type codes matching DLPack. @@ -184,7 +224,7 @@ def dtype(self) -> Tuple[DtypeKind, int, str, str]: @property @abstractmethod - def describe_categorical(self) -> Tuple[bool, bool, dict]: + def describe_categorical(self) -> Tuple[bool, bool, Optional[dict]]: """ If the dtype is categorical, there are two options: - There are only values in the data buffer. From 07c8fae8527f5163ce5fc4f525f980af5334538d Mon Sep 17 00:00:00 2001 From: Vasily Litvinov Date: Thu, 24 Feb 2022 20:54:55 +0300 Subject: [PATCH 16/49] Beautify spec whitespace Signed-off-by: Vasily Litvinov --- pandas/api/exchange/dataframe_protocol.py | 31 +++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/pandas/api/exchange/dataframe_protocol.py b/pandas/api/exchange/dataframe_protocol.py index b537a52c2c776..958a7af5d1c15 100644 --- a/pandas/api/exchange/dataframe_protocol.py +++ b/pandas/api/exchange/dataframe_protocol.py @@ -2,6 +2,7 @@ import enum from abc import ABC, abstractmethod + class DlpackDeviceType(enum.IntEnum): """Integer enum for device type codes matching DLPack.""" @@ -14,6 +15,7 @@ class DlpackDeviceType(enum.IntEnum): VPI = 9 ROCM = 10 + class DtypeKind(enum.IntEnum): """ Integer enum for data types. @@ -44,6 +46,7 @@ class DtypeKind(enum.IntEnum): DATETIME = 22 CATEGORICAL = 23 + class ColumnNullType(enum.IntEnum): """ Integer enum for null type representation. @@ -68,6 +71,7 @@ class ColumnNullType(enum.IntEnum): USE_BITMASK = 3 USE_BYTEMASK = 4 + class ColumnBuffers(TypedDict): data: Tuple["Buffer", Any] # first element is a buffer containing the column data; # second element is the data buffer's associated dtype @@ -86,11 +90,13 @@ class ColumnBuffers(TypedDict): class Buffer(ABC): """ Data in the buffer is guaranteed to be contiguous in memory. + Note that there is no dtype attribute present, a buffer can be thought of as simply a block of memory. However, if the column that the buffer is attached to has a dtype that's supported by DLPack and ``__dlpack__`` is implemented, then that dtype information will be contained in the return value from ``__dlpack__``. + This distinction is useful to support both data exchange via DLPack on a buffer and (b) dtypes like variable-length strings which do not have a fixed number of bytes per element. @@ -116,9 +122,12 @@ def ptr(self) -> int: def __dlpack__(self): """ Produce DLPack capsule (see array API standard). + Raises: + - TypeError : if the buffer contains unsupported dtypes. - NotImplementedError : if DLPack support is not implemented + Useful to have to connect to array libraries. Support optional because it's not completely trivial to implement for a Python-only library. """ @@ -138,27 +147,33 @@ class Column(ABC): """ A column object, with only the methods and properties required by the interchange protocol defined. + A column can contain one or more chunks. Each chunk can contain up to three buffers - a data buffer, a mask buffer (depending on null representation), and an offsets buffer (if variable-size binary; e.g., variable-length strings). + TBD: Arrow has a separate "null" dtype, and has no separate mask concept. Instead, it seems to use "children" for both columns with a bit mask, and for nested dtypes. Unclear whether this is elegant or confusing. This design requires checking the null representation explicitly. + The Arrow design requires checking: 1. the ARROW_FLAG_NULLABLE (for sentinel values) 2. if a column has two children, combined with one of those children having a null dtype. + Making the mask concept explicit seems useful. One null dtype would not be enough to cover both bit and byte masks, so that would mean even more checking if we did it the Arrow way. + TBD: there's also the "chunk" concept here, which is implicit in Arrow as multiple buffers per array (= column here). Semantically it may make sense to have both: chunks were meant for example for lazy evaluation of data which doesn't fit in memory, while multiple buffers per column could also come from doing a selection operation on a single contiguous buffer. + Given these concepts, one would expect chunks to be all of the same size (say a 10,000 row dataframe could have 10 chunks of 1,000 rows), while multiple buffers could have data-dependent lengths. Not an issue @@ -167,6 +182,7 @@ class Column(ABC): Are multiple chunks *and* multiple buffers per column necessary for the purposes of this interchange protocol, or must producers either reuse the chunk concept for this or copy the data? + Note: this Column object can only be produced by ``__dataframe__``, so doesn't need its own version or ``__column__`` protocol. """ @@ -176,6 +192,7 @@ class Column(ABC): def size(self) -> Optional[int]: """ Size of the column, in elements. + Corresponds to DataFrame.num_rows() if column is a single chunk; equal to size of this current chunk otherwise. """ @@ -186,6 +203,7 @@ def size(self) -> Optional[int]: def offset(self) -> int: """ Offset of first element. + May be > 0 if using chunks; for example for a column with N chunks of equal size M (only the last chunk may be shorter), ``offset = n * M``, ``n = 0 .. N-1``. @@ -197,10 +215,12 @@ def offset(self) -> int: def dtype(self) -> Tuple[DtypeKind, int, str, str]: """ Dtype description as a tuple ``(kind, bit-width, format string, endianness)``. + Bit-width : the number of bits as an integer Format string : data type description format string in Apache Arrow C Data Interface format. Endianness : current only native endianness (``=``) is supported + Notes: - Kind specifiers are aligned with DLPack where possible (hence the jump to 20, leave enough room for future extension) @@ -229,6 +249,7 @@ def describe_categorical(self) -> Tuple[bool, bool, Optional[dict]]: If the dtype is categorical, there are two options: - There are only values in the data buffer. - There is a separate dictionary-style encoding for categorical values. + Raises TypeError if the dtype is not categorical Returns the description on how to interpret the data buffer: @@ -238,6 +259,7 @@ def describe_categorical(self) -> Tuple[bool, bool, Optional[dict]]: categorical values to other objects exists - "mapping" : dict, Python-level only (e.g. ``{int: str}``). None if not a dictionary-style categorical. + TBD: are there any other in-memory representations that are needed? """ pass @@ -248,6 +270,7 @@ def describe_null(self) -> Tuple[ColumnNullType, Any]: """ Return the missing value (or "null") representation the column dtype uses, as a tuple ``(kind, value)``. + Value : if kind is "sentinel value", the actual value. If kind is a bit mask or a byte mask, the value (0 or 1) indicating a missing value. None otherwise. @@ -259,6 +282,7 @@ def describe_null(self) -> Tuple[ColumnNullType, Any]: def null_count(self) -> Optional[int]: """ Number of null elements, if known. + Note: Arrow uses -1 to indicate "unknown", but None seems cleaner. """ pass @@ -282,6 +306,7 @@ def num_chunks(self) -> int: def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable["Column"]: """ Return an iterator yielding the chunks. + See `DataFrame.get_chunks` for details on ``n_chunks``. """ pass @@ -290,7 +315,9 @@ def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable["Column"]: def get_buffers(self) -> ColumnBuffers: """ Return a dictionary containing the underlying buffers. + The returned dictionary has the following contents: + - "data": a two-element tuple whose first element is a buffer containing the data and whose second element is the data buffer's associated dtype. @@ -320,14 +347,17 @@ class DataFrame(ABC): """ A data frame class, with only the methods required by the interchange protocol defined. + A "data frame" represents an ordered collection of named columns. A column's "name" must be a unique string. Columns may be accessed by name or by position. + This could be a public data frame class, or an object with the methods and attributes defined on this DataFrame class could be returned from the ``__dataframe__`` method of a public data frame class in a library adhering to the dataframe interchange protocol specification. """ + version = 0 # version of the protocol @property @@ -414,6 +444,7 @@ def select_columns_by_name(self, names: Sequence[str]) -> "DataFrame": def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable["DataFrame"]: """ Return an iterator yielding the chunks. + By default (None), yields the chunks that the data is stored as by the producer. If given, ``n_chunks`` must be a multiple of ``self.num_chunks()``, meaning the producer must subdivide each chunk From b74c06eb8900e7544dbbbd9505f2b3e362df4953 Mon Sep 17 00:00:00 2001 From: Vasily Litvinov Date: Thu, 24 Feb 2022 20:56:48 +0300 Subject: [PATCH 17/49] Use constants from spec enums, beautify a bit Signed-off-by: Vasily Litvinov --- pandas/api/exchange/implementation.py | 38 +++++++++++++-------------- 1 file changed, 18 insertions(+), 20 deletions(-) diff --git a/pandas/api/exchange/implementation.py b/pandas/api/exchange/implementation.py index 1ebdc33642676..73e8bcde177b0 100644 --- a/pandas/api/exchange/implementation.py +++ b/pandas/api/exchange/implementation.py @@ -3,11 +3,17 @@ from typing import Tuple, Any -from .dataframe_protocol import Buffer, Column, DataFrame as DataFrameXchg, DtypeKind, DlpackDeviceType +from .dataframe_protocol import Buffer, Column, ColumnNullType, DataFrame as DataFrameXchg, DtypeKind, DlpackDeviceType import pandas as pd import numpy as np +_NP_DTYPES = { + DtypeKind.INT: {8: np.int8, 16: np.int16, 32: np.int32, 64: np.int64}, + DtypeKind.UINT: {8: np.uint8, 16: np.uint16, 32: np.uint32, 64: np.uint64}, + DtypeKind.FLOAT: {32: np.float32, 64: np.float64}, + DtypeKind.BOOL: {8: bool}, +} def from_dataframe(df : DataFrameXchg, allow_copy : bool = True) -> pd.DataFrame: @@ -65,10 +71,7 @@ def convert_column_to_ndarray(col : Column) -> Tuple[np.ndarray, Buffer]: """ Convert an int, uint, float or bool column to a numpy array. """ - if col.offset != 0: - raise NotImplementedError("column.offset > 0 not handled yet") - - if col.describe_null[0] not in (0, 1): + if col.describe_null[0] not in (ColumnNullType.NON_NULLABLE, ColumnNullType.USE_NAN): raise NotImplementedError("Null values represented as masks or " "sentinel values not handled yet") @@ -80,14 +83,10 @@ def buffer_to_ndarray(_buffer : Buffer, _dtype) -> np.ndarray: # Handle the dtype kind = _dtype[0] bitwidth = _dtype[1] - if _dtype[0] not in (DtypeKind.INT, DtypeKind.UINT, DtypeKind.FLOAT, DtypeKind.BOOL): - raise RuntimeError("Not a boolean, integer or floating-point dtype") + if kind not in _NP_DTYPES: + raise RuntimeError(f"Unsupported data type: {kind}") - _ints = {8: np.int8, 16: np.int16, 32: np.int32, 64: np.int64} - _uints = {8: np.uint8, 16: np.uint16, 32: np.uint32, 64: np.uint64} - _floats = {32: np.float32, 64: np.float64} - _np_dtypes = {0: _ints, 1: _uints, 2: _floats, 20: {8: bool}} - column_dtype = _np_dtypes[kind][bitwidth] + column_dtype = _NP_DTYPES[kind][bitwidth] # No DLPack yet, so need to construct a new ndarray from the data pointer # and size in the buffer plus the dtype on the column @@ -124,10 +123,10 @@ def convert_categorical_column(col : Column) -> Tuple[pd.Series, Buffer]: cat = pd.Categorical(values, categories=categories, ordered=ordered) series = pd.Series(cat) null_kind = col.describe_null[0] - if null_kind == 2: # sentinel value + if null_kind == ColumnNullType.USE_SENTINEL: # sentinel value sentinel = col.describe_null[1] series[codes == sentinel] = np.nan - else: + elif null_kind != ColumnNullType.NON_NULLABLE: raise NotImplementedError("Only categorical columns with sentinel " "value supported at the moment") @@ -164,16 +163,16 @@ def convert_string_column(col : Column) -> Tuple[np.ndarray, dict]: str_list = [] for i in range(obuf.size-1): # Check for missing values - if null_kind == 3: # bit mask - v = mbuf[i/8] + if null_kind == ColumnNullType.USE_BITMASK: + v = mbuf[i // 8] if null_value == 1: v = ~v - if v & (1<<(i%8)): + if v & (1<<(i % 8)): str_list.append(np.nan) continue - elif null_kind == 4 and mbuf[i] == null_value: # byte mask + elif null_kind == ColumnNullType.USE_BYTEMASK and mbuf[i] == null_value: str_list.append(np.nan) continue @@ -268,8 +267,7 @@ def __init__(self, column : pd.Series, Series/ndarray for now. """ if not isinstance(column, pd.Series): - raise NotImplementedError("Columns of type {} not handled " - "yet".format(type(column))) + raise NotImplementedError(f"Columns of type {type(column)} not handled yet") # Store the column as a private attribute self._col = column From 6637a29a063272f17e15f30a6d9f5c07121e50e9 Mon Sep 17 00:00:00 2001 From: Vasily Litvinov Date: Thu, 24 Feb 2022 21:24:56 +0300 Subject: [PATCH 18/49] Format by black Signed-off-by: Vasily Litvinov --- pandas/api/exchange/dataframe_protocol.py | 35 ++-- pandas/api/exchange/implementation.py | 217 +++++++++++++++------- 2 files changed, 167 insertions(+), 85 deletions(-) diff --git a/pandas/api/exchange/dataframe_protocol.py b/pandas/api/exchange/dataframe_protocol.py index 958a7af5d1c15..8633426457f3f 100644 --- a/pandas/api/exchange/dataframe_protocol.py +++ b/pandas/api/exchange/dataframe_protocol.py @@ -42,7 +42,7 @@ class DtypeKind(enum.IntEnum): UINT = 1 FLOAT = 2 BOOL = 20 - STRING = 21 # UTF-8 + STRING = 21 # UTF-8 DATETIME = 22 CATEGORICAL = 23 @@ -73,18 +73,20 @@ class ColumnNullType(enum.IntEnum): class ColumnBuffers(TypedDict): - data: Tuple["Buffer", Any] # first element is a buffer containing the column data; - # second element is the data buffer's associated dtype - validity: Optional[Tuple["Buffer", Any]] # first element is a buffer containing mask values - # indicating missing data and second element is - # the mask value buffer's associated dtype. - # None if the null representation is not a bit or byte mask - offsets: Optional[Tuple["Buffer", Any]] # first element is a buffer containing the - # offset values for variable-size binary data - # (e.g., variable-length strings) and - # second element is the offsets buffer's associated dtype. - # None if the data buffer does not have - # an associated offsets buffer + # first element is a buffer containing the column data; + # second element is the data buffer's associated dtype + data: Tuple["Buffer", Any] + + # first element is a buffer containing mask values indicating missing data; + # second element is the mask value buffer's associated dtype. + # None if the null representation is not a bit or byte mask + validity: Optional[Tuple["Buffer", Any]] + + # first element is a buffer containing the offset values for + # variable-size binary data (e.g., variable-length strings); + # second element is the offsets buffer's associated dtype. + # None if the data buffer does not have an associated offsets buffer + offsets: Optional[Tuple["Buffer", Any]] class Buffer(ABC): @@ -303,7 +305,7 @@ def num_chunks(self) -> int: pass @abstractmethod - def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable["Column"]: + def get_chunks(self, n_chunks: Optional[int] = None) -> Iterable["Column"]: """ Return an iterator yielding the chunks. @@ -335,6 +337,7 @@ def get_buffers(self) -> ColumnBuffers: """ pass + # def get_children(self) -> Iterable[Column]: # """ # Children columns underneath the column, each object in this iterator @@ -358,7 +361,7 @@ class DataFrame(ABC): to the dataframe interchange protocol specification. """ - version = 0 # version of the protocol + version = 0 # version of the protocol @property @abstractmethod @@ -441,7 +444,7 @@ def select_columns_by_name(self, names: Sequence[str]) -> "DataFrame": pass @abstractmethod - def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable["DataFrame"]: + def get_chunks(self, n_chunks: Optional[int] = None) -> Iterable["DataFrame"]: """ Return an iterator yielding the chunks. diff --git a/pandas/api/exchange/implementation.py b/pandas/api/exchange/implementation.py index 73e8bcde177b0..52bb7d4c4762d 100644 --- a/pandas/api/exchange/implementation.py +++ b/pandas/api/exchange/implementation.py @@ -3,7 +3,14 @@ from typing import Tuple, Any -from .dataframe_protocol import Buffer, Column, ColumnNullType, DataFrame as DataFrameXchg, DtypeKind, DlpackDeviceType +from .dataframe_protocol import ( + Buffer, + Column, + ColumnNullType, + DataFrame as DataFrameXchg, + DtypeKind, + DlpackDeviceType, +) import pandas as pd import numpy as np @@ -15,21 +22,21 @@ DtypeKind.BOOL: {8: bool}, } -def from_dataframe(df : DataFrameXchg, - allow_copy : bool = True) -> pd.DataFrame: + +def from_dataframe(df: DataFrameXchg, allow_copy: bool = True) -> pd.DataFrame: """ Construct a pandas DataFrame from ``df`` if it supports ``__dataframe__`` """ if isinstance(df, pd.DataFrame): return df - if not hasattr(df, '__dataframe__'): + if not hasattr(df, "__dataframe__"): raise ValueError("`df` does not support __dataframe__") return _from_dataframe(df.__dataframe__(allow_copy=allow_copy)) -def _from_dataframe(df : DataFrameXchg) -> pd.DataFrame: +def _from_dataframe(df: DataFrameXchg) -> pd.DataFrame: """ Note: not all cases are handled yet, only ones that can be implemented with only Pandas. Later, we need to implement/test support for categoricals, @@ -47,7 +54,12 @@ def _from_dataframe(df : DataFrameXchg) -> pd.DataFrame: if name in chunk_cols: raise ValueError(f"Column {name} is not unique") col = chunk.get_column_by_name(name) - if col.dtype[0] in (DtypeKind.INT, DtypeKind.UINT, DtypeKind.FLOAT, DtypeKind.BOOL): + if col.dtype[0] in ( + DtypeKind.INT, + DtypeKind.UINT, + DtypeKind.FLOAT, + DtypeKind.BOOL, + ): # Simple numerical or bool dtype, turn into numpy array chunk_cols[name], _buf = convert_column_to_ndarray(col) elif col.dtype[0] == DtypeKind.CATEGORICAL: @@ -67,19 +79,23 @@ def _from_dataframe(df : DataFrameXchg) -> pd.DataFrame: return df_new -def convert_column_to_ndarray(col : Column) -> Tuple[np.ndarray, Buffer]: +def convert_column_to_ndarray(col: Column) -> Tuple[np.ndarray, Buffer]: """ Convert an int, uint, float or bool column to a numpy array. """ - if col.describe_null[0] not in (ColumnNullType.NON_NULLABLE, ColumnNullType.USE_NAN): - raise NotImplementedError("Null values represented as masks or " - "sentinel values not handled yet") + if col.describe_null[0] not in ( + ColumnNullType.NON_NULLABLE, + ColumnNullType.USE_NAN, + ): + raise NotImplementedError( + "Null values represented as masks or " "sentinel values not handled yet" + ) _buffer, _dtype = col.get_buffers()["data"] return buffer_to_ndarray(_buffer, _dtype), _buffer -def buffer_to_ndarray(_buffer : Buffer, _dtype) -> np.ndarray: +def buffer_to_ndarray(_buffer: Buffer, _dtype) -> np.ndarray: # Handle the dtype kind = _dtype[0] bitwidth = _dtype[1] @@ -96,19 +112,18 @@ def buffer_to_ndarray(_buffer : Buffer, _dtype) -> np.ndarray: # NOTE: `x` does not own its memory, so the caller of this function must # either make a copy or hold on to a reference of the column or # buffer! (not done yet, this is pretty awful ...) - x = np.ctypeslib.as_array(data_pointer, - shape=(_buffer.bufsize // (bitwidth//8),)) + x = np.ctypeslib.as_array(data_pointer, shape=(_buffer.bufsize // (bitwidth // 8),)) return x -def convert_categorical_column(col : Column) -> Tuple[pd.Series, Buffer]: +def convert_categorical_column(col: Column) -> Tuple[pd.Series, Buffer]: """ Convert a categorical column to a Series instance. """ ordered, is_dict, mapping = col.describe_categorical if not is_dict: - raise NotImplementedError('Non-dictionary categoricals not supported yet') + raise NotImplementedError("Non-dictionary categoricals not supported yet") # If you want to cheat for testing (can't use `_col` in real-world code): # categories = col._col.values.categories.values @@ -127,13 +142,14 @@ def convert_categorical_column(col : Column) -> Tuple[pd.Series, Buffer]: sentinel = col.describe_null[1] series[codes == sentinel] = np.nan elif null_kind != ColumnNullType.NON_NULLABLE: - raise NotImplementedError("Only categorical columns with sentinel " - "value supported at the moment") + raise NotImplementedError( + "Only categorical columns with sentinel " "value supported at the moment" + ) return series, codes_buffer -def convert_string_column(col : Column) -> Tuple[np.ndarray, dict]: +def convert_string_column(col: Column) -> Tuple[np.ndarray, dict]: """ Convert a string column to a NumPy array. """ @@ -153,7 +169,12 @@ def convert_string_column(col : Column) -> Tuple[np.ndarray, dict]: null_kind, null_value = col.describe_null # Convert the buffers to NumPy arrays - dt = (DtypeKind.UINT, 8, None, None) # note: in order to go from STRING to an equivalent ndarray, we claim that the buffer is uint8 (i.e., a byte array) + dt = ( + DtypeKind.UINT, + 8, + None, + None, + ) # note: in order to go from STRING to an equivalent ndarray, we claim that the buffer is uint8 (i.e., a byte array) dbuf = buffer_to_ndarray(dbuffer, dt) obuf = buffer_to_ndarray(obuffer, odtype) @@ -161,14 +182,14 @@ def convert_string_column(col : Column) -> Tuple[np.ndarray, dict]: # Assemble the strings from the code units str_list = [] - for i in range(obuf.size-1): + for i in range(obuf.size - 1): # Check for missing values if null_kind == ColumnNullType.USE_BITMASK: v = mbuf[i // 8] if null_value == 1: v = ~v - if v & (1<<(i % 8)): + if v & (1 << (i % 8)): str_list.append(np.nan) continue @@ -177,7 +198,7 @@ def convert_string_column(col : Column) -> Tuple[np.ndarray, dict]: continue # Extract a range of code units - units = dbuf[obuf[i]:obuf[i+1]] + units = dbuf[obuf[i] : obuf[i + 1]] # Convert the list of code units to bytes b = bytes(units) @@ -191,15 +212,17 @@ def convert_string_column(col : Column) -> Tuple[np.ndarray, dict]: # Convert the string list to a NumPy array return np.asarray(str_list, dtype="object"), buffers + # Implementation of interchange protocol # -------------------------------------- + class _PandasBuffer(Buffer): """ Data in the buffer is guaranteed to be contiguous in memory. """ - def __init__(self, x : np.ndarray, allow_copy : bool = True) -> None: + def __init__(self, x: np.ndarray, allow_copy: bool = True) -> None: """ Handle only regular columns (= numpy arrays) for now. """ @@ -209,8 +232,10 @@ def __init__(self, x : np.ndarray, allow_copy : bool = True) -> None: if allow_copy: x = x.copy() else: - raise RuntimeError("Exports cannot be zero-copy in the case " - "of a non-contiguous buffer") + raise RuntimeError( + "Exports cannot be zero-copy in the case " + "of a non-contiguous buffer" + ) # Store the numpy array in which the data resides as a private # attribute, so we can use it to retrieve the public attributes @@ -228,7 +253,7 @@ def ptr(self) -> int: """ Pointer to start of the buffer as an integer. """ - return self._x.__array_interface__['data'][0] + return self._x.__array_interface__["data"][0] def __dlpack__(self): """ @@ -243,10 +268,18 @@ def __dlpack_device__(self) -> Tuple[DlpackDeviceType, int]: return (DlpackDeviceType.CPU, None) def __repr__(self) -> str: - return 'PandasBuffer(' + str({'bufsize': self.bufsize, - 'ptr': self.ptr, - 'device': self.__dlpack_device__()[0].name} - ) + ')' + return ( + "PandasBuffer(" + + str( + { + "bufsize": self.bufsize, + "ptr": self.ptr, + "device": self.__dlpack_device__()[0].name, + } + ) + + ")" + ) + class _PandasColumn(Column): """ @@ -260,8 +293,7 @@ class _PandasColumn(Column): doesn't need its own version or ``__column__`` protocol. """ - def __init__(self, column : pd.Series, - allow_copy : bool = True) -> None: + def __init__(self, column: pd.Series, allow_copy: bool = True) -> None: """ Note: doesn't deal with extension arrays yet, just assume a regular Series/ndarray for now. @@ -285,6 +317,7 @@ def offset(self) -> int: """ Offset of first element. Always zero. """ + # FIXME: chunks are implemented now, this should return something! return 0 @property @@ -292,8 +325,8 @@ def dtype(self): dtype = self._col.dtype # For now, assume that, if the column dtype is 'O' (i.e., `object`), then we have an array of strings - if not isinstance(dtype, pd.CategoricalDtype) and dtype.kind == 'O': - return (DtypeKind.STRING, 8, 'u', '=') + if not isinstance(dtype, pd.CategoricalDtype) and dtype.kind == "O": + return (DtypeKind.STRING, 8, "u", "=") return self._dtype_from_pandasdtype(dtype) @@ -304,27 +337,40 @@ def _dtype_from_pandasdtype(self, dtype) -> Tuple[DtypeKind, int, str, str]: # Note: 'c' (complex) not handled yet (not in array spec v1). # 'b', 'B' (bytes), 'S', 'a', (old-style string) 'V' (void) not handled # datetime and timedelta both map to datetime (is timedelta handled?) - _np_kinds = {"i": DtypeKind.INT, "u": DtypeKind.UINT, "f": DtypeKind.FLOAT, "b": DtypeKind.BOOL, - "U": DtypeKind.STRING, - "M": DtypeKind.DATETIME, "m": DtypeKind.DATETIME} + _np_kinds = { + "i": DtypeKind.INT, + "u": DtypeKind.UINT, + "f": DtypeKind.FLOAT, + "b": DtypeKind.BOOL, + "U": DtypeKind.STRING, + "M": DtypeKind.DATETIME, + "m": DtypeKind.DATETIME, + } kind = _np_kinds.get(dtype.kind, None) if kind is None: # Not a NumPy dtype. Check if it's a categorical maybe if isinstance(dtype, pd.CategoricalDtype): kind = DtypeKind.CATEGORICAL else: - raise ValueError(f"Data type {dtype} not supported by exchange" - "protocol") - - if kind not in (DtypeKind.INT, DtypeKind.UINT, DtypeKind.FLOAT, DtypeKind.BOOL, DtypeKind.CATEGORICAL, DtypeKind.STRING): + raise ValueError( + f"Data type {dtype} not supported by exchange" "protocol" + ) + + if kind not in ( + DtypeKind.INT, + DtypeKind.UINT, + DtypeKind.FLOAT, + DtypeKind.BOOL, + DtypeKind.CATEGORICAL, + DtypeKind.STRING, + ): raise NotImplementedError(f"Data type {dtype} not handled yet") bitwidth = dtype.itemsize * 8 format_str = dtype.str - endianness = dtype.byteorder if not kind == DtypeKind.CATEGORICAL else '=' + endianness = dtype.byteorder if not kind == DtypeKind.CATEGORICAL else "=" return (kind, bitwidth, format_str, endianness) - @property def describe_categorical(self): """ @@ -341,8 +387,10 @@ def describe_categorical(self): None if not a dictionary-style categorical. """ if not self.dtype[0] == DtypeKind.CATEGORICAL: - raise TypeError("`describe_categorical only works on a column with " - "categorical dtype!") + raise TypeError( + "`describe_categorical only works on a column with " + "categorical dtype!" + ) ordered = self._col.dtype.ordered is_dictionary = True @@ -373,7 +421,9 @@ def describe_null(self): value = -1 elif kind == DtypeKind.STRING: null = 4 - value = 0 # follow Arrow in using 1 as valid value and 0 for missing/null value + value = ( + 0 # follow Arrow in using 1 as valid value and 0 for missing/null value + ) else: raise NotImplementedError(f"Data type {self.dtype} not yet supported") @@ -408,9 +458,11 @@ def get_chunks(self, n_chunks=None): size = len(self._col) step = size // n_chunks if size % n_chunks != 0: - step +=1 + step += 1 for start in range(0, step * n_chunks, step): - yield _PandasColumn(self._col.iloc[start:start + step], self._allow_copy) + yield _PandasColumn( + self._col.iloc[start : start + step], self._allow_copy + ) else: yield self @@ -447,18 +499,23 @@ def get_buffers(self): return buffers - def _get_data_buffer(self) -> Tuple[_PandasBuffer, Any]: # Any is for self.dtype tuple + def _get_data_buffer( + self, + ) -> Tuple[_PandasBuffer, Any]: # Any is for self.dtype tuple """ Return the buffer containing the data and the buffer's associated dtype. """ - if self.dtype[0] in (DtypeKind.INT, DtypeKind.UINT, DtypeKind.FLOAT, DtypeKind.BOOL): - buffer = _PandasBuffer( - self._col.to_numpy(), allow_copy=self._allow_copy) + if self.dtype[0] in ( + DtypeKind.INT, + DtypeKind.UINT, + DtypeKind.FLOAT, + DtypeKind.BOOL, + ): + buffer = _PandasBuffer(self._col.to_numpy(), allow_copy=self._allow_copy) dtype = self.dtype elif self.dtype[0] == DtypeKind.CATEGORICAL: codes = self._col.values.codes - buffer = _PandasBuffer( - codes, allow_copy=self._allow_copy) + buffer = _PandasBuffer(codes, allow_copy=self._allow_copy) dtype = self._dtype_from_pandasdtype(codes.dtype) elif self.dtype[0] == DtypeKind.STRING: # Marshal the strings from a NumPy object array into a byte array @@ -474,7 +531,12 @@ def _get_data_buffer(self) -> Tuple[_PandasBuffer, Any]: # Any is for self.dtyp buffer = _PandasBuffer(np.frombuffer(b, dtype="uint8")) # Define the dtype for the returned buffer - dtype = (DtypeKind.STRING, 8, "u", "=") # note: currently only support native endianness + dtype = ( + DtypeKind.STRING, + 8, + "u", + "=", + ) # note: currently only support native endianness else: raise NotImplementedError(f"Data type {self._col.dtype} not handled yet") @@ -551,9 +613,16 @@ def _get_offsets_buffer(self) -> Tuple[_PandasBuffer, Any]: buffer = _PandasBuffer(buf) # Assemble the buffer dtype info - dtype = (DtypeKind.INT, 64, 'l', "=") # note: currently only support native endianness + dtype = ( + DtypeKind.INT, + 64, + "l", + "=", + ) # note: currently only support native endianness else: - raise RuntimeError("This column has a fixed-length dtype so does not have an offsets buffer") + raise RuntimeError( + "This column has a fixed-length dtype so does not have an offsets buffer" + ) return buffer, dtype @@ -566,8 +635,10 @@ class _PandasDataFrameXchg(DataFrameXchg): ``pd.DataFrame.__dataframe__`` as objects with the methods and attributes defined on this class. """ - def __init__(self, df : pd.DataFrame, nan_as_null : bool = False, - allow_copy : bool = True) -> None: + + def __init__( + self, df: pd.DataFrame, nan_as_null: bool = False, allow_copy: bool = True + ) -> None: """ Constructor - an instance of this (private) class is returned from `pd.DataFrame.__dataframe__`. @@ -599,16 +670,16 @@ def column_names(self): return self._df.columns.tolist() def get_column(self, i: int) -> _PandasColumn: - return _PandasColumn( - self._df.iloc[:, i], allow_copy=self._allow_copy) + return _PandasColumn(self._df.iloc[:, i], allow_copy=self._allow_copy) def get_column_by_name(self, name: str) -> _PandasColumn: - return _PandasColumn( - self._df[name], allow_copy=self._allow_copy) + return _PandasColumn(self._df[name], allow_copy=self._allow_copy) def get_columns(self): - return [_PandasColumn(self._df[name], allow_copy=self._allow_copy) - for name in self._df.columns] + return [ + _PandasColumn(self._df[name], allow_copy=self._allow_copy) + for name in self._df.columns + ] def select_columns(self, indices): if not isinstance(indices, collections.abc.Sequence): @@ -616,7 +687,9 @@ def select_columns(self, indices): if not isinstance(indices, list): indices = list(indices) - return _PandasDataFrameXchg(self._df.iloc[:, indices], self._nan_as_null, self._allow_copy) + return _PandasDataFrameXchg( + self._df.iloc[:, indices], self._nan_as_null, self._allow_copy + ) def select_columns_by_name(self, names): if not isinstance(names, collections.abc.Sequence): @@ -624,7 +697,9 @@ def select_columns_by_name(self, names): if not isinstance(names, list): names = list(names) - return _PandasDataFrameXchg(self._df.loc[:, names], self._nan_as_null, self._allow_copy) + return _PandasDataFrameXchg( + self._df.loc[:, names], self._nan_as_null, self._allow_copy + ) def get_chunks(self, n_chunks=None): """ @@ -634,8 +709,12 @@ def get_chunks(self, n_chunks=None): size = len(self._df) step = size // n_chunks if size % n_chunks != 0: - step +=1 + step += 1 for start in range(0, step * n_chunks, step): - yield _PandasDataFrameXchg(self._df.iloc[start:start + step, :], self._nan_as_null, self._allow_copy) + yield _PandasDataFrameXchg( + self._df.iloc[start : start + step, :], + self._nan_as_null, + self._allow_copy, + ) else: yield self From 088340638a8872f9e077dd6c9ff51497b7fdb6f6 Mon Sep 17 00:00:00 2001 From: Vasily Litvinov Date: Thu, 24 Feb 2022 21:26:09 +0300 Subject: [PATCH 19/49] Format exchange tests by black Signed-off-by: Vasily Litvinov --- pandas/tests/api/conftest.py | 10 +++-- pandas/tests/api/test_protocol.py | 64 +++++++++++++++++++++---------- 2 files changed, 51 insertions(+), 23 deletions(-) diff --git a/pandas/tests/api/conftest.py b/pandas/tests/api/conftest.py index f4e9f4a3fd524..cc5ba8cff11ba 100644 --- a/pandas/tests/api/conftest.py +++ b/pandas/tests/api/conftest.py @@ -2,15 +2,19 @@ import pandas as pd from pandas.api.exchange.implementation import _from_dataframe -@pytest.fixture(scope='package') + +@pytest.fixture(scope="package") def df_from_dict(): def maker(dct, is_categorical=False): df = pd.DataFrame(dct) - return df.astype('category') if is_categorical else df + return df.astype("category") if is_categorical else df + return maker -@pytest.fixture(scope='package') + +@pytest.fixture(scope="package") def df_from_xchg(): def maker(xchg): return _from_dataframe(xchg) + return maker diff --git a/pandas/tests/api/test_protocol.py b/pandas/tests/api/test_protocol.py index 5bb7bbc7d0f8c..683bfa7577ce9 100644 --- a/pandas/tests/api/test_protocol.py +++ b/pandas/tests/api/test_protocol.py @@ -2,14 +2,16 @@ import math import ctypes -@pytest.mark.parametrize("test_data", - [ - {"a": ["foo", "bar"], - "b": ["baz", "qux"]}, - {"a": [1.5, 2.5, 3.5], "b": [9.2, 10.5, 11.8]}, - {"A": [1, 2, 3, 4], "B": [1, 2, 3, 4]} - ], - ids=["str_data", "float_data", "int_data"]) + +@pytest.mark.parametrize( + "test_data", + [ + {"a": ["foo", "bar"], "b": ["baz", "qux"]}, + {"a": [1.5, 2.5, 3.5], "b": [9.2, 10.5, 11.8]}, + {"A": [1, 2, 3, 4], "B": [1, 2, 3, 4]}, + ], + ids=["str_data", "float_data", "int_data"], +) def test_only_one_dtype(test_data, df_from_dict): columns = list(test_data.keys()) df = df_from_dict(test_data) @@ -23,10 +25,16 @@ def test_only_one_dtype(test_data, df_from_dict): def test_float_int(df_from_dict): - df = df_from_dict({"a": [1, 2, 3], "b": [3, 4, 5], - "c": [1.5, 2.5, 3.5], "d": [9, 10, 11], - "e": [True, False, True], - "f": ["a", "", "c"]}) + df = df_from_dict( + { + "a": [1, 2, 3], + "b": [3, 4, 5], + "c": [1.5, 2.5, 3.5], + "d": [9, 10, 11], + "e": [True, False, True], + "f": ["a", "", "c"], + } + ) dfX = df.__dataframe__() columns = {"a": 0, "b": 0, "c": 2, "d": 0, "e": 20, "f": 21} @@ -47,6 +55,7 @@ def test_na_float(df_from_dict): colX = dfX.get_column_by_name("a") assert colX.null_count == 1 + def test_noncategorical(df_from_dict): df = df_from_dict({"a": [1, 2, 3]}) dfX = df.__dataframe__() @@ -54,8 +63,12 @@ def test_noncategorical(df_from_dict): with pytest.raises(TypeError): colX.describe_categorical + def test_categorical(df_from_dict): - df = df_from_dict({"weekday": ["Mon", "Tue", "Mon", "Wed", "Mon", "Thu", "Fri", "Sat", "Sun"]}, is_categorical=True) + df = df_from_dict( + {"weekday": ["Mon", "Tue", "Mon", "Wed", "Mon", "Thu", "Fri", "Sat", "Sun"]}, + is_categorical=True, + ) colX = df.__dataframe__().get_column_by_name("weekday") is_ordered, is_dictionary, _ = colX.describe_categorical @@ -64,14 +77,20 @@ def test_categorical(df_from_dict): def test_dataframe(df_from_dict): - df = df_from_dict({"x": [True, True, False], "y": [1, 2, 0], "z": [9.2, 10.5, 11.8]}) + df = df_from_dict( + {"x": [True, True, False], "y": [1, 2, 0], "z": [9.2, 10.5, 11.8]} + ) dfX = df.__dataframe__() assert dfX.num_columns() == 3 assert dfX.num_rows() == 3 assert dfX.num_chunks() == 1 assert dfX.column_names() == ["x", "y", "z"] - assert dfX.select_columns((0, 2)).column_names() == dfX.select_columns_by_name(("x", "z")).column_names() + assert ( + dfX.select_columns((0, 2)).column_names() + == dfX.select_columns_by_name(("x", "z")).column_names() + ) + @pytest.mark.parametrize(["size", "n_chunks"], [(10, 3), (12, 3), (12, 5)]) def test_df_get_chunks(size, n_chunks, df_from_dict): @@ -81,6 +100,7 @@ def test_df_get_chunks(size, n_chunks, df_from_dict): assert len(chunks) == n_chunks assert sum(chunk.num_rows() for chunk in chunks) == size + @pytest.mark.parametrize(["size", "n_chunks"], [(10, 3), (12, 3), (12, 5)]) def test_column_get_chunks(size, n_chunks, df_from_dict): df = df_from_dict({"x": list(range(size))}) @@ -89,6 +109,7 @@ def test_column_get_chunks(size, n_chunks, df_from_dict): assert len(chunks) == n_chunks assert sum(chunk.size for chunk in chunks) == size + def test_get_columns(df_from_dict): df = df_from_dict({"a": [0, 1], "b": [2.5, 3.5]}) dfX = df.__dataframe__() @@ -98,6 +119,7 @@ def test_get_columns(df_from_dict): assert dfX.get_column(0).dtype[0] == 0 assert dfX.get_column(1).dtype[0] == 2 + def test_buffer(df_from_dict): arr = [0, 1, -1] df = df_from_dict({"a": arr}) @@ -113,12 +135,14 @@ def test_buffer(df_from_dict): assert dataDtype[0] == 0 - if device == 1: # CPU-only as we're going to directly read memory here + if device == 1: # CPU-only as we're going to directly read memory here bitwidth = dataDtype[1] - ctype = {8: ctypes.c_int8, - 16: ctypes.c_int16, - 32: ctypes.c_int32, - 64: ctypes.c_int64}[bitwidth] + ctype = { + 8: ctypes.c_int8, + 16: ctypes.c_int16, + 32: ctypes.c_int32, + 64: ctypes.c_int64, + }[bitwidth] for idx, truth in enumerate(arr): val = ctype.from_address(dataBuf.ptr + idx * (bitwidth // 8)).value From 49418d27a7e3e10170b3d949dba861de0e96da01 Mon Sep 17 00:00:00 2001 From: Vasily Litvinov Date: Wed, 30 Mar 2022 18:37:35 +0300 Subject: [PATCH 20/49] Respond to review - move files around Signed-off-by: Vasily Litvinov --- pandas/api/__init__.py | 1 + pandas/api/exchange/__init__.py | 8 ++++++++ pandas/core/exchange/__init__.py | 0 pandas/{api => core}/exchange/dataframe_protocol.py | 4 ++++ pandas/{api => core}/exchange/implementation.py | 2 +- pandas/core/frame.py | 12 ++++++------ pandas/tests/{api => exchange}/conftest.py | 2 +- pandas/tests/{api => exchange}/test_protocol.py | 3 +++ 8 files changed, 24 insertions(+), 8 deletions(-) create mode 100644 pandas/core/exchange/__init__.py rename pandas/{api => core}/exchange/dataframe_protocol.py (99%) rename pandas/{api => core}/exchange/implementation.py (99%) rename pandas/tests/{api => exchange}/conftest.py (85%) rename pandas/tests/{api => exchange}/test_protocol.py (97%) diff --git a/pandas/api/__init__.py b/pandas/api/__init__.py index 80202b3569862..67fd722c9198b 100644 --- a/pandas/api/__init__.py +++ b/pandas/api/__init__.py @@ -1,5 +1,6 @@ """ public toolkit API """ from pandas.api import ( # noqa:F401 + exchange, extensions, indexers, types, diff --git a/pandas/api/exchange/__init__.py b/pandas/api/exchange/__init__.py index e69de29bb2d1d..dfe86230874d5 100644 --- a/pandas/api/exchange/__init__.py +++ b/pandas/api/exchange/__init__.py @@ -0,0 +1,8 @@ +""" +Public API for DataFrame exchange protocol. +""" + +from pandas.core.exchange.implementation import from_dataframe +from pandas.core.exchange.dataframe_protocol import DataFrame + +__all__ = ["from_dataframe", "DataFrame"] diff --git a/pandas/core/exchange/__init__.py b/pandas/core/exchange/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/api/exchange/dataframe_protocol.py b/pandas/core/exchange/dataframe_protocol.py similarity index 99% rename from pandas/api/exchange/dataframe_protocol.py rename to pandas/core/exchange/dataframe_protocol.py index 8633426457f3f..ad31e7d8b6653 100644 --- a/pandas/api/exchange/dataframe_protocol.py +++ b/pandas/core/exchange/dataframe_protocol.py @@ -1,3 +1,7 @@ +""" +A verbatim copy (vendored) of the spec from https://github.com/data-apis/dataframe-api +""" + from typing import Tuple, Optional, Dict, Any, Iterable, Sequence, TypedDict import enum from abc import ABC, abstractmethod diff --git a/pandas/api/exchange/implementation.py b/pandas/core/exchange/implementation.py similarity index 99% rename from pandas/api/exchange/implementation.py rename to pandas/core/exchange/implementation.py index 52bb7d4c4762d..4ebae9d2b2e56 100644 --- a/pandas/api/exchange/implementation.py +++ b/pandas/core/exchange/implementation.py @@ -3,7 +3,7 @@ from typing import Tuple, Any -from .dataframe_protocol import ( +from pandas.core.exchange.dataframe_protocol import ( Buffer, Column, ColumnNullType, diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 8948bcceb7f0b..174fdc56ae333 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -204,8 +204,6 @@ nargsort, ) -from pandas.api.exchange.dataframe_protocol import DataFrame as DataFrameXchg - from pandas.io.common import get_handle from pandas.io.formats import ( console, @@ -223,7 +221,7 @@ from pandas.core.groupby.generic import DataFrameGroupBy from pandas.core.internals import SingleDataManager from pandas.core.resample import Resampler - + from pandas.core.exchange.dataframe_protocol import DataFrame as DataFrameXchg from pandas.io.formats.style import Styler # --------------------------------------------------------------------- @@ -814,8 +812,9 @@ def __init__( NDFrame.__init__(self, mgr) # ---------------------------------------------------------------------- - def __dataframe__(self, nan_as_null : bool = False, - allow_copy : bool = True) -> DataFrameXchg: + def __dataframe__( + self, nan_as_null: bool = False, allow_copy: bool = True + ) -> DataFrameXchg: """ Return the dataframe exchange object implementing the exchange protocol. @@ -825,7 +824,8 @@ def __dataframe__(self, nan_as_null : bool = False, https://data-apis.org/dataframe-protocol/latest/index.html """ - from pandas.api.exchange.implementation import _PandasDataFrameXchg + from pandas.core.exchange.implementation import _PandasDataFrameXchg + return _PandasDataFrameXchg(self, nan_as_null, allow_copy) # ---------------------------------------------------------------------- diff --git a/pandas/tests/api/conftest.py b/pandas/tests/exchange/conftest.py similarity index 85% rename from pandas/tests/api/conftest.py rename to pandas/tests/exchange/conftest.py index cc5ba8cff11ba..688314ac6eca3 100644 --- a/pandas/tests/api/conftest.py +++ b/pandas/tests/exchange/conftest.py @@ -1,6 +1,6 @@ import pytest import pandas as pd -from pandas.api.exchange.implementation import _from_dataframe +from pandas.core.exchange.implementation import _from_dataframe @pytest.fixture(scope="package") diff --git a/pandas/tests/api/test_protocol.py b/pandas/tests/exchange/test_protocol.py similarity index 97% rename from pandas/tests/api/test_protocol.py rename to pandas/tests/exchange/test_protocol.py index 683bfa7577ce9..22111474f18e7 100644 --- a/pandas/tests/api/test_protocol.py +++ b/pandas/tests/exchange/test_protocol.py @@ -1,3 +1,6 @@ +""" +A verbatim copy (vendored) of the spec tests from https://github.com/data-apis/dataframe-api +""" import pytest import math import ctypes From 78aebaaf1b6f71e2bf4b43b2fb130119cf1d1879 Mon Sep 17 00:00:00 2001 From: Vasily Litvinov Date: Thu, 31 Mar 2022 15:38:17 +0300 Subject: [PATCH 21/49] Separate buffer and column implementations Signed-off-by: Vasily Litvinov --- pandas/core/exchange/buffer.py | 98 +++++ pandas/core/exchange/column.py | 468 ++++++++++++++++++++ pandas/core/exchange/implementation.py | 583 +------------------------ pandas/core/frame.py | 4 +- 4 files changed, 583 insertions(+), 570 deletions(-) create mode 100644 pandas/core/exchange/buffer.py create mode 100644 pandas/core/exchange/column.py diff --git a/pandas/core/exchange/buffer.py b/pandas/core/exchange/buffer.py new file mode 100644 index 0000000000000..bce9d21ca1469 --- /dev/null +++ b/pandas/core/exchange/buffer.py @@ -0,0 +1,98 @@ +from pandas.core.exchange.dataframe_protocol import Buffer, DlpackDeviceType, DtypeKind +import numpy as np +from typing import Tuple +import ctypes + + +_NP_DTYPES = { + DtypeKind.INT: {8: np.int8, 16: np.int16, 32: np.int32, 64: np.int64}, + DtypeKind.UINT: {8: np.uint8, 16: np.uint16, 32: np.uint32, 64: np.uint64}, + DtypeKind.FLOAT: {32: np.float32, 64: np.float64}, + DtypeKind.BOOL: {8: bool}, +} + + +class PandasBuffer(Buffer): + """ + Data in the buffer is guaranteed to be contiguous in memory. + """ + + def __init__(self, x: np.ndarray, allow_copy: bool = True) -> None: + """ + Handle only regular columns (= numpy arrays) for now. + """ + if not x.strides == (x.dtype.itemsize,): + # The protocol does not support strided buffers, so a copy is + # necessary. If that's not allowed, we need to raise an exception. + if allow_copy: + x = x.copy() + else: + raise RuntimeError( + "Exports cannot be zero-copy in the case " + "of a non-contiguous buffer" + ) + + # Store the numpy array in which the data resides as a private + # attribute, so we can use it to retrieve the public attributes + self._x = x + + @property + def bufsize(self) -> int: + """ + Buffer size in bytes. + """ + return self._x.size * self._x.dtype.itemsize + + @property + def ptr(self) -> int: + """ + Pointer to start of the buffer as an integer. + """ + return self._x.__array_interface__["data"][0] + + def __dlpack__(self): + """ + DLPack not implemented in NumPy yet, so leave it out here. + """ + raise NotImplementedError("__dlpack__") + + def __dlpack_device__(self) -> Tuple[DlpackDeviceType, int]: + """ + Device type and device ID for where the data in the buffer resides. + """ + return (DlpackDeviceType.CPU, None) + + def __repr__(self) -> str: + return ( + "PandasBuffer(" + + str( + { + "bufsize": self.bufsize, + "ptr": self.ptr, + "device": self.__dlpack_device__()[0].name, + } + ) + + ")" + ) + + +def buffer_to_ndarray(_buffer: Buffer, _dtype) -> np.ndarray: + # Handle the dtype + kind = _dtype[0] + bitwidth = _dtype[1] + if kind not in _NP_DTYPES: + raise RuntimeError(f"Unsupported data type: {kind}") + + column_dtype = _NP_DTYPES[kind][bitwidth] + + # No DLPack yet, so need to construct a new ndarray from the data pointer + # and size in the buffer plus the dtype on the column + ctypes_type = np.ctypeslib.as_ctypes_type(column_dtype) + data_pointer = ctypes.cast(_buffer.ptr, ctypes.POINTER(ctypes_type)) + + # NOTE: `x` does not own its memory, so the caller of this function must + # either make a copy or hold on to a reference of the column or + # buffer! (not done yet, this is pretty awful ...) + x = np.ctypeslib.as_array(data_pointer, shape=(_buffer.bufsize // (bitwidth // 8),)) + + return x diff --git a/pandas/core/exchange/column.py b/pandas/core/exchange/column.py new file mode 100644 index 0000000000000..5611539ef884a --- /dev/null +++ b/pandas/core/exchange/column.py @@ -0,0 +1,468 @@ +from pandas.core.exchange.dataframe_protocol import ( + Column, + DtypeKind, + Buffer, + ColumnNullType, +) +from pandas.core.exchange.buffer import PandasBuffer, buffer_to_ndarray +import pandas as pd +import numpy as np +from typing import Tuple, Any + + +class PandasColumn(Column): + """ + A column object, with only the methods and properties required by the + interchange protocol defined. + A column can contain one or more chunks. Each chunk can contain up to three + buffers - a data buffer, a mask buffer (depending on null representation), + and an offsets buffer (if variable-size binary; e.g., variable-length + strings). + Note: this Column object can only be produced by ``__dataframe__``, so + doesn't need its own version or ``__column__`` protocol. + """ + + def __init__(self, column: pd.Series, allow_copy: bool = True) -> None: + """ + Note: doesn't deal with extension arrays yet, just assume a regular + Series/ndarray for now. + """ + if not isinstance(column, pd.Series): + raise NotImplementedError(f"Columns of type {type(column)} not handled yet") + + # Store the column as a private attribute + self._col = column + self._allow_copy = allow_copy + + @property + def size(self) -> int: + """ + Size of the column, in elements. + """ + return self._col.size + + @property + def offset(self) -> int: + """ + Offset of first element. Always zero. + """ + # FIXME: chunks are implemented now, this should return something! + return 0 + + @property + def dtype(self): + dtype = self._col.dtype + + # For now, assume that, if the column dtype is 'O' (i.e., `object`), then we have an array of strings + if not isinstance(dtype, pd.CategoricalDtype) and dtype.kind == "O": + return (DtypeKind.STRING, 8, "u", "=") + + return self._dtype_from_pandasdtype(dtype) + + def _dtype_from_pandasdtype(self, dtype) -> Tuple[DtypeKind, int, str, str]: + """ + See `self.dtype` for details. + """ + # Note: 'c' (complex) not handled yet (not in array spec v1). + # 'b', 'B' (bytes), 'S', 'a', (old-style string) 'V' (void) not handled + # datetime and timedelta both map to datetime (is timedelta handled?) + _np_kinds = { + "i": DtypeKind.INT, + "u": DtypeKind.UINT, + "f": DtypeKind.FLOAT, + "b": DtypeKind.BOOL, + "U": DtypeKind.STRING, + "M": DtypeKind.DATETIME, + "m": DtypeKind.DATETIME, + } + kind = _np_kinds.get(dtype.kind, None) + if kind is None: + # Not a NumPy dtype. Check if it's a categorical maybe + if isinstance(dtype, pd.CategoricalDtype): + kind = DtypeKind.CATEGORICAL + else: + raise ValueError( + f"Data type {dtype} not supported by exchange" "protocol" + ) + + if kind not in ( + DtypeKind.INT, + DtypeKind.UINT, + DtypeKind.FLOAT, + DtypeKind.BOOL, + DtypeKind.CATEGORICAL, + DtypeKind.STRING, + ): + raise NotImplementedError(f"Data type {dtype} not handled yet") + + bitwidth = dtype.itemsize * 8 + format_str = dtype.str + endianness = dtype.byteorder if not kind == DtypeKind.CATEGORICAL else "=" + return (kind, bitwidth, format_str, endianness) + + @property + def describe_categorical(self): + """ + If the dtype is categorical, there are two options: + - There are only values in the data buffer. + - There is a separate dictionary-style encoding for categorical values. + Raises RuntimeError if the dtype is not categorical + Content of returned dict: + - "is_ordered" : bool, whether the ordering of dictionary indices is + semantically meaningful. + - "is_dictionary" : bool, whether a dictionary-style mapping of + categorical values to other objects exists + - "mapping" : dict, Python-level only (e.g. ``{int: str}``). + None if not a dictionary-style categorical. + """ + if not self.dtype[0] == DtypeKind.CATEGORICAL: + raise TypeError( + "`describe_categorical only works on a column with " + "categorical dtype!" + ) + + ordered = self._col.dtype.ordered + is_dictionary = True + # NOTE: this shows the children approach is better, transforming + # `categories` to a "mapping" dict is inefficient + codes = self._col.values.codes # ndarray, length `self.size` + # categories.values is ndarray of length n_categories + categories = self._col.values.categories.values + mapping = {ix: val for ix, val in enumerate(categories)} + return ordered, is_dictionary, mapping + + @property + def describe_null(self): + kind = self.dtype[0] + value = None + if kind == DtypeKind.FLOAT: + null = 1 # np.nan + elif kind == DtypeKind.DATETIME: + null = 1 # np.datetime64('NaT') + elif kind in (DtypeKind.INT, DtypeKind.UINT, DtypeKind.BOOL): + # TODO: check if extension dtypes are used once support for them is + # implemented in this protocol code + null = 0 # integer and boolean dtypes are non-nullable + elif kind == DtypeKind.CATEGORICAL: + # Null values for categoricals are stored as `-1` sentinel values + # in the category date (e.g., `col.values.codes` is int8 np.ndarray) + null = 2 + value = -1 + elif kind == DtypeKind.STRING: + null = 4 + value = ( + 0 # follow Arrow in using 1 as valid value and 0 for missing/null value + ) + else: + raise NotImplementedError(f"Data type {self.dtype} not yet supported") + + return null, value + + @property + def null_count(self) -> int: + """ + Number of null elements. Should always be known. + """ + return self._col.isna().sum() + + @property + def metadata(self): + """ + Store specific metadata of the column. + """ + return {} + + def num_chunks(self) -> int: + """ + Return the number of chunks the column consists of. + """ + return 1 + + def get_chunks(self, n_chunks=None): + """ + Return an iterator yielding the chunks. + See `DataFrame.get_chunks` for details on ``n_chunks``. + """ + if n_chunks and n_chunks > 1: + size = len(self._col) + step = size // n_chunks + if size % n_chunks != 0: + step += 1 + for start in range(0, step * n_chunks, step): + yield PandasColumn( + self._col.iloc[start : start + step], self._allow_copy + ) + else: + yield self + + def get_buffers(self): + """ + Return a dictionary containing the underlying buffers. + The returned dictionary has the following contents: + - "data": a two-element tuple whose first element is a buffer + containing the data and whose second element is the data + buffer's associated dtype. + - "validity": a two-element tuple whose first element is a buffer + containing mask values indicating missing data and + whose second element is the mask value buffer's + associated dtype. None if the null representation is + not a bit or byte mask. + - "offsets": a two-element tuple whose first element is a buffer + containing the offset values for variable-size binary + data (e.g., variable-length strings) and whose second + element is the offsets buffer's associated dtype. None + if the data buffer does not have an associated offsets + buffer. + """ + buffers = {} + buffers["data"] = self._get_data_buffer() + try: + buffers["validity"] = self._get_validity_buffer() + except: + buffers["validity"] = None + + try: + buffers["offsets"] = self._get_offsets_buffer() + except: + buffers["offsets"] = None + + return buffers + + def _get_data_buffer( + self, + ) -> Tuple[PandasBuffer, Any]: # Any is for self.dtype tuple + """ + Return the buffer containing the data and the buffer's associated dtype. + """ + if self.dtype[0] in ( + DtypeKind.INT, + DtypeKind.UINT, + DtypeKind.FLOAT, + DtypeKind.BOOL, + ): + buffer = PandasBuffer(self._col.to_numpy(), allow_copy=self._allow_copy) + dtype = self.dtype + elif self.dtype[0] == DtypeKind.CATEGORICAL: + codes = self._col.values.codes + buffer = PandasBuffer(codes, allow_copy=self._allow_copy) + dtype = self._dtype_from_pandasdtype(codes.dtype) + elif self.dtype[0] == DtypeKind.STRING: + # Marshal the strings from a NumPy object array into a byte array + buf = self._col.to_numpy() + b = bytearray() + + # TODO: this for-loop is slow; can be implemented in Cython/C/C++ later + for i in range(buf.size): + if type(buf[i]) == str: + b.extend(buf[i].encode(encoding="utf-8")) + + # Convert the byte array to a Pandas "buffer" using a NumPy array as the backing store + buffer = PandasBuffer(np.frombuffer(b, dtype="uint8")) + + # Define the dtype for the returned buffer + dtype = ( + DtypeKind.STRING, + 8, + "u", + "=", + ) # note: currently only support native endianness + else: + raise NotImplementedError(f"Data type {self._col.dtype} not handled yet") + + return buffer, dtype + + def _get_validity_buffer(self) -> Tuple[PandasBuffer, Any]: + """ + Return the buffer containing the mask values indicating missing data and + the buffer's associated dtype. + Raises RuntimeError if null representation is not a bit or byte mask. + """ + null, invalid = self.describe_null + + if self.dtype[0] == DtypeKind.STRING: + # For now, have the mask array be comprised of bytes, rather than a bit array + buf = self._col.to_numpy() + mask = [] + + # Determine the encoding for valid values + if invalid == 0: + valid = 1 + else: + valid = 0 + + for i in range(buf.size): + if type(buf[i]) == str: + v = valid + else: + v = invalid + + mask.append(v) + + # Convert the mask array to a Pandas "buffer" using a NumPy array as the backing store + buffer = PandasBuffer(np.asarray(mask, dtype="uint8")) + + # Define the dtype of the returned buffer + dtype = (DtypeKind.UINT, 8, "C", "=") + + return buffer, dtype + + if null == 0: + msg = "This column is non-nullable so does not have a mask" + elif null == 1: + msg = "This column uses NaN as null so does not have a separate mask" + else: + raise NotImplementedError("See self.describe_null") + + raise RuntimeError(msg) + + def _get_offsets_buffer(self) -> Tuple[PandasBuffer, Any]: + """ + Return the buffer containing the offset values for variable-size binary + data (e.g., variable-length strings) and the buffer's associated dtype. + Raises RuntimeError if the data buffer does not have an associated + offsets buffer. + """ + if self.dtype[0] == DtypeKind.STRING: + # For each string, we need to manually determine the next offset + values = self._col.to_numpy() + ptr = 0 + offsets = [ptr] + for v in values: + # For missing values (in this case, `np.nan` values), we don't increment the pointer) + if type(v) == str: + b = v.encode(encoding="utf-8") + ptr += len(b) + + offsets.append(ptr) + + # Convert the list of offsets to a NumPy array of signed 64-bit integers (note: Arrow allows the offsets array to be either `int32` or `int64`; here, we default to the latter) + buf = np.asarray(offsets, dtype="int64") + + # Convert the offsets to a Pandas "buffer" using the NumPy array as the backing store + buffer = PandasBuffer(buf) + + # Assemble the buffer dtype info + dtype = ( + DtypeKind.INT, + 64, + "l", + "=", + ) # note: currently only support native endianness + else: + raise RuntimeError( + "This column has a fixed-length dtype so does not have an offsets buffer" + ) + + return buffer, dtype + + +def convert_column_to_ndarray(col: Column) -> Tuple[np.ndarray, Buffer]: + """ + Convert an int, uint, float or bool column to a numpy array. + """ + if col.describe_null[0] not in ( + ColumnNullType.NON_NULLABLE, + ColumnNullType.USE_NAN, + ): + raise NotImplementedError( + "Null values represented as masks or " "sentinel values not handled yet" + ) + + _buffer, _dtype = col.get_buffers()["data"] + return buffer_to_ndarray(_buffer, _dtype), _buffer + + +def convert_categorical_column(col: Column) -> Tuple[pd.Series, Buffer]: + """ + Convert a categorical column to a Series instance. + """ + ordered, is_dict, mapping = col.describe_categorical + if not is_dict: + raise NotImplementedError("Non-dictionary categoricals not supported yet") + + # If you want to cheat for testing (can't use `_col` in real-world code): + # categories = col._col.values.categories.values + # codes = col._col.values.codes + categories = np.asarray(list(mapping.values())) + codes_buffer, codes_dtype = col.get_buffers()["data"] + codes = buffer_to_ndarray(codes_buffer, codes_dtype) + values = categories[codes] + + # Seems like Pandas can only construct with non-null values, so need to + # null out the nulls later + cat = pd.Categorical(values, categories=categories, ordered=ordered) + series = pd.Series(cat) + null_kind = col.describe_null[0] + if null_kind == ColumnNullType.USE_SENTINEL: # sentinel value + sentinel = col.describe_null[1] + series[codes == sentinel] = np.nan + elif null_kind != ColumnNullType.NON_NULLABLE: + raise NotImplementedError( + "Only categorical columns with sentinel " "value supported at the moment" + ) + + return series, codes_buffer + + +def convert_string_column(col: Column) -> Tuple[np.ndarray, dict]: + """ + Convert a string column to a NumPy array. + """ + # Retrieve the data buffers + buffers = col.get_buffers() + + # Retrieve the data buffer containing the UTF-8 code units + dbuffer, bdtype = buffers["data"] + + # Retrieve the offsets buffer containing the index offsets demarcating the beginning and end of each string + obuffer, odtype = buffers["offsets"] + + # Retrieve the mask buffer indicating the presence of missing values + mbuffer, mdtype = buffers["validity"] + + # Retrieve the missing value encoding + null_kind, null_value = col.describe_null + + # Convert the buffers to NumPy arrays + dt = ( + DtypeKind.UINT, + 8, + None, + None, + ) # note: in order to go from STRING to an equivalent ndarray, we claim that the buffer is uint8 (i.e., a byte array) + dbuf = buffer_to_ndarray(dbuffer, dt) + + obuf = buffer_to_ndarray(obuffer, odtype) + mbuf = buffer_to_ndarray(mbuffer, mdtype) + + # Assemble the strings from the code units + str_list = [] + for i in range(obuf.size - 1): + # Check for missing values + if null_kind == ColumnNullType.USE_BITMASK: + v = mbuf[i // 8] + if null_value == 1: + v = ~v + + if v & (1 << (i % 8)): + str_list.append(np.nan) + continue + + elif null_kind == ColumnNullType.USE_BYTEMASK and mbuf[i] == null_value: + str_list.append(np.nan) + continue + + # Extract a range of code units + units = dbuf[obuf[i] : obuf[i + 1]] + + # Convert the list of code units to bytes + b = bytes(units) + + # Create the string + s = b.decode(encoding="utf-8") + + # Add to our list of strings + str_list.append(s) + + # Convert the string list to a NumPy array + return np.asarray(str_list, dtype="object"), buffers diff --git a/pandas/core/exchange/implementation.py b/pandas/core/exchange/implementation.py index 4ebae9d2b2e56..060d4cac45cfa 100644 --- a/pandas/core/exchange/implementation.py +++ b/pandas/core/exchange/implementation.py @@ -4,23 +4,18 @@ from typing import Tuple, Any from pandas.core.exchange.dataframe_protocol import ( - Buffer, - Column, - ColumnNullType, DataFrame as DataFrameXchg, DtypeKind, - DlpackDeviceType, ) import pandas as pd -import numpy as np -_NP_DTYPES = { - DtypeKind.INT: {8: np.int8, 16: np.int16, 32: np.int32, 64: np.int64}, - DtypeKind.UINT: {8: np.uint8, 16: np.uint16, 32: np.uint32, 64: np.uint64}, - DtypeKind.FLOAT: {32: np.float32, 64: np.float64}, - DtypeKind.BOOL: {8: bool}, -} +from pandas.core.exchange.column import ( + convert_column_to_ndarray, + convert_categorical_column, + convert_string_column, + PandasColumn, +) def from_dataframe(df: DataFrameXchg, allow_copy: bool = True) -> pd.DataFrame: @@ -79,555 +74,7 @@ def _from_dataframe(df: DataFrameXchg) -> pd.DataFrame: return df_new -def convert_column_to_ndarray(col: Column) -> Tuple[np.ndarray, Buffer]: - """ - Convert an int, uint, float or bool column to a numpy array. - """ - if col.describe_null[0] not in ( - ColumnNullType.NON_NULLABLE, - ColumnNullType.USE_NAN, - ): - raise NotImplementedError( - "Null values represented as masks or " "sentinel values not handled yet" - ) - - _buffer, _dtype = col.get_buffers()["data"] - return buffer_to_ndarray(_buffer, _dtype), _buffer - - -def buffer_to_ndarray(_buffer: Buffer, _dtype) -> np.ndarray: - # Handle the dtype - kind = _dtype[0] - bitwidth = _dtype[1] - if kind not in _NP_DTYPES: - raise RuntimeError(f"Unsupported data type: {kind}") - - column_dtype = _NP_DTYPES[kind][bitwidth] - - # No DLPack yet, so need to construct a new ndarray from the data pointer - # and size in the buffer plus the dtype on the column - ctypes_type = np.ctypeslib.as_ctypes_type(column_dtype) - data_pointer = ctypes.cast(_buffer.ptr, ctypes.POINTER(ctypes_type)) - - # NOTE: `x` does not own its memory, so the caller of this function must - # either make a copy or hold on to a reference of the column or - # buffer! (not done yet, this is pretty awful ...) - x = np.ctypeslib.as_array(data_pointer, shape=(_buffer.bufsize // (bitwidth // 8),)) - - return x - - -def convert_categorical_column(col: Column) -> Tuple[pd.Series, Buffer]: - """ - Convert a categorical column to a Series instance. - """ - ordered, is_dict, mapping = col.describe_categorical - if not is_dict: - raise NotImplementedError("Non-dictionary categoricals not supported yet") - - # If you want to cheat for testing (can't use `_col` in real-world code): - # categories = col._col.values.categories.values - # codes = col._col.values.codes - categories = np.asarray(list(mapping.values())) - codes_buffer, codes_dtype = col.get_buffers()["data"] - codes = buffer_to_ndarray(codes_buffer, codes_dtype) - values = categories[codes] - - # Seems like Pandas can only construct with non-null values, so need to - # null out the nulls later - cat = pd.Categorical(values, categories=categories, ordered=ordered) - series = pd.Series(cat) - null_kind = col.describe_null[0] - if null_kind == ColumnNullType.USE_SENTINEL: # sentinel value - sentinel = col.describe_null[1] - series[codes == sentinel] = np.nan - elif null_kind != ColumnNullType.NON_NULLABLE: - raise NotImplementedError( - "Only categorical columns with sentinel " "value supported at the moment" - ) - - return series, codes_buffer - - -def convert_string_column(col: Column) -> Tuple[np.ndarray, dict]: - """ - Convert a string column to a NumPy array. - """ - # Retrieve the data buffers - buffers = col.get_buffers() - - # Retrieve the data buffer containing the UTF-8 code units - dbuffer, bdtype = buffers["data"] - - # Retrieve the offsets buffer containing the index offsets demarcating the beginning and end of each string - obuffer, odtype = buffers["offsets"] - - # Retrieve the mask buffer indicating the presence of missing values - mbuffer, mdtype = buffers["validity"] - - # Retrieve the missing value encoding - null_kind, null_value = col.describe_null - - # Convert the buffers to NumPy arrays - dt = ( - DtypeKind.UINT, - 8, - None, - None, - ) # note: in order to go from STRING to an equivalent ndarray, we claim that the buffer is uint8 (i.e., a byte array) - dbuf = buffer_to_ndarray(dbuffer, dt) - - obuf = buffer_to_ndarray(obuffer, odtype) - mbuf = buffer_to_ndarray(mbuffer, mdtype) - - # Assemble the strings from the code units - str_list = [] - for i in range(obuf.size - 1): - # Check for missing values - if null_kind == ColumnNullType.USE_BITMASK: - v = mbuf[i // 8] - if null_value == 1: - v = ~v - - if v & (1 << (i % 8)): - str_list.append(np.nan) - continue - - elif null_kind == ColumnNullType.USE_BYTEMASK and mbuf[i] == null_value: - str_list.append(np.nan) - continue - - # Extract a range of code units - units = dbuf[obuf[i] : obuf[i + 1]] - - # Convert the list of code units to bytes - b = bytes(units) - - # Create the string - s = b.decode(encoding="utf-8") - - # Add to our list of strings - str_list.append(s) - - # Convert the string list to a NumPy array - return np.asarray(str_list, dtype="object"), buffers - - -# Implementation of interchange protocol -# -------------------------------------- - - -class _PandasBuffer(Buffer): - """ - Data in the buffer is guaranteed to be contiguous in memory. - """ - - def __init__(self, x: np.ndarray, allow_copy: bool = True) -> None: - """ - Handle only regular columns (= numpy arrays) for now. - """ - if not x.strides == (x.dtype.itemsize,): - # The protocol does not support strided buffers, so a copy is - # necessary. If that's not allowed, we need to raise an exception. - if allow_copy: - x = x.copy() - else: - raise RuntimeError( - "Exports cannot be zero-copy in the case " - "of a non-contiguous buffer" - ) - - # Store the numpy array in which the data resides as a private - # attribute, so we can use it to retrieve the public attributes - self._x = x - - @property - def bufsize(self) -> int: - """ - Buffer size in bytes. - """ - return self._x.size * self._x.dtype.itemsize - - @property - def ptr(self) -> int: - """ - Pointer to start of the buffer as an integer. - """ - return self._x.__array_interface__["data"][0] - - def __dlpack__(self): - """ - DLPack not implemented in NumPy yet, so leave it out here. - """ - raise NotImplementedError("__dlpack__") - - def __dlpack_device__(self) -> Tuple[DlpackDeviceType, int]: - """ - Device type and device ID for where the data in the buffer resides. - """ - return (DlpackDeviceType.CPU, None) - - def __repr__(self) -> str: - return ( - "PandasBuffer(" - + str( - { - "bufsize": self.bufsize, - "ptr": self.ptr, - "device": self.__dlpack_device__()[0].name, - } - ) - + ")" - ) - - -class _PandasColumn(Column): - """ - A column object, with only the methods and properties required by the - interchange protocol defined. - A column can contain one or more chunks. Each chunk can contain up to three - buffers - a data buffer, a mask buffer (depending on null representation), - and an offsets buffer (if variable-size binary; e.g., variable-length - strings). - Note: this Column object can only be produced by ``__dataframe__``, so - doesn't need its own version or ``__column__`` protocol. - """ - - def __init__(self, column: pd.Series, allow_copy: bool = True) -> None: - """ - Note: doesn't deal with extension arrays yet, just assume a regular - Series/ndarray for now. - """ - if not isinstance(column, pd.Series): - raise NotImplementedError(f"Columns of type {type(column)} not handled yet") - - # Store the column as a private attribute - self._col = column - self._allow_copy = allow_copy - - @property - def size(self) -> int: - """ - Size of the column, in elements. - """ - return self._col.size - - @property - def offset(self) -> int: - """ - Offset of first element. Always zero. - """ - # FIXME: chunks are implemented now, this should return something! - return 0 - - @property - def dtype(self): - dtype = self._col.dtype - - # For now, assume that, if the column dtype is 'O' (i.e., `object`), then we have an array of strings - if not isinstance(dtype, pd.CategoricalDtype) and dtype.kind == "O": - return (DtypeKind.STRING, 8, "u", "=") - - return self._dtype_from_pandasdtype(dtype) - - def _dtype_from_pandasdtype(self, dtype) -> Tuple[DtypeKind, int, str, str]: - """ - See `self.dtype` for details. - """ - # Note: 'c' (complex) not handled yet (not in array spec v1). - # 'b', 'B' (bytes), 'S', 'a', (old-style string) 'V' (void) not handled - # datetime and timedelta both map to datetime (is timedelta handled?) - _np_kinds = { - "i": DtypeKind.INT, - "u": DtypeKind.UINT, - "f": DtypeKind.FLOAT, - "b": DtypeKind.BOOL, - "U": DtypeKind.STRING, - "M": DtypeKind.DATETIME, - "m": DtypeKind.DATETIME, - } - kind = _np_kinds.get(dtype.kind, None) - if kind is None: - # Not a NumPy dtype. Check if it's a categorical maybe - if isinstance(dtype, pd.CategoricalDtype): - kind = DtypeKind.CATEGORICAL - else: - raise ValueError( - f"Data type {dtype} not supported by exchange" "protocol" - ) - - if kind not in ( - DtypeKind.INT, - DtypeKind.UINT, - DtypeKind.FLOAT, - DtypeKind.BOOL, - DtypeKind.CATEGORICAL, - DtypeKind.STRING, - ): - raise NotImplementedError(f"Data type {dtype} not handled yet") - - bitwidth = dtype.itemsize * 8 - format_str = dtype.str - endianness = dtype.byteorder if not kind == DtypeKind.CATEGORICAL else "=" - return (kind, bitwidth, format_str, endianness) - - @property - def describe_categorical(self): - """ - If the dtype is categorical, there are two options: - - There are only values in the data buffer. - - There is a separate dictionary-style encoding for categorical values. - Raises RuntimeError if the dtype is not categorical - Content of returned dict: - - "is_ordered" : bool, whether the ordering of dictionary indices is - semantically meaningful. - - "is_dictionary" : bool, whether a dictionary-style mapping of - categorical values to other objects exists - - "mapping" : dict, Python-level only (e.g. ``{int: str}``). - None if not a dictionary-style categorical. - """ - if not self.dtype[0] == DtypeKind.CATEGORICAL: - raise TypeError( - "`describe_categorical only works on a column with " - "categorical dtype!" - ) - - ordered = self._col.dtype.ordered - is_dictionary = True - # NOTE: this shows the children approach is better, transforming - # `categories` to a "mapping" dict is inefficient - codes = self._col.values.codes # ndarray, length `self.size` - # categories.values is ndarray of length n_categories - categories = self._col.values.categories.values - mapping = {ix: val for ix, val in enumerate(categories)} - return ordered, is_dictionary, mapping - - @property - def describe_null(self): - kind = self.dtype[0] - value = None - if kind == DtypeKind.FLOAT: - null = 1 # np.nan - elif kind == DtypeKind.DATETIME: - null = 1 # np.datetime64('NaT') - elif kind in (DtypeKind.INT, DtypeKind.UINT, DtypeKind.BOOL): - # TODO: check if extension dtypes are used once support for them is - # implemented in this protocol code - null = 0 # integer and boolean dtypes are non-nullable - elif kind == DtypeKind.CATEGORICAL: - # Null values for categoricals are stored as `-1` sentinel values - # in the category date (e.g., `col.values.codes` is int8 np.ndarray) - null = 2 - value = -1 - elif kind == DtypeKind.STRING: - null = 4 - value = ( - 0 # follow Arrow in using 1 as valid value and 0 for missing/null value - ) - else: - raise NotImplementedError(f"Data type {self.dtype} not yet supported") - - return null, value - - @property - def null_count(self) -> int: - """ - Number of null elements. Should always be known. - """ - return self._col.isna().sum() - - @property - def metadata(self): - """ - Store specific metadata of the column. - """ - return {} - - def num_chunks(self) -> int: - """ - Return the number of chunks the column consists of. - """ - return 1 - - def get_chunks(self, n_chunks=None): - """ - Return an iterator yielding the chunks. - See `DataFrame.get_chunks` for details on ``n_chunks``. - """ - if n_chunks and n_chunks > 1: - size = len(self._col) - step = size // n_chunks - if size % n_chunks != 0: - step += 1 - for start in range(0, step * n_chunks, step): - yield _PandasColumn( - self._col.iloc[start : start + step], self._allow_copy - ) - else: - yield self - - def get_buffers(self): - """ - Return a dictionary containing the underlying buffers. - The returned dictionary has the following contents: - - "data": a two-element tuple whose first element is a buffer - containing the data and whose second element is the data - buffer's associated dtype. - - "validity": a two-element tuple whose first element is a buffer - containing mask values indicating missing data and - whose second element is the mask value buffer's - associated dtype. None if the null representation is - not a bit or byte mask. - - "offsets": a two-element tuple whose first element is a buffer - containing the offset values for variable-size binary - data (e.g., variable-length strings) and whose second - element is the offsets buffer's associated dtype. None - if the data buffer does not have an associated offsets - buffer. - """ - buffers = {} - buffers["data"] = self._get_data_buffer() - try: - buffers["validity"] = self._get_validity_buffer() - except: - buffers["validity"] = None - - try: - buffers["offsets"] = self._get_offsets_buffer() - except: - buffers["offsets"] = None - - return buffers - - def _get_data_buffer( - self, - ) -> Tuple[_PandasBuffer, Any]: # Any is for self.dtype tuple - """ - Return the buffer containing the data and the buffer's associated dtype. - """ - if self.dtype[0] in ( - DtypeKind.INT, - DtypeKind.UINT, - DtypeKind.FLOAT, - DtypeKind.BOOL, - ): - buffer = _PandasBuffer(self._col.to_numpy(), allow_copy=self._allow_copy) - dtype = self.dtype - elif self.dtype[0] == DtypeKind.CATEGORICAL: - codes = self._col.values.codes - buffer = _PandasBuffer(codes, allow_copy=self._allow_copy) - dtype = self._dtype_from_pandasdtype(codes.dtype) - elif self.dtype[0] == DtypeKind.STRING: - # Marshal the strings from a NumPy object array into a byte array - buf = self._col.to_numpy() - b = bytearray() - - # TODO: this for-loop is slow; can be implemented in Cython/C/C++ later - for i in range(buf.size): - if type(buf[i]) == str: - b.extend(buf[i].encode(encoding="utf-8")) - - # Convert the byte array to a Pandas "buffer" using a NumPy array as the backing store - buffer = _PandasBuffer(np.frombuffer(b, dtype="uint8")) - - # Define the dtype for the returned buffer - dtype = ( - DtypeKind.STRING, - 8, - "u", - "=", - ) # note: currently only support native endianness - else: - raise NotImplementedError(f"Data type {self._col.dtype} not handled yet") - - return buffer, dtype - - def _get_validity_buffer(self) -> Tuple[_PandasBuffer, Any]: - """ - Return the buffer containing the mask values indicating missing data and - the buffer's associated dtype. - Raises RuntimeError if null representation is not a bit or byte mask. - """ - null, invalid = self.describe_null - - if self.dtype[0] == DtypeKind.STRING: - # For now, have the mask array be comprised of bytes, rather than a bit array - buf = self._col.to_numpy() - mask = [] - - # Determine the encoding for valid values - if invalid == 0: - valid = 1 - else: - valid = 0 - - for i in range(buf.size): - if type(buf[i]) == str: - v = valid - else: - v = invalid - - mask.append(v) - - # Convert the mask array to a Pandas "buffer" using a NumPy array as the backing store - buffer = _PandasBuffer(np.asarray(mask, dtype="uint8")) - - # Define the dtype of the returned buffer - dtype = (DtypeKind.UINT, 8, "C", "=") - - return buffer, dtype - - if null == 0: - msg = "This column is non-nullable so does not have a mask" - elif null == 1: - msg = "This column uses NaN as null so does not have a separate mask" - else: - raise NotImplementedError("See self.describe_null") - - raise RuntimeError(msg) - - def _get_offsets_buffer(self) -> Tuple[_PandasBuffer, Any]: - """ - Return the buffer containing the offset values for variable-size binary - data (e.g., variable-length strings) and the buffer's associated dtype. - Raises RuntimeError if the data buffer does not have an associated - offsets buffer. - """ - if self.dtype[0] == DtypeKind.STRING: - # For each string, we need to manually determine the next offset - values = self._col.to_numpy() - ptr = 0 - offsets = [ptr] - for v in values: - # For missing values (in this case, `np.nan` values), we don't increment the pointer) - if type(v) == str: - b = v.encode(encoding="utf-8") - ptr += len(b) - - offsets.append(ptr) - - # Convert the list of offsets to a NumPy array of signed 64-bit integers (note: Arrow allows the offsets array to be either `int32` or `int64`; here, we default to the latter) - buf = np.asarray(offsets, dtype="int64") - - # Convert the offsets to a Pandas "buffer" using the NumPy array as the backing store - buffer = _PandasBuffer(buf) - - # Assemble the buffer dtype info - dtype = ( - DtypeKind.INT, - 64, - "l", - "=", - ) # note: currently only support native endianness - else: - raise RuntimeError( - "This column has a fixed-length dtype so does not have an offsets buffer" - ) - - return buffer, dtype - - -class _PandasDataFrameXchg(DataFrameXchg): +class PandasDataFrameXchg(DataFrameXchg): """ A data frame class, with only the methods required by the interchange protocol defined. @@ -669,15 +116,15 @@ def num_chunks(self) -> int: def column_names(self): return self._df.columns.tolist() - def get_column(self, i: int) -> _PandasColumn: - return _PandasColumn(self._df.iloc[:, i], allow_copy=self._allow_copy) + def get_column(self, i: int) -> PandasColumn: + return PandasColumn(self._df.iloc[:, i], allow_copy=self._allow_copy) - def get_column_by_name(self, name: str) -> _PandasColumn: - return _PandasColumn(self._df[name], allow_copy=self._allow_copy) + def get_column_by_name(self, name: str) -> PandasColumn: + return PandasColumn(self._df[name], allow_copy=self._allow_copy) def get_columns(self): return [ - _PandasColumn(self._df[name], allow_copy=self._allow_copy) + PandasColumn(self._df[name], allow_copy=self._allow_copy) for name in self._df.columns ] @@ -687,7 +134,7 @@ def select_columns(self, indices): if not isinstance(indices, list): indices = list(indices) - return _PandasDataFrameXchg( + return PandasDataFrameXchg( self._df.iloc[:, indices], self._nan_as_null, self._allow_copy ) @@ -697,7 +144,7 @@ def select_columns_by_name(self, names): if not isinstance(names, list): names = list(names) - return _PandasDataFrameXchg( + return PandasDataFrameXchg( self._df.loc[:, names], self._nan_as_null, self._allow_copy ) @@ -711,7 +158,7 @@ def get_chunks(self, n_chunks=None): if size % n_chunks != 0: step += 1 for start in range(0, step * n_chunks, step): - yield _PandasDataFrameXchg( + yield PandasDataFrameXchg( self._df.iloc[start : start + step, :], self._nan_as_null, self._allow_copy, diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 174fdc56ae333..b8e0fca21f54c 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -824,9 +824,9 @@ def __dataframe__( https://data-apis.org/dataframe-protocol/latest/index.html """ - from pandas.core.exchange.implementation import _PandasDataFrameXchg + from pandas.core.exchange.implementation import PandasDataFrameXchg - return _PandasDataFrameXchg(self, nan_as_null, allow_copy) + return PandasDataFrameXchg(self, nan_as_null, allow_copy) # ---------------------------------------------------------------------- From 1b64ae2b1672e22eae636474b0d0d2b4026b0ed4 Mon Sep 17 00:00:00 2001 From: Vasily Litvinov Date: Thu, 31 Mar 2022 16:56:47 +0300 Subject: [PATCH 22/49] Mimick what Modin did Signed-off-by: Vasily Litvinov --- pandas/api/exchange/__init__.py | 2 +- pandas/core/exchange/buffer.py | 31 -- pandas/core/exchange/column.py | 279 +++------- .../{implementation.py => dataframe.py} | 76 +-- pandas/core/exchange/from_dataframe.py | 481 ++++++++++++++++++ pandas/core/exchange/utils.py | 102 ++++ pandas/core/frame.py | 2 +- pandas/tests/exchange/conftest.py | 2 +- 8 files changed, 664 insertions(+), 311 deletions(-) rename pandas/core/exchange/{implementation.py => dataframe.py} (57%) create mode 100644 pandas/core/exchange/from_dataframe.py create mode 100644 pandas/core/exchange/utils.py diff --git a/pandas/api/exchange/__init__.py b/pandas/api/exchange/__init__.py index dfe86230874d5..f81c3a90d3506 100644 --- a/pandas/api/exchange/__init__.py +++ b/pandas/api/exchange/__init__.py @@ -2,7 +2,7 @@ Public API for DataFrame exchange protocol. """ -from pandas.core.exchange.implementation import from_dataframe +from pandas.core.exchange.from_dataframe import from_dataframe from pandas.core.exchange.dataframe_protocol import DataFrame __all__ = ["from_dataframe", "DataFrame"] diff --git a/pandas/core/exchange/buffer.py b/pandas/core/exchange/buffer.py index bce9d21ca1469..e9d14852eedf1 100644 --- a/pandas/core/exchange/buffer.py +++ b/pandas/core/exchange/buffer.py @@ -1,15 +1,6 @@ from pandas.core.exchange.dataframe_protocol import Buffer, DlpackDeviceType, DtypeKind import numpy as np from typing import Tuple -import ctypes - - -_NP_DTYPES = { - DtypeKind.INT: {8: np.int8, 16: np.int16, 32: np.int32, 64: np.int64}, - DtypeKind.UINT: {8: np.uint8, 16: np.uint16, 32: np.uint32, 64: np.uint64}, - DtypeKind.FLOAT: {32: np.float32, 64: np.float64}, - DtypeKind.BOOL: {8: bool}, -} class PandasBuffer(Buffer): @@ -74,25 +65,3 @@ def __repr__(self) -> str: ) + ")" ) - - -def buffer_to_ndarray(_buffer: Buffer, _dtype) -> np.ndarray: - # Handle the dtype - kind = _dtype[0] - bitwidth = _dtype[1] - if kind not in _NP_DTYPES: - raise RuntimeError(f"Unsupported data type: {kind}") - - column_dtype = _NP_DTYPES[kind][bitwidth] - - # No DLPack yet, so need to construct a new ndarray from the data pointer - # and size in the buffer plus the dtype on the column - ctypes_type = np.ctypeslib.as_ctypes_type(column_dtype) - data_pointer = ctypes.cast(_buffer.ptr, ctypes.POINTER(ctypes_type)) - - # NOTE: `x` does not own its memory, so the caller of this function must - # either make a copy or hold on to a reference of the column or - # buffer! (not done yet, this is pretty awful ...) - x = np.ctypeslib.as_array(data_pointer, shape=(_buffer.bufsize // (bitwidth // 8),)) - - return x diff --git a/pandas/core/exchange/column.py b/pandas/core/exchange/column.py index 5611539ef884a..8dae3a1f13662 100644 --- a/pandas/core/exchange/column.py +++ b/pandas/core/exchange/column.py @@ -5,9 +5,35 @@ ColumnNullType, ) from pandas.core.exchange.buffer import PandasBuffer, buffer_to_ndarray +from pandas.core.exchange.utils import ArrowCTypes, Endianness, dtype_to_arrow_c_fmt +from pandas.api.types import is_categorical_dtype, is_string_dtype import pandas as pd import numpy as np from typing import Tuple, Any +from functools import cached_property + +_NP_KINDS = { + "i": DtypeKind.INT, + "u": DtypeKind.UINT, + "f": DtypeKind.FLOAT, + "b": DtypeKind.BOOL, + "U": DtypeKind.STRING, + "M": DtypeKind.DATETIME, + "m": DtypeKind.DATETIME, +} + +_NULL_DESCRIPTION = { + DtypeKind.FLOAT: (ColumnNullType.USE_NAN, None), + DtypeKind.DATETIME: (ColumnNullType.USE_NAN, None), + DtypeKind.INT: (ColumnNullType.NON_NULLABLE, None), + DtypeKind.UINT: (ColumnNullType.NON_NULLABLE, None), + DtypeKind.BOOL: (ColumnNullType.NON_NULLABLE, None), + # Null values for categoricals are stored as `-1` sentinel values + # in the category date (e.g., `col.values.codes` is int8 np.ndarray) + DtypeKind.CATEGORICAL: (ColumnNullType.USE_SENTINEL, -1), + # follow Arrow in using 1 as valid value and 0 for missing/null value + DtypeKind.STRING: (ColumnNullType.USE_BYTEMASK, 0), +} class PandasColumn(Column): @@ -46,18 +72,31 @@ def offset(self) -> int: """ Offset of first element. Always zero. """ - # FIXME: chunks are implemented now, this should return something! + # TODO: chunks are implemented now, probably this should return something return 0 - @property + @cached_property def dtype(self): dtype = self._col.dtype - # For now, assume that, if the column dtype is 'O' (i.e., `object`), then we have an array of strings - if not isinstance(dtype, pd.CategoricalDtype) and dtype.kind == "O": - return (DtypeKind.STRING, 8, "u", "=") - - return self._dtype_from_pandasdtype(dtype) + if is_categorical_dtype(dtype): + codes = self._col.values.codes + ( + _, + bitwidth, + c_arrow_dtype_f_str, + _, + ) = self._dtype_from_pandasdtype(codes.dtype) + return ( + DtypeKind.CATEGORICAL, + bitwidth, + c_arrow_dtype_f_str, + "=", + ) + elif is_string_dtype(dtype): + return (DtypeKind.STRING, 8, dtype_to_arrow_c_fmt(dtype), "=") + else: + return self._dtype_from_pandasdtype(dtype) def _dtype_from_pandasdtype(self, dtype) -> Tuple[DtypeKind, int, str, str]: """ @@ -66,39 +105,13 @@ def _dtype_from_pandasdtype(self, dtype) -> Tuple[DtypeKind, int, str, str]: # Note: 'c' (complex) not handled yet (not in array spec v1). # 'b', 'B' (bytes), 'S', 'a', (old-style string) 'V' (void) not handled # datetime and timedelta both map to datetime (is timedelta handled?) - _np_kinds = { - "i": DtypeKind.INT, - "u": DtypeKind.UINT, - "f": DtypeKind.FLOAT, - "b": DtypeKind.BOOL, - "U": DtypeKind.STRING, - "M": DtypeKind.DATETIME, - "m": DtypeKind.DATETIME, - } - kind = _np_kinds.get(dtype.kind, None) + + kind = _NP_KINDS.get(dtype.kind, None) if kind is None: # Not a NumPy dtype. Check if it's a categorical maybe - if isinstance(dtype, pd.CategoricalDtype): - kind = DtypeKind.CATEGORICAL - else: - raise ValueError( - f"Data type {dtype} not supported by exchange" "protocol" - ) - - if kind not in ( - DtypeKind.INT, - DtypeKind.UINT, - DtypeKind.FLOAT, - DtypeKind.BOOL, - DtypeKind.CATEGORICAL, - DtypeKind.STRING, - ): - raise NotImplementedError(f"Data type {dtype} not handled yet") + raise ValueError(f"Data type {dtype} not supported by exchange protocol") - bitwidth = dtype.itemsize * 8 - format_str = dtype.str - endianness = dtype.byteorder if not kind == DtypeKind.CATEGORICAL else "=" - return (kind, bitwidth, format_str, endianness) + return (kind, dtype.itemsize * 8, dtype_to_arrow_c_fmt(dtype), dtype.byteorder) @property def describe_categorical(self): @@ -121,44 +134,23 @@ def describe_categorical(self): "categorical dtype!" ) - ordered = self._col.dtype.ordered - is_dictionary = True - # NOTE: this shows the children approach is better, transforming - # `categories` to a "mapping" dict is inefficient - codes = self._col.values.codes # ndarray, length `self.size` - # categories.values is ndarray of length n_categories - categories = self._col.values.categories.values - mapping = {ix: val for ix, val in enumerate(categories)} - return ordered, is_dictionary, mapping + return { + "is_ordered": self._col.cat.ordered, + "is_dictionary": True, + "mapping": dict(zip(self._col.cat.codes, self._col.cat.categories)), + } @property def describe_null(self): kind = self.dtype[0] - value = None - if kind == DtypeKind.FLOAT: - null = 1 # np.nan - elif kind == DtypeKind.DATETIME: - null = 1 # np.datetime64('NaT') - elif kind in (DtypeKind.INT, DtypeKind.UINT, DtypeKind.BOOL): - # TODO: check if extension dtypes are used once support for them is - # implemented in this protocol code - null = 0 # integer and boolean dtypes are non-nullable - elif kind == DtypeKind.CATEGORICAL: - # Null values for categoricals are stored as `-1` sentinel values - # in the category date (e.g., `col.values.codes` is int8 np.ndarray) - null = 2 - value = -1 - elif kind == DtypeKind.STRING: - null = 4 - value = ( - 0 # follow Arrow in using 1 as valid value and 0 for missing/null value - ) - else: - raise NotImplementedError(f"Data type {self.dtype} not yet supported") + try: + null, value = _NULL_DESCRIPTION[kind] + except KeyError: + raise NotImplementedError(f"Data type {kind} not yet supported") return null, value - @property + @cached_property def null_count(self) -> int: """ Number of null elements. Should always be known. @@ -170,7 +162,7 @@ def metadata(self): """ Store specific metadata of the column. """ - return {} + return {"index": self._col.index} def num_chunks(self) -> int: """ @@ -252,9 +244,9 @@ def _get_data_buffer( b = bytearray() # TODO: this for-loop is slow; can be implemented in Cython/C/C++ later - for i in range(buf.size): - if type(buf[i]) == str: - b.extend(buf[i].encode(encoding="utf-8")) + for obj in buf: + if isinstance(obj, str): + b.extend(obj.encode(encoding="utf-8")) # Convert the byte array to a Pandas "buffer" using a NumPy array as the backing store buffer = PandasBuffer(np.frombuffer(b, dtype="uint8")) @@ -263,8 +255,8 @@ def _get_data_buffer( dtype = ( DtypeKind.STRING, 8, - "u", - "=", + ArrowCTypes.STRING, + Endianness.NATIVE, ) # note: currently only support native endianness else: raise NotImplementedError(f"Data type {self._col.dtype} not handled yet") @@ -285,24 +277,15 @@ def _get_validity_buffer(self) -> Tuple[PandasBuffer, Any]: mask = [] # Determine the encoding for valid values - if invalid == 0: - valid = 1 - else: - valid = 0 + valid = 1 if invalid == 0 else 0 - for i in range(buf.size): - if type(buf[i]) == str: - v = valid - else: - v = invalid - - mask.append(v) + mask = [valid if isinstance(obj, str) else invalid for obj in buf] # Convert the mask array to a Pandas "buffer" using a NumPy array as the backing store buffer = PandasBuffer(np.asarray(mask, dtype="uint8")) # Define the dtype of the returned buffer - dtype = (DtypeKind.UINT, 8, "C", "=") + dtype = (DtypeKind.UINT, 8, ArrowCTypes.UINT8, Endianness.NATIVE) return buffer, dtype @@ -326,14 +309,14 @@ def _get_offsets_buffer(self) -> Tuple[PandasBuffer, Any]: # For each string, we need to manually determine the next offset values = self._col.to_numpy() ptr = 0 - offsets = [ptr] - for v in values: + offsets = [ptr] + [None] * len(values) + for i, v in enumerate(values): # For missing values (in this case, `np.nan` values), we don't increment the pointer) - if type(v) == str: + if isinstance(v, str): b = v.encode(encoding="utf-8") ptr += len(b) - offsets.append(ptr) + offsets[i + 1] = ptr # Convert the list of offsets to a NumPy array of signed 64-bit integers (note: Arrow allows the offsets array to be either `int32` or `int64`; here, we default to the latter) buf = np.asarray(offsets, dtype="int64") @@ -345,8 +328,8 @@ def _get_offsets_buffer(self) -> Tuple[PandasBuffer, Any]: dtype = ( DtypeKind.INT, 64, - "l", - "=", + ArrowCTypes.INT64, + Endianness.NATIVE, ) # note: currently only support native endianness else: raise RuntimeError( @@ -354,115 +337,3 @@ def _get_offsets_buffer(self) -> Tuple[PandasBuffer, Any]: ) return buffer, dtype - - -def convert_column_to_ndarray(col: Column) -> Tuple[np.ndarray, Buffer]: - """ - Convert an int, uint, float or bool column to a numpy array. - """ - if col.describe_null[0] not in ( - ColumnNullType.NON_NULLABLE, - ColumnNullType.USE_NAN, - ): - raise NotImplementedError( - "Null values represented as masks or " "sentinel values not handled yet" - ) - - _buffer, _dtype = col.get_buffers()["data"] - return buffer_to_ndarray(_buffer, _dtype), _buffer - - -def convert_categorical_column(col: Column) -> Tuple[pd.Series, Buffer]: - """ - Convert a categorical column to a Series instance. - """ - ordered, is_dict, mapping = col.describe_categorical - if not is_dict: - raise NotImplementedError("Non-dictionary categoricals not supported yet") - - # If you want to cheat for testing (can't use `_col` in real-world code): - # categories = col._col.values.categories.values - # codes = col._col.values.codes - categories = np.asarray(list(mapping.values())) - codes_buffer, codes_dtype = col.get_buffers()["data"] - codes = buffer_to_ndarray(codes_buffer, codes_dtype) - values = categories[codes] - - # Seems like Pandas can only construct with non-null values, so need to - # null out the nulls later - cat = pd.Categorical(values, categories=categories, ordered=ordered) - series = pd.Series(cat) - null_kind = col.describe_null[0] - if null_kind == ColumnNullType.USE_SENTINEL: # sentinel value - sentinel = col.describe_null[1] - series[codes == sentinel] = np.nan - elif null_kind != ColumnNullType.NON_NULLABLE: - raise NotImplementedError( - "Only categorical columns with sentinel " "value supported at the moment" - ) - - return series, codes_buffer - - -def convert_string_column(col: Column) -> Tuple[np.ndarray, dict]: - """ - Convert a string column to a NumPy array. - """ - # Retrieve the data buffers - buffers = col.get_buffers() - - # Retrieve the data buffer containing the UTF-8 code units - dbuffer, bdtype = buffers["data"] - - # Retrieve the offsets buffer containing the index offsets demarcating the beginning and end of each string - obuffer, odtype = buffers["offsets"] - - # Retrieve the mask buffer indicating the presence of missing values - mbuffer, mdtype = buffers["validity"] - - # Retrieve the missing value encoding - null_kind, null_value = col.describe_null - - # Convert the buffers to NumPy arrays - dt = ( - DtypeKind.UINT, - 8, - None, - None, - ) # note: in order to go from STRING to an equivalent ndarray, we claim that the buffer is uint8 (i.e., a byte array) - dbuf = buffer_to_ndarray(dbuffer, dt) - - obuf = buffer_to_ndarray(obuffer, odtype) - mbuf = buffer_to_ndarray(mbuffer, mdtype) - - # Assemble the strings from the code units - str_list = [] - for i in range(obuf.size - 1): - # Check for missing values - if null_kind == ColumnNullType.USE_BITMASK: - v = mbuf[i // 8] - if null_value == 1: - v = ~v - - if v & (1 << (i % 8)): - str_list.append(np.nan) - continue - - elif null_kind == ColumnNullType.USE_BYTEMASK and mbuf[i] == null_value: - str_list.append(np.nan) - continue - - # Extract a range of code units - units = dbuf[obuf[i] : obuf[i + 1]] - - # Convert the list of code units to bytes - b = bytes(units) - - # Create the string - s = b.decode(encoding="utf-8") - - # Add to our list of strings - str_list.append(s) - - # Convert the string list to a NumPy array - return np.asarray(str_list, dtype="object"), buffers diff --git a/pandas/core/exchange/implementation.py b/pandas/core/exchange/dataframe.py similarity index 57% rename from pandas/core/exchange/implementation.py rename to pandas/core/exchange/dataframe.py index 060d4cac45cfa..14e07fc2fafed 100644 --- a/pandas/core/exchange/implementation.py +++ b/pandas/core/exchange/dataframe.py @@ -1,77 +1,7 @@ -import collections.abc -import ctypes - -from typing import Tuple, Any - -from pandas.core.exchange.dataframe_protocol import ( - DataFrame as DataFrameXchg, - DtypeKind, -) - +from pandas.core.exchange.dataframe_protocol import DataFrame as DataFrameXchg +from pandas.core.exchange.column import PandasColumn import pandas as pd - -from pandas.core.exchange.column import ( - convert_column_to_ndarray, - convert_categorical_column, - convert_string_column, - PandasColumn, -) - - -def from_dataframe(df: DataFrameXchg, allow_copy: bool = True) -> pd.DataFrame: - """ - Construct a pandas DataFrame from ``df`` if it supports ``__dataframe__`` - """ - if isinstance(df, pd.DataFrame): - return df - - if not hasattr(df, "__dataframe__"): - raise ValueError("`df` does not support __dataframe__") - - return _from_dataframe(df.__dataframe__(allow_copy=allow_copy)) - - -def _from_dataframe(df: DataFrameXchg) -> pd.DataFrame: - """ - Note: not all cases are handled yet, only ones that can be implemented with - only Pandas. Later, we need to implement/test support for categoricals, - bit/byte masks, chunk handling, etc. - """ - _buffers = [] # hold on to buffers, keeps memory alive - result = [] - for chunk in df.get_chunks(): - # We need a dict of columns here, with each column being a numpy array (at - # least for now, deal with non-numpy dtypes later). - chunk_cols = {} - for name in chunk.column_names(): - if not isinstance(name, str): - raise ValueError(f"Column {name} is not a string") - if name in chunk_cols: - raise ValueError(f"Column {name} is not unique") - col = chunk.get_column_by_name(name) - if col.dtype[0] in ( - DtypeKind.INT, - DtypeKind.UINT, - DtypeKind.FLOAT, - DtypeKind.BOOL, - ): - # Simple numerical or bool dtype, turn into numpy array - chunk_cols[name], _buf = convert_column_to_ndarray(col) - elif col.dtype[0] == DtypeKind.CATEGORICAL: - chunk_cols[name], _buf = convert_categorical_column(col) - elif col.dtype[0] == DtypeKind.STRING: - chunk_cols[name], _buf = convert_string_column(col) - else: - raise NotImplementedError(f"Data type {col.dtype[0]} not handled yet") - - _buffers.append(_buf) - - df_new = pd.DataFrame(chunk_cols) - result.append(df_new) - - df_new = pd.concat(result) - df_new._buffers = _buffers - return df_new +import collections.abc class PandasDataFrameXchg(DataFrameXchg): diff --git a/pandas/core/exchange/from_dataframe.py b/pandas/core/exchange/from_dataframe.py new file mode 100644 index 0000000000000..09bedd8005567 --- /dev/null +++ b/pandas/core/exchange/from_dataframe.py @@ -0,0 +1,481 @@ +from pandas.core.exchange.dataframe_protocol import ( + Buffer, + Column, + DataFrame, + DtypeKind, + ColumnNullType, +) +from pandas.core.exchange.utils import ArrowCTypes, Endianness +import numpy as np +import pandas as pd +import ctypes +import re +from typing import Tuple, Optional, Any, Union + +_NP_DTYPES = { + DtypeKind.INT: {8: np.int8, 16: np.int16, 32: np.int32, 64: np.int64}, + DtypeKind.UINT: {8: np.uint8, 16: np.uint16, 32: np.uint32, 64: np.uint64}, + DtypeKind.FLOAT: {32: np.float32, 64: np.float64}, + DtypeKind.BOOL: {8: bool}, +} + + +def from_dataframe(df, allow_copy=True): + if isinstance(df, pd.DataFrame): + return df + + if not hasattr(df, "__dataframe__"): + raise ValueError("`df` does not support __dataframe__") + + return _from_dataframe(df.__dataframe__(allow_copy=allow_copy)) + + +def _from_dataframe(df: DataFrame, allow_copy=True): + """ + Build a ``pd.DataFrame`` from an object supporting the DataFrame exchange protocol, i.e. `__dataframe__` method. + + Parameters + ---------- + df : DataFrame + Object supporting the exchange protocol, i.e. `__dataframe__` method. + n_chunks : int, optional + Number of chunks to split `df`. + + Returns + ------- + pd.DataFrame + """ + pandas_dfs = [] + for chunk in df.get_chunks(): + pandas_df = protocol_df_chunk_to_pandas(chunk) + pandas_dfs.append(pandas_df) + + if not allow_copy and len(pandas_dfs) > 1: + raise RuntimeError( + "To join chunks a copy is required which is forbidden by allow_copy=False" + ) + if len(pandas_dfs) == 1: + pandas_df = pandas_dfs[0] + else: + pandas_df = pd.concat(pandas_dfs, axis=0, ignore_index=True) + + index_obj = df.metadata.get("pandas.index", None) + if index_obj is not None: + pandas_df.index = index_obj + + return pandas_df + + +def protocol_df_chunk_to_pandas(df): + """ + Convert exchange protocol chunk to ``pd.DataFrame``. + + Parameters + ---------- + df : DataFrame + + Returns + ------- + pd.DataFrame + """ + # We need a dict of columns here, with each column being a NumPy array (at + # least for now, deal with non-NumPy dtypes later). + columns = dict() + buffers = [] # hold on to buffers, keeps memory alive + for name in df.column_names(): + if not isinstance(name, str): + raise ValueError(f"Column {name} is not a string") + if name in columns: + raise ValueError(f"Column {name} is not unique") + col = df.get_column_by_name(name) + dtype = col.dtype[0] + if dtype in ( + DtypeKind.INT, + DtypeKind.UINT, + DtypeKind.FLOAT, + DtypeKind.BOOL, + ): + columns[name], buf = primitive_column_to_ndarray(col) + elif dtype == DtypeKind.CATEGORICAL: + columns[name], buf = categorical_column_to_series(col) + elif dtype == DtypeKind.STRING: + columns[name], buf = string_column_to_ndarray(col) + elif dtype == DtypeKind.DATETIME: + columns[name], buf = datetime_column_to_ndarray(col) + else: + raise NotImplementedError(f"Data type {dtype} not handled yet") + + buffers.append(buf) + + pandas_df = pd.DataFrame(columns) + pandas_df._buffers = buffers + return pandas_df + + +def primitive_column_to_ndarray(col: Column) -> Tuple[np.ndarray, Any]: + """ + Convert a column holding one of the primitive dtypes (int, uint, float or bool) to a NumPy array. + + Parameters + ---------- + col : Column + + Returns + ------- + tuple + Tuple of np.ndarray holding the data and the memory owner object that keeps the memory alive. + """ + buffers = col.get_buffers() + + data_buff, data_dtype = buffers["data"] + data = buffer_to_ndarray(data_buff, data_dtype, col.offset, col.size) + + data = set_nulls(data, col, buffers["validity"]) + return data, buffers + + +def categorical_column_to_series(col: Column) -> Tuple[pd.Series, Any]: + """ + Convert a column holding categorical data to a pandas Series. + + Parameters + ---------- + col : Column + + Returns + ------- + tuple + Tuple of pd.Series holding the data and the memory owner object that keeps the memory alive. + """ + ordered, is_dict, mapping = col.describe_categorical.values() + + if not is_dict: + raise NotImplementedError("Non-dictionary categoricals not supported yet") + + categories = np.array(tuple(mapping.values())) + buffers = col.get_buffers() + + codes_buff, codes_dtype = buffers["data"] + codes = buffer_to_ndarray(codes_buff, codes_dtype, col.offset, col.size) + + # Doing module in order to not get ``IndexError`` for out-of-bounds sentinel values in `codes` + values = categories[codes % len(categories)] + + cat = pd.Categorical(values, categories=categories, ordered=ordered) + data = pd.Series(cat) + + data = set_nulls(data, col, buffers["validity"]) + return data, buffers + + +def string_column_to_ndarray(col: Column) -> Tuple[np.ndarray, Any]: + """ + Convert a column holding string data to a NumPy array. + + Parameters + ---------- + col : Column + + Returns + ------- + tuple + Tuple of np.ndarray holding the data and the memory owner object that keeps the memory alive. + """ + null_kind, sentinel_val = col.describe_null + + if null_kind not in ( + ColumnNullType.NON_NULLABLE, + ColumnNullType.USE_BITMASK, + ColumnNullType.USE_BYTEMASK, + ): + raise NotImplementedError( + f"{null_kind} null kind is not yet supported for string columns." + ) + + buffers = col.get_buffers() + + # Retrieve the data buffer containing the UTF-8 code units + data_buff, protocol_data_dtype = buffers["data"] + # We're going to reinterpret the buffer as uint8, so making sure we can do it safely + assert protocol_data_dtype[1] == 8 # bitwidth == 8 + assert protocol_data_dtype[2] == ArrowCTypes.STRING # format_str == utf-8 + # Convert the buffers to NumPy arrays, in order to go from STRING to an equivalent ndarray, + # we claim that the buffer is uint8 (i.e., a byte array) + data_dtype = ( + DtypeKind.UINT, + 8, + ArrowCTypes.UINT8, + Endianness.NATIVE, + ) + # Specify zero offset as we don't want to chunk the string data + data = buffer_to_ndarray(data_buff, data_dtype, offset=0, length=col.size) + + # Retrieve the offsets buffer containing the index offsets demarcating the beginning and end of each string + offset_buff, offset_dtype = buffers["offsets"] + # Offsets buffer contains start-stop positions of strings in the data buffer, + # meaning that it has more elements than in the data buffer, do `col.size + 1` here + # to pass a proper offsets buffer size + offsets = buffer_to_ndarray( + offset_buff, offset_dtype, col.offset, length=col.size + 1 + ) + + null_pos = None + if null_kind in (ColumnNullType.USE_BITMASK, ColumnNullType.USE_BYTEMASK): + valid_buff, valid_dtype = buffers["validity"] + null_pos = buffer_to_ndarray(valid_buff, valid_dtype, col.offset, col.size) + if sentinel_val == 0: + null_pos = ~null_pos + + # Assemble the strings from the code units + str_list = [None] * col.size + for i in range(col.size): + # Check for missing values + if null_pos is not None and null_pos[i]: + str_list[i] = np.nan + continue + + # Extract a range of code units + units = data[offsets[i] : offsets[i + 1]] + + # Convert the list of code units to bytes + str_bytes = bytes(units) + + # Create the string + string = str_bytes.decode(encoding="utf-8") + + # Add to our list of strings + str_list[i] = string + + # Convert the string list to a NumPy array + return np.asarray(str_list, dtype="object"), buffers + + +def parse_datetime_format_str(format_str, data): + """Parse datetime `format_str` to interpret the `data`.""" + # timestamp 'ts{unit}:tz' + timestamp_meta = re.match(r"ts([smun]):(.*)", format_str) + if timestamp_meta: + unit, tz = timestamp_meta.group(1), timestamp_meta.group(2) + if tz != "": + raise NotImplementedError("Timezones are not supported yet") + if unit != "s": + # the format string describes only a first letter of the unit, add one extra + # letter to make the unit in numpy-style: 'm' -> 'ms', 'u' -> 'us', 'n' -> 'ns' + unit += "s" + data = data.astype(f"datetime64[{unit}]") + return data + + # date 'td{Days/Ms}' + date_meta = re.match(r"td([Dm])", format_str) + if date_meta: + unit = date_meta.group(1) + if unit == "D": + # NumPy doesn't support DAY unit, so converting days to seconds + # (converting to uint64 to avoid overflow) + data = (data.astype(np.uint64) * (24 * 60 * 60)).astype("datetime64[s]") + elif unit == "m": + data = data.astype("datetime64[ms]") + else: + raise NotImplementedError(f"Date unit is not supported: {unit}") + return data + + raise NotImplementedError(f"DateTime kind is not supported: {format_str}") + + +def datetime_column_to_ndarray(col: Column) -> Tuple[np.ndarray, Any]: + """ + Convert a column holding DateTime data to a NumPy array. + + Parameters + ---------- + col : Column + + Returns + ------- + tuple + Tuple of np.ndarray holding the data and the memory owner object that keeps the memory alive. + """ + buffers = col.get_buffers() + + _, _, format_str, _ = col.dtype + dbuf, dtype = buffers["data"] + # Consider dtype being `uint` to get number of units passed since the 01.01.1970 + data = buffer_to_ndarray( + dbuf, + ( + DtypeKind.UINT, + dtype[1], + getattr(ArrowCTypes, f"UINT{dtype[1]}"), + Endianness.NATIVE, + ), + col.offset, + col.size, + ) + + data = parse_datetime_format_str(format_str, data) + data = set_nulls(data, col, buffers["validity"]) + return data, buffers + + +def buffer_to_ndarray( + buffer: Buffer, + dtype: Tuple[DtypeKind, int, str, str], + offset: int = 0, + length: Optional[int] = None, +) -> np.ndarray: + """ + Build a NumPy array from the passed buffer. + + Parameters + ---------- + buffer : Buffer + Buffer to build a NumPy array from. + dtype : tuple + Data type of the buffer conforming protocol dtypes format. + offset : int, default: 0 + Number of elements to offset from the start of the buffer. + length : int, optional + If the buffer is a bit-mask, specifies a number of bits to read + from the buffer. Has no effect otherwise. + + Returns + ------- + np.ndarray + + Notes + ----- + The returned array doesn't own the memory. A user of the function must keep the memory + owner object alive as long as the returned NumPy array is being used. + """ + kind, bit_width, _, _ = dtype + + column_dtype = _NP_DTYPES.get(kind, {}).get(bit_width, None) + if column_dtype is None: + raise NotImplementedError(f"Convertion for {dtype} is not yet supported.") + + # TODO: No DLPack yet, so need to construct a new ndarray from the data pointer + # and size in the buffer plus the dtype on the column. Use DLPack as NumPy supports + # it since https://github.com/numpy/numpy/pull/19083 + ctypes_type = np.ctypeslib.as_ctypes_type(column_dtype) + data_pointer = ctypes.cast( + buffer.ptr + (offset * bit_width // 8), ctypes.POINTER(ctypes_type) + ) + + if bit_width == 1: + assert length is not None, "`length` must be specified for a bit-mask buffer." + arr = np.ctypeslib.as_array(data_pointer, shape=(buffer.bufsize,)) + return bitmask_to_bool_ndarray(arr, length, first_byte_offset=offset % 8) + else: + return np.ctypeslib.as_array( + data_pointer, shape=(buffer.bufsize // (bit_width // 8),) + ) + + +def bitmask_to_bool_ndarray( + bitmask: np.ndarray, mask_length: int, first_byte_offset: int = 0 +) -> np.ndarray: + """ + Convert bit-mask to a boolean NumPy array. + + Parameters + ---------- + bitmask : np.ndarray[uint8] + NumPy array of uint8 dtype representing the bitmask. + mask_length : int + Number of elements in the mask to interpret. + first_byte_offset : int, default: 0 + Number of elements to offset from the start of the first byte. + + Returns + ------- + np.ndarray[bool] + """ + bytes_to_skip = first_byte_offset // 8 + bitmask = bitmask[bytes_to_skip:] + first_byte_offset %= 8 + + bool_mask = np.zeros(mask_length, dtype=bool) + + # Proccessing the first byte separately as it has its own offset + val = bitmask[0] + mask_idx = 0 + bits_in_first_byte = min(8 - first_byte_offset, mask_length) + for j in range(bits_in_first_byte): + if val & (1 << (j + first_byte_offset)): + bool_mask[mask_idx] = True + mask_idx += 1 + + # `mask_length // 8` describes how many full bytes to process + for i in range((mask_length - bits_in_first_byte) // 8): + # doing `+ 1` as we already processed the first byte + val = bitmask[i + 1] + for j in range(8): + if val & (1 << j): + bool_mask[mask_idx] = True + mask_idx += 1 + + if len(bitmask) > 1: + # Processing reminder of last byte + val = bitmask[-1] + for j in range(len(bool_mask) - mask_idx): + if val & (1 << j): + bool_mask[mask_idx] = True + mask_idx += 1 + + return bool_mask + + +def set_nulls( + data: Union[np.ndarray, pd.Series], + col: Column, + validity: Tuple[Buffer, Tuple[DtypeKind, int, str, str]], + allow_modify_inplace: bool = True, +): + """ + Set null values for the data according to the column null kind. + + Parameters + ---------- + data : np.ndarray or pd.Series + Data to set nulls in. + col : Column + Column object that describes the `data`. + validity : tuple(Buffer, dtype) or None + The return value of ``col.buffers()``. We do not access the ``col.buffers()`` + here to not take the ownership of the memory of buffer objects. + allow_modify_inplace : bool, default: True + Whether to modify the `data` inplace when zero-copy is possible (True) or always + modify a copy of the `data` (False). + + Returns + ------- + np.ndarray or pd.Series + Data with the nulls being set. + """ + null_kind, sentinel_val = col.describe_null + null_pos = None + + if null_kind == ColumnNullType.USE_SENTINEL: + null_pos = data == sentinel_val + elif null_kind in (ColumnNullType.USE_BITMASK, ColumnNullType.USE_BYTEMASK): + valid_buff, valid_dtype = validity + null_pos = buffer_to_ndarray(valid_buff, valid_dtype, col.offset, col.size) + if sentinel_val == 0: + null_pos = ~null_pos + elif null_kind in (ColumnNullType.NON_NULLABLE, ColumnNullType.USE_NAN): + pass + else: + raise NotImplementedError(f"Null kind {null_kind} is not yet supported.") + + if null_pos is not None and np.any(null_pos): + if not allow_modify_inplace: + data = data.copy() + try: + data[null_pos] = None + except TypeError: + # TypeError happens if the `data` dtype appears to be non-nullable in numpy notation + # (bool, int, uint), if such happens, cast the `data` to nullable float dtype. + data = data.astype(float) + data[null_pos] = None + + return data diff --git a/pandas/core/exchange/utils.py b/pandas/core/exchange/utils.py new file mode 100644 index 0000000000000..a60a26aabbb39 --- /dev/null +++ b/pandas/core/exchange/utils.py @@ -0,0 +1,102 @@ +""" +Utility functions and objects for implementing the exchange API. +""" + +import enum +import pandas as pd +import re +import numpy as np +from pandas.api.types import is_datetime64_dtype + + +@enum.unique +class ArrowCTypes(enum.Enum): + """ + Enum for Apache Arrow C type format strings. + + The Arrow C data interface: + https://arrow.apache.org/docs/format/CDataInterface.html#data-type-description-format-strings + """ + + NULL = "n" + BOOL = "b" + INT8 = "c" + UINT8 = "C" + INT16 = "s" + UINT16 = "S" + INT32 = "i" + UINT32 = "I" + INT64 = "l" + UINT64 = "L" + FLOAT16 = "e" + FLOAT32 = "f" + FLOAT64 = "g" + STRING = "u" # utf-8 + DATE32 = "tdD" + DATE64 = "tdm" + # Resoulution: + # - seconds -> 's' + # - miliseconds -> 'm' + # - microseconds -> 'u' + # - nanoseconds -> 'n' + TIMESTAMP = "ts{resolution}:{tz}" + TIME = "tt{resolution}" + + +class Endianness: + """Enum indicating the byte-order of a data-type.""" + + LITTLE = "<" + BIG = ">" + NATIVE = "=" + NA = "|" + + +def dtype_to_arrow_c_fmt(dtype) -> str: + """ + Represent pandas `dtype` as a format string in Apache Arrow C notation. + + Parameters + ---------- + dtype : np.dtype + Datatype of pandas DataFrame to represent. + + Returns + ------- + str + Format string in Apache Arrow C notation of the given `dtype`. + """ + if isinstance(dtype, pd.CategoricalDtype): + return ArrowCTypes.INT64 + elif dtype == np.dtype("O"): + return ArrowCTypes.STRING + + format_str = getattr(ArrowCTypes, dtype.name.upper(), None) + if format_str is not None: + return format_str + + if is_datetime64_dtype(dtype): + # Selecting the first char of resolution string: + # dtype.str -> ' Date: Thu, 31 Mar 2022 17:04:39 +0300 Subject: [PATCH 23/49] Make spec tests pass Signed-off-by: Vasily Litvinov --- pandas/core/exchange/buffer.py | 2 +- pandas/core/exchange/column.py | 2 +- .../{test_protocol.py => test_spec_conformance.py} | 8 ++++---- 3 files changed, 6 insertions(+), 6 deletions(-) rename pandas/tests/exchange/{test_protocol.py => test_spec_conformance.py} (95%) diff --git a/pandas/core/exchange/buffer.py b/pandas/core/exchange/buffer.py index e9d14852eedf1..fdc48c7eca1a7 100644 --- a/pandas/core/exchange/buffer.py +++ b/pandas/core/exchange/buffer.py @@ -1,4 +1,4 @@ -from pandas.core.exchange.dataframe_protocol import Buffer, DlpackDeviceType, DtypeKind +from pandas.core.exchange.dataframe_protocol import Buffer, DlpackDeviceType import numpy as np from typing import Tuple diff --git a/pandas/core/exchange/column.py b/pandas/core/exchange/column.py index 8dae3a1f13662..0d0e522917a75 100644 --- a/pandas/core/exchange/column.py +++ b/pandas/core/exchange/column.py @@ -4,7 +4,7 @@ Buffer, ColumnNullType, ) -from pandas.core.exchange.buffer import PandasBuffer, buffer_to_ndarray +from pandas.core.exchange.buffer import PandasBuffer from pandas.core.exchange.utils import ArrowCTypes, Endianness, dtype_to_arrow_c_fmt from pandas.api.types import is_categorical_dtype, is_string_dtype import pandas as pd diff --git a/pandas/tests/exchange/test_protocol.py b/pandas/tests/exchange/test_spec_conformance.py similarity index 95% rename from pandas/tests/exchange/test_protocol.py rename to pandas/tests/exchange/test_spec_conformance.py index 22111474f18e7..10d6614ba3e12 100644 --- a/pandas/tests/exchange/test_protocol.py +++ b/pandas/tests/exchange/test_spec_conformance.py @@ -74,9 +74,9 @@ def test_categorical(df_from_dict): ) colX = df.__dataframe__().get_column_by_name("weekday") - is_ordered, is_dictionary, _ = colX.describe_categorical - assert isinstance(is_ordered, bool) - assert isinstance(is_dictionary, bool) + categorical = colX.describe_categorical + assert isinstance(categorical["is_ordered"], bool) + assert isinstance(categorical["is_dictionary"], bool) def test_dataframe(df_from_dict): @@ -134,7 +134,7 @@ def test_buffer(df_from_dict): assert dataBuf.bufsize > 0 assert dataBuf.ptr != 0 - device, _ = dataBuf.__dlpack_device__ + device, _ = dataBuf.__dlpack_device__() assert dataDtype[0] == 0 From edefc8f41c6d4d106927d1e44bf950af8e2021fb Mon Sep 17 00:00:00 2001 From: Vasily Litvinov Date: Thu, 31 Mar 2022 17:10:35 +0300 Subject: [PATCH 24/49] Add tests for dtype_to_arrow_c_fmt Signed-off-by: Vasily Litvinov --- pandas/core/exchange/utils.py | 4 +--- pandas/tests/exchange/test_utils.py | 35 +++++++++++++++++++++++++++++ 2 files changed, 36 insertions(+), 3 deletions(-) create mode 100644 pandas/tests/exchange/test_utils.py diff --git a/pandas/core/exchange/utils.py b/pandas/core/exchange/utils.py index a60a26aabbb39..1e6378ff76024 100644 --- a/pandas/core/exchange/utils.py +++ b/pandas/core/exchange/utils.py @@ -2,15 +2,13 @@ Utility functions and objects for implementing the exchange API. """ -import enum import pandas as pd import re import numpy as np from pandas.api.types import is_datetime64_dtype -@enum.unique -class ArrowCTypes(enum.Enum): +class ArrowCTypes: """ Enum for Apache Arrow C type format strings. diff --git a/pandas/tests/exchange/test_utils.py b/pandas/tests/exchange/test_utils.py new file mode 100644 index 0000000000000..26edbffe9a8af --- /dev/null +++ b/pandas/tests/exchange/test_utils.py @@ -0,0 +1,35 @@ +import pandas as pd +import numpy as np +import pytest + +from pandas.core.exchange.utils import dtype_to_arrow_c_fmt + +# TODO: use ArrowSchema to get reference C-string. +# At the time, there is no way to access ArrowSchema holding a type format string from python. +# The only way to 'touch' it is to export the structure to a C-pointer: +# https://github.com/apache/arrow/blob/5680d209fd870f99134e2d7299b47acd90fabb8e/python/pyarrow/types.pxi#L230-L239 +@pytest.mark.parametrize( + "pandas_dtype, c_string", + [ + (np.dtype("bool"), "b"), + (np.dtype("int8"), "c"), + (np.dtype("uint8"), "C"), + (np.dtype("int16"), "s"), + (np.dtype("uint16"), "S"), + (np.dtype("int32"), "i"), + (np.dtype("uint32"), "I"), + (np.dtype("int64"), "l"), + (np.dtype("uint64"), "L"), + (np.dtype("float16"), "e"), + (np.dtype("float32"), "f"), + (np.dtype("float64"), "g"), + (pd.Series(["a"]).dtype, "u"), + ( + pd.Series([0]).astype("datetime64[ns]").dtype, + "tsn:", + ), + ], +) +def test_dtype_to_arrow_c_fmt(pandas_dtype, c_string): # noqa PR01 + """Test ``dtype_to_arrow_c_fmt`` utility function.""" + assert dtype_to_arrow_c_fmt(pandas_dtype) == c_string From 7144cf243def0e25d1d5c1de959aaf8cd498eb29 Mon Sep 17 00:00:00 2001 From: Vasily Litvinov Date: Thu, 31 Mar 2022 18:33:34 +0300 Subject: [PATCH 25/49] Fix test declarations, some impl bugs remain Signed-off-by: Vasily Litvinov --- pandas/core/exchange/dataframe.py | 3 + pandas/core/exchange/dataframe_protocol.py | 18 +- pandas/tests/exchange/test_impl.py | 191 +++++++++++++++++++++ 3 files changed, 210 insertions(+), 2 deletions(-) create mode 100644 pandas/tests/exchange/test_impl.py diff --git a/pandas/core/exchange/dataframe.py b/pandas/core/exchange/dataframe.py index 14e07fc2fafed..4e7edf753c901 100644 --- a/pandas/core/exchange/dataframe.py +++ b/pandas/core/exchange/dataframe.py @@ -28,6 +28,9 @@ def __init__( self._nan_as_null = nan_as_null self._allow_copy = allow_copy + def __dataframe__(self, nan_as_null: bool = False, allow_copy: bool = True): + return PandasDataFrameXchg(self._df, nan_as_null, allow_copy) + @property def metadata(self): # `index` isn't a regular column, and the protocol doesn't support row diff --git a/pandas/core/exchange/dataframe_protocol.py b/pandas/core/exchange/dataframe_protocol.py index ad31e7d8b6653..964424ec7de16 100644 --- a/pandas/core/exchange/dataframe_protocol.py +++ b/pandas/core/exchange/dataframe_protocol.py @@ -93,6 +93,15 @@ class ColumnBuffers(TypedDict): offsets: Optional[Tuple["Buffer", Any]] +class CategoricalDescription(TypedDict): + # whether the ordering of dictionary indices is semantically meaningful + is_ordered: bool + # whether a dictionary-style mapping of categorical values to other objects exists + is_dictionary: bool + # Python-level only (e.g. ``{int: str}``). None if not a dictionary-style categorical. + mapping: Optional[dict] + + class Buffer(ABC): """ Data in the buffer is guaranteed to be contiguous in memory. @@ -250,7 +259,7 @@ def dtype(self) -> Tuple[DtypeKind, int, str, str]: @property @abstractmethod - def describe_categorical(self) -> Tuple[bool, bool, Optional[dict]]: + def describe_categorical(self) -> CategoricalDescription: """ If the dtype is categorical, there are two options: - There are only values in the data buffer. @@ -258,7 +267,7 @@ def describe_categorical(self) -> Tuple[bool, bool, Optional[dict]]: Raises TypeError if the dtype is not categorical - Returns the description on how to interpret the data buffer: + Returns the dictionary with description on how to interpret the data buffer: - "is_ordered" : bool, whether the ordering of dictionary indices is semantically meaningful. - "is_dictionary" : bool, whether a dictionary-style mapping of @@ -367,6 +376,11 @@ class DataFrame(ABC): version = 0 # version of the protocol + @abstractmethod + def __dataframe__(self, nan_as_null: bool = False, allow_copy: bool = True): + """Construct a new exchange object, potentially changing the parameters.""" + pass + @property @abstractmethod def metadata(self) -> Dict[str, Any]: diff --git a/pandas/tests/exchange/test_impl.py b/pandas/tests/exchange/test_impl.py new file mode 100644 index 0000000000000..94b35c999ee3e --- /dev/null +++ b/pandas/tests/exchange/test_impl.py @@ -0,0 +1,191 @@ +import pandas as pd +import numpy as np +import pytest +import random + +from pandas.testing import assert_frame_equal +from pandas.core.exchange.dataframe_protocol import DtypeKind, ColumnNullType +from pandas.core.exchange.from_dataframe import from_dataframe + +test_data_categorical = { + "ordered": pd.Categorical(list("testdata") * 30, ordered=True), + "unordered": pd.Categorical(list("testdata") * 30, ordered=False), +} + +NCOLS, NROWS = 100, 200 + +int_data = { + "col{}".format(int((i - NCOLS / 2) % NCOLS + 1)): [ + random.randint(0, 100) for _ in range(NROWS) + ] + for i in range(NCOLS) +} + +bool_data = { + "col{}".format(int((i - NCOLS / 2) % NCOLS + 1)): [ + random.choice([True, False]) for _ in range(NROWS) + ] + for i in range(NCOLS) +} + +float_data = { + "col{}".format(int((i - NCOLS / 2) % NCOLS + 1)): [ + random.random() for _ in range(NROWS) + ] + for i in range(NCOLS) +} + +string_data = { + "separator data": [ + "abC|DeF,Hik", + "234,3245.67", + "gSaf,qWer|Gre", + "asd3,4sad|", + np.NaN, + ] +} + + +@pytest.mark.parametrize("data", [("ordered", True), ("unordered", False)]) +def test_categorical_dtype(data): + df = pd.DataFrame({"A": (test_data_categorical[data[0]])}) + + col = df.__dataframe__().get_column_by_name("A") + assert col.dtype[0] == DtypeKind.CATEGORICAL + assert col.null_count == 0 + assert col.describe_null == (ColumnNullType.USE_SENTINEL, -1) + assert col.num_chunks() == 1 + assert col.describe_categorical == { + "is_ordered": data[1], + "is_dictionary": True, + "mapping": {4: "s", 2: "d", 3: "e", 1: "t"}, + } + + assert assert_frame_equal(df, from_dataframe(df.__dataframe__())) + + +@pytest.mark.parametrize("data", [int_data, float_data, bool_data]) +def test_dataframe(data): + df = pd.DataFrame(data) + + df2 = df.__dataframe__() + + assert df2._allow_copy is True + assert df2.num_columns() == NCOLS + assert df2.num_rows() == NROWS + + assert list(df2.column_names()) == list(data.keys()) + + assert assert_frame_equal( + from_dataframe(df2.select_columns((0, 2))), + from_dataframe(df2.select_columns_by_name(("col33", "col35"))), + ) + assert assert_frame_equal( + from_dataframe(df2.select_columns((0, 2))), + from_dataframe(df2.select_columns_by_name(("col33", "col35"))), + ) + + +def test_missing_from_masked(): + df = pd.DataFrame( + { + "x": np.array([1, 2, 3, 4, 0]), + "y": np.array([1.5, 2.5, 3.5, 4.5, 0]), + "z": np.array([True, False, True, True, True]), + } + ) + + df2 = df.__dataframe__() + + # for col_name in df.columns: + # assert convert_column_to_array(df2.get_column_by_name(col_name) == df[col_name].tolist() + # assert df[col_name].dtype == convert_column_to_array(df2.get_column_by_name(col_name)).dtype + + rng = np.random.RandomState(42) + dict_null = {col: rng.randint(low=0, high=len(df)) for col in df.columns} + for col, num_nulls in dict_null.items(): + null_idx = df.index[ + rng.choice(np.arange(len(df)), size=num_nulls, replace=False) + ] + df.loc[null_idx, col] = None + + df2 = df.__dataframe__() + + assert df2.get_column_by_name("x").null_count == dict_null["x"] + assert df2.get_column_by_name("y").null_count == dict_null["y"] + assert df2.get_column_by_name("z").null_count == dict_null["z"] + + +@pytest.mark.parametrize( + "data", + [ + {"x": [1.5, 2.5, 3.5], "y": [9.2, 10.5, 11.8]}, + {"x": [1, 2, 0], "y": [9.2, 10.5, 11.8]}, + { + "x": np.array([True, True, False]), + "y": np.array([1, 2, 0]), + "z": np.array([9.2, 10.5, 11.8]), + }, + ], +) +def test_mixed_data(data): + df = pd.DataFrame(data) + df2 = df.__dataframe__() + + for col_name in df.columns: + assert df2.get_column_by_name(col_name).null_count == 0 + + +def test_mixed_missing(): + df = pd.DataFrame( + { + "x": np.array([True, None, False, None, True]), + "y": np.array([None, 2, None, 1, 2]), + "z": np.array([9.2, 10.5, None, 11.8, None]), + } + ) + + df2 = df.__dataframe__() + + for col_name in df.columns: + assert df2.get_column_by_name(col_name).null_count == 2 + + +def test_select_columns_error(): + df = pd.DataFrame(int_data) + + df2 = df.__dataframe__() + + with pytest.raises(ValueError): + assert from_dataframe(df2.select_columns(np.array([0, 2]))) == from_dataframe( + df2.select_columns_by_name(("col33", "col35")) + ) + + +def test_select_columns_by_name_error(): + df = pd.DataFrame(int_data) + + df2 = df.__dataframe__() + + with pytest.raises(ValueError): + assert from_dataframe( + df2.select_columns_by_name(np.array(["col33", "col35"])) + ) == from_dataframe(df2.select_columns((0, 2))) + + +def test_string(): + test_str_data = string_data["separator data"] + [""] + df = pd.DataFrame({"A": test_str_data}) + col = df.__dataframe__().get_column_by_name("A") + + assert col.size == 6 + assert col.null_count == 1 + assert col.dtype[0] == DtypeKind.STRING + assert col.describe_null == (ColumnNullType.USE_BYTEMASK, 0) + + df_sliced = df[1:] + col = df_sliced.__dataframe__().get_column_by_name("A") + assert col.size == 5 + assert col.null_count == 1 + assert col.dtype[0] == DtypeKind.STRING + assert col.describe_null == (ColumnNullType.USE_BYTEMASK, 0) From 0dc1e584b6ceced739bc000a76a4142c62f989b5 Mon Sep 17 00:00:00 2001 From: Vasily Litvinov Date: Thu, 31 Mar 2022 19:00:32 +0300 Subject: [PATCH 26/49] Fix .describe_categoricals and some tests Signed-off-by: Vasily Litvinov --- pandas/core/exchange/column.py | 2 +- pandas/tests/exchange/test_impl.py | 21 ++++++++------------- 2 files changed, 9 insertions(+), 14 deletions(-) diff --git a/pandas/core/exchange/column.py b/pandas/core/exchange/column.py index 0d0e522917a75..8d062e8e9b10f 100644 --- a/pandas/core/exchange/column.py +++ b/pandas/core/exchange/column.py @@ -137,7 +137,7 @@ def describe_categorical(self): return { "is_ordered": self._col.cat.ordered, "is_dictionary": True, - "mapping": dict(zip(self._col.cat.codes, self._col.cat.categories)), + "mapping": dict(enumerate(self._col.cat.categories)), } @property diff --git a/pandas/tests/exchange/test_impl.py b/pandas/tests/exchange/test_impl.py index 94b35c999ee3e..c5d5e1b9a1e74 100644 --- a/pandas/tests/exchange/test_impl.py +++ b/pandas/tests/exchange/test_impl.py @@ -58,10 +58,10 @@ def test_categorical_dtype(data): assert col.describe_categorical == { "is_ordered": data[1], "is_dictionary": True, - "mapping": {4: "s", 2: "d", 3: "e", 1: "t"}, + "mapping": {0: "a", 1: "d", 2: "e", 3: "s", 4: "t"}, } - assert assert_frame_equal(df, from_dataframe(df.__dataframe__())) + assert_frame_equal(df, from_dataframe(df.__dataframe__())) @pytest.mark.parametrize("data", [int_data, float_data, bool_data]) @@ -76,13 +76,12 @@ def test_dataframe(data): assert list(df2.column_names()) == list(data.keys()) - assert assert_frame_equal( - from_dataframe(df2.select_columns((0, 2))), - from_dataframe(df2.select_columns_by_name(("col33", "col35"))), - ) - assert assert_frame_equal( - from_dataframe(df2.select_columns((0, 2))), - from_dataframe(df2.select_columns_by_name(("col33", "col35"))), + indices = (0, 2) + names = tuple(list(data.keys())[idx] for idx in indices) + + assert_frame_equal( + from_dataframe(df2.select_columns(indices)), + from_dataframe(df2.select_columns_by_name(names)), ) @@ -97,10 +96,6 @@ def test_missing_from_masked(): df2 = df.__dataframe__() - # for col_name in df.columns: - # assert convert_column_to_array(df2.get_column_by_name(col_name) == df[col_name].tolist() - # assert df[col_name].dtype == convert_column_to_array(df2.get_column_by_name(col_name)).dtype - rng = np.random.RandomState(42) dict_null = {col: rng.randint(low=0, high=len(df)) for col in df.columns} for col, num_nulls in dict_null.items(): From 0f7c6548eec92c1aa83c884036df36bbe3655a72 Mon Sep 17 00:00:00 2001 From: Vasily Litvinov Date: Thu, 31 Mar 2022 19:17:37 +0300 Subject: [PATCH 27/49] Auto-fix some pre-commit checks Signed-off-by: Vasily Litvinov --- pandas/api/exchange/__init__.py | 2 +- pandas/core/exchange/buffer.py | 9 ++++-- pandas/core/exchange/column.py | 30 +++++++++++++------ pandas/core/exchange/dataframe.py | 7 +++-- pandas/core/exchange/dataframe_protocol.py | 15 ++++++++-- pandas/core/exchange/from_dataframe.py | 24 ++++++++++----- pandas/core/exchange/utils.py | 4 ++- pandas/core/frame.py | 3 +- pandas/tests/exchange/conftest.py | 9 +----- pandas/tests/exchange/test_impl.py | 18 ++++++----- .../tests/exchange/test_spec_conformance.py | 5 ++-- pandas/tests/exchange/test_utils.py | 5 ++-- 12 files changed, 86 insertions(+), 45 deletions(-) diff --git a/pandas/api/exchange/__init__.py b/pandas/api/exchange/__init__.py index f81c3a90d3506..6760d81f60ac7 100644 --- a/pandas/api/exchange/__init__.py +++ b/pandas/api/exchange/__init__.py @@ -2,7 +2,7 @@ Public API for DataFrame exchange protocol. """ -from pandas.core.exchange.from_dataframe import from_dataframe from pandas.core.exchange.dataframe_protocol import DataFrame +from pandas.core.exchange.from_dataframe import from_dataframe __all__ = ["from_dataframe", "DataFrame"] diff --git a/pandas/core/exchange/buffer.py b/pandas/core/exchange/buffer.py index fdc48c7eca1a7..d527a540de371 100644 --- a/pandas/core/exchange/buffer.py +++ b/pandas/core/exchange/buffer.py @@ -1,7 +1,12 @@ -from pandas.core.exchange.dataframe_protocol import Buffer, DlpackDeviceType -import numpy as np from typing import Tuple +import numpy as np + +from pandas.core.exchange.dataframe_protocol import ( + Buffer, + DlpackDeviceType, +) + class PandasBuffer(Buffer): """ diff --git a/pandas/core/exchange/column.py b/pandas/core/exchange/column.py index 8d062e8e9b10f..190f2b04097b2 100644 --- a/pandas/core/exchange/column.py +++ b/pandas/core/exchange/column.py @@ -1,16 +1,28 @@ +from functools import cached_property +from typing import ( + Any, + Tuple, +) + +import numpy as np + +import pandas as pd +from pandas.api.types import ( + is_categorical_dtype, + is_string_dtype, +) +from pandas.core.exchange.buffer import PandasBuffer from pandas.core.exchange.dataframe_protocol import ( - Column, - DtypeKind, Buffer, + Column, ColumnNullType, + DtypeKind, +) +from pandas.core.exchange.utils import ( + ArrowCTypes, + Endianness, + dtype_to_arrow_c_fmt, ) -from pandas.core.exchange.buffer import PandasBuffer -from pandas.core.exchange.utils import ArrowCTypes, Endianness, dtype_to_arrow_c_fmt -from pandas.api.types import is_categorical_dtype, is_string_dtype -import pandas as pd -import numpy as np -from typing import Tuple, Any -from functools import cached_property _NP_KINDS = { "i": DtypeKind.INT, diff --git a/pandas/core/exchange/dataframe.py b/pandas/core/exchange/dataframe.py index 4e7edf753c901..76e1e16c921b8 100644 --- a/pandas/core/exchange/dataframe.py +++ b/pandas/core/exchange/dataframe.py @@ -1,8 +1,9 @@ -from pandas.core.exchange.dataframe_protocol import DataFrame as DataFrameXchg -from pandas.core.exchange.column import PandasColumn -import pandas as pd import collections.abc +import pandas as pd +from pandas.core.exchange.column import PandasColumn +from pandas.core.exchange.dataframe_protocol import DataFrame as DataFrameXchg + class PandasDataFrameXchg(DataFrameXchg): """ diff --git a/pandas/core/exchange/dataframe_protocol.py b/pandas/core/exchange/dataframe_protocol.py index 964424ec7de16..acf8b0723cf4a 100644 --- a/pandas/core/exchange/dataframe_protocol.py +++ b/pandas/core/exchange/dataframe_protocol.py @@ -2,9 +2,20 @@ A verbatim copy (vendored) of the spec from https://github.com/data-apis/dataframe-api """ -from typing import Tuple, Optional, Dict, Any, Iterable, Sequence, TypedDict +from abc import ( + ABC, + abstractmethod, +) import enum -from abc import ABC, abstractmethod +from typing import ( + Any, + Dict, + Iterable, + Optional, + Sequence, + Tuple, + TypedDict, +) class DlpackDeviceType(enum.IntEnum): diff --git a/pandas/core/exchange/from_dataframe.py b/pandas/core/exchange/from_dataframe.py index 09bedd8005567..94ad58fed55c2 100644 --- a/pandas/core/exchange/from_dataframe.py +++ b/pandas/core/exchange/from_dataframe.py @@ -1,16 +1,26 @@ +import ctypes +import re +from typing import ( + Any, + Optional, + Tuple, + Union, +) + +import numpy as np + +import pandas as pd from pandas.core.exchange.dataframe_protocol import ( Buffer, Column, + ColumnNullType, DataFrame, DtypeKind, - ColumnNullType, ) -from pandas.core.exchange.utils import ArrowCTypes, Endianness -import numpy as np -import pandas as pd -import ctypes -import re -from typing import Tuple, Optional, Any, Union +from pandas.core.exchange.utils import ( + ArrowCTypes, + Endianness, +) _NP_DTYPES = { DtypeKind.INT: {8: np.int8, 16: np.int16, 32: np.int32, 64: np.int64}, diff --git a/pandas/core/exchange/utils.py b/pandas/core/exchange/utils.py index 1e6378ff76024..83ecabd4cbffd 100644 --- a/pandas/core/exchange/utils.py +++ b/pandas/core/exchange/utils.py @@ -2,9 +2,11 @@ Utility functions and objects for implementing the exchange API. """ -import pandas as pd import re + import numpy as np + +import pandas as pd from pandas.api.types import is_datetime64_dtype diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 5eb37b6ded3d5..1a994154ab332 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -218,10 +218,11 @@ if TYPE_CHECKING: + from pandas.core.exchange.dataframe_protocol import DataFrame as DataFrameXchg from pandas.core.groupby.generic import DataFrameGroupBy from pandas.core.internals import SingleDataManager from pandas.core.resample import Resampler - from pandas.core.exchange.dataframe_protocol import DataFrame as DataFrameXchg + from pandas.io.formats.style import Styler # --------------------------------------------------------------------- diff --git a/pandas/tests/exchange/conftest.py b/pandas/tests/exchange/conftest.py index 5773ce8114548..42fdc7bca28d4 100644 --- a/pandas/tests/exchange/conftest.py +++ b/pandas/tests/exchange/conftest.py @@ -1,4 +1,5 @@ import pytest + import pandas as pd from pandas.core.exchange.from_dataframe import _from_dataframe @@ -10,11 +11,3 @@ def maker(dct, is_categorical=False): return df.astype("category") if is_categorical else df return maker - - -@pytest.fixture(scope="package") -def df_from_xchg(): - def maker(xchg): - return _from_dataframe(xchg) - - return maker diff --git a/pandas/tests/exchange/test_impl.py b/pandas/tests/exchange/test_impl.py index c5d5e1b9a1e74..6aede877cde04 100644 --- a/pandas/tests/exchange/test_impl.py +++ b/pandas/tests/exchange/test_impl.py @@ -1,11 +1,15 @@ -import pandas as pd +import random + import numpy as np import pytest -import random -from pandas.testing import assert_frame_equal -from pandas.core.exchange.dataframe_protocol import DtypeKind, ColumnNullType +import pandas as pd +from pandas.core.exchange.dataframe_protocol import ( + ColumnNullType, + DtypeKind, +) from pandas.core.exchange.from_dataframe import from_dataframe +from pandas.testing import assert_frame_equal test_data_categorical = { "ordered": pd.Categorical(list("testdata") * 30, ordered=True), @@ -15,21 +19,21 @@ NCOLS, NROWS = 100, 200 int_data = { - "col{}".format(int((i - NCOLS / 2) % NCOLS + 1)): [ + f"col{int((i - NCOLS / 2) % NCOLS + 1)}": [ random.randint(0, 100) for _ in range(NROWS) ] for i in range(NCOLS) } bool_data = { - "col{}".format(int((i - NCOLS / 2) % NCOLS + 1)): [ + f"col{int((i - NCOLS / 2) % NCOLS + 1)}": [ random.choice([True, False]) for _ in range(NROWS) ] for i in range(NCOLS) } float_data = { - "col{}".format(int((i - NCOLS / 2) % NCOLS + 1)): [ + f"col{int((i - NCOLS / 2) % NCOLS + 1)}": [ random.random() for _ in range(NROWS) ] for i in range(NCOLS) diff --git a/pandas/tests/exchange/test_spec_conformance.py b/pandas/tests/exchange/test_spec_conformance.py index 10d6614ba3e12..7b29db9c57ba0 100644 --- a/pandas/tests/exchange/test_spec_conformance.py +++ b/pandas/tests/exchange/test_spec_conformance.py @@ -1,9 +1,10 @@ """ A verbatim copy (vendored) of the spec tests from https://github.com/data-apis/dataframe-api """ -import pytest -import math import ctypes +import math + +import pytest @pytest.mark.parametrize( diff --git a/pandas/tests/exchange/test_utils.py b/pandas/tests/exchange/test_utils.py index 26edbffe9a8af..bd938c4464095 100644 --- a/pandas/tests/exchange/test_utils.py +++ b/pandas/tests/exchange/test_utils.py @@ -1,9 +1,10 @@ -import pandas as pd import numpy as np import pytest +import pandas as pd from pandas.core.exchange.utils import dtype_to_arrow_c_fmt + # TODO: use ArrowSchema to get reference C-string. # At the time, there is no way to access ArrowSchema holding a type format string from python. # The only way to 'touch' it is to export the structure to a C-pointer: @@ -30,6 +31,6 @@ ), ], ) -def test_dtype_to_arrow_c_fmt(pandas_dtype, c_string): # noqa PR01 +def test_dtype_to_arrow_c_fmt(pandas_dtype, c_string): # PR01 """Test ``dtype_to_arrow_c_fmt`` utility function.""" assert dtype_to_arrow_c_fmt(pandas_dtype) == c_string From 522a66a3dcd42be6be90bb8011b728b309d8953f Mon Sep 17 00:00:00 2001 From: Vasily Litvinov Date: Thu, 31 Mar 2022 19:41:49 +0300 Subject: [PATCH 28/49] Fix more issues found by commit checks Signed-off-by: Vasily Litvinov --- pandas/core/exchange/column.py | 44 +++++++------ pandas/core/exchange/dataframe.py | 6 +- pandas/core/exchange/dataframe_protocol.py | 3 +- pandas/core/exchange/from_dataframe.py | 61 +++++++++++-------- pandas/core/exchange/utils.py | 20 ++---- pandas/tests/exchange/conftest.py | 1 - pandas/tests/exchange/test_impl.py | 26 ++++---- .../tests/exchange/test_spec_conformance.py | 5 +- pandas/tests/exchange/test_utils.py | 10 +-- 9 files changed, 92 insertions(+), 84 deletions(-) diff --git a/pandas/core/exchange/column.py b/pandas/core/exchange/column.py index 190f2b04097b2..c73bf48d577dc 100644 --- a/pandas/core/exchange/column.py +++ b/pandas/core/exchange/column.py @@ -13,7 +13,6 @@ ) from pandas.core.exchange.buffer import PandasBuffer from pandas.core.exchange.dataframe_protocol import ( - Buffer, Column, ColumnNullType, DtypeKind, @@ -21,6 +20,7 @@ from pandas.core.exchange.utils import ( ArrowCTypes, Endianness, + NoBufferPresent, dtype_to_arrow_c_fmt, ) @@ -142,8 +142,7 @@ def describe_categorical(self): """ if not self.dtype[0] == DtypeKind.CATEGORICAL: raise TypeError( - "`describe_categorical only works on a column with " - "categorical dtype!" + "describe_categorical only works on a column with categorical dtype!" ) return { @@ -222,12 +221,12 @@ def get_buffers(self): buffers["data"] = self._get_data_buffer() try: buffers["validity"] = self._get_validity_buffer() - except: + except NoBufferPresent: buffers["validity"] = None try: buffers["offsets"] = self._get_offsets_buffer() - except: + except NoBufferPresent: buffers["offsets"] = None return buffers @@ -260,7 +259,8 @@ def _get_data_buffer( if isinstance(obj, str): b.extend(obj.encode(encoding="utf-8")) - # Convert the byte array to a Pandas "buffer" using a NumPy array as the backing store + # Convert the byte array to a Pandas "buffer" using + # a NumPy array as the backing store buffer = PandasBuffer(np.frombuffer(b, dtype="uint8")) # Define the dtype for the returned buffer @@ -279,12 +279,13 @@ def _get_validity_buffer(self) -> Tuple[PandasBuffer, Any]: """ Return the buffer containing the mask values indicating missing data and the buffer's associated dtype. - Raises RuntimeError if null representation is not a bit or byte mask. + Raises NoBufferPresent if null representation is not a bit or byte mask. """ null, invalid = self.describe_null if self.dtype[0] == DtypeKind.STRING: - # For now, have the mask array be comprised of bytes, rather than a bit array + # For now, use byte array as the mask. + # TODO: maybe store as bit array to save space?.. buf = self._col.to_numpy() mask = [] @@ -293,7 +294,8 @@ def _get_validity_buffer(self) -> Tuple[PandasBuffer, Any]: mask = [valid if isinstance(obj, str) else invalid for obj in buf] - # Convert the mask array to a Pandas "buffer" using a NumPy array as the backing store + # Convert the mask array to a Pandas "buffer" using + # a NumPy array as the backing store buffer = PandasBuffer(np.asarray(mask, dtype="uint8")) # Define the dtype of the returned buffer @@ -301,20 +303,21 @@ def _get_validity_buffer(self) -> Tuple[PandasBuffer, Any]: return buffer, dtype - if null == 0: + if null == ColumnNullType.NON_NULLABLE: msg = "This column is non-nullable so does not have a mask" - elif null == 1: + elif null == ColumnNullType.USE_NAN: msg = "This column uses NaN as null so does not have a separate mask" else: + # TODO: implement for other bit/byte masks? raise NotImplementedError("See self.describe_null") - raise RuntimeError(msg) + raise NoBufferPresent(msg) def _get_offsets_buffer(self) -> Tuple[PandasBuffer, Any]: """ Return the buffer containing the offset values for variable-size binary data (e.g., variable-length strings) and the buffer's associated dtype. - Raises RuntimeError if the data buffer does not have an associated + Raises NoBufferPresent if the data buffer does not have an associated offsets buffer. """ if self.dtype[0] == DtypeKind.STRING: @@ -323,17 +326,21 @@ def _get_offsets_buffer(self) -> Tuple[PandasBuffer, Any]: ptr = 0 offsets = [ptr] + [None] * len(values) for i, v in enumerate(values): - # For missing values (in this case, `np.nan` values), we don't increment the pointer) + # For missing values (in this case, `np.nan` values) + # we don't increment the pointer if isinstance(v, str): b = v.encode(encoding="utf-8") ptr += len(b) offsets[i + 1] = ptr - # Convert the list of offsets to a NumPy array of signed 64-bit integers (note: Arrow allows the offsets array to be either `int32` or `int64`; here, we default to the latter) + # Convert the list of offsets to a NumPy array of signed 64-bit integers + # (note: Arrow allows the offsets array to be either `int32` or `int64`; + # here, we default to the latter) buf = np.asarray(offsets, dtype="int64") - # Convert the offsets to a Pandas "buffer" using the NumPy array as the backing store + # Convert the offsets to a Pandas "buffer" using + # the NumPy array as the backing store buffer = PandasBuffer(buf) # Assemble the buffer dtype info @@ -344,8 +351,9 @@ def _get_offsets_buffer(self) -> Tuple[PandasBuffer, Any]: Endianness.NATIVE, ) # note: currently only support native endianness else: - raise RuntimeError( - "This column has a fixed-length dtype so does not have an offsets buffer" + raise NoBufferPresent( + "This column has a fixed-length dtype so " + "it does not have an offsets buffer" ) return buffer, dtype diff --git a/pandas/core/exchange/dataframe.py b/pandas/core/exchange/dataframe.py index 76e1e16c921b8..18e275604ff96 100644 --- a/pandas/core/exchange/dataframe.py +++ b/pandas/core/exchange/dataframe.py @@ -1,4 +1,4 @@ -import collections.abc +from collections import abc import pandas as pd from pandas.core.exchange.column import PandasColumn @@ -63,7 +63,7 @@ def get_columns(self): ] def select_columns(self, indices): - if not isinstance(indices, collections.abc.Sequence): + if not isinstance(indices, abc.Sequence): raise ValueError("`indices` is not a sequence") if not isinstance(indices, list): indices = list(indices) @@ -73,7 +73,7 @@ def select_columns(self, indices): ) def select_columns_by_name(self, names): - if not isinstance(names, collections.abc.Sequence): + if not isinstance(names, abc.Sequence): raise ValueError("`names` is not a sequence") if not isinstance(names, list): names = list(names) diff --git a/pandas/core/exchange/dataframe_protocol.py b/pandas/core/exchange/dataframe_protocol.py index acf8b0723cf4a..667e856c200e7 100644 --- a/pandas/core/exchange/dataframe_protocol.py +++ b/pandas/core/exchange/dataframe_protocol.py @@ -109,7 +109,8 @@ class CategoricalDescription(TypedDict): is_ordered: bool # whether a dictionary-style mapping of categorical values to other objects exists is_dictionary: bool - # Python-level only (e.g. ``{int: str}``). None if not a dictionary-style categorical. + # Python-level only (e.g. ``{int: str}``). + # None if not a dictionary-style categorical. mapping: Optional[dict] diff --git a/pandas/core/exchange/from_dataframe.py b/pandas/core/exchange/from_dataframe.py index 94ad58fed55c2..111fa68ec6029 100644 --- a/pandas/core/exchange/from_dataframe.py +++ b/pandas/core/exchange/from_dataframe.py @@ -14,7 +14,7 @@ Buffer, Column, ColumnNullType, - DataFrame, + DataFrame as DataFrameXchg, DtypeKind, ) from pandas.core.exchange.utils import ( @@ -40,13 +40,13 @@ def from_dataframe(df, allow_copy=True): return _from_dataframe(df.__dataframe__(allow_copy=allow_copy)) -def _from_dataframe(df: DataFrame, allow_copy=True): +def _from_dataframe(df: DataFrameXchg, allow_copy=True): """ - Build a ``pd.DataFrame`` from an object supporting the DataFrame exchange protocol, i.e. `__dataframe__` method. + Build a ``pd.DataFrame`` from the DataFrame exchange object. Parameters ---------- - df : DataFrame + df : DataFrameXchg Object supporting the exchange protocol, i.e. `__dataframe__` method. n_chunks : int, optional Number of chunks to split `df`. @@ -76,13 +76,13 @@ def _from_dataframe(df: DataFrame, allow_copy=True): return pandas_df -def protocol_df_chunk_to_pandas(df): +def protocol_df_chunk_to_pandas(df: DataFrameXchg) -> pd.DataFrame: """ Convert exchange protocol chunk to ``pd.DataFrame``. Parameters ---------- - df : DataFrame + df : DataFrameXchg Returns ------- @@ -90,7 +90,7 @@ def protocol_df_chunk_to_pandas(df): """ # We need a dict of columns here, with each column being a NumPy array (at # least for now, deal with non-NumPy dtypes later). - columns = dict() + columns = {} buffers = [] # hold on to buffers, keeps memory alive for name in df.column_names(): if not isinstance(name, str): @@ -124,7 +124,9 @@ def protocol_df_chunk_to_pandas(df): def primitive_column_to_ndarray(col: Column) -> Tuple[np.ndarray, Any]: """ - Convert a column holding one of the primitive dtypes (int, uint, float or bool) to a NumPy array. + Convert a column holding one of the primitive dtypes to a NumPy array. + + A primitive type is one of: int, uint, float, bool. Parameters ---------- @@ -133,7 +135,8 @@ def primitive_column_to_ndarray(col: Column) -> Tuple[np.ndarray, Any]: Returns ------- tuple - Tuple of np.ndarray holding the data and the memory owner object that keeps the memory alive. + Tuple of np.ndarray holding the data and the memory owner object + that keeps the memory alive. """ buffers = col.get_buffers() @@ -155,7 +158,8 @@ def categorical_column_to_series(col: Column) -> Tuple[pd.Series, Any]: Returns ------- tuple - Tuple of pd.Series holding the data and the memory owner object that keeps the memory alive. + Tuple of pd.Series holding the data and the memory owner object + that keeps the memory alive. """ ordered, is_dict, mapping = col.describe_categorical.values() @@ -168,7 +172,8 @@ def categorical_column_to_series(col: Column) -> Tuple[pd.Series, Any]: codes_buff, codes_dtype = buffers["data"] codes = buffer_to_ndarray(codes_buff, codes_dtype, col.offset, col.size) - # Doing module in order to not get ``IndexError`` for out-of-bounds sentinel values in `codes` + # Doing module in order to not get ``IndexError`` for + # out-of-bounds sentinel values in `codes` values = categories[codes % len(categories)] cat = pd.Categorical(values, categories=categories, ordered=ordered) @@ -189,7 +194,8 @@ def string_column_to_ndarray(col: Column) -> Tuple[np.ndarray, Any]: Returns ------- tuple - Tuple of np.ndarray holding the data and the memory owner object that keeps the memory alive. + Tuple of np.ndarray holding the data and the memory owner object + that keeps the memory alive. """ null_kind, sentinel_val = col.describe_null @@ -206,11 +212,11 @@ def string_column_to_ndarray(col: Column) -> Tuple[np.ndarray, Any]: # Retrieve the data buffer containing the UTF-8 code units data_buff, protocol_data_dtype = buffers["data"] - # We're going to reinterpret the buffer as uint8, so making sure we can do it safely + # We're going to reinterpret the buffer as uint8, so make sure we can do it safely assert protocol_data_dtype[1] == 8 # bitwidth == 8 assert protocol_data_dtype[2] == ArrowCTypes.STRING # format_str == utf-8 - # Convert the buffers to NumPy arrays, in order to go from STRING to an equivalent ndarray, - # we claim that the buffer is uint8 (i.e., a byte array) + # Convert the buffers to NumPy arrays. In order to go from STRING to + # an equivalent ndarray, we claim that the buffer is uint8 (i.e., a byte array) data_dtype = ( DtypeKind.UINT, 8, @@ -220,7 +226,8 @@ def string_column_to_ndarray(col: Column) -> Tuple[np.ndarray, Any]: # Specify zero offset as we don't want to chunk the string data data = buffer_to_ndarray(data_buff, data_dtype, offset=0, length=col.size) - # Retrieve the offsets buffer containing the index offsets demarcating the beginning and end of each string + # Retrieve the offsets buffer containing the index offsets demarcating + # the beginning and the ending of each string offset_buff, offset_dtype = buffers["offsets"] # Offsets buffer contains start-stop positions of strings in the data buffer, # meaning that it has more elements than in the data buffer, do `col.size + 1` here @@ -269,8 +276,9 @@ def parse_datetime_format_str(format_str, data): if tz != "": raise NotImplementedError("Timezones are not supported yet") if unit != "s": - # the format string describes only a first letter of the unit, add one extra - # letter to make the unit in numpy-style: 'm' -> 'ms', 'u' -> 'us', 'n' -> 'ns' + # the format string describes only a first letter of the unit, so + # add one extra letter to convert the unit to numpy-style: + # 'm' -> 'ms', 'u' -> 'us', 'n' -> 'ns' unit += "s" data = data.astype(f"datetime64[{unit}]") return data @@ -303,7 +311,8 @@ def datetime_column_to_ndarray(col: Column) -> Tuple[np.ndarray, Any]: Returns ------- tuple - Tuple of np.ndarray holding the data and the memory owner object that keeps the memory alive. + Tuple of np.ndarray holding the data and the memory owner object + that keeps the memory alive. """ buffers = col.get_buffers() @@ -354,14 +363,15 @@ def buffer_to_ndarray( Notes ----- - The returned array doesn't own the memory. A user of the function must keep the memory - owner object alive as long as the returned NumPy array is being used. + The returned array doesn't own the memory. The caller of this function is + responsible for keeping the memory owner object alive as long as + the returned NumPy array is being used. """ kind, bit_width, _, _ = dtype column_dtype = _NP_DTYPES.get(kind, {}).get(bit_width, None) if column_dtype is None: - raise NotImplementedError(f"Convertion for {dtype} is not yet supported.") + raise NotImplementedError(f"Conversion for {dtype} is not yet supported.") # TODO: No DLPack yet, so need to construct a new ndarray from the data pointer # and size in the buffer plus the dtype on the column. Use DLPack as NumPy supports @@ -406,7 +416,7 @@ def bitmask_to_bool_ndarray( bool_mask = np.zeros(mask_length, dtype=bool) - # Proccessing the first byte separately as it has its own offset + # Processing the first byte separately as it has its own offset val = bitmask[0] mask_idx = 0 bits_in_first_byte = min(8 - first_byte_offset, mask_length) @@ -483,8 +493,9 @@ def set_nulls( try: data[null_pos] = None except TypeError: - # TypeError happens if the `data` dtype appears to be non-nullable in numpy notation - # (bool, int, uint), if such happens, cast the `data` to nullable float dtype. + # TypeError happens if the `data` dtype appears to be non-nullable + # in numpy notation (bool, int, uint). If this happens, + # cast the `data` to nullable float dtype. data = data.astype(float) data[null_pos] = None diff --git a/pandas/core/exchange/utils.py b/pandas/core/exchange/utils.py index 83ecabd4cbffd..b3a89a1b1f484 100644 --- a/pandas/core/exchange/utils.py +++ b/pandas/core/exchange/utils.py @@ -36,7 +36,7 @@ class ArrowCTypes: DATE64 = "tdm" # Resoulution: # - seconds -> 's' - # - miliseconds -> 'm' + # - milliseconds -> 'm' # - microseconds -> 'u' # - nanoseconds -> 'n' TIMESTAMP = "ts{resolution}:{tz}" @@ -82,21 +82,9 @@ def dtype_to_arrow_c_fmt(dtype) -> str: return ArrowCTypes.TIMESTAMP.format(resolution=resolution, tz="") raise NotImplementedError( - f"Convertion of {dtype} to Arrow C format string is not implemented." + f"Conversion of {dtype} to Arrow C format string is not implemented." ) -def raise_copy_alert(copy_reason=None): - """ - Raise a ``RuntimeError`` mentioning that there's a copy required. - - Parameters - ---------- - copy_reason : str, optional - The reason of making a copy. Should fit to the following format: - 'The copy occured due to {copy_reason}.'. - """ - msg = "Copy required but 'allow_copy=False' is set." - if copy_reason: - msg += f" The copy occured due to {copy_reason}." - raise RuntimeError(msg) +class NoBufferPresent(Exception): + """Exception to signal that there is no requested buffer.""" diff --git a/pandas/tests/exchange/conftest.py b/pandas/tests/exchange/conftest.py index 42fdc7bca28d4..033f44984b551 100644 --- a/pandas/tests/exchange/conftest.py +++ b/pandas/tests/exchange/conftest.py @@ -1,7 +1,6 @@ import pytest import pandas as pd -from pandas.core.exchange.from_dataframe import _from_dataframe @pytest.fixture(scope="package") diff --git a/pandas/tests/exchange/test_impl.py b/pandas/tests/exchange/test_impl.py index 6aede877cde04..e4dbf14164961 100644 --- a/pandas/tests/exchange/test_impl.py +++ b/pandas/tests/exchange/test_impl.py @@ -4,12 +4,12 @@ import pytest import pandas as pd +import pandas._testing as tm from pandas.core.exchange.dataframe_protocol import ( ColumnNullType, DtypeKind, ) from pandas.core.exchange.from_dataframe import from_dataframe -from pandas.testing import assert_frame_equal test_data_categorical = { "ordered": pd.Categorical(list("testdata") * 30, ordered=True), @@ -33,9 +33,7 @@ } float_data = { - f"col{int((i - NCOLS / 2) % NCOLS + 1)}": [ - random.random() for _ in range(NROWS) - ] + f"col{int((i - NCOLS / 2) % NCOLS + 1)}": [random.random() for _ in range(NROWS)] for i in range(NCOLS) } @@ -65,7 +63,7 @@ def test_categorical_dtype(data): "mapping": {0: "a", 1: "d", 2: "e", 3: "s", 4: "t"}, } - assert_frame_equal(df, from_dataframe(df.__dataframe__())) + tm.assert_frame_equal(df, from_dataframe(df.__dataframe__())) @pytest.mark.parametrize("data", [int_data, float_data, bool_data]) @@ -83,7 +81,7 @@ def test_dataframe(data): indices = (0, 2) names = tuple(list(data.keys())[idx] for idx in indices) - assert_frame_equal( + tm.assert_frame_equal( from_dataframe(df2.select_columns(indices)), from_dataframe(df2.select_columns_by_name(names)), ) @@ -155,10 +153,10 @@ def test_select_columns_error(): df2 = df.__dataframe__() - with pytest.raises(ValueError): - assert from_dataframe(df2.select_columns(np.array([0, 2]))) == from_dataframe( - df2.select_columns_by_name(("col33", "col35")) - ) + # with pytest.raises(ValueError): + assert from_dataframe(df2.select_columns(np.array([0, 2]))) == from_dataframe( + df2.select_columns_by_name(("col33", "col35")) + ) def test_select_columns_by_name_error(): @@ -166,10 +164,10 @@ def test_select_columns_by_name_error(): df2 = df.__dataframe__() - with pytest.raises(ValueError): - assert from_dataframe( - df2.select_columns_by_name(np.array(["col33", "col35"])) - ) == from_dataframe(df2.select_columns((0, 2))) + # with pytest.raises(ValueError): + assert from_dataframe( + df2.select_columns_by_name(np.array(["col33", "col35"])) + ) == from_dataframe(df2.select_columns((0, 2))) def test_string(): diff --git a/pandas/tests/exchange/test_spec_conformance.py b/pandas/tests/exchange/test_spec_conformance.py index 7b29db9c57ba0..3bce96982b155 100644 --- a/pandas/tests/exchange/test_spec_conformance.py +++ b/pandas/tests/exchange/test_spec_conformance.py @@ -1,5 +1,6 @@ """ -A verbatim copy (vendored) of the spec tests from https://github.com/data-apis/dataframe-api +A verbatim copy (vendored) of the spec tests. +Taken from https://github.com/data-apis/dataframe-api """ import ctypes import math @@ -64,7 +65,7 @@ def test_noncategorical(df_from_dict): df = df_from_dict({"a": [1, 2, 3]}) dfX = df.__dataframe__() colX = dfX.get_column_by_name("a") - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=".*categorical.*"): colX.describe_categorical diff --git a/pandas/tests/exchange/test_utils.py b/pandas/tests/exchange/test_utils.py index bd938c4464095..a1341ef57feec 100644 --- a/pandas/tests/exchange/test_utils.py +++ b/pandas/tests/exchange/test_utils.py @@ -4,11 +4,13 @@ import pandas as pd from pandas.core.exchange.utils import dtype_to_arrow_c_fmt - # TODO: use ArrowSchema to get reference C-string. -# At the time, there is no way to access ArrowSchema holding a type format string from python. -# The only way to 'touch' it is to export the structure to a C-pointer: -# https://github.com/apache/arrow/blob/5680d209fd870f99134e2d7299b47acd90fabb8e/python/pyarrow/types.pxi#L230-L239 +# At the time, there is no way to access ArrowSchema holding a type format string +# from python. The only way to access it is to export the structure to a C-pointer, +# see DataType._export_to_c() method defined in +# https://github.com/apache/arrow/blob/master/python/pyarrow/types.pxi + + @pytest.mark.parametrize( "pandas_dtype, c_string", [ From 15253201d1b482dbafd4ddb5fe2ae9951ad8a46d Mon Sep 17 00:00:00 2001 From: Vasily Litvinov Date: Thu, 31 Mar 2022 20:11:59 +0300 Subject: [PATCH 29/49] Fix categorical-related test failures Signed-off-by: Vasily Litvinov --- pandas/core/exchange/column.py | 14 +++++++++----- pandas/tests/exchange/test_impl.py | 10 +++------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/pandas/core/exchange/column.py b/pandas/core/exchange/column.py index c73bf48d577dc..785f7f02fbbe1 100644 --- a/pandas/core/exchange/column.py +++ b/pandas/core/exchange/column.py @@ -47,6 +47,12 @@ DtypeKind.STRING: (ColumnNullType.USE_BYTEMASK, 0), } +_NO_VALIDITY_BUFFER = { + ColumnNullType.NON_NULLABLE: "This column is non-nullable", + ColumnNullType.USE_NAN: "This column uses NaN as null", + ColumnNullType.USE_SENTINEL: "This column uses a sentinel value", +} + class PandasColumn(Column): """ @@ -303,11 +309,9 @@ def _get_validity_buffer(self) -> Tuple[PandasBuffer, Any]: return buffer, dtype - if null == ColumnNullType.NON_NULLABLE: - msg = "This column is non-nullable so does not have a mask" - elif null == ColumnNullType.USE_NAN: - msg = "This column uses NaN as null so does not have a separate mask" - else: + try: + msg = _NO_VALIDITY_BUFFER[null] + " so does not have a separate mask" + except KeyError: # TODO: implement for other bit/byte masks? raise NotImplementedError("See self.describe_null") diff --git a/pandas/tests/exchange/test_impl.py b/pandas/tests/exchange/test_impl.py index e4dbf14164961..c40421da44709 100644 --- a/pandas/tests/exchange/test_impl.py +++ b/pandas/tests/exchange/test_impl.py @@ -153,10 +153,8 @@ def test_select_columns_error(): df2 = df.__dataframe__() - # with pytest.raises(ValueError): - assert from_dataframe(df2.select_columns(np.array([0, 2]))) == from_dataframe( - df2.select_columns_by_name(("col33", "col35")) - ) + with pytest.raises(ValueError, match="is not a sequence"): + df2.select_columns(np.array([0, 2])) def test_select_columns_by_name_error(): @@ -164,10 +162,8 @@ def test_select_columns_by_name_error(): df2 = df.__dataframe__() - # with pytest.raises(ValueError): - assert from_dataframe( + with pytest.raises(ValueError, match="is not a sequence"): df2.select_columns_by_name(np.array(["col33", "col35"])) - ) == from_dataframe(df2.select_columns((0, 2))) def test_string(): From 0054c15285c6cb078900c909478aa2961db16c57 Mon Sep 17 00:00:00 2001 From: Vasily Litvinov Date: Thu, 31 Mar 2022 20:34:49 +0300 Subject: [PATCH 30/49] Add a whatsnew entry Signed-off-by: Vasily Litvinov --- doc/source/reference/frame.rst | 1 + doc/source/reference/general_functions.rst | 7 +++++++ doc/source/whatsnew/v1.5.0.rst | 18 ++++++++++++++++++ 3 files changed, 26 insertions(+) diff --git a/doc/source/reference/frame.rst b/doc/source/reference/frame.rst index 9a1ebc8d670dc..ea27d1efbb235 100644 --- a/doc/source/reference/frame.rst +++ b/doc/source/reference/frame.rst @@ -391,3 +391,4 @@ Serialization / IO / conversion DataFrame.to_clipboard DataFrame.to_markdown DataFrame.style + DataFrame.__dataframe__ diff --git a/doc/source/reference/general_functions.rst b/doc/source/reference/general_functions.rst index 4b2fadcb367a1..156d49b6df4fa 100644 --- a/doc/source/reference/general_functions.rst +++ b/doc/source/reference/general_functions.rst @@ -78,3 +78,10 @@ Hashing util.hash_array util.hash_pandas_object + +Importing from other dataframe libraries +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: api/ + + api.exchange.from_dataframe diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 08500019143ed..01bf97f35702a 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -14,6 +14,24 @@ including other versions of pandas. Enhancements ~~~~~~~~~~~~ +.. _whatsnew_150.enhancements.dataframe_exchange: + +DataFrame exchange protocol implementation +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Pandas now implement the DataFrame exchange API spec. +See the full details on the API at https://data-apis.org/dataframe-protocol/latest/index.html + +The protocol consists of two parts: + + - New method :meth:`DataFrame.__dataframe__` which produces the exchange object. + It effectively "exports" the Pandas dataframe as an exchange object so + any other library which has the protocol implemented can "import" that dataframe + without knowing anything about the producer except that it makes an exchange object. + - New function :func:`pandas.api.exchange.from_dataframe` which can take + an arbitrary exchange object from any conformant library and construct a + Pandas DataFrame out of it. + .. _whatsnew_150.enhancements.styler: Styler From f8badc6621a0b4cb347297e3eca429ba7a8401d6 Mon Sep 17 00:00:00 2001 From: Vasily Litvinov Date: Thu, 31 Mar 2022 21:13:10 +0300 Subject: [PATCH 31/49] Fix rst linting Signed-off-by: Vasily Litvinov --- doc/source/reference/general_functions.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/reference/general_functions.rst b/doc/source/reference/general_functions.rst index 156d49b6df4fa..a42d54b7e50ef 100644 --- a/doc/source/reference/general_functions.rst +++ b/doc/source/reference/general_functions.rst @@ -79,7 +79,7 @@ Hashing util.hash_array util.hash_pandas_object -Importing from other dataframe libraries +Importing from other DataFrame libraries ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autosummary:: :toctree: api/ From 86005d440a2d78f4d44fb71fb434f221300a9ea3 Mon Sep 17 00:00:00 2001 From: Vasily Litvinov Date: Thu, 31 Mar 2022 21:13:39 +0300 Subject: [PATCH 32/49] Fix DataFrame.__dataframe__ docstring Signed-off-by: Vasily Litvinov --- pandas/core/frame.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 1a994154ab332..a9ca18d036d77 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -819,8 +819,8 @@ def __dataframe__( """ Return the dataframe exchange object implementing the exchange protocol. - See Also - -------- + Notes + ----- Details on the exchange protocol: https://data-apis.org/dataframe-protocol/latest/index.html """ From 9ab797b069d018d154be5c0e78a50edeb6484f5a Mon Sep 17 00:00:00 2001 From: Vasily Litvinov Date: Thu, 31 Mar 2022 21:48:15 +0300 Subject: [PATCH 33/49] Fix DataFrame.__dataframe__ docstring more Signed-off-by: Vasily Litvinov --- pandas/core/frame.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index a9ca18d036d77..9f11f17f77a4b 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -819,10 +819,27 @@ def __dataframe__( """ Return the dataframe exchange object implementing the exchange protocol. + Parameters + ---------- + nan_as_null : bool, default False + Whether to tell the DataFrame to overwrite null values in the data + with ``NaN`` (or ``NaT``). + allow_copy : bool, default True + Whether to allow memory copying when exporting. If set to False + it would cause non-zero-copy exports to fail. + + Returns + ------- + DataFrame exchange object + The object which consuming library can use to ingress the dataframe. + Notes ----- Details on the exchange protocol: https://data-apis.org/dataframe-protocol/latest/index.html + + `nan_as_null` currently has no effect; once support for nullable extension + dtypes is added, this value should be propagated to columns. """ from pandas.core.exchange.dataframe import PandasDataFrameXchg From 7a54b203c0ce502b833277f01b8cf3047232de61 Mon Sep 17 00:00:00 2001 From: Vasily Litvinov Date: Thu, 31 Mar 2022 22:00:23 +0300 Subject: [PATCH 34/49] Fix test_api::TestApi Signed-off-by: Vasily Litvinov --- pandas/tests/api/test_api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index 2e306c76d246c..1bc2cf5085f1a 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -274,7 +274,7 @@ def test_np(): class TestApi(Base): - allowed = ["types", "extensions", "indexers"] + allowed = ["types", "extensions", "indexers", "exchange"] def test_api(self): self.check(api, self.allowed) From 65a5370759aff07d3f5bdcb31df3fe461ab6a104 Mon Sep 17 00:00:00 2001 From: Vasily Litvinov Date: Thu, 31 Mar 2022 23:18:02 +0300 Subject: [PATCH 35/49] Try to fix typecheck issues Signed-off-by: Vasily Litvinov --- pandas/core/exchange/buffer.py | 7 ++++-- pandas/core/exchange/column.py | 15 ++++++++----- pandas/core/exchange/dataframe_protocol.py | 2 +- pandas/core/exchange/from_dataframe.py | 25 +++++++++++++++------- pandas/tests/exchange/test_impl.py | 19 ---------------- 5 files changed, 33 insertions(+), 35 deletions(-) diff --git a/pandas/core/exchange/buffer.py b/pandas/core/exchange/buffer.py index d527a540de371..6b0f28437c694 100644 --- a/pandas/core/exchange/buffer.py +++ b/pandas/core/exchange/buffer.py @@ -1,4 +1,7 @@ -from typing import Tuple +from typing import ( + Optional, + Tuple, +) import numpy as np @@ -52,7 +55,7 @@ def __dlpack__(self): """ raise NotImplementedError("__dlpack__") - def __dlpack_device__(self) -> Tuple[DlpackDeviceType, int]: + def __dlpack_device__(self) -> Tuple[DlpackDeviceType, Optional[int]]: """ Device type and device ID for where the data in the buffer resides. """ diff --git a/pandas/core/exchange/column.py b/pandas/core/exchange/column.py index 785f7f02fbbe1..c265b8c38ab05 100644 --- a/pandas/core/exchange/column.py +++ b/pandas/core/exchange/column.py @@ -14,6 +14,7 @@ from pandas.core.exchange.buffer import PandasBuffer from pandas.core.exchange.dataframe_protocol import ( Column, + ColumnBuffers, ColumnNullType, DtypeKind, ) @@ -223,17 +224,21 @@ def get_buffers(self): if the data buffer does not have an associated offsets buffer. """ - buffers = {} - buffers["data"] = self._get_data_buffer() + buffers: ColumnBuffers = { + "data": self._get_data_buffer(), + "validity": None, + "offsets": None, + } + try: buffers["validity"] = self._get_validity_buffer() except NoBufferPresent: - buffers["validity"] = None + pass try: buffers["offsets"] = self._get_offsets_buffer() except NoBufferPresent: - buffers["offsets"] = None + pass return buffers @@ -328,7 +333,7 @@ def _get_offsets_buffer(self) -> Tuple[PandasBuffer, Any]: # For each string, we need to manually determine the next offset values = self._col.to_numpy() ptr = 0 - offsets = [ptr] + [None] * len(values) + offsets = [ptr] + [0] * len(values) for i, v in enumerate(values): # For missing values (in this case, `np.nan` values) # we don't increment the pointer diff --git a/pandas/core/exchange/dataframe_protocol.py b/pandas/core/exchange/dataframe_protocol.py index 667e856c200e7..ee2ae609e73f9 100644 --- a/pandas/core/exchange/dataframe_protocol.py +++ b/pandas/core/exchange/dataframe_protocol.py @@ -216,7 +216,7 @@ class Column(ABC): @property @abstractmethod - def size(self) -> Optional[int]: + def size(self) -> int: """ Size of the column, in elements. diff --git a/pandas/core/exchange/from_dataframe.py b/pandas/core/exchange/from_dataframe.py index 111fa68ec6029..e8f9d462cc85f 100644 --- a/pandas/core/exchange/from_dataframe.py +++ b/pandas/core/exchange/from_dataframe.py @@ -2,6 +2,8 @@ import re from typing import ( Any, + Dict, + List, Optional, Tuple, Union, @@ -22,7 +24,7 @@ Endianness, ) -_NP_DTYPES = { +_NP_DTYPES: Dict[DtypeKind, Dict[int, Any]] = { DtypeKind.INT: {8: np.int8, 16: np.int16, 32: np.int32, 64: np.int64}, DtypeKind.UINT: {8: np.uint8, 16: np.uint16, 32: np.uint32, 64: np.uint64}, DtypeKind.FLOAT: {32: np.float32, 64: np.float64}, @@ -90,7 +92,7 @@ def protocol_df_chunk_to_pandas(df: DataFrameXchg) -> pd.DataFrame: """ # We need a dict of columns here, with each column being a NumPy array (at # least for now, deal with non-NumPy dtypes later). - columns = {} + columns: Dict[str, Any] = {} buffers = [] # hold on to buffers, keeps memory alive for name in df.column_names(): if not isinstance(name, str): @@ -161,12 +163,14 @@ def categorical_column_to_series(col: Column) -> Tuple[pd.Series, Any]: Tuple of pd.Series holding the data and the memory owner object that keeps the memory alive. """ - ordered, is_dict, mapping = col.describe_categorical.values() + categorical = col.describe_categorical - if not is_dict: + if not categorical["is_dictionary"]: raise NotImplementedError("Non-dictionary categoricals not supported yet") - categories = np.array(tuple(mapping.values())) + mapping = categorical["mapping"] + assert isinstance(mapping, dict), "Categorical mapping must be a dict" + categories = np.array(tuple(mapping[k] for k in sorted(mapping))) buffers = col.get_buffers() codes_buff, codes_dtype = buffers["data"] @@ -176,7 +180,9 @@ def categorical_column_to_series(col: Column) -> Tuple[pd.Series, Any]: # out-of-bounds sentinel values in `codes` values = categories[codes % len(categories)] - cat = pd.Categorical(values, categories=categories, ordered=ordered) + cat = pd.Categorical( + values, categories=categories, ordered=categorical["is_ordered"] + ) data = pd.Series(cat) data = set_nulls(data, col, buffers["validity"]) @@ -210,6 +216,7 @@ def string_column_to_ndarray(col: Column) -> Tuple[np.ndarray, Any]: buffers = col.get_buffers() + assert buffers["offsets"], "String buffers must contain offsets" # Retrieve the data buffer containing the UTF-8 code units data_buff, protocol_data_dtype = buffers["data"] # We're going to reinterpret the buffer as uint8, so make sure we can do it safely @@ -238,13 +245,14 @@ def string_column_to_ndarray(col: Column) -> Tuple[np.ndarray, Any]: null_pos = None if null_kind in (ColumnNullType.USE_BITMASK, ColumnNullType.USE_BYTEMASK): + assert buffers["validity"], "Validity buffers cannot be empty for masks" valid_buff, valid_dtype = buffers["validity"] null_pos = buffer_to_ndarray(valid_buff, valid_dtype, col.offset, col.size) if sentinel_val == 0: null_pos = ~null_pos # Assemble the strings from the code units - str_list = [None] * col.size + str_list: List[Union[None, float, str]] = [None] * col.size for i in range(col.size): # Check for missing values if null_pos is not None and null_pos[i]: @@ -448,7 +456,7 @@ def bitmask_to_bool_ndarray( def set_nulls( data: Union[np.ndarray, pd.Series], col: Column, - validity: Tuple[Buffer, Tuple[DtypeKind, int, str, str]], + validity: Optional[Tuple[Buffer, Tuple[DtypeKind, int, str, str]]], allow_modify_inplace: bool = True, ): """ @@ -478,6 +486,7 @@ def set_nulls( if null_kind == ColumnNullType.USE_SENTINEL: null_pos = data == sentinel_val elif null_kind in (ColumnNullType.USE_BITMASK, ColumnNullType.USE_BYTEMASK): + assert validity, "Expected to have a validity buffer for the mask" valid_buff, valid_dtype = validity null_pos = buffer_to_ndarray(valid_buff, valid_dtype, col.offset, col.size) if sentinel_val == 0: diff --git a/pandas/tests/exchange/test_impl.py b/pandas/tests/exchange/test_impl.py index c40421da44709..3920234f635e8 100644 --- a/pandas/tests/exchange/test_impl.py +++ b/pandas/tests/exchange/test_impl.py @@ -72,7 +72,6 @@ def test_dataframe(data): df2 = df.__dataframe__() - assert df2._allow_copy is True assert df2.num_columns() == NCOLS assert df2.num_rows() == NROWS @@ -148,24 +147,6 @@ def test_mixed_missing(): assert df2.get_column_by_name(col_name).null_count == 2 -def test_select_columns_error(): - df = pd.DataFrame(int_data) - - df2 = df.__dataframe__() - - with pytest.raises(ValueError, match="is not a sequence"): - df2.select_columns(np.array([0, 2])) - - -def test_select_columns_by_name_error(): - df = pd.DataFrame(int_data) - - df2 = df.__dataframe__() - - with pytest.raises(ValueError, match="is not a sequence"): - df2.select_columns_by_name(np.array(["col33", "col35"])) - - def test_string(): test_str_data = string_data["separator data"] + [""] df = pd.DataFrame({"A": test_str_data}) From 594ac531ffc690c2c2b9603f8ba8983eb0d84f1c Mon Sep 17 00:00:00 2001 From: Vasily Litvinov Date: Thu, 14 Apr 2022 17:03:15 +0300 Subject: [PATCH 36/49] Respond to review comments Signed-off-by: Vasily Litvinov --- pandas/core/exchange/buffer.py | 23 ++++++++++++++++++----- pandas/core/exchange/column.py | 15 +++++++++++---- pandas/core/exchange/utils.py | 4 +++- 3 files changed, 32 insertions(+), 10 deletions(-) diff --git a/pandas/core/exchange/buffer.py b/pandas/core/exchange/buffer.py index 6b0f28437c694..bfd2538a6e2f0 100644 --- a/pandas/core/exchange/buffer.py +++ b/pandas/core/exchange/buffer.py @@ -4,12 +4,15 @@ ) import numpy as np +from packaging import version from pandas.core.exchange.dataframe_protocol import ( Buffer, DlpackDeviceType, ) +_NUMPY_DLPACK = version.parse("1.22.0") + class PandasBuffer(Buffer): """ @@ -49,11 +52,21 @@ def ptr(self) -> int: """ return self._x.__array_interface__["data"][0] - def __dlpack__(self): - """ - DLPack not implemented in NumPy yet, so leave it out here. - """ - raise NotImplementedError("__dlpack__") + if version.parse(np.__version__) >= _NUMPY_DLPACK: + + def __dlpack__(self): + """ + Represent this structure as DLPack interface. + """ + return self._x.__dlpack__() + + else: + + def __dlpack__(self): + """ + Represent this structure as DLPack interface. + """ + raise NotImplementedError("__dlpack__") def __dlpack_device__(self) -> Tuple[DlpackDeviceType, Optional[int]]: """ diff --git a/pandas/core/exchange/column.py b/pandas/core/exchange/column.py index c265b8c38ab05..085be1831d594 100644 --- a/pandas/core/exchange/column.py +++ b/pandas/core/exchange/column.py @@ -1,4 +1,3 @@ -from functools import cached_property from typing import ( Any, Tuple, @@ -6,6 +5,8 @@ import numpy as np +from pandas.util._decorators import cache_readonly + import pandas as pd from pandas.api.types import ( is_categorical_dtype, @@ -94,7 +95,7 @@ def offset(self) -> int: # TODO: chunks are implemented now, probably this should return something return 0 - @cached_property + @cache_readonly def dtype(self): dtype = self._col.dtype @@ -113,6 +114,9 @@ def dtype(self): "=", ) elif is_string_dtype(dtype): + # TODO: is_string_dtype() can return True for non-string dtypes like + # numpy arrays, because they all have an "object" dtype. + # Think on how to improve the check here. return (DtypeKind.STRING, 8, dtype_to_arrow_c_fmt(dtype), "=") else: return self._dtype_from_pandasdtype(dtype) @@ -168,7 +172,7 @@ def describe_null(self): return null, value - @cached_property + @cache_readonly def null_count(self) -> int: """ Number of null elements. Should always be known. @@ -180,7 +184,7 @@ def metadata(self): """ Store specific metadata of the column. """ - return {"index": self._col.index} + return {"pandas.index": self._col.index} def num_chunks(self) -> int: """ @@ -253,6 +257,7 @@ def _get_data_buffer( DtypeKind.UINT, DtypeKind.FLOAT, DtypeKind.BOOL, + DtypeKind.DATETIME, ): buffer = PandasBuffer(self._col.to_numpy(), allow_copy=self._allow_copy) dtype = self.dtype @@ -281,6 +286,8 @@ def _get_data_buffer( ArrowCTypes.STRING, Endianness.NATIVE, ) # note: currently only support native endianness + elif self.dtype[0] == DtypeKind.DATETIME: + pass else: raise NotImplementedError(f"Data type {self._col.dtype} not handled yet") diff --git a/pandas/core/exchange/utils.py b/pandas/core/exchange/utils.py index b3a89a1b1f484..d10be57582f9a 100644 --- a/pandas/core/exchange/utils.py +++ b/pandas/core/exchange/utils.py @@ -6,6 +6,8 @@ import numpy as np +from pandas._typing import DtypeObj + import pandas as pd from pandas.api.types import is_datetime64_dtype @@ -52,7 +54,7 @@ class Endianness: NA = "|" -def dtype_to_arrow_c_fmt(dtype) -> str: +def dtype_to_arrow_c_fmt(dtype: DtypeObj) -> str: """ Represent pandas `dtype` as a format string in Apache Arrow C notation. From 62c43afc9e1c7495cfc3bc1495f0b700d8ede13f Mon Sep 17 00:00:00 2001 From: Vasily Litvinov Date: Thu, 14 Apr 2022 18:14:10 +0300 Subject: [PATCH 37/49] Fix mypy error Signed-off-by: Vasily Litvinov --- pandas/core/exchange/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/exchange/utils.py b/pandas/core/exchange/utils.py index d10be57582f9a..0c746113babee 100644 --- a/pandas/core/exchange/utils.py +++ b/pandas/core/exchange/utils.py @@ -3,6 +3,7 @@ """ import re +import typing import numpy as np @@ -80,7 +81,7 @@ def dtype_to_arrow_c_fmt(dtype: DtypeObj) -> str: if is_datetime64_dtype(dtype): # Selecting the first char of resolution string: # dtype.str -> ' Date: Thu, 14 Apr 2022 18:35:50 +0300 Subject: [PATCH 38/49] Change check for dlpack Signed-off-by: Vasily Litvinov --- pandas/core/exchange/buffer.py | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) diff --git a/pandas/core/exchange/buffer.py b/pandas/core/exchange/buffer.py index bfd2538a6e2f0..098c596bff4cd 100644 --- a/pandas/core/exchange/buffer.py +++ b/pandas/core/exchange/buffer.py @@ -11,7 +11,7 @@ DlpackDeviceType, ) -_NUMPY_DLPACK = version.parse("1.22.0") +_NUMPY_HAS_DLPACK = version.parse(np.__version__) >= version.parse("1.22.0") class PandasBuffer(Buffer): @@ -52,21 +52,13 @@ def ptr(self) -> int: """ return self._x.__array_interface__["data"][0] - if version.parse(np.__version__) >= _NUMPY_DLPACK: - - def __dlpack__(self): - """ - Represent this structure as DLPack interface. - """ + def __dlpack__(self): + """ + Represent this structure as DLPack interface. + """ + if _NUMPY_HAS_DLPACK: return self._x.__dlpack__() - - else: - - def __dlpack__(self): - """ - Represent this structure as DLPack interface. - """ - raise NotImplementedError("__dlpack__") + raise NotImplementedError("__dlpack__") def __dlpack_device__(self) -> Tuple[DlpackDeviceType, Optional[int]]: """ From 804aa895521158b0a3e49c89e27dd8da757b4fcc Mon Sep 17 00:00:00 2001 From: Vasily Litvinov Date: Mon, 18 Apr 2022 23:16:25 +0300 Subject: [PATCH 39/49] Address review comments Signed-off-by: Vasily Litvinov --- pandas/core/exchange/column.py | 25 +++++++++++-------------- pandas/core/exchange/dataframe.py | 2 +- 2 files changed, 12 insertions(+), 15 deletions(-) diff --git a/pandas/core/exchange/column.py b/pandas/core/exchange/column.py index 085be1831d594..b94894b607367 100644 --- a/pandas/core/exchange/column.py +++ b/pandas/core/exchange/column.py @@ -111,13 +111,13 @@ def dtype(self): DtypeKind.CATEGORICAL, bitwidth, c_arrow_dtype_f_str, - "=", + Endianness.NATIVE, ) elif is_string_dtype(dtype): # TODO: is_string_dtype() can return True for non-string dtypes like # numpy arrays, because they all have an "object" dtype. # Think on how to improve the check here. - return (DtypeKind.STRING, 8, dtype_to_arrow_c_fmt(dtype), "=") + return (DtypeKind.STRING, 8, dtype_to_arrow_c_fmt(dtype), Endianness.NATIVE) else: return self._dtype_from_pandasdtype(dtype) @@ -308,16 +308,18 @@ def _get_validity_buffer(self) -> Tuple[PandasBuffer, Any]: mask = [] # Determine the encoding for valid values - valid = 1 if invalid == 0 else 0 - - mask = [valid if isinstance(obj, str) else invalid for obj in buf] + valid = invalid == 0 + invalid = not valid + mask = np.zeros(shape=(len(buf),), dtype=np.bool8) + for i, obj in enumerate(buf): + mask[i] = valid if isinstance(obj, str) else invalid # Convert the mask array to a Pandas "buffer" using # a NumPy array as the backing store - buffer = PandasBuffer(np.asarray(mask, dtype="uint8")) + buffer = PandasBuffer(mask) # Define the dtype of the returned buffer - dtype = (DtypeKind.UINT, 8, ArrowCTypes.UINT8, Endianness.NATIVE) + dtype = (DtypeKind.BOOL, 8, ArrowCTypes.BOOL, Endianness.NATIVE) return buffer, dtype @@ -340,7 +342,7 @@ def _get_offsets_buffer(self) -> Tuple[PandasBuffer, Any]: # For each string, we need to manually determine the next offset values = self._col.to_numpy() ptr = 0 - offsets = [ptr] + [0] * len(values) + offsets = np.zeros(shape=(len(values) + 1,), dtype=np.int64) for i, v in enumerate(values): # For missing values (in this case, `np.nan` values) # we don't increment the pointer @@ -350,14 +352,9 @@ def _get_offsets_buffer(self) -> Tuple[PandasBuffer, Any]: offsets[i + 1] = ptr - # Convert the list of offsets to a NumPy array of signed 64-bit integers - # (note: Arrow allows the offsets array to be either `int32` or `int64`; - # here, we default to the latter) - buf = np.asarray(offsets, dtype="int64") - # Convert the offsets to a Pandas "buffer" using # the NumPy array as the backing store - buffer = PandasBuffer(buf) + buffer = PandasBuffer(offsets) # Assemble the buffer dtype info dtype = ( diff --git a/pandas/core/exchange/dataframe.py b/pandas/core/exchange/dataframe.py index 18e275604ff96..c8a89184b34c6 100644 --- a/pandas/core/exchange/dataframe.py +++ b/pandas/core/exchange/dataframe.py @@ -48,7 +48,7 @@ def num_chunks(self) -> int: return 1 def column_names(self): - return self._df.columns.tolist() + return self._df.columns def get_column(self, i: int) -> PandasColumn: return PandasColumn(self._df.iloc[:, i], allow_copy=self._allow_copy) From d1c0d56db0d4bd25fe0f88446e2498d7f270afe2 Mon Sep 17 00:00:00 2001 From: Vasily Litvinov Date: Tue, 19 Apr 2022 12:22:13 +0300 Subject: [PATCH 40/49] Remove dead elif branch Signed-off-by: Vasily Litvinov --- pandas/core/exchange/column.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/core/exchange/column.py b/pandas/core/exchange/column.py index b94894b607367..9d7986dd68cd6 100644 --- a/pandas/core/exchange/column.py +++ b/pandas/core/exchange/column.py @@ -286,8 +286,6 @@ def _get_data_buffer( ArrowCTypes.STRING, Endianness.NATIVE, ) # note: currently only support native endianness - elif self.dtype[0] == DtypeKind.DATETIME: - pass else: raise NotImplementedError(f"Data type {self._col.dtype} not handled yet") From 5d98ebf41b6370556c0ff977ca2be997fed8823e Mon Sep 17 00:00:00 2001 From: Vasily Litvinov Date: Tue, 19 Apr 2022 12:34:23 +0300 Subject: [PATCH 41/49] Fix tests broken by .column_names change Signed-off-by: Vasily Litvinov --- pandas/core/exchange/column.py | 2 +- pandas/tests/exchange/test_spec_conformance.py | 7 +++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/pandas/core/exchange/column.py b/pandas/core/exchange/column.py index 9d7986dd68cd6..c6f105072b72f 100644 --- a/pandas/core/exchange/column.py +++ b/pandas/core/exchange/column.py @@ -303,11 +303,11 @@ def _get_validity_buffer(self) -> Tuple[PandasBuffer, Any]: # For now, use byte array as the mask. # TODO: maybe store as bit array to save space?.. buf = self._col.to_numpy() - mask = [] # Determine the encoding for valid values valid = invalid == 0 invalid = not valid + mask = np.zeros(shape=(len(buf),), dtype=np.bool8) for i, obj in enumerate(buf): mask[i] = valid if isinstance(obj, str) else invalid diff --git a/pandas/tests/exchange/test_spec_conformance.py b/pandas/tests/exchange/test_spec_conformance.py index 3bce96982b155..e097cf8a407b7 100644 --- a/pandas/tests/exchange/test_spec_conformance.py +++ b/pandas/tests/exchange/test_spec_conformance.py @@ -90,10 +90,9 @@ def test_dataframe(df_from_dict): assert dfX.num_columns() == 3 assert dfX.num_rows() == 3 assert dfX.num_chunks() == 1 - assert dfX.column_names() == ["x", "y", "z"] - assert ( - dfX.select_columns((0, 2)).column_names() - == dfX.select_columns_by_name(("x", "z")).column_names() + assert list(dfX.column_names()) == ["x", "y", "z"] + assert list(dfX.select_columns((0, 2)).column_names()) == list( + dfX.select_columns_by_name(("x", "z")).column_names() ) From 60379e50b5ccb57d9f3b98329db570962704dd52 Mon Sep 17 00:00:00 2001 From: Vasily Litvinov Date: Wed, 20 Apr 2022 16:35:07 +0300 Subject: [PATCH 42/49] Add tests for datetime dtype Signed-off-by: Vasily Litvinov --- pandas/tests/exchange/test_impl.py | 35 +++++++++++++++--------------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/pandas/tests/exchange/test_impl.py b/pandas/tests/exchange/test_impl.py index 3920234f635e8..c9bc2d466fae1 100644 --- a/pandas/tests/exchange/test_impl.py +++ b/pandas/tests/exchange/test_impl.py @@ -1,3 +1,4 @@ +from datetime import datetime import random import numpy as np @@ -18,24 +19,24 @@ NCOLS, NROWS = 100, 200 -int_data = { - f"col{int((i - NCOLS / 2) % NCOLS + 1)}": [ - random.randint(0, 100) for _ in range(NROWS) - ] - for i in range(NCOLS) -} -bool_data = { - f"col{int((i - NCOLS / 2) % NCOLS + 1)}": [ - random.choice([True, False]) for _ in range(NROWS) - ] - for i in range(NCOLS) -} +def _make_data(make_one): + return { + f"col{int((i - NCOLS / 2) % NCOLS + 1)}": [make_one() for _ in range(NROWS)] + for i in range(NCOLS) + } -float_data = { - f"col{int((i - NCOLS / 2) % NCOLS + 1)}": [random.random() for _ in range(NROWS)] - for i in range(NCOLS) -} + +int_data = _make_data(lambda: random.randint(0, 100)) +bool_data = _make_data(lambda: random.choice([True, False])) +float_data = _make_data(lambda: random.random()) +datetime_data = _make_data( + lambda: datetime( + year=random.randint(1900, 2100), + month=random.randint(1, 12), + day=random.randint(1, 20), + ) +) string_data = { "separator data": [ @@ -66,7 +67,7 @@ def test_categorical_dtype(data): tm.assert_frame_equal(df, from_dataframe(df.__dataframe__())) -@pytest.mark.parametrize("data", [int_data, float_data, bool_data]) +@pytest.mark.parametrize("data", [int_data, float_data, bool_data, datetime_data]) def test_dataframe(data): df = pd.DataFrame(data) From 497ca2422456581571a6b211e50b003ac4fc5b37 Mon Sep 17 00:00:00 2001 From: Vasily Litvinov Date: Wed, 20 Apr 2022 16:48:58 +0300 Subject: [PATCH 43/49] Fix from_dataframe docstring Signed-off-by: Vasily Litvinov --- pandas/core/exchange/from_dataframe.py | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/pandas/core/exchange/from_dataframe.py b/pandas/core/exchange/from_dataframe.py index e8f9d462cc85f..18d8c9c257af4 100644 --- a/pandas/core/exchange/from_dataframe.py +++ b/pandas/core/exchange/from_dataframe.py @@ -33,6 +33,21 @@ def from_dataframe(df, allow_copy=True): + """ + Build a ``pd.DataFrame`` from any DataFrame supporting the interchange protocol. + + Parameters + ---------- + df : DataFrameXchg + Object supporting the exchange protocol, i.e. `__dataframe__` method. + allow_copy : bool, default: True + Whether to allow copying the memory to perform the conversion + (if false then zero-copy approach is requested). + + Returns + ------- + pd.DataFrame + """ if isinstance(df, pd.DataFrame): return df @@ -50,8 +65,9 @@ def _from_dataframe(df: DataFrameXchg, allow_copy=True): ---------- df : DataFrameXchg Object supporting the exchange protocol, i.e. `__dataframe__` method. - n_chunks : int, optional - Number of chunks to split `df`. + allow_copy : bool, default: True + Whether to allow copying the memory to perform the conversion + (if false then zero-copy approach is requested). Returns ------- @@ -69,7 +85,7 @@ def _from_dataframe(df: DataFrameXchg, allow_copy=True): if len(pandas_dfs) == 1: pandas_df = pandas_dfs[0] else: - pandas_df = pd.concat(pandas_dfs, axis=0, ignore_index=True) + pandas_df = pd.concat(pandas_dfs, axis=0, ignore_index=True, copy=False) index_obj = df.metadata.get("pandas.index", None) if index_obj is not None: From 39f5a5cb3b93bfab0203b9be3d6b116f3a289ec7 Mon Sep 17 00:00:00 2001 From: Vasily Litvinov Date: Thu, 21 Apr 2022 18:05:33 +0300 Subject: [PATCH 44/49] Add tests for uint dtype Signed-off-by: Vasily Litvinov --- pandas/tests/exchange/test_impl.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pandas/tests/exchange/test_impl.py b/pandas/tests/exchange/test_impl.py index c9bc2d466fae1..dbb629832f2cb 100644 --- a/pandas/tests/exchange/test_impl.py +++ b/pandas/tests/exchange/test_impl.py @@ -27,7 +27,8 @@ def _make_data(make_one): } -int_data = _make_data(lambda: random.randint(0, 100)) +int_data = _make_data(lambda: random.randint(-100, 100)) +uint_data = _make_data(lambda: random.randint(1, 100)) bool_data = _make_data(lambda: random.choice([True, False])) float_data = _make_data(lambda: random.random()) datetime_data = _make_data( @@ -67,7 +68,9 @@ def test_categorical_dtype(data): tm.assert_frame_equal(df, from_dataframe(df.__dataframe__())) -@pytest.mark.parametrize("data", [int_data, float_data, bool_data, datetime_data]) +@pytest.mark.parametrize( + "data", [int_data, uint_data, float_data, bool_data, datetime_data] +) def test_dataframe(data): df = pd.DataFrame(data) From d73558a1390130a3526a5d19a48295ca0997fc61 Mon Sep 17 00:00:00 2001 From: Vasily Litvinov Date: Fri, 22 Apr 2022 19:42:27 +0300 Subject: [PATCH 45/49] Handle string dtype better Signed-off-by: Vasily Litvinov --- pandas/core/exchange/column.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/pandas/core/exchange/column.py b/pandas/core/exchange/column.py index c6f105072b72f..ae24c5d295cc9 100644 --- a/pandas/core/exchange/column.py +++ b/pandas/core/exchange/column.py @@ -5,6 +5,7 @@ import numpy as np +from pandas._libs.lib import infer_dtype from pandas.util._decorators import cache_readonly import pandas as pd @@ -114,10 +115,14 @@ def dtype(self): Endianness.NATIVE, ) elif is_string_dtype(dtype): - # TODO: is_string_dtype() can return True for non-string dtypes like - # numpy arrays, because they all have an "object" dtype. - # Think on how to improve the check here. - return (DtypeKind.STRING, 8, dtype_to_arrow_c_fmt(dtype), Endianness.NATIVE) + if infer_dtype(self._col) == "string": + return ( + DtypeKind.STRING, + 8, + dtype_to_arrow_c_fmt(dtype), + Endianness.NATIVE, + ) + raise NotImplementedError("Non-string object dtypes are not supported yet") else: return self._dtype_from_pandasdtype(dtype) From 4ed35bf01b2fcb5a239a71491ca1d05c41e662ef Mon Sep 17 00:00:00 2001 From: Vasily Litvinov Date: Fri, 22 Apr 2022 19:47:14 +0300 Subject: [PATCH 46/49] Add test for mixed object dtype Signed-off-by: Vasily Litvinov --- pandas/tests/exchange/test_impl.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pandas/tests/exchange/test_impl.py b/pandas/tests/exchange/test_impl.py index dbb629832f2cb..a40e5fef789ad 100644 --- a/pandas/tests/exchange/test_impl.py +++ b/pandas/tests/exchange/test_impl.py @@ -167,3 +167,10 @@ def test_string(): assert col.null_count == 1 assert col.dtype[0] == DtypeKind.STRING assert col.describe_null == (ColumnNullType.USE_BYTEMASK, 0) + + +def test_nonstring_object(): + df = pd.DataFrame({"A": ["a", 10, 1.0, ()]}) + col = df.__dataframe__().get_column_by_name("A") + with pytest.raises(NotImplementedError, match="not supported yet"): + col.dtype From 2fca3c062a0a7e40b50d19a781c2c6efa53ec9d2 Mon Sep 17 00:00:00 2001 From: Vasily Litvinov Date: Sat, 23 Apr 2022 10:35:52 +0300 Subject: [PATCH 47/49] Rename spec test for clarity Signed-off-by: Vasily Litvinov --- pandas/tests/exchange/test_spec_conformance.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/exchange/test_spec_conformance.py b/pandas/tests/exchange/test_spec_conformance.py index e097cf8a407b7..93650553aec80 100644 --- a/pandas/tests/exchange/test_spec_conformance.py +++ b/pandas/tests/exchange/test_spec_conformance.py @@ -29,7 +29,7 @@ def test_only_one_dtype(test_data, df_from_dict): assert dfX.get_column_by_name(column).offset == 0 -def test_float_int(df_from_dict): +def test_mixed_dtypes(df_from_dict): df = df_from_dict( { "a": [1, 2, 3], From f030d9f27a99b8a91141810d58738b89be2ca1da Mon Sep 17 00:00:00 2001 From: Vasily Litvinov Date: Sun, 24 Apr 2022 12:52:27 +0300 Subject: [PATCH 48/49] Add missing test cases in test_dtype_to_arrow_c_fmt Signed-off-by: Vasily Litvinov --- pandas/tests/exchange/test_utils.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/tests/exchange/test_utils.py b/pandas/tests/exchange/test_utils.py index a1341ef57feec..4c80ecf0d23a0 100644 --- a/pandas/tests/exchange/test_utils.py +++ b/pandas/tests/exchange/test_utils.py @@ -31,6 +31,8 @@ pd.Series([0]).astype("datetime64[ns]").dtype, "tsn:", ), + (pd.CategoricalDtype(["a"]), "l"), + (np.dtype("O"), "u"), ], ) def test_dtype_to_arrow_c_fmt(pandas_dtype, c_string): # PR01 From cc94e57a886e1bc638e913d1196bf49cd324f73c Mon Sep 17 00:00:00 2001 From: Vasily Litvinov Date: Sun, 24 Apr 2022 13:02:53 +0300 Subject: [PATCH 49/49] Add comments explaing magic dtype numbers Signed-off-by: Vasily Litvinov --- .../tests/exchange/test_spec_conformance.py | 25 ++++++++++++------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/pandas/tests/exchange/test_spec_conformance.py b/pandas/tests/exchange/test_spec_conformance.py index 93650553aec80..f5b8bb569f35e 100644 --- a/pandas/tests/exchange/test_spec_conformance.py +++ b/pandas/tests/exchange/test_spec_conformance.py @@ -32,15 +32,18 @@ def test_only_one_dtype(test_data, df_from_dict): def test_mixed_dtypes(df_from_dict): df = df_from_dict( { - "a": [1, 2, 3], - "b": [3, 4, 5], - "c": [1.5, 2.5, 3.5], - "d": [9, 10, 11], - "e": [True, False, True], - "f": ["a", "", "c"], + "a": [1, 2, 3], # dtype kind INT = 0 + "b": [3, 4, 5], # dtype kind INT = 0 + "c": [1.5, 2.5, 3.5], # dtype kind FLOAT = 2 + "d": [9, 10, 11], # dtype kind INT = 0 + "e": [True, False, True], # dtype kind BOOLEAN = 20 + "f": ["a", "", "c"], # dtype kind STRING = 21 } ) dfX = df.__dataframe__() + # for meanings of dtype[0] see the spec; we cannot import the spec here as this + # file is expected to be vendored *anywhere*; + # values for dtype[0] are explained above columns = {"a": 0, "b": 0, "c": 2, "d": 0, "e": 20, "f": 21} for column, kind in columns.items(): @@ -120,8 +123,10 @@ def test_get_columns(df_from_dict): for colX in dfX.get_columns(): assert colX.size == 2 assert colX.num_chunks() == 1 - assert dfX.get_column(0).dtype[0] == 0 - assert dfX.get_column(1).dtype[0] == 2 + # for meanings of dtype[0] see the spec; we cannot import the spec here as this + # file is expected to be vendored *anywhere* + assert dfX.get_column(0).dtype[0] == 0 # INT + assert dfX.get_column(1).dtype[0] == 2 # FLOAT def test_buffer(df_from_dict): @@ -137,7 +142,9 @@ def test_buffer(df_from_dict): assert dataBuf.ptr != 0 device, _ = dataBuf.__dlpack_device__() - assert dataDtype[0] == 0 + # for meanings of dtype[0] see the spec; we cannot import the spec here as this + # file is expected to be vendored *anywhere* + assert dataDtype[0] == 0 # INT if device == 1: # CPU-only as we're going to directly read memory here bitwidth = dataDtype[1]