From a8dbbb2c6e49af74bc8058397adaea944973e6cb Mon Sep 17 00:00:00 2001 From: John Kerl Date: Sun, 2 Oct 2022 23:49:10 -0400 Subject: [PATCH] Use true ASCII attributes in dataframes --- apis/python/src/tiledbsoma/soma_dataframe.py | 8 ++--- .../src/tiledbsoma/soma_indexed_dataframe.py | 6 +--- apis/python/src/tiledbsoma/util_arrow.py | 34 +++++-------------- apis/python/src/tiledbsoma/util_pandas.py | 13 ------- apis/python/tests/test_type_system.py | 2 +- 5 files changed, 14 insertions(+), 49 deletions(-) delete mode 100644 apis/python/src/tiledbsoma/util_pandas.py diff --git a/apis/python/src/tiledbsoma/soma_dataframe.py b/apis/python/src/tiledbsoma/soma_dataframe.py index 7ce148e5bf..d5bffe50ce 100644 --- a/apis/python/src/tiledbsoma/soma_dataframe.py +++ b/apis/python/src/tiledbsoma/soma_dataframe.py @@ -5,7 +5,7 @@ import pyarrow as pa import tiledb -from . import util, util_arrow, util_pandas, util_tiledb +from . import util, util_arrow, util_tiledb from .logging import log_io from .soma_collection import SOMACollectionBase from .tiledb_array import TileDBArray @@ -226,7 +226,7 @@ def read( # Also: don't materialize these on read # TODO: get the arrow syntax for drop # df.drop(ROWID, axis=1) - yield util_arrow.ascii_to_unicode_pyarrow_readback(table) + yield table def read_all( self, @@ -363,7 +363,7 @@ def read_as_pandas( # This is the 'decode on read' part of our logic; in dim_select we have the 'encode on # write' part. # Context: https://github.com/single-cell-data/TileDB-SOMA/issues/99. - df = util_pandas.ascii_to_unicode_pandas_readback(df) + df = df if id_column_name is not None: df.reset_index(inplace=True) @@ -477,7 +477,7 @@ def write_from_pandas( dfc = dataframe[column_name] if len(dfc) > 0 and type(dfc[0]) == str: # Force ASCII storage if string, in order to make obs/var columns queryable. - column_types[column_name] = np.dtype("S") + column_types[column_name] = "ascii" tiledb.from_pandas( uri=self.uri, diff --git a/apis/python/src/tiledbsoma/soma_indexed_dataframe.py b/apis/python/src/tiledbsoma/soma_indexed_dataframe.py index 27e8130b00..539d4e5140 100644 --- a/apis/python/src/tiledbsoma/soma_indexed_dataframe.py +++ b/apis/python/src/tiledbsoma/soma_indexed_dataframe.py @@ -259,11 +259,7 @@ def read( iterator = query.df[ids] for table in iterator: - # XXX COMMENT MORE - # This is the 'decode on read' part of our logic; in dim_select we have the - # 'encode on write' part. - # Context: # https://github.com/single-cell-data/TileDB-SOMA/issues/99. - yield util_arrow.ascii_to_unicode_pyarrow_readback(table) + yield table def read_all( self, diff --git a/apis/python/src/tiledbsoma/util_arrow.py b/apis/python/src/tiledbsoma/util_arrow.py index cb0620f913..8ceb7d4627 100644 --- a/apis/python/src/tiledbsoma/util_arrow.py +++ b/apis/python/src/tiledbsoma/util_arrow.py @@ -23,9 +23,7 @@ # # IMPORTANT: ALL non-primitive types supported by TileDB must be in this table. # - pa.string(): np.dtype( - "S" - ), # XXX TODO: temporary work-around until UTF8 support is native. GH #338. + pa.string(): "ascii", # XXX TODO: temporary work-around until UTF8 support is native. GH #338. pa.binary(): np.dtype("S"), pa.timestamp("s"): "datetime64[s]", pa.timestamp("ms"): "datetime64[ms]", @@ -39,7 +37,7 @@ } -def tiledb_type_from_arrow_type(t: pa.DataType) -> Union[type, np.dtype]: +def tiledb_type_from_arrow_type(t: pa.DataType) -> Union[type, np.dtype, str]: """ Given an Arrow type, return the corresponding TileDB type as a Numpy dtype. Building block for Arrow-to-TileDB schema translation. @@ -61,7 +59,10 @@ def tiledb_type_from_arrow_type(t: pa.DataType) -> Union[type, np.dtype]: arrow_type = ARROW_TO_TDB[t] if isinstance(arrow_type, Exception): raise arrow_type - return np.dtype(arrow_type) + if arrow_type == "ascii": + return arrow_type + else: + return np.dtype(arrow_type) if not pa.types.is_primitive(t): raise TypeError(f"Type {str(t)} - unsupported type") @@ -83,11 +84,11 @@ def tiledb_type_from_arrow_type(t: pa.DataType) -> Union[type, np.dtype]: raise TypeError("Unsupported Arrow type") from exc -def get_arrow_type_from_tiledb_dtype(tiledb_dtype: np.dtype) -> pa.DataType: +def get_arrow_type_from_tiledb_dtype(tiledb_dtype: Union[str, np.dtype]) -> pa.DataType: """ TODO: COMMENT """ - if tiledb_dtype.name == "bytes": + if tiledb_dtype == "ascii" or tiledb_dtype.name == "bytes": # XXX TODO: temporary work-around until UTF8 support is native. GH #338. return pa.string() else: @@ -119,22 +120,3 @@ def get_arrow_schema_from_tiledb_uri( arrow_schema_dict[name] = get_arrow_type_from_tiledb_dtype(attr.dtype) return pa.schema(arrow_schema_dict) - - -def ascii_to_unicode_pyarrow_readback(table: pa.Table) -> pa.Table: - """ - Implements the 'decode on read' part of our ASCII/Unicode logic - """ - # TODO: COMMENT/LINK HEAVILY - names = [ofield.name for ofield in table.schema] - new_fields = [] - for name in names: - old_field = table[name] - if len(old_field) > 0 and isinstance(old_field[0], pa.LargeBinaryScalar): - nfield = pa.array( - [element.as_py().decode("utf-8") for element in old_field] - ) - new_fields.append(nfield) - else: - new_fields.append(old_field) - return pa.Table.from_arrays(new_fields, names=names) diff --git a/apis/python/src/tiledbsoma/util_pandas.py b/apis/python/src/tiledbsoma/util_pandas.py deleted file mode 100644 index b5caf01255..0000000000 --- a/apis/python/src/tiledbsoma/util_pandas.py +++ /dev/null @@ -1,13 +0,0 @@ -import pandas as pd - - -def ascii_to_unicode_pandas_readback(df: pd.DataFrame) -> pd.DataFrame: - """ - Implements the 'decode on read' part of our ASCII/Unicode logic. - """ - # TODO: COMMENT/LINK HEAVILY - for k in df: - dfk = df[k] - if len(dfk) > 0 and type(dfk.iat[0]) == bytes: - df[k] = dfk.map(lambda e: e.decode()) - return df diff --git a/apis/python/tests/test_type_system.py b/apis/python/tests/test_type_system.py index 74af8fb5bb..4e0d515be4 100644 --- a/apis/python/tests/test_type_system.py +++ b/apis/python/tests/test_type_system.py @@ -65,7 +65,7 @@ def test_supported_types_supported(arrow_type): pytest.xfail("Awaiting UTF-8 support - see issue #338") tdb_dtype = tiledb_type_from_arrow_type(arrow_type) - assert isinstance(tdb_dtype, np.dtype) + assert isinstance(tdb_dtype, np.dtype) or tdb_dtype == "ascii" rt_arrow_type = get_arrow_type_from_tiledb_dtype(tdb_dtype) assert isinstance(rt_arrow_type, pa.DataType) assert arrow_type == rt_arrow_type