From 0086e2f843b8a57a7ee5bcf0437d9e5fa3471454 Mon Sep 17 00:00:00 2001 From: John Kerl Date: Tue, 4 Oct 2022 11:24:24 -0400 Subject: [PATCH] Use true ASCII attributes in dataframes --- .github/workflows/ci.yml | 1 - .github/workflows/cpp-ci.yml | 1 + apis/python/src/tiledbsoma/soma_dataframe.py | 48 +++---------------- .../src/tiledbsoma/soma_indexed_dataframe.py | 6 +-- apis/python/src/tiledbsoma/util_arrow.py | 34 ++++--------- apis/python/src/tiledbsoma/util_pandas.py | 13 ----- apis/python/tests/test_type_system.py | 2 +- 7 files changed, 17 insertions(+), 88 deletions(-) delete mode 100644 apis/python/src/tiledbsoma/util_pandas.py diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 8b0cb87f1d..3d9e23d64a 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -20,7 +20,6 @@ jobs: - runs-on: ubuntu-22.04 cc: gcc-11 cxx: g++-11 - # Pending https://github.com/actions/runner-images/issues/6350 - runs-on: macos-11 cc: gcc-11 cxx: g++-11 diff --git a/.github/workflows/cpp-ci.yml b/.github/workflows/cpp-ci.yml index e22f826532..0b0345e0b1 100644 --- a/.github/workflows/cpp-ci.yml +++ b/.github/workflows/cpp-ci.yml @@ -18,6 +18,7 @@ jobs: cc: gcc-11 cxx: g++-11 # Pending https://github.com/actions/runner-images/issues/6350 + # - runs-on: macos-12 - runs-on: macos-11 cc: gcc-11 cxx: g++-11 diff --git a/apis/python/src/tiledbsoma/soma_dataframe.py b/apis/python/src/tiledbsoma/soma_dataframe.py index 7ce148e5bf..eab8621cc7 100644 --- a/apis/python/src/tiledbsoma/soma_dataframe.py +++ b/apis/python/src/tiledbsoma/soma_dataframe.py @@ -5,7 +5,7 @@ import pyarrow as pa import tiledb -from . import util, util_arrow, util_pandas, util_tiledb +from . import util, util_arrow, util_tiledb from .logging import log_io from .soma_collection import SOMACollectionBase from .tiledb_array import TileDBArray @@ -218,15 +218,10 @@ def read( iterator = query.df[ids] for table in iterator: - # XXX COMMENT MORE - # This is the 'decode on read' part of our logic; in dim_select we have the - # 'encode on write' part. - # Context: https://github.com/single-cell-data/TileDB-SOMA/issues/99. - # - # Also: don't materialize these on read + # Don't materialize these on read # TODO: get the arrow syntax for drop # df.drop(ROWID, axis=1) - yield util_arrow.ascii_to_unicode_pyarrow_readback(table) + yield table def read_all( self, @@ -360,11 +355,6 @@ def read_as_pandas( for df in iterator: - # This is the 'decode on read' part of our logic; in dim_select we have the 'encode on - # write' part. - # Context: https://github.com/single-cell-data/TileDB-SOMA/issues/99. - df = util_pandas.ascii_to_unicode_pandas_readback(df) - if id_column_name is not None: df.reset_index(inplace=True) df.set_index(id_column_name, inplace=True) @@ -445,39 +435,13 @@ def write_from_pandas( dataframe.set_index(ROWID, inplace=True) - # ISSUE: - # - # TileDB attributes can be stored as Unicode but they are not yet queryable via the TileDB - # QueryCondition API. While this needs to be addressed -- global collaborators will want to - # write annotation-dataframe values in Unicode -- until then, to make obs/var data possible - # to query, we need to store these as ASCII. - # - # This is (besides collation) a storage-level issue not a presentation-level issue: At write - # time, this works — "α,β,γ" stores as "\xce\xb1,\xce\xb2,\xce\xb3"; at read time: since - # SOMA is an API: utf8-decode those strings when a query is done & give the user back - # "α,β,γ". - # - # CONTEXT: - # https://github.com/single-cell-data/TileDB-SOMA/issues/99 - # https://github.com/single-cell-data/TileDB-SOMA/pull/101 - # https://github.com/single-cell-data/TileDB-SOMA/issues/106 - # https://github.com/single-cell-data/TileDB-SOMA/pull/117 - # - # IMPLEMENTATION: - # Python types -- float, string, what have you -- appear as dtype('O') which is not useful. - # Also, ``tiledb.from_pandas`` has ``column_types`` but that _forces_ things to string to a - # particular if they shouldn't be. - # - # Instead, we use ``dataframe.convert_dtypes`` to get a little jump on what ``tiledb.from_pandas`` - # is going to be doing anyway, namely, type-inferring to see what is going to be a string. - # - # TODO: when UTF-8 attributes are queryable using TileDB-Py's QueryCondition API we can remove this. + # Force ASCII storage if string, in order to make obs/var columns queryable. + # TODO: when UTF-8 attributes are fully supported we can remove this. column_types = {} for column_name in dataframe.keys(): dfc = dataframe[column_name] if len(dfc) > 0 and type(dfc[0]) == str: - # Force ASCII storage if string, in order to make obs/var columns queryable. - column_types[column_name] = np.dtype("S") + column_types[column_name] = "ascii" tiledb.from_pandas( uri=self.uri, diff --git a/apis/python/src/tiledbsoma/soma_indexed_dataframe.py b/apis/python/src/tiledbsoma/soma_indexed_dataframe.py index 27e8130b00..539d4e5140 100644 --- a/apis/python/src/tiledbsoma/soma_indexed_dataframe.py +++ b/apis/python/src/tiledbsoma/soma_indexed_dataframe.py @@ -259,11 +259,7 @@ def read( iterator = query.df[ids] for table in iterator: - # XXX COMMENT MORE - # This is the 'decode on read' part of our logic; in dim_select we have the - # 'encode on write' part. - # Context: # https://github.com/single-cell-data/TileDB-SOMA/issues/99. - yield util_arrow.ascii_to_unicode_pyarrow_readback(table) + yield table def read_all( self, diff --git a/apis/python/src/tiledbsoma/util_arrow.py b/apis/python/src/tiledbsoma/util_arrow.py index cb0620f913..8ceb7d4627 100644 --- a/apis/python/src/tiledbsoma/util_arrow.py +++ b/apis/python/src/tiledbsoma/util_arrow.py @@ -23,9 +23,7 @@ # # IMPORTANT: ALL non-primitive types supported by TileDB must be in this table. # - pa.string(): np.dtype( - "S" - ), # XXX TODO: temporary work-around until UTF8 support is native. GH #338. + pa.string(): "ascii", # XXX TODO: temporary work-around until UTF8 support is native. GH #338. pa.binary(): np.dtype("S"), pa.timestamp("s"): "datetime64[s]", pa.timestamp("ms"): "datetime64[ms]", @@ -39,7 +37,7 @@ } -def tiledb_type_from_arrow_type(t: pa.DataType) -> Union[type, np.dtype]: +def tiledb_type_from_arrow_type(t: pa.DataType) -> Union[type, np.dtype, str]: """ Given an Arrow type, return the corresponding TileDB type as a Numpy dtype. Building block for Arrow-to-TileDB schema translation. @@ -61,7 +59,10 @@ def tiledb_type_from_arrow_type(t: pa.DataType) -> Union[type, np.dtype]: arrow_type = ARROW_TO_TDB[t] if isinstance(arrow_type, Exception): raise arrow_type - return np.dtype(arrow_type) + if arrow_type == "ascii": + return arrow_type + else: + return np.dtype(arrow_type) if not pa.types.is_primitive(t): raise TypeError(f"Type {str(t)} - unsupported type") @@ -83,11 +84,11 @@ def tiledb_type_from_arrow_type(t: pa.DataType) -> Union[type, np.dtype]: raise TypeError("Unsupported Arrow type") from exc -def get_arrow_type_from_tiledb_dtype(tiledb_dtype: np.dtype) -> pa.DataType: +def get_arrow_type_from_tiledb_dtype(tiledb_dtype: Union[str, np.dtype]) -> pa.DataType: """ TODO: COMMENT """ - if tiledb_dtype.name == "bytes": + if tiledb_dtype == "ascii" or tiledb_dtype.name == "bytes": # XXX TODO: temporary work-around until UTF8 support is native. GH #338. return pa.string() else: @@ -119,22 +120,3 @@ def get_arrow_schema_from_tiledb_uri( arrow_schema_dict[name] = get_arrow_type_from_tiledb_dtype(attr.dtype) return pa.schema(arrow_schema_dict) - - -def ascii_to_unicode_pyarrow_readback(table: pa.Table) -> pa.Table: - """ - Implements the 'decode on read' part of our ASCII/Unicode logic - """ - # TODO: COMMENT/LINK HEAVILY - names = [ofield.name for ofield in table.schema] - new_fields = [] - for name in names: - old_field = table[name] - if len(old_field) > 0 and isinstance(old_field[0], pa.LargeBinaryScalar): - nfield = pa.array( - [element.as_py().decode("utf-8") for element in old_field] - ) - new_fields.append(nfield) - else: - new_fields.append(old_field) - return pa.Table.from_arrays(new_fields, names=names) diff --git a/apis/python/src/tiledbsoma/util_pandas.py b/apis/python/src/tiledbsoma/util_pandas.py deleted file mode 100644 index b5caf01255..0000000000 --- a/apis/python/src/tiledbsoma/util_pandas.py +++ /dev/null @@ -1,13 +0,0 @@ -import pandas as pd - - -def ascii_to_unicode_pandas_readback(df: pd.DataFrame) -> pd.DataFrame: - """ - Implements the 'decode on read' part of our ASCII/Unicode logic. - """ - # TODO: COMMENT/LINK HEAVILY - for k in df: - dfk = df[k] - if len(dfk) > 0 and type(dfk.iat[0]) == bytes: - df[k] = dfk.map(lambda e: e.decode()) - return df diff --git a/apis/python/tests/test_type_system.py b/apis/python/tests/test_type_system.py index 74af8fb5bb..4e0d515be4 100644 --- a/apis/python/tests/test_type_system.py +++ b/apis/python/tests/test_type_system.py @@ -65,7 +65,7 @@ def test_supported_types_supported(arrow_type): pytest.xfail("Awaiting UTF-8 support - see issue #338") tdb_dtype = tiledb_type_from_arrow_type(arrow_type) - assert isinstance(tdb_dtype, np.dtype) + assert isinstance(tdb_dtype, np.dtype) or tdb_dtype == "ascii" rt_arrow_type = get_arrow_type_from_tiledb_dtype(tdb_dtype) assert isinstance(rt_arrow_type, pa.DataType) assert arrow_type == rt_arrow_type