Skip to content

Commit

Permalink
Use true ASCII attributes in dataframes
Browse files Browse the repository at this point in the history
  • Loading branch information
johnkerl committed Oct 4, 2022
1 parent 1d08b4c commit d236a9b
Show file tree
Hide file tree
Showing 7 changed files with 17 additions and 88 deletions.
1 change: 0 additions & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ jobs:
- runs-on: ubuntu-22.04
cc: gcc-11
cxx: g++-11
# Pending https://github.com/actions/runner-images/issues/6350
- runs-on: macos-11
cc: gcc-11
cxx: g++-11
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/cpp-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ jobs:
cc: gcc-11
cxx: g++-11
# Pending https://github.com/actions/runner-images/issues/6350
# - runs-on: macos-12
- runs-on: macos-11
cc: gcc-11
cxx: g++-11
Expand Down
48 changes: 6 additions & 42 deletions apis/python/src/tiledbsoma/soma_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import pyarrow as pa
import tiledb

from . import util, util_arrow, util_pandas, util_tiledb
from . import util, util_arrow, util_tiledb
from .logging import log_io
from .soma_collection import SOMACollectionBase
from .tiledb_array import TileDBArray
Expand Down Expand Up @@ -218,15 +218,10 @@ def read(
iterator = query.df[ids]

for table in iterator:
# XXX COMMENT MORE
# This is the 'decode on read' part of our logic; in dim_select we have the
# 'encode on write' part.
# Context: https://github.com/single-cell-data/TileDB-SOMA/issues/99.
#
# Also: don't materialize these on read
# Don't materialize these on read
# TODO: get the arrow syntax for drop
# df.drop(ROWID, axis=1)
yield util_arrow.ascii_to_unicode_pyarrow_readback(table)
yield table

def read_all(
self,
Expand Down Expand Up @@ -360,11 +355,6 @@ def read_as_pandas(

for df in iterator:

# This is the 'decode on read' part of our logic; in dim_select we have the 'encode on
# write' part.
# Context: https://github.com/single-cell-data/TileDB-SOMA/issues/99.
df = util_pandas.ascii_to_unicode_pandas_readback(df)

if id_column_name is not None:
df.reset_index(inplace=True)
df.set_index(id_column_name, inplace=True)
Expand Down Expand Up @@ -445,39 +435,13 @@ def write_from_pandas(

dataframe.set_index(ROWID, inplace=True)

# ISSUE:
#
# TileDB attributes can be stored as Unicode but they are not yet queryable via the TileDB
# QueryCondition API. While this needs to be addressed -- global collaborators will want to
# write annotation-dataframe values in Unicode -- until then, to make obs/var data possible
# to query, we need to store these as ASCII.
#
# This is (besides collation) a storage-level issue not a presentation-level issue: At write
# time, this works — "α,β,γ" stores as "\xce\xb1,\xce\xb2,\xce\xb3"; at read time: since
# SOMA is an API: utf8-decode those strings when a query is done & give the user back
# "α,β,γ".
#
# CONTEXT:
# https://github.com/single-cell-data/TileDB-SOMA/issues/99
# https://github.com/single-cell-data/TileDB-SOMA/pull/101
# https://github.com/single-cell-data/TileDB-SOMA/issues/106
# https://github.com/single-cell-data/TileDB-SOMA/pull/117
#
# IMPLEMENTATION:
# Python types -- float, string, what have you -- appear as dtype('O') which is not useful.
# Also, ``tiledb.from_pandas`` has ``column_types`` but that _forces_ things to string to a
# particular if they shouldn't be.
#
# Instead, we use ``dataframe.convert_dtypes`` to get a little jump on what ``tiledb.from_pandas``
# is going to be doing anyway, namely, type-inferring to see what is going to be a string.
#
# TODO: when UTF-8 attributes are queryable using TileDB-Py's QueryCondition API we can remove this.
# Force ASCII storage if string, in order to make obs/var columns queryable.
# TODO: when UTF-8 attributes are fully supported we can remove this.
column_types = {}
for column_name in dataframe.keys():
dfc = dataframe[column_name]
if len(dfc) > 0 and type(dfc[0]) == str:
# Force ASCII storage if string, in order to make obs/var columns queryable.
column_types[column_name] = np.dtype("S")
column_types[column_name] = "ascii"

tiledb.from_pandas(
uri=self.uri,
Expand Down
6 changes: 1 addition & 5 deletions apis/python/src/tiledbsoma/soma_indexed_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,11 +259,7 @@ def read(
iterator = query.df[ids]

for table in iterator:
# XXX COMMENT MORE
# This is the 'decode on read' part of our logic; in dim_select we have the
# 'encode on write' part.
# Context: # https://github.com/single-cell-data/TileDB-SOMA/issues/99.
yield util_arrow.ascii_to_unicode_pyarrow_readback(table)
yield table

def read_all(
self,
Expand Down
34 changes: 8 additions & 26 deletions apis/python/src/tiledbsoma/util_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,7 @@
#
# IMPORTANT: ALL non-primitive types supported by TileDB must be in this table.
#
pa.string(): np.dtype(
"S"
), # XXX TODO: temporary work-around until UTF8 support is native. GH #338.
pa.string(): "ascii", # XXX TODO: temporary work-around until UTF8 support is native. GH #338.
pa.binary(): np.dtype("S"),
pa.timestamp("s"): "datetime64[s]",
pa.timestamp("ms"): "datetime64[ms]",
Expand All @@ -39,7 +37,7 @@
}


def tiledb_type_from_arrow_type(t: pa.DataType) -> Union[type, np.dtype]:
def tiledb_type_from_arrow_type(t: pa.DataType) -> Union[type, np.dtype, str]:
"""
Given an Arrow type, return the corresponding TileDB type as a Numpy dtype.
Building block for Arrow-to-TileDB schema translation.
Expand All @@ -61,7 +59,10 @@ def tiledb_type_from_arrow_type(t: pa.DataType) -> Union[type, np.dtype]:
arrow_type = ARROW_TO_TDB[t]
if isinstance(arrow_type, Exception):
raise arrow_type
return np.dtype(arrow_type)
if arrow_type == "ascii":
return arrow_type
else:
return np.dtype(arrow_type)

if not pa.types.is_primitive(t):
raise TypeError(f"Type {str(t)} - unsupported type")
Expand All @@ -83,11 +84,11 @@ def tiledb_type_from_arrow_type(t: pa.DataType) -> Union[type, np.dtype]:
raise TypeError("Unsupported Arrow type") from exc


def get_arrow_type_from_tiledb_dtype(tiledb_dtype: np.dtype) -> pa.DataType:
def get_arrow_type_from_tiledb_dtype(tiledb_dtype: Union[str, np.dtype]) -> pa.DataType:
"""
TODO: COMMENT
"""
if tiledb_dtype.name == "bytes":
if tiledb_dtype == "ascii" or tiledb_dtype.name == "bytes":
# XXX TODO: temporary work-around until UTF8 support is native. GH #338.
return pa.string()
else:
Expand Down Expand Up @@ -119,22 +120,3 @@ def get_arrow_schema_from_tiledb_uri(
arrow_schema_dict[name] = get_arrow_type_from_tiledb_dtype(attr.dtype)

return pa.schema(arrow_schema_dict)


def ascii_to_unicode_pyarrow_readback(table: pa.Table) -> pa.Table:
"""
Implements the 'decode on read' part of our ASCII/Unicode logic
"""
# TODO: COMMENT/LINK HEAVILY
names = [ofield.name for ofield in table.schema]
new_fields = []
for name in names:
old_field = table[name]
if len(old_field) > 0 and isinstance(old_field[0], pa.LargeBinaryScalar):
nfield = pa.array(
[element.as_py().decode("utf-8") for element in old_field]
)
new_fields.append(nfield)
else:
new_fields.append(old_field)
return pa.Table.from_arrays(new_fields, names=names)
13 changes: 0 additions & 13 deletions apis/python/src/tiledbsoma/util_pandas.py

This file was deleted.

2 changes: 1 addition & 1 deletion apis/python/tests/test_type_system.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def test_supported_types_supported(arrow_type):
pytest.xfail("Awaiting UTF-8 support - see issue #338")

tdb_dtype = tiledb_type_from_arrow_type(arrow_type)
assert isinstance(tdb_dtype, np.dtype)
assert isinstance(tdb_dtype, np.dtype) or tdb_dtype == "ascii"
rt_arrow_type = get_arrow_type_from_tiledb_dtype(tdb_dtype)
assert isinstance(rt_arrow_type, pa.DataType)
assert arrow_type == rt_arrow_type
Expand Down

0 comments on commit d236a9b

Please sign in to comment.