Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[python] Use true ASCII attributes in dataframes #359

Merged
merged 9 commits into from
Oct 11, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 8 additions & 41 deletions apis/python/src/tiledbsoma/soma_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import pyarrow as pa
import tiledb

from . import util, util_arrow, util_pandas, util_tiledb
from . import util, util_arrow, util_tiledb
from .logging import log_io
from .soma_collection import SOMACollectionBase
from .tiledb_array import TileDBArray
Expand Down Expand Up @@ -205,11 +205,7 @@ def read(
iterator = query.df[ids]

for table in iterator:
# XXX COMMENT MORE
# This is the 'decode on read' part of our logic; in dim_select we have the
# 'encode on write' part.
# Context: https://github.com/single-cell-data/TileDB-SOMA/issues/99.
yield util_arrow.ascii_to_unicode_pyarrow_readback(table)
yield table

def read_all(
self,
Expand Down Expand Up @@ -275,7 +271,7 @@ def write(self, values: pa.Table) -> None:
if name != ROWID:
attr_cols_map[name] = np.asarray(
values.column(name).to_pandas(
types_mapper=util_arrow.tiledb_type_from_arrow_type,
types_mapper=util_arrow.tiledb_type_from_arrow_type_for_write,
)
)

Expand Down Expand Up @@ -343,11 +339,6 @@ def read_as_pandas(

for df in iterator:

# This is the 'decode on read' part of our logic; in dim_select we have the 'encode on
# write' part.
# Context: https://github.com/single-cell-data/TileDB-SOMA/issues/99.
df = util_pandas.ascii_to_unicode_pandas_readback(df)

if id_column_name is not None:
df.reset_index(inplace=True)
df.set_index(id_column_name, inplace=True)
Expand Down Expand Up @@ -428,39 +419,15 @@ def write_from_pandas(

dataframe.set_index(ROWID, inplace=True)

# ISSUE:
#
# TileDB attributes can be stored as Unicode but they are not yet queryable via the TileDB
# QueryCondition API. While this needs to be addressed -- global collaborators will want to
# write annotation-dataframe values in Unicode -- until then, to make obs/var data possible
# to query, we need to store these as ASCII.
#
# This is (besides collation) a storage-level issue not a presentation-level issue: At write
# time, this works — "α,β,γ" stores as "\xce\xb1,\xce\xb2,\xce\xb3"; at read time: since
# SOMA is an API: utf8-decode those strings when a query is done & give the user back
# "α,β,γ".
#
# CONTEXT:
# https://github.com/single-cell-data/TileDB-SOMA/issues/99
# https://github.com/single-cell-data/TileDB-SOMA/pull/101
# https://github.com/single-cell-data/TileDB-SOMA/issues/106
# https://github.com/single-cell-data/TileDB-SOMA/pull/117
#
# IMPLEMENTATION:
# Python types -- float, string, what have you -- appear as dtype('O') which is not useful.
# Also, ``tiledb.from_pandas`` has ``column_types`` but that _forces_ things to string to a
# particular if they shouldn't be.
#
# Instead, we use ``dataframe.convert_dtypes`` to get a little jump on what ``tiledb.from_pandas``
# is going to be doing anyway, namely, type-inferring to see what is going to be a string.
#
# TODO: when UTF-8 attributes are queryable using TileDB-Py's QueryCondition API we can remove this.
# Force ASCII storage if string, in order to make obs/var columns queryable.
johnkerl marked this conversation as resolved.
Show resolved Hide resolved
# TODO: when UTF-8 attributes are fully supported we can remove this.
column_types = {}
for column_name in dataframe.keys():
dfc = dataframe[column_name]
if len(dfc) > 0 and type(dfc[0]) == str:
# Force ASCII storage if string, in order to make obs/var columns queryable.
column_types[column_name] = np.dtype("S")
column_types[column_name] = "ascii"
if len(dfc) > 0 and type(dfc[0]) == bytes:
column_types[column_name] = "bytes"

tiledb.from_pandas(
uri=self.uri,
Expand Down
6 changes: 1 addition & 5 deletions apis/python/src/tiledbsoma/soma_indexed_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,11 +250,7 @@ def read(
iterator = query.df[ids]

for table in iterator:
# XXX COMMENT MORE
# This is the 'decode on read' part of our logic; in dim_select we have the
# 'encode on write' part.
# Context: # https://github.com/single-cell-data/TileDB-SOMA/issues/99.
yield util_arrow.ascii_to_unicode_pyarrow_readback(table)
yield table

def read_all(
self,
Expand Down
68 changes: 34 additions & 34 deletions apis/python/src/tiledbsoma/util_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,24 +9,28 @@
of representing full type semantics, and correctly performing a
round trip conversion (eg, T == to_arrow(to_tiledb(T)))

Most primitive types are simple - eg, uint8. Of particular challenge
Most primitive types are simple -- e.g., uint8. Of particular challenge
are datetime/timestamps as TileDB has no distinction between a "datetime" and
a "timedelta". The best Arrow match is TimestampType, as long as that
TimestampType instance does NOT have a timezone set.

Because of our round-trip requirement, all other Arrow temporal types
are unsupported (even though they are just int64 under the covers).

We auto-promote Arrow's string and binary to large_string and large_binary,
respectively, as this is what TileDB stores -- a sequence of bytes preceded
by a 64-bit (not 32-bit) length int.
"""
ARROW_TO_TDB = {
# Dict of types unsupported by to_pandas_dtype, which require overrides.
# If the value is an instance of Exception, it will be raised.
#
# IMPORTANT: ALL non-primitive types supported by TileDB must be in this table.
#
pa.string(): np.dtype(
"S"
), # XXX TODO: temporary work-around until UTF8 support is native. GH #338.
pa.binary(): np.dtype("S"),
pa.string(): "ascii", # XXX TODO: temporary work-around until UTF8 support is native. GH #338.
pa.large_string(): "ascii", # XXX TODO: temporary work-around until UTF8 support is native. GH #338.
pa.binary(): "bytes", # XXX TODO: temporary work-around until UTF8 support is native. GH #338.
pa.large_binary(): "bytes", # XXX TODO: temporary work-around until UTF8 support is native. GH #338.
pa.timestamp("s"): "datetime64[s]",
pa.timestamp("ms"): "datetime64[ms]",
pa.timestamp("us"): "datetime64[us]",
Expand All @@ -39,7 +43,21 @@
}


def tiledb_type_from_arrow_type(t: pa.DataType) -> Union[type, np.dtype]:
def tiledb_type_from_arrow_type_for_write(t: pa.DataType) -> Union[type, np.dtype, str]:
"""
Same as ``tiledb_type_from_arrow_type`` except that this is used for writing to a TileDB array.
The syntax of TileDB-Py is such that when we want to create a schema with an ASCII column,
we use the string ``"ascii"`` in place of a dtype. But when we want to write data, we need to
use a dtype of ``np.str``, which is now deprecated in favor of simply ``str``.
"""
retval = tiledb_type_from_arrow_type(t)
if retval == "ascii":
return str
else:
return retval


def tiledb_type_from_arrow_type(t: pa.DataType) -> Union[type, np.dtype, str]:
"""
Given an Arrow type, return the corresponding TileDB type as a Numpy dtype.
Building block for Arrow-to-TileDB schema translation.
Expand All @@ -61,6 +79,10 @@ def tiledb_type_from_arrow_type(t: pa.DataType) -> Union[type, np.dtype]:
arrow_type = ARROW_TO_TDB[t]
if isinstance(arrow_type, Exception):
raise arrow_type
if arrow_type == "ascii":
return arrow_type
if arrow_type == "bytes":
return arrow_type # np.int8()
return np.dtype(arrow_type)

if not pa.types.is_primitive(t):
Expand All @@ -83,15 +105,16 @@ def tiledb_type_from_arrow_type(t: pa.DataType) -> Union[type, np.dtype]:
raise TypeError("Unsupported Arrow type") from exc


def get_arrow_type_from_tiledb_dtype(tiledb_dtype: np.dtype) -> pa.DataType:
def get_arrow_type_from_tiledb_dtype(tiledb_dtype: Union[str, np.dtype]) -> pa.DataType:
"""
TODO: COMMENT
"""
if tiledb_dtype.name == "bytes":
if tiledb_dtype == "bytes":
return pa.large_binary()
if isinstance(tiledb_dtype, str) and tiledb_dtype == "ascii":
# XXX TODO: temporary work-around until UTF8 support is native. GH #338.
return pa.string()
else:
return pa.from_numpy_dtype(tiledb_dtype)
return pa.large_string()
return pa.from_numpy_dtype(tiledb_dtype)


def get_arrow_schema_from_tiledb_uri(
Expand Down Expand Up @@ -119,26 +142,3 @@ def get_arrow_schema_from_tiledb_uri(
arrow_schema_dict[name] = get_arrow_type_from_tiledb_dtype(attr.dtype)

return pa.schema(arrow_schema_dict)


def ascii_to_unicode_pyarrow_readback(table: pa.Table) -> pa.Table:
"""
Implements the 'decode on read' part of our ASCII/Unicode logic
"""
# TODO: COMMENT/LINK HEAVILY
names = [ofield.name for ofield in table.schema]
new_fields = []
for name in names:
old_field = table[name]
# Preferred syntax:
# if len(old_field) > 0 and pa.types.is_large_binary(old_field[0]):
# but:
# AttributeError: 'pyarrow.lib.UInt64Scalar' object has no attribute 'id'
if len(old_field) > 0 and isinstance(old_field[0], pa.LargeBinaryScalar):
nfield = pa.array(
[element.as_py().decode("utf-8") for element in old_field]
)
new_fields.append(nfield)
else:
new_fields.append(old_field)
return pa.Table.from_arrays(new_fields, names=names)
13 changes: 0 additions & 13 deletions apis/python/src/tiledbsoma/util_pandas.py

This file was deleted.

4 changes: 2 additions & 2 deletions apis/python/tests/test_soma_collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ def create_and_populate_dataframe(dataframe: soma.SOMADataFrame) -> None:
[
("foo", pa.int32()),
("bar", pa.float64()),
("baz", pa.string()),
("baz", pa.large_string()),
]
)

Expand Down Expand Up @@ -108,7 +108,7 @@ def soma_object(request, tmp_path):

elif class_name == "SOMADataFrame":
so = soma.SOMADataFrame(uri=uri)
so.create(pa.schema([("A", pa.int32()), ("B", pa.string())]))
so.create(pa.schema([("A", pa.int32()), ("B", pa.large_string())]))

elif class_name == "SOMAIndexedDataFrame":
so = soma.SOMAIndexedDataFrame(uri=uri)
Expand Down
38 changes: 29 additions & 9 deletions apis/python/tests/test_soma_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ def test_soma_dataframe_non_indexed(tmp_path):
[
("foo", pa.int32()),
("bar", pa.float64()),
("baz", pa.string()),
("baz", pa.large_string()),
]
)
sdf.create(schema=asch)
Expand Down Expand Up @@ -120,7 +120,7 @@ def simple_soma_data_frame(tmp_path):
("soma_rowid", pa.uint64()),
("A", pa.int64()),
("B", pa.float64()),
("C", pa.string()),
("C", pa.large_string()),
]
)
sdf = t.SOMADataFrame(uri=tmp_path.as_posix())
Expand Down Expand Up @@ -174,37 +174,57 @@ def test_SOMADataFrame_read_column_names(simple_soma_data_frame, ids, col_names)
schema, sdf, n_data = simple_soma_data_frame
assert sdf.exists()

def _check_tbl(tbl, col_names, ids):
def _check_tbl(tbl, col_names, ids, *, demote):
assert tbl.num_columns == (
len(schema.names) if col_names is None else len(col_names)
)
assert tbl.num_rows == (n_data if ids is None else len(ids))
assert tbl.schema == pa.schema(
[
schema.field(f)
for f in (col_names if col_names is not None else schema.names)
]
)

if demote:
assert tbl.schema == pa.schema(
[
pa.field(schema.field(f).name, pa.string())
if schema.field(f).type == pa.large_string()
else schema.field(f)
for f in (col_names if col_names is not None else schema.names)
]
)
else:
assert tbl.schema == pa.schema(
[
schema.field(f)
for f in (col_names if col_names is not None else schema.names)
]
)

# TileDB ASCII -> Arrow large_string
_check_tbl(
sdf.read_all(ids=ids, column_names=col_names),
col_names,
ids,
demote=False,
)

_check_tbl(
sdf.read_all(column_names=col_names),
col_names,
None,
demote=False,
)

# TileDB ASCII -> Pandas string -> Arrow string (not large_string)
_check_tbl(
pa.Table.from_pandas(
pd.concat(sdf.read_as_pandas(ids=ids, column_names=col_names))
),
col_names,
ids,
demote=True,
)

_check_tbl(
pa.Table.from_pandas(sdf.read_as_pandas_all(column_names=col_names)),
col_names,
None,
demote=True,
)
4 changes: 2 additions & 2 deletions apis/python/tests/test_soma_experiment_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ def create_and_populate_obs(obs: soma.SOMADataFrame) -> soma.SOMADataFrame:
[
("foo", pa.int32()),
("bar", pa.float64()),
("baz", pa.string()),
("baz", pa.large_string()),
]
)

Expand All @@ -37,7 +37,7 @@ def create_and_populate_var(var: soma.SOMADataFrame) -> soma.SOMADataFrame:

var_arrow_schema = pa.schema(
[
("quux", pa.string()),
("quux", pa.large_string()),
("xyzzy", pa.float64()),
]
)
Expand Down
10 changes: 9 additions & 1 deletion apis/python/tests/test_soma_indexed_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,14 @@ def _schema():
def test_soma_indexed_dataframe(tmp_path, arrow_schema):
sdf = t.SOMAIndexedDataFrame(uri=tmp_path.as_posix())

asch = pa.schema(
[
("foo", pa.int32()),
("bar", pa.float64()),
("baz", pa.large_string()),
]
)

# Create
asch = arrow_schema()
sdf.create(schema=asch, index_column_names=["foo"])
Expand Down Expand Up @@ -72,7 +80,7 @@ def simple_soma_indexed_data_frame(tmp_path):
("index", pa.uint64()),
("A", pa.int64()),
("B", pa.float64()),
("C", pa.string()),
("C", pa.large_string()),
]
)
index_column_names = ["index"]
Expand Down
2 changes: 1 addition & 1 deletion apis/python/tests/test_soma_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def soma_object(request, tmp_path):

elif class_name == "SOMADataFrame":
so = soma.SOMADataFrame(uri=uri)
so.create(pa.schema([("A", pa.int32()), ("B", pa.string())]))
so.create(pa.schema([("A", pa.int32()), ("B", pa.large_string())]))

elif class_name == "SOMAIndexedDataFrame":
so = soma.SOMAIndexedDataFrame(uri=uri)
Expand Down
Loading