Skip to content

Commit

Permalink
[python] Enable Unicode non-indexed columns (#777)
Browse files Browse the repository at this point in the history
* remove xfails from unicode tests

* remove unicode work-arounds for non-indexed columns

* lint

* remove incorrect code

* additional comments

* PR feedback
  • Loading branch information
Bruce Martin authored Jan 22, 2023
1 parent f959d2f commit 096f716
Show file tree
Hide file tree
Showing 3 changed files with 98 additions and 28 deletions.
4 changes: 3 additions & 1 deletion apis/python/src/tiledbsoma/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,9 @@ def _create_empty(
dims = []
for index_column_name in index_column_names:
pa_type = schema.field(index_column_name).type
dtype = util_arrow.tiledb_type_from_arrow_type(pa_type)
dtype = util_arrow.tiledb_type_from_arrow_type(
pa_type, is_indexed_column=True
)
domain: Tuple[Any, Any]
if isinstance(dtype, str):
domain = None, None
Expand Down
53 changes: 39 additions & 14 deletions apis/python/src/tiledbsoma/util_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,17 +21,25 @@
We auto-promote Arrow's string and binary to large_string and large_binary,
respectively, as this is what TileDB stores -- a sequence of bytes preceded
by a 64-bit (not 32-bit) length int.
DataFrame-specific note: currently (as of 2.14), TileDB does not support
Unicode array dimensions. All Arrow string types used in a DataFrame index
columns (i.e., TileDB dimension) are coerced to ASCII. This equirement for
ASCII-only dimensions will be relaxed in a future release. Unicode/UTF-8 is
fully supported in SOMA DataFrame non-indexed columns.
"""
ARROW_TO_TDB: Dict[Any, Union[str, TypeError]] = {
# Dict of types unsupported by to_pandas_dtype, which require overrides.
_ARROW_TO_TDB_ATTR: Dict[Any, Union[str, TypeError]] = {
# Dict of types unsupported by to_pandas_dtype, which require overrides for
# use in TileDB Attributes (aka DataFrame non-indexe columns).
#
# If the value is an instance of Exception, it will be raised.
#
# IMPORTANT: ALL non-primitive types supported by TileDB must be in this table.
#
pa.string(): "ascii", # TODO: temporary work-around until UTF8 support is native. GH #338.
pa.large_string(): "ascii", # TODO: temporary work-around until UTF8 support is native. GH #338.
pa.binary(): "bytes", # TODO: temporary work-around until UTF8 support is native. GH #338.
pa.large_binary(): "bytes", # TODO: temporary work-around until UTF8 support is native. GH #338.
pa.string(): "U1",
pa.large_string(): "U1",
pa.binary(): "bytes",
pa.large_binary(): "bytes",
pa.timestamp("s"): "datetime64[s]",
pa.timestamp("ms"): "datetime64[ms]",
pa.timestamp("us"): "datetime64[us]",
Expand All @@ -43,33 +51,50 @@
pa.date64(): TypeError("64-bit date - unsupported type (use TimestampType)"),
}

# Same as _ARROW_TO_TDB_ATTR, but used for DataFrame indexed columns, aka TileDB Dimensions.
# Any type system differences from the base-case Attr should be added here.
_ARROW_TO_TDB_DIM: Dict[Any, Union[str, TypeError]] = _ARROW_TO_TDB_ATTR.copy()
_ARROW_TO_TDB_DIM.update(
{
pa.string(): "ascii", # TODO: temporary work-around until Dimension UTF8 support is available.
pa.large_string(): "ascii", # TODO: temporary work-around until Dimension UTF8 support is available.
}
)

def tiledb_type_from_arrow_type(t: pa.DataType) -> npt.DTypeLike:

def tiledb_type_from_arrow_type(
t: pa.DataType, is_indexed_column: bool = False
) -> npt.DTypeLike:
"""
Given an Arrow type, return the corresponding TileDB type as a Numpy dtype.
Given an Arrow type, return the corresponding TileDB type as a NumPy dtype.
Building block for Arrow-to-TileDB schema translation.
TileDB currently has different Unicode handling for dimensions and attributes.
Set the ``is_dimension`` parameter to True for indexed-column (AKA dimension)
rules, which currently requires all strings to be ASCII.
If type is unsupported, with raise a TypeError exception.
Parameters
----------
t : pyarrow.DataType
Arrow DataType instance, e.g., pyarrow.int8()
is_indexed_column : bool
Use TileDB dimension type conversion rules.
Returns
-------
numpy.dtype
The numpy dtype corresponding to the ``t`` parameter. ``TypeError`` will
be raised for unsupported types.
"""
if t in ARROW_TO_TDB:
arrow_type = ARROW_TO_TDB[t]
arrow_to_tdb = _ARROW_TO_TDB_DIM if is_indexed_column else _ARROW_TO_TDB_ATTR
if t in arrow_to_tdb:
arrow_type = arrow_to_tdb[t]
if isinstance(arrow_type, Exception):
raise arrow_type
if arrow_type == "ascii":
if arrow_type in ["ascii", "bytes"]:
return arrow_type
if arrow_type == "bytes":
return arrow_type # np.int8()
return np.dtype(arrow_type)

if not pa.types.is_primitive(t):
Expand Down Expand Up @@ -105,7 +130,7 @@ def get_arrow_type_from_tiledb_dtype(
return pa.large_string()
else:
return pa.large_binary()
elif tiledb_dtype == "ascii":
elif tiledb_dtype == "ascii" or tiledb_dtype == str:
return pa.large_string()
else:
return pa.from_numpy_dtype(tiledb_dtype)
Expand Down
69 changes: 56 additions & 13 deletions apis/python/tests/test_unicode.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def sample_arrow_table():
],
"ascii": ["aa", "bbb", "cccccc"],
"bytes": [b"aa", b"bbb", b"ccc"],
"float32": np.array([0.0, 1.1, 2.2], np.float32),
"float32": np.array([0.0, 1.0, 2.0], np.float32),
}
)

Expand All @@ -40,29 +40,72 @@ def sample_arrow_table():
return pa.Table.from_pandas(df, schema)


# TODO: Remove the `xfail` annotation when issue TileDB-SOMA#415 is fixed
#
@pytest.mark.xfail
def test_dataframe_unicode(tmp_path, sample_arrow_table):
"""Verify round-trip of unicode in DataFrame attributes"""
@pytest.fixture
def sample_soma_dataframe(tmp_path, sample_arrow_table):
sdf = soma.DataFrame(tmp_path.as_posix())
sdf.create(sample_arrow_table.schema)
sdf.create(sample_arrow_table.schema, index_column_names=["soma_joinid"])
sdf.write(sample_arrow_table)
assert sdf.read().concat().equals(sample_arrow_table)
assert sdf.exists()
return sdf


# TODO: Remove the `xfail` annotation when issue TileDB-SOMA#415 is fixed
#
@pytest.mark.xfail
def test_dataframe_unicode_attr(tmp_path, sample_arrow_table):
def test_dataframe_unicode_columns(tmp_path, sample_arrow_table):
"""Verify round-trip of unicode in DataFrame value columns"""
sdf = soma.DataFrame(tmp_path.as_posix())
sdf.create(sample_arrow_table.schema, index_column_names=["soma_joinid"])
sdf.write(sample_arrow_table)

assert sample_arrow_table.schema == sdf.schema
assert sdf.read().concat().equals(sample_arrow_table)


# TODO: Remove the `xfail` annotation when issues TileDB-SOMA#415 and TileDB-SOMA#418 are fixed
def test_dataframe_unicode_value_filter(sample_soma_dataframe):
"""Verify that value_filter works correctly"""

# filter on ascii
assert sample_soma_dataframe.read(
value_filter="ascii in ['aa', 'cccccc']"
).concat().to_pydict() == {
"soma_joinid": [0, 2],
"unicode": [
"\N{LATIN CAPITAL LETTER E}\N{COMBINING CIRCUMFLEX ACCENT}",
"クン キン おし.える よ.む くん.ずる",
],
"ascii": ["aa", "cccccc"],
"bytes": [b"aa", b"ccc"],
"float32": [0.0, 2.0],
}

# filter on unicode, equality
assert sample_soma_dataframe.read(
value_filter="unicode == '\N{LATIN CAPITAL LETTER E}\N{COMBINING CIRCUMFLEX ACCENT}'"
).concat().to_pydict() == {
"soma_joinid": [0],
"unicode": [
"\N{LATIN CAPITAL LETTER E}\N{COMBINING CIRCUMFLEX ACCENT}",
],
"ascii": ["aa"],
"bytes": [b"aa"],
"float32": [0.0],
}

# filter on unicode, ordering
assert sample_soma_dataframe.read(
value_filter="unicode > 'a'"
).concat().to_pydict() == {
"soma_joinid": [1, 2],
"unicode": [
"a \N{GREEK CAPITAL LETTER DELTA} test",
"クン キン おし.える よ.む くん.ずる",
],
"ascii": ["bbb", "cccccc"],
"bytes": [b"bbb", b"ccc"],
"float32": [1.0, 2.0],
}


# TODO: Remove the `xfail` annotation when TileDB core supports Unicode
# dimensions, aka SOMA index columns.
#
@pytest.mark.xfail
def test_dataframe_unicode_index(tmp_path, sample_arrow_table):
Expand Down

0 comments on commit 096f716

Please sign in to comment.