Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[python] Enable Unicode non-indexed columns #777

Merged
merged 10 commits into from
Jan 22, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion apis/python/src/tiledbsoma/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,9 @@ def _create_empty(
dims = []
for index_column_name in index_column_names:
pa_type = schema.field(index_column_name).type
dtype = util_arrow.tiledb_type_from_arrow_type(pa_type)
dtype = util_arrow.tiledb_type_from_arrow_type(
pa_type, is_indexed_column=True
)
domain: Tuple[Any, Any]
if isinstance(dtype, str):
domain = None, None
Expand Down
53 changes: 39 additions & 14 deletions apis/python/src/tiledbsoma/util_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,17 +21,25 @@
We auto-promote Arrow's string and binary to large_string and large_binary,
respectively, as this is what TileDB stores -- a sequence of bytes preceded
by a 64-bit (not 32-bit) length int.

DataFrame-specific note: currently (as of 2.14), TileDB does not support
Unicode array dimensions. All Arrow string types used in a DataFrame index
columns (i.e., TileDB dimension) are coerced to ASCII. This equirement for
ASCII-only dimensions will be relaxed in a future release. Unicode/UTF-8 is
fully supported in SOMA DataFrame non-indexed columns.
"""
ARROW_TO_TDB: Dict[Any, Union[str, TypeError]] = {
# Dict of types unsupported by to_pandas_dtype, which require overrides.
_ARROW_TO_TDB_ATTR: Dict[Any, Union[str, TypeError]] = {
# Dict of types unsupported by to_pandas_dtype, which require overrides for
# use in TileDB Attributes (aka DataFrame non-indexe columns).
#
# If the value is an instance of Exception, it will be raised.
#
# IMPORTANT: ALL non-primitive types supported by TileDB must be in this table.
#
pa.string(): "ascii", # TODO: temporary work-around until UTF8 support is native. GH #338.
pa.large_string(): "ascii", # TODO: temporary work-around until UTF8 support is native. GH #338.
pa.binary(): "bytes", # TODO: temporary work-around until UTF8 support is native. GH #338.
pa.large_binary(): "bytes", # TODO: temporary work-around until UTF8 support is native. GH #338.
pa.string(): "U1",
pa.large_string(): "U1",
pa.binary(): "bytes",
pa.large_binary(): "bytes",
pa.timestamp("s"): "datetime64[s]",
pa.timestamp("ms"): "datetime64[ms]",
pa.timestamp("us"): "datetime64[us]",
Expand All @@ -43,33 +51,50 @@
pa.date64(): TypeError("64-bit date - unsupported type (use TimestampType)"),
}

# Same as _ARROW_TO_TDB_ATTR, but used for DataFrame indexed columns, aka TileDB Dimensions.
# Any type system differences from the base-case Attr should be added here.
_ARROW_TO_TDB_DIM: Dict[Any, Union[str, TypeError]] = _ARROW_TO_TDB_ATTR.copy()
_ARROW_TO_TDB_DIM.update(
{
pa.string(): "ascii", # TODO: temporary work-around until Dimension UTF8 support is available.
pa.large_string(): "ascii", # TODO: temporary work-around until Dimension UTF8 support is available.
}
)

def tiledb_type_from_arrow_type(t: pa.DataType) -> npt.DTypeLike:

def tiledb_type_from_arrow_type(
t: pa.DataType, is_indexed_column: bool = False
) -> npt.DTypeLike:
"""
Given an Arrow type, return the corresponding TileDB type as a Numpy dtype.
Given an Arrow type, return the corresponding TileDB type as a NumPy dtype.
Building block for Arrow-to-TileDB schema translation.

TileDB currently has different Unicode handling for dimensions and attributes.
Set the ``is_dimension`` parameter to True for indexed-column (AKA dimension)
rules, which currently requires all strings to be ASCII.

If type is unsupported, with raise a TypeError exception.

Parameters
----------
t : pyarrow.DataType
Arrow DataType instance, e.g., pyarrow.int8()
is_indexed_column : bool
Use TileDB dimension type conversion rules.

Returns
-------
numpy.dtype
The numpy dtype corresponding to the ``t`` parameter. ``TypeError`` will
be raised for unsupported types.
"""
if t in ARROW_TO_TDB:
arrow_type = ARROW_TO_TDB[t]
arrow_to_tdb = _ARROW_TO_TDB_DIM if is_indexed_column else _ARROW_TO_TDB_ATTR
if t in arrow_to_tdb:
arrow_type = arrow_to_tdb[t]
if isinstance(arrow_type, Exception):
raise arrow_type
if arrow_type == "ascii":
if arrow_type in ["ascii", "bytes"]:
return arrow_type
if arrow_type == "bytes":
return arrow_type # np.int8()
return np.dtype(arrow_type)

if not pa.types.is_primitive(t):
Expand Down Expand Up @@ -105,7 +130,7 @@ def get_arrow_type_from_tiledb_dtype(
return pa.large_string()
else:
return pa.large_binary()
elif tiledb_dtype == "ascii":
elif tiledb_dtype == "ascii" or tiledb_dtype == str:
return pa.large_string()
else:
return pa.from_numpy_dtype(tiledb_dtype)
Expand Down
69 changes: 56 additions & 13 deletions apis/python/tests/test_unicode.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def sample_arrow_table():
],
"ascii": ["aa", "bbb", "cccccc"],
"bytes": [b"aa", b"bbb", b"ccc"],
"float32": np.array([0.0, 1.1, 2.2], np.float32),
"float32": np.array([0.0, 1.0, 2.0], np.float32),
}
)

Expand All @@ -40,29 +40,72 @@ def sample_arrow_table():
return pa.Table.from_pandas(df, schema)


# TODO: Remove the `xfail` annotation when issue TileDB-SOMA#415 is fixed
#
@pytest.mark.xfail
def test_dataframe_unicode(tmp_path, sample_arrow_table):
"""Verify round-trip of unicode in DataFrame attributes"""
@pytest.fixture
def sample_soma_dataframe(tmp_path, sample_arrow_table):
sdf = soma.DataFrame(tmp_path.as_posix())
sdf.create(sample_arrow_table.schema)
sdf.create(sample_arrow_table.schema, index_column_names=["soma_joinid"])
sdf.write(sample_arrow_table)
assert sdf.read().concat().equals(sample_arrow_table)
assert sdf.exists()
return sdf


# TODO: Remove the `xfail` annotation when issue TileDB-SOMA#415 is fixed
#
@pytest.mark.xfail
def test_dataframe_unicode_attr(tmp_path, sample_arrow_table):
def test_dataframe_unicode_columns(tmp_path, sample_arrow_table):
"""Verify round-trip of unicode in DataFrame value columns"""
sdf = soma.DataFrame(tmp_path.as_posix())
sdf.create(sample_arrow_table.schema, index_column_names=["soma_joinid"])
sdf.write(sample_arrow_table)

assert sample_arrow_table.schema == sdf.schema
assert sdf.read().concat().equals(sample_arrow_table)


# TODO: Remove the `xfail` annotation when issues TileDB-SOMA#415 and TileDB-SOMA#418 are fixed
def test_dataframe_unicode_value_filter(sample_soma_dataframe):
"""Verify that value_filter works correctly"""

# filter on ascii
assert sample_soma_dataframe.read(
value_filter="ascii in ['aa', 'cccccc']"
).concat().to_pydict() == {
"soma_joinid": [0, 2],
"unicode": [
"\N{LATIN CAPITAL LETTER E}\N{COMBINING CIRCUMFLEX ACCENT}",
"クン キン おし.える よ.む くん.ずる",
],
"ascii": ["aa", "cccccc"],
"bytes": [b"aa", b"ccc"],
"float32": [0.0, 2.0],
}

# filter on unicode, equality
assert sample_soma_dataframe.read(
value_filter="unicode == '\N{LATIN CAPITAL LETTER E}\N{COMBINING CIRCUMFLEX ACCENT}'"
).concat().to_pydict() == {
"soma_joinid": [0],
"unicode": [
"\N{LATIN CAPITAL LETTER E}\N{COMBINING CIRCUMFLEX ACCENT}",
],
"ascii": ["aa"],
"bytes": [b"aa"],
"float32": [0.0],
}

# filter on unicode, ordering
assert sample_soma_dataframe.read(
value_filter="unicode > 'a'"
).concat().to_pydict() == {
"soma_joinid": [1, 2],
"unicode": [
"a \N{GREEK CAPITAL LETTER DELTA} test",
"クン キン おし.える よ.む くん.ずる",
],
"ascii": ["bbb", "cccccc"],
"bytes": [b"bbb", b"ccc"],
"float32": [1.0, 2.0],
}


# TODO: Remove the `xfail` annotation when TileDB core supports Unicode
# dimensions, aka SOMA index columns.
#
@pytest.mark.xfail
def test_dataframe_unicode_index(tmp_path, sample_arrow_table):
Expand Down