From 096f716c0cb7c65422d1b6c77c7de2f9b515c44c Mon Sep 17 00:00:00 2001 From: Bruce Martin Date: Sun, 22 Jan 2023 13:24:25 -0800 Subject: [PATCH] [python] Enable Unicode non-indexed columns (#777) * remove xfails from unicode tests * remove unicode work-arounds for non-indexed columns * lint * remove incorrect code * additional comments * PR feedback --- apis/python/src/tiledbsoma/dataframe.py | 4 +- apis/python/src/tiledbsoma/util_arrow.py | 53 +++++++++++++----- apis/python/tests/test_unicode.py | 69 +++++++++++++++++++----- 3 files changed, 98 insertions(+), 28 deletions(-) diff --git a/apis/python/src/tiledbsoma/dataframe.py b/apis/python/src/tiledbsoma/dataframe.py index 9089ab2f87..8ea8a47ee7 100644 --- a/apis/python/src/tiledbsoma/dataframe.py +++ b/apis/python/src/tiledbsoma/dataframe.py @@ -88,7 +88,9 @@ def _create_empty( dims = [] for index_column_name in index_column_names: pa_type = schema.field(index_column_name).type - dtype = util_arrow.tiledb_type_from_arrow_type(pa_type) + dtype = util_arrow.tiledb_type_from_arrow_type( + pa_type, is_indexed_column=True + ) domain: Tuple[Any, Any] if isinstance(dtype, str): domain = None, None diff --git a/apis/python/src/tiledbsoma/util_arrow.py b/apis/python/src/tiledbsoma/util_arrow.py index 788c8b00bb..f1bb47cc8c 100644 --- a/apis/python/src/tiledbsoma/util_arrow.py +++ b/apis/python/src/tiledbsoma/util_arrow.py @@ -21,17 +21,25 @@ We auto-promote Arrow's string and binary to large_string and large_binary, respectively, as this is what TileDB stores -- a sequence of bytes preceded by a 64-bit (not 32-bit) length int. + +DataFrame-specific note: currently (as of 2.14), TileDB does not support +Unicode array dimensions. All Arrow string types used in a DataFrame index +columns (i.e., TileDB dimension) are coerced to ASCII. This equirement for +ASCII-only dimensions will be relaxed in a future release. Unicode/UTF-8 is +fully supported in SOMA DataFrame non-indexed columns. """ -ARROW_TO_TDB: Dict[Any, Union[str, TypeError]] = { - # Dict of types unsupported by to_pandas_dtype, which require overrides. +_ARROW_TO_TDB_ATTR: Dict[Any, Union[str, TypeError]] = { + # Dict of types unsupported by to_pandas_dtype, which require overrides for + # use in TileDB Attributes (aka DataFrame non-indexe columns). + # # If the value is an instance of Exception, it will be raised. # # IMPORTANT: ALL non-primitive types supported by TileDB must be in this table. # - pa.string(): "ascii", # TODO: temporary work-around until UTF8 support is native. GH #338. - pa.large_string(): "ascii", # TODO: temporary work-around until UTF8 support is native. GH #338. - pa.binary(): "bytes", # TODO: temporary work-around until UTF8 support is native. GH #338. - pa.large_binary(): "bytes", # TODO: temporary work-around until UTF8 support is native. GH #338. + pa.string(): "U1", + pa.large_string(): "U1", + pa.binary(): "bytes", + pa.large_binary(): "bytes", pa.timestamp("s"): "datetime64[s]", pa.timestamp("ms"): "datetime64[ms]", pa.timestamp("us"): "datetime64[us]", @@ -43,18 +51,36 @@ pa.date64(): TypeError("64-bit date - unsupported type (use TimestampType)"), } +# Same as _ARROW_TO_TDB_ATTR, but used for DataFrame indexed columns, aka TileDB Dimensions. +# Any type system differences from the base-case Attr should be added here. +_ARROW_TO_TDB_DIM: Dict[Any, Union[str, TypeError]] = _ARROW_TO_TDB_ATTR.copy() +_ARROW_TO_TDB_DIM.update( + { + pa.string(): "ascii", # TODO: temporary work-around until Dimension UTF8 support is available. + pa.large_string(): "ascii", # TODO: temporary work-around until Dimension UTF8 support is available. + } +) -def tiledb_type_from_arrow_type(t: pa.DataType) -> npt.DTypeLike: + +def tiledb_type_from_arrow_type( + t: pa.DataType, is_indexed_column: bool = False +) -> npt.DTypeLike: """ - Given an Arrow type, return the corresponding TileDB type as a Numpy dtype. + Given an Arrow type, return the corresponding TileDB type as a NumPy dtype. Building block for Arrow-to-TileDB schema translation. + TileDB currently has different Unicode handling for dimensions and attributes. + Set the ``is_dimension`` parameter to True for indexed-column (AKA dimension) + rules, which currently requires all strings to be ASCII. + If type is unsupported, with raise a TypeError exception. Parameters ---------- t : pyarrow.DataType Arrow DataType instance, e.g., pyarrow.int8() + is_indexed_column : bool + Use TileDB dimension type conversion rules. Returns ------- @@ -62,14 +88,13 @@ def tiledb_type_from_arrow_type(t: pa.DataType) -> npt.DTypeLike: The numpy dtype corresponding to the ``t`` parameter. ``TypeError`` will be raised for unsupported types. """ - if t in ARROW_TO_TDB: - arrow_type = ARROW_TO_TDB[t] + arrow_to_tdb = _ARROW_TO_TDB_DIM if is_indexed_column else _ARROW_TO_TDB_ATTR + if t in arrow_to_tdb: + arrow_type = arrow_to_tdb[t] if isinstance(arrow_type, Exception): raise arrow_type - if arrow_type == "ascii": + if arrow_type in ["ascii", "bytes"]: return arrow_type - if arrow_type == "bytes": - return arrow_type # np.int8() return np.dtype(arrow_type) if not pa.types.is_primitive(t): @@ -105,7 +130,7 @@ def get_arrow_type_from_tiledb_dtype( return pa.large_string() else: return pa.large_binary() - elif tiledb_dtype == "ascii": + elif tiledb_dtype == "ascii" or tiledb_dtype == str: return pa.large_string() else: return pa.from_numpy_dtype(tiledb_dtype) diff --git a/apis/python/tests/test_unicode.py b/apis/python/tests/test_unicode.py index 935316204b..2036ec52fd 100644 --- a/apis/python/tests/test_unicode.py +++ b/apis/python/tests/test_unicode.py @@ -22,7 +22,7 @@ def sample_arrow_table(): ], "ascii": ["aa", "bbb", "cccccc"], "bytes": [b"aa", b"bbb", b"ccc"], - "float32": np.array([0.0, 1.1, 2.2], np.float32), + "float32": np.array([0.0, 1.0, 2.0], np.float32), } ) @@ -40,29 +40,72 @@ def sample_arrow_table(): return pa.Table.from_pandas(df, schema) -# TODO: Remove the `xfail` annotation when issue TileDB-SOMA#415 is fixed -# -@pytest.mark.xfail -def test_dataframe_unicode(tmp_path, sample_arrow_table): - """Verify round-trip of unicode in DataFrame attributes""" +@pytest.fixture +def sample_soma_dataframe(tmp_path, sample_arrow_table): sdf = soma.DataFrame(tmp_path.as_posix()) - sdf.create(sample_arrow_table.schema) + sdf.create(sample_arrow_table.schema, index_column_names=["soma_joinid"]) sdf.write(sample_arrow_table) - assert sdf.read().concat().equals(sample_arrow_table) + assert sdf.exists() + return sdf -# TODO: Remove the `xfail` annotation when issue TileDB-SOMA#415 is fixed -# -@pytest.mark.xfail -def test_dataframe_unicode_attr(tmp_path, sample_arrow_table): +def test_dataframe_unicode_columns(tmp_path, sample_arrow_table): """Verify round-trip of unicode in DataFrame value columns""" sdf = soma.DataFrame(tmp_path.as_posix()) sdf.create(sample_arrow_table.schema, index_column_names=["soma_joinid"]) sdf.write(sample_arrow_table) + + assert sample_arrow_table.schema == sdf.schema assert sdf.read().concat().equals(sample_arrow_table) -# TODO: Remove the `xfail` annotation when issues TileDB-SOMA#415 and TileDB-SOMA#418 are fixed +def test_dataframe_unicode_value_filter(sample_soma_dataframe): + """Verify that value_filter works correctly""" + + # filter on ascii + assert sample_soma_dataframe.read( + value_filter="ascii in ['aa', 'cccccc']" + ).concat().to_pydict() == { + "soma_joinid": [0, 2], + "unicode": [ + "\N{LATIN CAPITAL LETTER E}\N{COMBINING CIRCUMFLEX ACCENT}", + "クン キン おし.える よ.む くん.ずる", + ], + "ascii": ["aa", "cccccc"], + "bytes": [b"aa", b"ccc"], + "float32": [0.0, 2.0], + } + + # filter on unicode, equality + assert sample_soma_dataframe.read( + value_filter="unicode == '\N{LATIN CAPITAL LETTER E}\N{COMBINING CIRCUMFLEX ACCENT}'" + ).concat().to_pydict() == { + "soma_joinid": [0], + "unicode": [ + "\N{LATIN CAPITAL LETTER E}\N{COMBINING CIRCUMFLEX ACCENT}", + ], + "ascii": ["aa"], + "bytes": [b"aa"], + "float32": [0.0], + } + + # filter on unicode, ordering + assert sample_soma_dataframe.read( + value_filter="unicode > 'a'" + ).concat().to_pydict() == { + "soma_joinid": [1, 2], + "unicode": [ + "a \N{GREEK CAPITAL LETTER DELTA} test", + "クン キン おし.える よ.む くん.ずる", + ], + "ascii": ["bbb", "cccccc"], + "bytes": [b"bbb", b"ccc"], + "float32": [1.0, 2.0], + } + + +# TODO: Remove the `xfail` annotation when TileDB core supports Unicode +# dimensions, aka SOMA index columns. # @pytest.mark.xfail def test_dataframe_unicode_index(tmp_path, sample_arrow_table):