From 51d854d5e358890a2a6fc36d16f2f8038acc2587 Mon Sep 17 00:00:00 2001 From: bkmartinjr Date: Fri, 20 Jan 2023 00:47:22 +0000 Subject: [PATCH 1/6] remove xfails from unicode tests --- apis/python/tests/test_unicode.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/apis/python/tests/test_unicode.py b/apis/python/tests/test_unicode.py index 935316204b..4d253ed2ae 100644 --- a/apis/python/tests/test_unicode.py +++ b/apis/python/tests/test_unicode.py @@ -40,9 +40,6 @@ def sample_arrow_table(): return pa.Table.from_pandas(df, schema) -# TODO: Remove the `xfail` annotation when issue TileDB-SOMA#415 is fixed -# -@pytest.mark.xfail def test_dataframe_unicode(tmp_path, sample_arrow_table): """Verify round-trip of unicode in DataFrame attributes""" sdf = soma.DataFrame(tmp_path.as_posix()) @@ -51,9 +48,6 @@ def test_dataframe_unicode(tmp_path, sample_arrow_table): assert sdf.read().concat().equals(sample_arrow_table) -# TODO: Remove the `xfail` annotation when issue TileDB-SOMA#415 is fixed -# -@pytest.mark.xfail def test_dataframe_unicode_attr(tmp_path, sample_arrow_table): """Verify round-trip of unicode in DataFrame value columns""" sdf = soma.DataFrame(tmp_path.as_posix()) @@ -62,9 +56,6 @@ def test_dataframe_unicode_attr(tmp_path, sample_arrow_table): assert sdf.read().concat().equals(sample_arrow_table) -# TODO: Remove the `xfail` annotation when issues TileDB-SOMA#415 and TileDB-SOMA#418 are fixed -# -@pytest.mark.xfail def test_dataframe_unicode_index(tmp_path, sample_arrow_table): """Verify round-trip of unicode in DataFrame index columns""" sdf = soma.DataFrame(tmp_path.as_posix()) From 53253cd6bc9287b58ca905de20ec5e668e577728 Mon Sep 17 00:00:00 2001 From: bkmartinjr Date: Fri, 20 Jan 2023 18:40:12 +0000 Subject: [PATCH 2/6] remove unicode work-arounds for non-indexed columns --- apis/python/src/tiledbsoma/dataframe.py | 4 +- apis/python/src/tiledbsoma/util_arrow.py | 54 ++++++++++++++------ apis/python/tests/test_unicode.py | 64 +++++++++++++++++++++--- 3 files changed, 101 insertions(+), 21 deletions(-) diff --git a/apis/python/src/tiledbsoma/dataframe.py b/apis/python/src/tiledbsoma/dataframe.py index 9089ab2f87..8ea8a47ee7 100644 --- a/apis/python/src/tiledbsoma/dataframe.py +++ b/apis/python/src/tiledbsoma/dataframe.py @@ -88,7 +88,9 @@ def _create_empty( dims = [] for index_column_name in index_column_names: pa_type = schema.field(index_column_name).type - dtype = util_arrow.tiledb_type_from_arrow_type(pa_type) + dtype = util_arrow.tiledb_type_from_arrow_type( + pa_type, is_indexed_column=True + ) domain: Tuple[Any, Any] if isinstance(dtype, str): domain = None, None diff --git a/apis/python/src/tiledbsoma/util_arrow.py b/apis/python/src/tiledbsoma/util_arrow.py index 788c8b00bb..eaec21f28d 100644 --- a/apis/python/src/tiledbsoma/util_arrow.py +++ b/apis/python/src/tiledbsoma/util_arrow.py @@ -21,17 +21,25 @@ We auto-promote Arrow's string and binary to large_string and large_binary, respectively, as this is what TileDB stores -- a sequence of bytes preceded by a 64-bit (not 32-bit) length int. + +DataFrame-specific note: currently (as of 2.14), TileDB does not support +Unicode array dimensions. All Arrow string types used in a DataFrame index +columns (i.e., TileDB dimension) are coerced to ASCII. This equirement for +ASCII-only dimensions will be relaxed in a future release. Unicode/UTF-8 is +fully supported in SOMA DataFrame non-indexed columns. """ -ARROW_TO_TDB: Dict[Any, Union[str, TypeError]] = { - # Dict of types unsupported by to_pandas_dtype, which require overrides. +_ARROW_TO_TDB_ATTR: Dict[Any, Union[str, TypeError]] = { + # Dict of types unsupported by to_pandas_dtype, which require overrides for + # use in TileDB Attributes (aka DataFrame non-indexe columns). + # # If the value is an instance of Exception, it will be raised. # # IMPORTANT: ALL non-primitive types supported by TileDB must be in this table. # - pa.string(): "ascii", # TODO: temporary work-around until UTF8 support is native. GH #338. - pa.large_string(): "ascii", # TODO: temporary work-around until UTF8 support is native. GH #338. - pa.binary(): "bytes", # TODO: temporary work-around until UTF8 support is native. GH #338. - pa.large_binary(): "bytes", # TODO: temporary work-around until UTF8 support is native. GH #338. + pa.string(): "U1", + pa.large_string(): "U1", + pa.binary(): "bytes", + pa.large_binary(): "bytes", pa.timestamp("s"): "datetime64[s]", pa.timestamp("ms"): "datetime64[ms]", pa.timestamp("us"): "datetime64[us]", @@ -43,18 +51,37 @@ pa.date64(): TypeError("64-bit date - unsupported type (use TimestampType)"), } - -def tiledb_type_from_arrow_type(t: pa.DataType) -> npt.DTypeLike: +# Same as _ARROW_TO_TDB_ATTR, but used for DataFrame indexed columns, aka TileDB Dimensions +_ARROW_TO_TDB_DIM: Dict[Any, Union[str, TypeError]] = _ARROW_TO_TDB_ATTR.copy() +_ARROW_TO_TDB_DIM.update( + { + pa.string(): "ascii", # TODO: temporary work-around until Dimension UTF8 support is available. + pa.large_string(): "ascii", # TODO: temporary work-around until UTF8 support is available. + pa.binary(): "bytes", # TODO: temporary work-around until UTF8 support is available. + pa.large_binary(): "bytes", # TODO: temporary work-around until UTF8 support is available. + } +) + + +def tiledb_type_from_arrow_type( + t: pa.DataType, is_indexed_column: bool = False +) -> npt.DTypeLike: """ Given an Arrow type, return the corresponding TileDB type as a Numpy dtype. Building block for Arrow-to-TileDB schema translation. + TileDB currently has different Unicode handling for dimensions and attributes. + Set the ``is_dimension`` parameter to True for dimension rules, which + currently requires all strings to be ASCII. + If type is unsupported, with raise a TypeError exception. Parameters ---------- t : pyarrow.DataType Arrow DataType instance, e.g., pyarrow.int8() + is_indexed_column : bool + Use TileDB dimension type conversion rules. Returns ------- @@ -62,14 +89,13 @@ def tiledb_type_from_arrow_type(t: pa.DataType) -> npt.DTypeLike: The numpy dtype corresponding to the ``t`` parameter. ``TypeError`` will be raised for unsupported types. """ - if t in ARROW_TO_TDB: - arrow_type = ARROW_TO_TDB[t] + arrow_to_tdb = _ARROW_TO_TDB_DIM if is_indexed_column else _ARROW_TO_TDB_ATTR + if t in arrow_to_tdb: + arrow_type = arrow_to_tdb[t] if isinstance(arrow_type, Exception): raise arrow_type - if arrow_type == "ascii": + if arrow_type in ["ascii", "bytes"]: return arrow_type - if arrow_type == "bytes": - return arrow_type # np.int8() return np.dtype(arrow_type) if not pa.types.is_primitive(t): @@ -105,7 +131,7 @@ def get_arrow_type_from_tiledb_dtype( return pa.large_string() else: return pa.large_binary() - elif tiledb_dtype == "ascii": + elif tiledb_dtype == "ascii" or tiledb_dtype == str: return pa.large_string() else: return pa.from_numpy_dtype(tiledb_dtype) diff --git a/apis/python/tests/test_unicode.py b/apis/python/tests/test_unicode.py index 4d253ed2ae..2036ec52fd 100644 --- a/apis/python/tests/test_unicode.py +++ b/apis/python/tests/test_unicode.py @@ -22,7 +22,7 @@ def sample_arrow_table(): ], "ascii": ["aa", "bbb", "cccccc"], "bytes": [b"aa", b"bbb", b"ccc"], - "float32": np.array([0.0, 1.1, 2.2], np.float32), + "float32": np.array([0.0, 1.0, 2.0], np.float32), } ) @@ -40,22 +40,74 @@ def sample_arrow_table(): return pa.Table.from_pandas(df, schema) -def test_dataframe_unicode(tmp_path, sample_arrow_table): - """Verify round-trip of unicode in DataFrame attributes""" +@pytest.fixture +def sample_soma_dataframe(tmp_path, sample_arrow_table): sdf = soma.DataFrame(tmp_path.as_posix()) - sdf.create(sample_arrow_table.schema) + sdf.create(sample_arrow_table.schema, index_column_names=["soma_joinid"]) sdf.write(sample_arrow_table) - assert sdf.read().concat().equals(sample_arrow_table) + assert sdf.exists() + return sdf -def test_dataframe_unicode_attr(tmp_path, sample_arrow_table): +def test_dataframe_unicode_columns(tmp_path, sample_arrow_table): """Verify round-trip of unicode in DataFrame value columns""" sdf = soma.DataFrame(tmp_path.as_posix()) sdf.create(sample_arrow_table.schema, index_column_names=["soma_joinid"]) sdf.write(sample_arrow_table) + + assert sample_arrow_table.schema == sdf.schema assert sdf.read().concat().equals(sample_arrow_table) +def test_dataframe_unicode_value_filter(sample_soma_dataframe): + """Verify that value_filter works correctly""" + + # filter on ascii + assert sample_soma_dataframe.read( + value_filter="ascii in ['aa', 'cccccc']" + ).concat().to_pydict() == { + "soma_joinid": [0, 2], + "unicode": [ + "\N{LATIN CAPITAL LETTER E}\N{COMBINING CIRCUMFLEX ACCENT}", + "クン キン おし.える よ.む くん.ずる", + ], + "ascii": ["aa", "cccccc"], + "bytes": [b"aa", b"ccc"], + "float32": [0.0, 2.0], + } + + # filter on unicode, equality + assert sample_soma_dataframe.read( + value_filter="unicode == '\N{LATIN CAPITAL LETTER E}\N{COMBINING CIRCUMFLEX ACCENT}'" + ).concat().to_pydict() == { + "soma_joinid": [0], + "unicode": [ + "\N{LATIN CAPITAL LETTER E}\N{COMBINING CIRCUMFLEX ACCENT}", + ], + "ascii": ["aa"], + "bytes": [b"aa"], + "float32": [0.0], + } + + # filter on unicode, ordering + assert sample_soma_dataframe.read( + value_filter="unicode > 'a'" + ).concat().to_pydict() == { + "soma_joinid": [1, 2], + "unicode": [ + "a \N{GREEK CAPITAL LETTER DELTA} test", + "クン キン おし.える よ.む くん.ずる", + ], + "ascii": ["bbb", "cccccc"], + "bytes": [b"bbb", b"ccc"], + "float32": [1.0, 2.0], + } + + +# TODO: Remove the `xfail` annotation when TileDB core supports Unicode +# dimensions, aka SOMA index columns. +# +@pytest.mark.xfail def test_dataframe_unicode_index(tmp_path, sample_arrow_table): """Verify round-trip of unicode in DataFrame index columns""" sdf = soma.DataFrame(tmp_path.as_posix()) From 4830225c4eee665c25c6a04bd8b69cb368f9284d Mon Sep 17 00:00:00 2001 From: bkmartinjr Date: Fri, 20 Jan 2023 18:45:25 +0000 Subject: [PATCH 3/6] lint --- apis/python/src/tiledbsoma/util_arrow.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apis/python/src/tiledbsoma/util_arrow.py b/apis/python/src/tiledbsoma/util_arrow.py index eaec21f28d..accefed3b9 100644 --- a/apis/python/src/tiledbsoma/util_arrow.py +++ b/apis/python/src/tiledbsoma/util_arrow.py @@ -22,9 +22,9 @@ respectively, as this is what TileDB stores -- a sequence of bytes preceded by a 64-bit (not 32-bit) length int. -DataFrame-specific note: currently (as of 2.14), TileDB does not support +DataFrame-specific note: currently (as of 2.14), TileDB does not support Unicode array dimensions. All Arrow string types used in a DataFrame index -columns (i.e., TileDB dimension) are coerced to ASCII. This equirement for +columns (i.e., TileDB dimension) are coerced to ASCII. This equirement for ASCII-only dimensions will be relaxed in a future release. Unicode/UTF-8 is fully supported in SOMA DataFrame non-indexed columns. """ From 9a72e495bcfa2f7fafa0ec2cc674070af19076f2 Mon Sep 17 00:00:00 2001 From: bkmartinjr Date: Fri, 20 Jan 2023 18:54:12 +0000 Subject: [PATCH 4/6] remove incorrect code --- apis/python/src/tiledbsoma/util_arrow.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/apis/python/src/tiledbsoma/util_arrow.py b/apis/python/src/tiledbsoma/util_arrow.py index accefed3b9..251331364f 100644 --- a/apis/python/src/tiledbsoma/util_arrow.py +++ b/apis/python/src/tiledbsoma/util_arrow.py @@ -57,8 +57,6 @@ { pa.string(): "ascii", # TODO: temporary work-around until Dimension UTF8 support is available. pa.large_string(): "ascii", # TODO: temporary work-around until UTF8 support is available. - pa.binary(): "bytes", # TODO: temporary work-around until UTF8 support is available. - pa.large_binary(): "bytes", # TODO: temporary work-around until UTF8 support is available. } ) From 2120c48d0593922f34176767b35e447ca61d0a09 Mon Sep 17 00:00:00 2001 From: bkmartinjr Date: Fri, 20 Jan 2023 18:56:07 +0000 Subject: [PATCH 5/6] additional comments --- apis/python/src/tiledbsoma/util_arrow.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/apis/python/src/tiledbsoma/util_arrow.py b/apis/python/src/tiledbsoma/util_arrow.py index 251331364f..d972b35f68 100644 --- a/apis/python/src/tiledbsoma/util_arrow.py +++ b/apis/python/src/tiledbsoma/util_arrow.py @@ -51,12 +51,13 @@ pa.date64(): TypeError("64-bit date - unsupported type (use TimestampType)"), } -# Same as _ARROW_TO_TDB_ATTR, but used for DataFrame indexed columns, aka TileDB Dimensions +# Same as _ARROW_TO_TDB_ATTR, but used for DataFrame indexed columns, aka TileDB Dimensions. +# Any type system differences from the base-case Attr should be added here. _ARROW_TO_TDB_DIM: Dict[Any, Union[str, TypeError]] = _ARROW_TO_TDB_ATTR.copy() _ARROW_TO_TDB_DIM.update( { pa.string(): "ascii", # TODO: temporary work-around until Dimension UTF8 support is available. - pa.large_string(): "ascii", # TODO: temporary work-around until UTF8 support is available. + pa.large_string(): "ascii", # TODO: temporary work-around until Dimension UTF8 support is available. } ) From 793d97d275780a5ddb7cdddc738f1b50d44e9c57 Mon Sep 17 00:00:00 2001 From: bkmartinjr Date: Fri, 20 Jan 2023 19:16:12 +0000 Subject: [PATCH 6/6] PR feedback --- apis/python/src/tiledbsoma/util_arrow.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/apis/python/src/tiledbsoma/util_arrow.py b/apis/python/src/tiledbsoma/util_arrow.py index d972b35f68..f1bb47cc8c 100644 --- a/apis/python/src/tiledbsoma/util_arrow.py +++ b/apis/python/src/tiledbsoma/util_arrow.py @@ -66,12 +66,12 @@ def tiledb_type_from_arrow_type( t: pa.DataType, is_indexed_column: bool = False ) -> npt.DTypeLike: """ - Given an Arrow type, return the corresponding TileDB type as a Numpy dtype. + Given an Arrow type, return the corresponding TileDB type as a NumPy dtype. Building block for Arrow-to-TileDB schema translation. TileDB currently has different Unicode handling for dimensions and attributes. - Set the ``is_dimension`` parameter to True for dimension rules, which - currently requires all strings to be ASCII. + Set the ``is_dimension`` parameter to True for indexed-column (AKA dimension) + rules, which currently requires all strings to be ASCII. If type is unsupported, with raise a TypeError exception.