From 096f716c0cb7c65422d1b6c77c7de2f9b515c44c Mon Sep 17 00:00:00 2001
From: Bruce Martin <bruce@chanzuckerberg.com>
Date: Sun, 22 Jan 2023 13:24:25 -0800
Subject: [PATCH] [python] Enable Unicode non-indexed columns (#777)

* remove xfails from unicode tests

* remove unicode work-arounds for non-indexed columns

* lint

* remove incorrect code

* additional comments

* PR feedback
---
 apis/python/src/tiledbsoma/dataframe.py  |  4 +-
 apis/python/src/tiledbsoma/util_arrow.py | 53 +++++++++++++-----
 apis/python/tests/test_unicode.py        | 69 +++++++++++++++++++-----
 3 files changed, 98 insertions(+), 28 deletions(-)

diff --git a/apis/python/src/tiledbsoma/dataframe.py b/apis/python/src/tiledbsoma/dataframe.py
index 9089ab2f87..8ea8a47ee7 100644
--- a/apis/python/src/tiledbsoma/dataframe.py
+++ b/apis/python/src/tiledbsoma/dataframe.py
@@ -88,7 +88,9 @@ def _create_empty(
         dims = []
         for index_column_name in index_column_names:
             pa_type = schema.field(index_column_name).type
-            dtype = util_arrow.tiledb_type_from_arrow_type(pa_type)
+            dtype = util_arrow.tiledb_type_from_arrow_type(
+                pa_type, is_indexed_column=True
+            )
             domain: Tuple[Any, Any]
             if isinstance(dtype, str):
                 domain = None, None
diff --git a/apis/python/src/tiledbsoma/util_arrow.py b/apis/python/src/tiledbsoma/util_arrow.py
index 788c8b00bb..f1bb47cc8c 100644
--- a/apis/python/src/tiledbsoma/util_arrow.py
+++ b/apis/python/src/tiledbsoma/util_arrow.py
@@ -21,17 +21,25 @@
 We auto-promote Arrow's string and binary to large_string and large_binary,
 respectively, as this is what TileDB stores -- a sequence of bytes preceded
 by a 64-bit (not 32-bit) length int.
+
+DataFrame-specific note: currently (as of 2.14), TileDB does not support
+Unicode array dimensions. All Arrow string types used in a DataFrame index
+columns (i.e., TileDB dimension) are coerced to ASCII. This equirement for
+ASCII-only dimensions will be relaxed in a future release. Unicode/UTF-8 is
+fully supported in SOMA DataFrame non-indexed columns.
 """
-ARROW_TO_TDB: Dict[Any, Union[str, TypeError]] = {
-    # Dict of types unsupported by to_pandas_dtype, which require overrides.
+_ARROW_TO_TDB_ATTR: Dict[Any, Union[str, TypeError]] = {
+    # Dict of types unsupported by to_pandas_dtype, which require overrides for
+    # use in TileDB Attributes (aka DataFrame non-indexe columns).
+    #
     # If the value is an instance of Exception, it will be raised.
     #
     # IMPORTANT: ALL non-primitive types supported by TileDB must be in this table.
     #
-    pa.string(): "ascii",  # TODO: temporary work-around until UTF8 support is native. GH #338.
-    pa.large_string(): "ascii",  # TODO: temporary work-around until UTF8 support is native. GH #338.
-    pa.binary(): "bytes",  # TODO: temporary work-around until UTF8 support is native. GH #338.
-    pa.large_binary(): "bytes",  # TODO: temporary work-around until UTF8 support is native. GH #338.
+    pa.string(): "U1",
+    pa.large_string(): "U1",
+    pa.binary(): "bytes",
+    pa.large_binary(): "bytes",
     pa.timestamp("s"): "datetime64[s]",
     pa.timestamp("ms"): "datetime64[ms]",
     pa.timestamp("us"): "datetime64[us]",
@@ -43,18 +51,36 @@
     pa.date64(): TypeError("64-bit date - unsupported type (use TimestampType)"),
 }
 
+# Same as _ARROW_TO_TDB_ATTR, but used for DataFrame indexed columns, aka TileDB Dimensions.
+# Any type system differences from the base-case Attr should be added here.
+_ARROW_TO_TDB_DIM: Dict[Any, Union[str, TypeError]] = _ARROW_TO_TDB_ATTR.copy()
+_ARROW_TO_TDB_DIM.update(
+    {
+        pa.string(): "ascii",  # TODO: temporary work-around until Dimension UTF8 support is available.
+        pa.large_string(): "ascii",  # TODO: temporary work-around until Dimension UTF8 support is available.
+    }
+)
 
-def tiledb_type_from_arrow_type(t: pa.DataType) -> npt.DTypeLike:
+
+def tiledb_type_from_arrow_type(
+    t: pa.DataType, is_indexed_column: bool = False
+) -> npt.DTypeLike:
     """
-    Given an Arrow type, return the corresponding TileDB type as a Numpy dtype.
+    Given an Arrow type, return the corresponding TileDB type as a NumPy dtype.
     Building block for Arrow-to-TileDB schema translation.
 
+    TileDB currently has different Unicode handling for dimensions and attributes.
+    Set the ``is_dimension`` parameter to True for indexed-column (AKA dimension)
+    rules, which currently requires all strings to be ASCII.
+
     If type is unsupported, with raise a TypeError exception.
 
     Parameters
     ----------
     t : pyarrow.DataType
         Arrow DataType instance, e.g., pyarrow.int8()
+    is_indexed_column : bool
+        Use TileDB dimension type conversion rules.
 
     Returns
     -------
@@ -62,14 +88,13 @@ def tiledb_type_from_arrow_type(t: pa.DataType) -> npt.DTypeLike:
         The numpy dtype corresponding to the ``t`` parameter. ``TypeError`` will
         be raised for unsupported types.
     """
-    if t in ARROW_TO_TDB:
-        arrow_type = ARROW_TO_TDB[t]
+    arrow_to_tdb = _ARROW_TO_TDB_DIM if is_indexed_column else _ARROW_TO_TDB_ATTR
+    if t in arrow_to_tdb:
+        arrow_type = arrow_to_tdb[t]
         if isinstance(arrow_type, Exception):
             raise arrow_type
-        if arrow_type == "ascii":
+        if arrow_type in ["ascii", "bytes"]:
             return arrow_type
-        if arrow_type == "bytes":
-            return arrow_type  # np.int8()
         return np.dtype(arrow_type)
 
     if not pa.types.is_primitive(t):
@@ -105,7 +130,7 @@ def get_arrow_type_from_tiledb_dtype(
             return pa.large_string()
         else:
             return pa.large_binary()
-    elif tiledb_dtype == "ascii":
+    elif tiledb_dtype == "ascii" or tiledb_dtype == str:
         return pa.large_string()
     else:
         return pa.from_numpy_dtype(tiledb_dtype)
diff --git a/apis/python/tests/test_unicode.py b/apis/python/tests/test_unicode.py
index 935316204b..2036ec52fd 100644
--- a/apis/python/tests/test_unicode.py
+++ b/apis/python/tests/test_unicode.py
@@ -22,7 +22,7 @@ def sample_arrow_table():
             ],
             "ascii": ["aa", "bbb", "cccccc"],
             "bytes": [b"aa", b"bbb", b"ccc"],
-            "float32": np.array([0.0, 1.1, 2.2], np.float32),
+            "float32": np.array([0.0, 1.0, 2.0], np.float32),
         }
     )
 
@@ -40,29 +40,72 @@ def sample_arrow_table():
     return pa.Table.from_pandas(df, schema)
 
 
-# TODO: Remove the `xfail` annotation when issue TileDB-SOMA#415 is fixed
-#
-@pytest.mark.xfail
-def test_dataframe_unicode(tmp_path, sample_arrow_table):
-    """Verify round-trip of unicode in DataFrame attributes"""
+@pytest.fixture
+def sample_soma_dataframe(tmp_path, sample_arrow_table):
     sdf = soma.DataFrame(tmp_path.as_posix())
-    sdf.create(sample_arrow_table.schema)
+    sdf.create(sample_arrow_table.schema, index_column_names=["soma_joinid"])
     sdf.write(sample_arrow_table)
-    assert sdf.read().concat().equals(sample_arrow_table)
+    assert sdf.exists()
+    return sdf
 
 
-# TODO: Remove the `xfail` annotation when issue TileDB-SOMA#415 is fixed
-#
-@pytest.mark.xfail
-def test_dataframe_unicode_attr(tmp_path, sample_arrow_table):
+def test_dataframe_unicode_columns(tmp_path, sample_arrow_table):
     """Verify round-trip of unicode in DataFrame value columns"""
     sdf = soma.DataFrame(tmp_path.as_posix())
     sdf.create(sample_arrow_table.schema, index_column_names=["soma_joinid"])
     sdf.write(sample_arrow_table)
+
+    assert sample_arrow_table.schema == sdf.schema
     assert sdf.read().concat().equals(sample_arrow_table)
 
 
-# TODO: Remove the `xfail` annotation when issues TileDB-SOMA#415 and TileDB-SOMA#418 are fixed
+def test_dataframe_unicode_value_filter(sample_soma_dataframe):
+    """Verify that value_filter works correctly"""
+
+    # filter on ascii
+    assert sample_soma_dataframe.read(
+        value_filter="ascii in ['aa', 'cccccc']"
+    ).concat().to_pydict() == {
+        "soma_joinid": [0, 2],
+        "unicode": [
+            "\N{LATIN CAPITAL LETTER E}\N{COMBINING CIRCUMFLEX ACCENT}",
+            "クン キン おし.える よ.む くん.ずる",
+        ],
+        "ascii": ["aa", "cccccc"],
+        "bytes": [b"aa", b"ccc"],
+        "float32": [0.0, 2.0],
+    }
+
+    # filter on unicode, equality
+    assert sample_soma_dataframe.read(
+        value_filter="unicode == '\N{LATIN CAPITAL LETTER E}\N{COMBINING CIRCUMFLEX ACCENT}'"
+    ).concat().to_pydict() == {
+        "soma_joinid": [0],
+        "unicode": [
+            "\N{LATIN CAPITAL LETTER E}\N{COMBINING CIRCUMFLEX ACCENT}",
+        ],
+        "ascii": ["aa"],
+        "bytes": [b"aa"],
+        "float32": [0.0],
+    }
+
+    # filter on unicode, ordering
+    assert sample_soma_dataframe.read(
+        value_filter="unicode > 'a'"
+    ).concat().to_pydict() == {
+        "soma_joinid": [1, 2],
+        "unicode": [
+            "a \N{GREEK CAPITAL LETTER DELTA} test",
+            "クン キン おし.える よ.む くん.ずる",
+        ],
+        "ascii": ["bbb", "cccccc"],
+        "bytes": [b"bbb", b"ccc"],
+        "float32": [1.0, 2.0],
+    }
+
+
+# TODO: Remove the `xfail` annotation when TileDB core supports Unicode
+# dimensions, aka SOMA index columns.
 #
 @pytest.mark.xfail
 def test_dataframe_unicode_index(tmp_path, sample_arrow_table):