From 51d854d5e358890a2a6fc36d16f2f8038acc2587 Mon Sep 17 00:00:00 2001
From: bkmartinjr <bruce@chanzuckerberg.com>
Date: Fri, 20 Jan 2023 00:47:22 +0000
Subject: [PATCH 1/6] remove xfails from unicode tests

---
 apis/python/tests/test_unicode.py | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/apis/python/tests/test_unicode.py b/apis/python/tests/test_unicode.py
index 935316204b..4d253ed2ae 100644
--- a/apis/python/tests/test_unicode.py
+++ b/apis/python/tests/test_unicode.py
@@ -40,9 +40,6 @@ def sample_arrow_table():
     return pa.Table.from_pandas(df, schema)
 
 
-# TODO: Remove the `xfail` annotation when issue TileDB-SOMA#415 is fixed
-#
-@pytest.mark.xfail
 def test_dataframe_unicode(tmp_path, sample_arrow_table):
     """Verify round-trip of unicode in DataFrame attributes"""
     sdf = soma.DataFrame(tmp_path.as_posix())
@@ -51,9 +48,6 @@ def test_dataframe_unicode(tmp_path, sample_arrow_table):
     assert sdf.read().concat().equals(sample_arrow_table)
 
 
-# TODO: Remove the `xfail` annotation when issue TileDB-SOMA#415 is fixed
-#
-@pytest.mark.xfail
 def test_dataframe_unicode_attr(tmp_path, sample_arrow_table):
     """Verify round-trip of unicode in DataFrame value columns"""
     sdf = soma.DataFrame(tmp_path.as_posix())
@@ -62,9 +56,6 @@ def test_dataframe_unicode_attr(tmp_path, sample_arrow_table):
     assert sdf.read().concat().equals(sample_arrow_table)
 
 
-# TODO: Remove the `xfail` annotation when issues TileDB-SOMA#415 and TileDB-SOMA#418 are fixed
-#
-@pytest.mark.xfail
 def test_dataframe_unicode_index(tmp_path, sample_arrow_table):
     """Verify round-trip of unicode in DataFrame index columns"""
     sdf = soma.DataFrame(tmp_path.as_posix())

From 53253cd6bc9287b58ca905de20ec5e668e577728 Mon Sep 17 00:00:00 2001
From: bkmartinjr <bruce@chanzuckerberg.com>
Date: Fri, 20 Jan 2023 18:40:12 +0000
Subject: [PATCH 2/6] remove unicode work-arounds for non-indexed columns

---
 apis/python/src/tiledbsoma/dataframe.py  |  4 +-
 apis/python/src/tiledbsoma/util_arrow.py | 54 ++++++++++++++------
 apis/python/tests/test_unicode.py        | 64 +++++++++++++++++++++---
 3 files changed, 101 insertions(+), 21 deletions(-)

diff --git a/apis/python/src/tiledbsoma/dataframe.py b/apis/python/src/tiledbsoma/dataframe.py
index 9089ab2f87..8ea8a47ee7 100644
--- a/apis/python/src/tiledbsoma/dataframe.py
+++ b/apis/python/src/tiledbsoma/dataframe.py
@@ -88,7 +88,9 @@ def _create_empty(
         dims = []
         for index_column_name in index_column_names:
             pa_type = schema.field(index_column_name).type
-            dtype = util_arrow.tiledb_type_from_arrow_type(pa_type)
+            dtype = util_arrow.tiledb_type_from_arrow_type(
+                pa_type, is_indexed_column=True
+            )
             domain: Tuple[Any, Any]
             if isinstance(dtype, str):
                 domain = None, None
diff --git a/apis/python/src/tiledbsoma/util_arrow.py b/apis/python/src/tiledbsoma/util_arrow.py
index 788c8b00bb..eaec21f28d 100644
--- a/apis/python/src/tiledbsoma/util_arrow.py
+++ b/apis/python/src/tiledbsoma/util_arrow.py
@@ -21,17 +21,25 @@
 We auto-promote Arrow's string and binary to large_string and large_binary,
 respectively, as this is what TileDB stores -- a sequence of bytes preceded
 by a 64-bit (not 32-bit) length int.
+
+DataFrame-specific note: currently (as of 2.14), TileDB does not support 
+Unicode array dimensions. All Arrow string types used in a DataFrame index
+columns (i.e., TileDB dimension) are coerced to ASCII. This equirement for 
+ASCII-only dimensions will be relaxed in a future release. Unicode/UTF-8 is
+fully supported in SOMA DataFrame non-indexed columns.
 """
-ARROW_TO_TDB: Dict[Any, Union[str, TypeError]] = {
-    # Dict of types unsupported by to_pandas_dtype, which require overrides.
+_ARROW_TO_TDB_ATTR: Dict[Any, Union[str, TypeError]] = {
+    # Dict of types unsupported by to_pandas_dtype, which require overrides for
+    # use in TileDB Attributes (aka DataFrame non-indexe columns).
+    #
     # If the value is an instance of Exception, it will be raised.
     #
     # IMPORTANT: ALL non-primitive types supported by TileDB must be in this table.
     #
-    pa.string(): "ascii",  # TODO: temporary work-around until UTF8 support is native. GH #338.
-    pa.large_string(): "ascii",  # TODO: temporary work-around until UTF8 support is native. GH #338.
-    pa.binary(): "bytes",  # TODO: temporary work-around until UTF8 support is native. GH #338.
-    pa.large_binary(): "bytes",  # TODO: temporary work-around until UTF8 support is native. GH #338.
+    pa.string(): "U1",
+    pa.large_string(): "U1",
+    pa.binary(): "bytes",
+    pa.large_binary(): "bytes",
     pa.timestamp("s"): "datetime64[s]",
     pa.timestamp("ms"): "datetime64[ms]",
     pa.timestamp("us"): "datetime64[us]",
@@ -43,18 +51,37 @@
     pa.date64(): TypeError("64-bit date - unsupported type (use TimestampType)"),
 }
 
-
-def tiledb_type_from_arrow_type(t: pa.DataType) -> npt.DTypeLike:
+# Same as _ARROW_TO_TDB_ATTR, but used for DataFrame indexed columns, aka TileDB Dimensions
+_ARROW_TO_TDB_DIM: Dict[Any, Union[str, TypeError]] = _ARROW_TO_TDB_ATTR.copy()
+_ARROW_TO_TDB_DIM.update(
+    {
+        pa.string(): "ascii",  # TODO: temporary work-around until Dimension UTF8 support is available.
+        pa.large_string(): "ascii",  # TODO: temporary work-around until UTF8 support is available.
+        pa.binary(): "bytes",  # TODO: temporary work-around until UTF8 support is available.
+        pa.large_binary(): "bytes",  # TODO: temporary work-around until UTF8 support is available.
+    }
+)
+
+
+def tiledb_type_from_arrow_type(
+    t: pa.DataType, is_indexed_column: bool = False
+) -> npt.DTypeLike:
     """
     Given an Arrow type, return the corresponding TileDB type as a Numpy dtype.
     Building block for Arrow-to-TileDB schema translation.
 
+    TileDB currently has different Unicode handling for dimensions and attributes.
+    Set the ``is_dimension`` parameter to True for dimension rules, which
+    currently requires all strings to be ASCII.
+
     If type is unsupported, with raise a TypeError exception.
 
     Parameters
     ----------
     t : pyarrow.DataType
         Arrow DataType instance, e.g., pyarrow.int8()
+    is_indexed_column : bool
+        Use TileDB dimension type conversion rules.
 
     Returns
     -------
@@ -62,14 +89,13 @@ def tiledb_type_from_arrow_type(t: pa.DataType) -> npt.DTypeLike:
         The numpy dtype corresponding to the ``t`` parameter. ``TypeError`` will
         be raised for unsupported types.
     """
-    if t in ARROW_TO_TDB:
-        arrow_type = ARROW_TO_TDB[t]
+    arrow_to_tdb = _ARROW_TO_TDB_DIM if is_indexed_column else _ARROW_TO_TDB_ATTR
+    if t in arrow_to_tdb:
+        arrow_type = arrow_to_tdb[t]
         if isinstance(arrow_type, Exception):
             raise arrow_type
-        if arrow_type == "ascii":
+        if arrow_type in ["ascii", "bytes"]:
             return arrow_type
-        if arrow_type == "bytes":
-            return arrow_type  # np.int8()
         return np.dtype(arrow_type)
 
     if not pa.types.is_primitive(t):
@@ -105,7 +131,7 @@ def get_arrow_type_from_tiledb_dtype(
             return pa.large_string()
         else:
             return pa.large_binary()
-    elif tiledb_dtype == "ascii":
+    elif tiledb_dtype == "ascii" or tiledb_dtype == str:
         return pa.large_string()
     else:
         return pa.from_numpy_dtype(tiledb_dtype)
diff --git a/apis/python/tests/test_unicode.py b/apis/python/tests/test_unicode.py
index 4d253ed2ae..2036ec52fd 100644
--- a/apis/python/tests/test_unicode.py
+++ b/apis/python/tests/test_unicode.py
@@ -22,7 +22,7 @@ def sample_arrow_table():
             ],
             "ascii": ["aa", "bbb", "cccccc"],
             "bytes": [b"aa", b"bbb", b"ccc"],
-            "float32": np.array([0.0, 1.1, 2.2], np.float32),
+            "float32": np.array([0.0, 1.0, 2.0], np.float32),
         }
     )
 
@@ -40,22 +40,74 @@ def sample_arrow_table():
     return pa.Table.from_pandas(df, schema)
 
 
-def test_dataframe_unicode(tmp_path, sample_arrow_table):
-    """Verify round-trip of unicode in DataFrame attributes"""
+@pytest.fixture
+def sample_soma_dataframe(tmp_path, sample_arrow_table):
     sdf = soma.DataFrame(tmp_path.as_posix())
-    sdf.create(sample_arrow_table.schema)
+    sdf.create(sample_arrow_table.schema, index_column_names=["soma_joinid"])
     sdf.write(sample_arrow_table)
-    assert sdf.read().concat().equals(sample_arrow_table)
+    assert sdf.exists()
+    return sdf
 
 
-def test_dataframe_unicode_attr(tmp_path, sample_arrow_table):
+def test_dataframe_unicode_columns(tmp_path, sample_arrow_table):
     """Verify round-trip of unicode in DataFrame value columns"""
     sdf = soma.DataFrame(tmp_path.as_posix())
     sdf.create(sample_arrow_table.schema, index_column_names=["soma_joinid"])
     sdf.write(sample_arrow_table)
+
+    assert sample_arrow_table.schema == sdf.schema
     assert sdf.read().concat().equals(sample_arrow_table)
 
 
+def test_dataframe_unicode_value_filter(sample_soma_dataframe):
+    """Verify that value_filter works correctly"""
+
+    # filter on ascii
+    assert sample_soma_dataframe.read(
+        value_filter="ascii in ['aa', 'cccccc']"
+    ).concat().to_pydict() == {
+        "soma_joinid": [0, 2],
+        "unicode": [
+            "\N{LATIN CAPITAL LETTER E}\N{COMBINING CIRCUMFLEX ACCENT}",
+            "クン キン おし.える よ.む くん.ずる",
+        ],
+        "ascii": ["aa", "cccccc"],
+        "bytes": [b"aa", b"ccc"],
+        "float32": [0.0, 2.0],
+    }
+
+    # filter on unicode, equality
+    assert sample_soma_dataframe.read(
+        value_filter="unicode == '\N{LATIN CAPITAL LETTER E}\N{COMBINING CIRCUMFLEX ACCENT}'"
+    ).concat().to_pydict() == {
+        "soma_joinid": [0],
+        "unicode": [
+            "\N{LATIN CAPITAL LETTER E}\N{COMBINING CIRCUMFLEX ACCENT}",
+        ],
+        "ascii": ["aa"],
+        "bytes": [b"aa"],
+        "float32": [0.0],
+    }
+
+    # filter on unicode, ordering
+    assert sample_soma_dataframe.read(
+        value_filter="unicode > 'a'"
+    ).concat().to_pydict() == {
+        "soma_joinid": [1, 2],
+        "unicode": [
+            "a \N{GREEK CAPITAL LETTER DELTA} test",
+            "クン キン おし.える よ.む くん.ずる",
+        ],
+        "ascii": ["bbb", "cccccc"],
+        "bytes": [b"bbb", b"ccc"],
+        "float32": [1.0, 2.0],
+    }
+
+
+# TODO: Remove the `xfail` annotation when TileDB core supports Unicode
+# dimensions, aka SOMA index columns.
+#
+@pytest.mark.xfail
 def test_dataframe_unicode_index(tmp_path, sample_arrow_table):
     """Verify round-trip of unicode in DataFrame index columns"""
     sdf = soma.DataFrame(tmp_path.as_posix())

From 4830225c4eee665c25c6a04bd8b69cb368f9284d Mon Sep 17 00:00:00 2001
From: bkmartinjr <bruce@chanzuckerberg.com>
Date: Fri, 20 Jan 2023 18:45:25 +0000
Subject: [PATCH 3/6] lint

---
 apis/python/src/tiledbsoma/util_arrow.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/apis/python/src/tiledbsoma/util_arrow.py b/apis/python/src/tiledbsoma/util_arrow.py
index eaec21f28d..accefed3b9 100644
--- a/apis/python/src/tiledbsoma/util_arrow.py
+++ b/apis/python/src/tiledbsoma/util_arrow.py
@@ -22,9 +22,9 @@
 respectively, as this is what TileDB stores -- a sequence of bytes preceded
 by a 64-bit (not 32-bit) length int.
 
-DataFrame-specific note: currently (as of 2.14), TileDB does not support 
+DataFrame-specific note: currently (as of 2.14), TileDB does not support
 Unicode array dimensions. All Arrow string types used in a DataFrame index
-columns (i.e., TileDB dimension) are coerced to ASCII. This equirement for 
+columns (i.e., TileDB dimension) are coerced to ASCII. This equirement for
 ASCII-only dimensions will be relaxed in a future release. Unicode/UTF-8 is
 fully supported in SOMA DataFrame non-indexed columns.
 """

From 9a72e495bcfa2f7fafa0ec2cc674070af19076f2 Mon Sep 17 00:00:00 2001
From: bkmartinjr <bruce@chanzuckerberg.com>
Date: Fri, 20 Jan 2023 18:54:12 +0000
Subject: [PATCH 4/6] remove incorrect code

---
 apis/python/src/tiledbsoma/util_arrow.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/apis/python/src/tiledbsoma/util_arrow.py b/apis/python/src/tiledbsoma/util_arrow.py
index accefed3b9..251331364f 100644
--- a/apis/python/src/tiledbsoma/util_arrow.py
+++ b/apis/python/src/tiledbsoma/util_arrow.py
@@ -57,8 +57,6 @@
     {
         pa.string(): "ascii",  # TODO: temporary work-around until Dimension UTF8 support is available.
         pa.large_string(): "ascii",  # TODO: temporary work-around until UTF8 support is available.
-        pa.binary(): "bytes",  # TODO: temporary work-around until UTF8 support is available.
-        pa.large_binary(): "bytes",  # TODO: temporary work-around until UTF8 support is available.
     }
 )
 

From 2120c48d0593922f34176767b35e447ca61d0a09 Mon Sep 17 00:00:00 2001
From: bkmartinjr <bruce@chanzuckerberg.com>
Date: Fri, 20 Jan 2023 18:56:07 +0000
Subject: [PATCH 5/6] additional comments

---
 apis/python/src/tiledbsoma/util_arrow.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/apis/python/src/tiledbsoma/util_arrow.py b/apis/python/src/tiledbsoma/util_arrow.py
index 251331364f..d972b35f68 100644
--- a/apis/python/src/tiledbsoma/util_arrow.py
+++ b/apis/python/src/tiledbsoma/util_arrow.py
@@ -51,12 +51,13 @@
     pa.date64(): TypeError("64-bit date - unsupported type (use TimestampType)"),
 }
 
-# Same as _ARROW_TO_TDB_ATTR, but used for DataFrame indexed columns, aka TileDB Dimensions
+# Same as _ARROW_TO_TDB_ATTR, but used for DataFrame indexed columns, aka TileDB Dimensions.
+# Any type system differences from the base-case Attr should be added here.
 _ARROW_TO_TDB_DIM: Dict[Any, Union[str, TypeError]] = _ARROW_TO_TDB_ATTR.copy()
 _ARROW_TO_TDB_DIM.update(
     {
         pa.string(): "ascii",  # TODO: temporary work-around until Dimension UTF8 support is available.
-        pa.large_string(): "ascii",  # TODO: temporary work-around until UTF8 support is available.
+        pa.large_string(): "ascii",  # TODO: temporary work-around until Dimension UTF8 support is available.
     }
 )
 

From 793d97d275780a5ddb7cdddc738f1b50d44e9c57 Mon Sep 17 00:00:00 2001
From: bkmartinjr <bruce@chanzuckerberg.com>
Date: Fri, 20 Jan 2023 19:16:12 +0000
Subject: [PATCH 6/6] PR feedback

---
 apis/python/src/tiledbsoma/util_arrow.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/apis/python/src/tiledbsoma/util_arrow.py b/apis/python/src/tiledbsoma/util_arrow.py
index d972b35f68..f1bb47cc8c 100644
--- a/apis/python/src/tiledbsoma/util_arrow.py
+++ b/apis/python/src/tiledbsoma/util_arrow.py
@@ -66,12 +66,12 @@ def tiledb_type_from_arrow_type(
     t: pa.DataType, is_indexed_column: bool = False
 ) -> npt.DTypeLike:
     """
-    Given an Arrow type, return the corresponding TileDB type as a Numpy dtype.
+    Given an Arrow type, return the corresponding TileDB type as a NumPy dtype.
     Building block for Arrow-to-TileDB schema translation.
 
     TileDB currently has different Unicode handling for dimensions and attributes.
-    Set the ``is_dimension`` parameter to True for dimension rules, which
-    currently requires all strings to be ASCII.
+    Set the ``is_dimension`` parameter to True for indexed-column (AKA dimension)
+    rules, which currently requires all strings to be ASCII.
 
     If type is unsupported, with raise a TypeError exception.