[c++] Fix display of Arrow schema for enum of bytes datatype (#2305) (

#2325) * [c++] Fix display of Arrow schema for enum of bytes * bugfix per @nguyenv * more of same * try 3 with help of @nguyenv * python unit-test case * extend the unit-test case a bit * implement @nguyenv advice * Partial reversal of schema format assignment * test --------- Co-authored-by: John Kerl <[email protected]> Co-authored-by: Dirk Eddelbuettel <[email protected]>
single-cell-data · Mar 26, 2024 · aa508fa · aa508fa
1 parent 4361a3c
commit aa508fa
Show file tree

Hide file tree

Showing 4 changed files with 81 additions and 6 deletions.
diff --git a/.github/workflows/python-ci-minimal.yml b/.github/workflows/python-ci-minimal.yml
@@ -19,7 +19,10 @@ on:
 jobs:
   build:
     strategy:
-      fail-fast: true
+      # TEMP: PLEASE DO NOT COMMIT
+      # fail-fast: true
+      fail-fast: false
+
       matrix:
         os: [ubuntu-22.04, macos-12]
         python-version: ['3.8', '3.11']

diff --git a/apis/python/tests/test_dataframe.py b/apis/python/tests/test_dataframe.py
@@ -1371,3 +1371,60 @@ def test_enum_extend_past_numerical_limit(tmp_path):
 
 def test_write_str_empty_ned(tmp_path):
     tmp_path.as_posix()
+
+
+def test_enum_schema_report(tmp_path):
+    uri = tmp_path.as_posix()
+
+    pandas_df = pd.DataFrame(
+        {
+            "soma_joinid": pd.Series([0, 1, 2, 3, 4, 5], dtype=np.int64),
+            "int_cat": pd.Series([10, 20, 10, 20, 20, 20], dtype="category"),
+            "int": pd.Series([10, 20, 10, 20, 20, 20]),
+            "str_cat": pd.Series(["A", "B", "A", "B", "B", "B"], dtype="category"),
+            "str": pd.Series(["A", "B", "A", "B", "B", "B"]),
+            "byte_cat": pd.Series(
+                [b"A", b"B", b"A", b"B", b"B", b"B"], dtype="category"
+            ),
+            "byte": pd.Series([b"A", b"B", b"A", b"B", b"B", b"B"]),
+        },
+    )
+
+    arrow_schema = pa.Schema.from_pandas(pandas_df, preserve_index=False)
+
+    with soma.DataFrame.create(uri, schema=arrow_schema) as sdf:
+        arrow_table = pa.Table.from_pandas(pandas_df, preserve_index=False)
+        sdf.write(arrow_table)
+
+    # Double-check against TileDB-Py reporting
+    with tiledb.open(uri) as A:
+        for i in range(A.schema.nattr):
+            attr = A.schema.attr(i)
+            try:
+                index_type = attr.dtype
+                value_type = A.enum(attr.name).dtype
+            except tiledb.cc.TileDBError:
+                pass  # not an enum attr
+            if attr.name == "int_cat":
+                assert index_type.name == "int8"
+                assert value_type.name == "int64"
+            elif attr.name == "str_cat":
+                assert index_type.name == "int8"
+                assert value_type.name == "str32"
+            elif attr.name == "byte_cat":
+                assert index_type.name == "int8"
+                assert value_type.name == "bytes"
+
+    # Verify SOMA Arrow schema
+    with soma.open(uri) as sdf:
+        f = sdf.schema.field("int_cat")
+        assert f.type.index_type == pa.int8()
+        assert f.type.value_type == pa.int64()
+
+        f = sdf.schema.field("str_cat")
+        assert f.type.index_type == pa.int8()
+        assert f.type.value_type == pa.string()
+
+        f = sdf.schema.field("byte_cat")
+        assert f.type.index_type == pa.int8()
+        assert f.type.value_type == pa.binary()
diff --git a/libtiledbsoma/src/utils/arrow_adapter.cc b/libtiledbsoma/src/utils/arrow_adapter.cc
@@ -145,8 +145,13 @@ std::unique_ptr<ArrowSchema> ArrowAdapter::arrow_schema_from_tiledb_array(
             auto enmr = ArrayExperimental::get_enumeration(
                 *ctx, *tiledb_array, attr.name());
             auto dict = new ArrowSchema;
-            dict->format = strdup(
-                ArrowAdapter::to_arrow_format(enmr.type(), false).data());
+            if (enmr.type() == TILEDB_STRING_ASCII or
+                enmr.type() == TILEDB_CHAR) {
+                dict->format = strdup("z");
+            } else {
+                dict->format = strdup(
+                    ArrowAdapter::to_arrow_format(enmr.type(), false).data());
+            }
             dict->name = strdup(enmr.name().c_str());
             dict->metadata = nullptr;
             dict->flags = 0;
@@ -240,6 +245,14 @@ std::pair<const void*, std::size_t> ArrowAdapter::_get_data_and_length(
     }
 }
 
+bool ArrowAdapter::_isstr(const char* format) {
+    if ((strcmp(format, "U") == 0) || (strcmp(format, "Z") == 0) ||
+        (strcmp(format, "u") == 0) || (strcmp(format, "z") == 0)) {
+        return true;
+    }
+    return false;
+}
+
 std::pair<std::unique_ptr<ArrowArray>, std::unique_ptr<ArrowSchema>>
 ArrowAdapter::to_arrow(std::shared_ptr<ColumnBuffer> column) {
     std::unique_ptr<ArrowSchema> schema = std::make_unique<ArrowSchema>();
@@ -329,7 +342,7 @@ ArrowAdapter::to_arrow(std::shared_ptr<ColumnBuffer> column) {
         dict_sch->release = &release_schema;
         dict_sch->private_data = nullptr;
 
-        const int n_buf = strcmp(dict_sch->format, "u") == 0 ? 3 : 2;
+        const int n_buf = ArrowAdapter::_isstr(dict_sch->format) ? 3 : 2;
         dict_arr->null_count = 0;
         dict_arr->offset = 0;
         dict_arr->n_buffers = n_buf;
@@ -353,7 +366,7 @@ ArrowAdapter::to_arrow(std::shared_ptr<ColumnBuffer> column) {
         // returns std::optional where std::nullopt indicates the
         // column does not contain enumerated values.
         if (enmr->type() == TILEDB_STRING_ASCII or
-            enmr->type() == TILEDB_STRING_UTF8) {
+            enmr->type() == TILEDB_STRING_UTF8 or enmr->type() == TILEDB_CHAR) {
             auto dict_vec = enmr->as_vector<std::string>();
             column->convert_enumeration();
             dict_arr->buffers[1] = column->enum_offsets().data();

diff --git a/libtiledbsoma/src/utils/arrow_adapter.h b/libtiledbsoma/src/utils/arrow_adapter.h
@@ -37,6 +37,8 @@ class ArrowAdapter {
     static void release_schema(struct ArrowSchema* schema);
     static void release_array(struct ArrowArray* array);
 
+    static bool _isstr(const char* format);
+
     /**
      * @brief Convert ColumnBuffer to an Arrow array.
      *
@@ -72,4 +74,4 @@ class ArrowAdapter {
 };
 };  // namespace tiledbsoma
 
-#endif
+#endif