Skip to content

Commit

Permalink
[c++] Fix display of Arrow schema for enum of bytes datatype (#2305)
Browse files Browse the repository at this point in the history
* [c++] Fix display of Arrow schema for enum of bytes

* bugfix per @nguyenv

* more of same

* try 3 with help of @nguyenv

* python unit-test case

* extend the unit-test case a bit

* implement @nguyenv advice

* Partial reversal of schema format assignment

* test

---------

Co-authored-by: Dirk Eddelbuettel <[email protected]>
  • Loading branch information
2 people authored and github-actions[bot] committed Mar 26, 2024
1 parent 4361a3c commit 6287fd9
Show file tree
Hide file tree
Showing 4 changed files with 81 additions and 6 deletions.
5 changes: 4 additions & 1 deletion .github/workflows/python-ci-minimal.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,10 @@ on:
jobs:
build:
strategy:
fail-fast: true
# TEMP: PLEASE DO NOT COMMIT
# fail-fast: true
fail-fast: false

matrix:
os: [ubuntu-22.04, macos-12]
python-version: ['3.8', '3.11']
Expand Down
57 changes: 57 additions & 0 deletions apis/python/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1371,3 +1371,60 @@ def test_enum_extend_past_numerical_limit(tmp_path):

def test_write_str_empty_ned(tmp_path):
tmp_path.as_posix()


def test_enum_schema_report(tmp_path):
uri = tmp_path.as_posix()

pandas_df = pd.DataFrame(
{
"soma_joinid": pd.Series([0, 1, 2, 3, 4, 5], dtype=np.int64),
"int_cat": pd.Series([10, 20, 10, 20, 20, 20], dtype="category"),
"int": pd.Series([10, 20, 10, 20, 20, 20]),
"str_cat": pd.Series(["A", "B", "A", "B", "B", "B"], dtype="category"),
"str": pd.Series(["A", "B", "A", "B", "B", "B"]),
"byte_cat": pd.Series(
[b"A", b"B", b"A", b"B", b"B", b"B"], dtype="category"
),
"byte": pd.Series([b"A", b"B", b"A", b"B", b"B", b"B"]),
},
)

arrow_schema = pa.Schema.from_pandas(pandas_df, preserve_index=False)

with soma.DataFrame.create(uri, schema=arrow_schema) as sdf:
arrow_table = pa.Table.from_pandas(pandas_df, preserve_index=False)
sdf.write(arrow_table)

# Double-check against TileDB-Py reporting
with tiledb.open(uri) as A:
for i in range(A.schema.nattr):
attr = A.schema.attr(i)
try:
index_type = attr.dtype
value_type = A.enum(attr.name).dtype
except tiledb.cc.TileDBError:
pass # not an enum attr
if attr.name == "int_cat":
assert index_type.name == "int8"
assert value_type.name == "int64"
elif attr.name == "str_cat":
assert index_type.name == "int8"
assert value_type.name == "str32"
elif attr.name == "byte_cat":
assert index_type.name == "int8"
assert value_type.name == "bytes"

# Verify SOMA Arrow schema
with soma.open(uri) as sdf:
f = sdf.schema.field("int_cat")
assert f.type.index_type == pa.int8()
assert f.type.value_type == pa.int64()

f = sdf.schema.field("str_cat")
assert f.type.index_type == pa.int8()
assert f.type.value_type == pa.string()

f = sdf.schema.field("byte_cat")
assert f.type.index_type == pa.int8()
assert f.type.value_type == pa.binary()
21 changes: 17 additions & 4 deletions libtiledbsoma/src/utils/arrow_adapter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -145,8 +145,13 @@ std::unique_ptr<ArrowSchema> ArrowAdapter::arrow_schema_from_tiledb_array(
auto enmr = ArrayExperimental::get_enumeration(
*ctx, *tiledb_array, attr.name());
auto dict = new ArrowSchema;
dict->format = strdup(
ArrowAdapter::to_arrow_format(enmr.type(), false).data());
if (enmr.type() == TILEDB_STRING_ASCII or
enmr.type() == TILEDB_CHAR) {
dict->format = strdup("z");
} else {
dict->format = strdup(
ArrowAdapter::to_arrow_format(enmr.type(), false).data());
}
dict->name = strdup(enmr.name().c_str());
dict->metadata = nullptr;
dict->flags = 0;
Expand Down Expand Up @@ -240,6 +245,14 @@ std::pair<const void*, std::size_t> ArrowAdapter::_get_data_and_length(
}
}

bool ArrowAdapter::_isstr(const char* format) {
if ((strcmp(format, "U") == 0) || (strcmp(format, "Z") == 0) ||
(strcmp(format, "u") == 0) || (strcmp(format, "z") == 0)) {
return true;
}
return false;
}

std::pair<std::unique_ptr<ArrowArray>, std::unique_ptr<ArrowSchema>>
ArrowAdapter::to_arrow(std::shared_ptr<ColumnBuffer> column) {
std::unique_ptr<ArrowSchema> schema = std::make_unique<ArrowSchema>();
Expand Down Expand Up @@ -329,7 +342,7 @@ ArrowAdapter::to_arrow(std::shared_ptr<ColumnBuffer> column) {
dict_sch->release = &release_schema;
dict_sch->private_data = nullptr;

const int n_buf = strcmp(dict_sch->format, "u") == 0 ? 3 : 2;
const int n_buf = ArrowAdapter::_isstr(dict_sch->format) ? 3 : 2;
dict_arr->null_count = 0;
dict_arr->offset = 0;
dict_arr->n_buffers = n_buf;
Expand All @@ -353,7 +366,7 @@ ArrowAdapter::to_arrow(std::shared_ptr<ColumnBuffer> column) {
// returns std::optional where std::nullopt indicates the
// column does not contain enumerated values.
if (enmr->type() == TILEDB_STRING_ASCII or
enmr->type() == TILEDB_STRING_UTF8) {
enmr->type() == TILEDB_STRING_UTF8 or enmr->type() == TILEDB_CHAR) {
auto dict_vec = enmr->as_vector<std::string>();
column->convert_enumeration();
dict_arr->buffers[1] = column->enum_offsets().data();
Expand Down
4 changes: 3 additions & 1 deletion libtiledbsoma/src/utils/arrow_adapter.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ class ArrowAdapter {
static void release_schema(struct ArrowSchema* schema);
static void release_array(struct ArrowArray* array);

static bool _isstr(const char* format);

/**
* @brief Convert ColumnBuffer to an Arrow array.
*
Expand Down Expand Up @@ -72,4 +74,4 @@ class ArrowAdapter {
};
}; // namespace tiledbsoma

#endif
#endif

0 comments on commit 6287fd9

Please sign in to comment.