Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Backport release-1.8] [c++] Fix display of Arrow schema for enum of bytes datatype #2325

Merged
merged 1 commit into from
Mar 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .github/workflows/python-ci-minimal.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,10 @@ on:
jobs:
build:
strategy:
fail-fast: true
# TEMP: PLEASE DO NOT COMMIT
# fail-fast: true
fail-fast: false

matrix:
os: [ubuntu-22.04, macos-12]
python-version: ['3.8', '3.11']
Expand Down
57 changes: 57 additions & 0 deletions apis/python/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1371,3 +1371,60 @@ def test_enum_extend_past_numerical_limit(tmp_path):

def test_write_str_empty_ned(tmp_path):
tmp_path.as_posix()


def test_enum_schema_report(tmp_path):
uri = tmp_path.as_posix()

pandas_df = pd.DataFrame(
{
"soma_joinid": pd.Series([0, 1, 2, 3, 4, 5], dtype=np.int64),
"int_cat": pd.Series([10, 20, 10, 20, 20, 20], dtype="category"),
"int": pd.Series([10, 20, 10, 20, 20, 20]),
"str_cat": pd.Series(["A", "B", "A", "B", "B", "B"], dtype="category"),
"str": pd.Series(["A", "B", "A", "B", "B", "B"]),
"byte_cat": pd.Series(
[b"A", b"B", b"A", b"B", b"B", b"B"], dtype="category"
),
"byte": pd.Series([b"A", b"B", b"A", b"B", b"B", b"B"]),
},
)

arrow_schema = pa.Schema.from_pandas(pandas_df, preserve_index=False)

with soma.DataFrame.create(uri, schema=arrow_schema) as sdf:
arrow_table = pa.Table.from_pandas(pandas_df, preserve_index=False)
sdf.write(arrow_table)

# Double-check against TileDB-Py reporting
with tiledb.open(uri) as A:
for i in range(A.schema.nattr):
attr = A.schema.attr(i)
try:
index_type = attr.dtype
value_type = A.enum(attr.name).dtype
except tiledb.cc.TileDBError:
pass # not an enum attr
if attr.name == "int_cat":
assert index_type.name == "int8"
assert value_type.name == "int64"
elif attr.name == "str_cat":
assert index_type.name == "int8"
assert value_type.name == "str32"
elif attr.name == "byte_cat":
assert index_type.name == "int8"
assert value_type.name == "bytes"

# Verify SOMA Arrow schema
with soma.open(uri) as sdf:
f = sdf.schema.field("int_cat")
assert f.type.index_type == pa.int8()
assert f.type.value_type == pa.int64()

f = sdf.schema.field("str_cat")
assert f.type.index_type == pa.int8()
assert f.type.value_type == pa.string()

f = sdf.schema.field("byte_cat")
assert f.type.index_type == pa.int8()
assert f.type.value_type == pa.binary()
21 changes: 17 additions & 4 deletions libtiledbsoma/src/utils/arrow_adapter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -145,8 +145,13 @@ std::unique_ptr<ArrowSchema> ArrowAdapter::arrow_schema_from_tiledb_array(
auto enmr = ArrayExperimental::get_enumeration(
*ctx, *tiledb_array, attr.name());
auto dict = new ArrowSchema;
dict->format = strdup(
ArrowAdapter::to_arrow_format(enmr.type(), false).data());
if (enmr.type() == TILEDB_STRING_ASCII or
enmr.type() == TILEDB_CHAR) {
dict->format = strdup("z");
} else {
dict->format = strdup(
ArrowAdapter::to_arrow_format(enmr.type(), false).data());
}
dict->name = strdup(enmr.name().c_str());
dict->metadata = nullptr;
dict->flags = 0;
Expand Down Expand Up @@ -240,6 +245,14 @@ std::pair<const void*, std::size_t> ArrowAdapter::_get_data_and_length(
}
}

bool ArrowAdapter::_isstr(const char* format) {
if ((strcmp(format, "U") == 0) || (strcmp(format, "Z") == 0) ||
(strcmp(format, "u") == 0) || (strcmp(format, "z") == 0)) {
return true;
}
return false;
}

std::pair<std::unique_ptr<ArrowArray>, std::unique_ptr<ArrowSchema>>
ArrowAdapter::to_arrow(std::shared_ptr<ColumnBuffer> column) {
std::unique_ptr<ArrowSchema> schema = std::make_unique<ArrowSchema>();
Expand Down Expand Up @@ -329,7 +342,7 @@ ArrowAdapter::to_arrow(std::shared_ptr<ColumnBuffer> column) {
dict_sch->release = &release_schema;
dict_sch->private_data = nullptr;

const int n_buf = strcmp(dict_sch->format, "u") == 0 ? 3 : 2;
const int n_buf = ArrowAdapter::_isstr(dict_sch->format) ? 3 : 2;
dict_arr->null_count = 0;
dict_arr->offset = 0;
dict_arr->n_buffers = n_buf;
Expand All @@ -353,7 +366,7 @@ ArrowAdapter::to_arrow(std::shared_ptr<ColumnBuffer> column) {
// returns std::optional where std::nullopt indicates the
// column does not contain enumerated values.
if (enmr->type() == TILEDB_STRING_ASCII or
enmr->type() == TILEDB_STRING_UTF8) {
enmr->type() == TILEDB_STRING_UTF8 or enmr->type() == TILEDB_CHAR) {
auto dict_vec = enmr->as_vector<std::string>();
column->convert_enumeration();
dict_arr->buffers[1] = column->enum_offsets().data();
Expand Down
4 changes: 3 additions & 1 deletion libtiledbsoma/src/utils/arrow_adapter.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ class ArrowAdapter {
static void release_schema(struct ArrowSchema* schema);
static void release_array(struct ArrowArray* array);

static bool _isstr(const char* format);

/**
* @brief Convert ColumnBuffer to an Arrow array.
*
Expand Down Expand Up @@ -72,4 +74,4 @@ class ArrowAdapter {
};
}; // namespace tiledbsoma

#endif
#endif
Loading