Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[c++] Fix display of Arrow schema for enum of bytes datatype #2305

Merged
merged 10 commits into from
Mar 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .github/workflows/python-ci-minimal.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,10 @@ on:
jobs:
build:
strategy:
fail-fast: true
# TEMP: PLEASE DO NOT COMMIT
# fail-fast: true
fail-fast: false

matrix:
os: [ubuntu-22.04, macos-12]
python-version: ['3.8', '3.11']
Expand Down
57 changes: 57 additions & 0 deletions apis/python/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1371,3 +1371,60 @@ def test_enum_extend_past_numerical_limit(tmp_path):

def test_write_str_empty_ned(tmp_path):
tmp_path.as_posix()


def test_enum_schema_report(tmp_path):
uri = tmp_path.as_posix()

pandas_df = pd.DataFrame(
{
"soma_joinid": pd.Series([0, 1, 2, 3, 4, 5], dtype=np.int64),
"int_cat": pd.Series([10, 20, 10, 20, 20, 20], dtype="category"),
"int": pd.Series([10, 20, 10, 20, 20, 20]),
"str_cat": pd.Series(["A", "B", "A", "B", "B", "B"], dtype="category"),
"str": pd.Series(["A", "B", "A", "B", "B", "B"]),
"byte_cat": pd.Series(
[b"A", b"B", b"A", b"B", b"B", b"B"], dtype="category"
),
"byte": pd.Series([b"A", b"B", b"A", b"B", b"B", b"B"]),
},
)

arrow_schema = pa.Schema.from_pandas(pandas_df, preserve_index=False)

with soma.DataFrame.create(uri, schema=arrow_schema) as sdf:
arrow_table = pa.Table.from_pandas(pandas_df, preserve_index=False)
sdf.write(arrow_table)

# Double-check against TileDB-Py reporting
with tiledb.open(uri) as A:
for i in range(A.schema.nattr):
attr = A.schema.attr(i)
try:
index_type = attr.dtype
value_type = A.enum(attr.name).dtype
except tiledb.cc.TileDBError:
pass # not an enum attr
if attr.name == "int_cat":
assert index_type.name == "int8"
assert value_type.name == "int64"
elif attr.name == "str_cat":
assert index_type.name == "int8"
assert value_type.name == "str32"
elif attr.name == "byte_cat":
assert index_type.name == "int8"
assert value_type.name == "bytes"

# Verify SOMA Arrow schema
with soma.open(uri) as sdf:
f = sdf.schema.field("int_cat")
assert f.type.index_type == pa.int8()
assert f.type.value_type == pa.int64()

f = sdf.schema.field("str_cat")
assert f.type.index_type == pa.int8()
assert f.type.value_type == pa.string()

f = sdf.schema.field("byte_cat")
assert f.type.index_type == pa.int8()
assert f.type.value_type == pa.binary()
21 changes: 17 additions & 4 deletions libtiledbsoma/src/utils/arrow_adapter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -145,8 +145,13 @@
auto enmr = ArrayExperimental::get_enumeration(
*ctx, *tiledb_array, attr.name());
auto dict = new ArrowSchema;
dict->format = strdup(
ArrowAdapter::to_arrow_format(enmr.type(), false).data());
if (enmr.type() == TILEDB_STRING_ASCII or
enmr.type() == TILEDB_CHAR) {
dict->format = strdup("z");
} else {
dict->format = strdup(
ArrowAdapter::to_arrow_format(enmr.type(), false).data());

Check warning on line 153 in libtiledbsoma/src/utils/arrow_adapter.cc

View check run for this annotation

Codecov / codecov/patch

libtiledbsoma/src/utils/arrow_adapter.cc#L149-L153

Added lines #L149 - L153 were not covered by tests
}
dict->name = strdup(enmr.name().c_str());
dict->metadata = nullptr;
dict->flags = 0;
Expand Down Expand Up @@ -240,6 +245,14 @@
}
}

bool ArrowAdapter::_isstr(const char* format) {

Check warning on line 248 in libtiledbsoma/src/utils/arrow_adapter.cc

View check run for this annotation

Codecov / codecov/patch

libtiledbsoma/src/utils/arrow_adapter.cc#L248

Added line #L248 was not covered by tests
if ((strcmp(format, "U") == 0) || (strcmp(format, "Z") == 0) ||
(strcmp(format, "u") == 0) || (strcmp(format, "z") == 0)) {
return true;

Check warning on line 251 in libtiledbsoma/src/utils/arrow_adapter.cc

View check run for this annotation

Codecov / codecov/patch

libtiledbsoma/src/utils/arrow_adapter.cc#L250-L251

Added lines #L250 - L251 were not covered by tests
}
return false;

Check warning on line 253 in libtiledbsoma/src/utils/arrow_adapter.cc

View check run for this annotation

Codecov / codecov/patch

libtiledbsoma/src/utils/arrow_adapter.cc#L253

Added line #L253 was not covered by tests
}

std::pair<std::unique_ptr<ArrowArray>, std::unique_ptr<ArrowSchema>>
ArrowAdapter::to_arrow(std::shared_ptr<ColumnBuffer> column) {
std::unique_ptr<ArrowSchema> schema = std::make_unique<ArrowSchema>();
Expand Down Expand Up @@ -329,7 +342,7 @@
dict_sch->release = &release_schema;
dict_sch->private_data = nullptr;

const int n_buf = strcmp(dict_sch->format, "u") == 0 ? 3 : 2;
const int n_buf = ArrowAdapter::_isstr(dict_sch->format) ? 3 : 2;

Check warning on line 345 in libtiledbsoma/src/utils/arrow_adapter.cc

View check run for this annotation

Codecov / codecov/patch

libtiledbsoma/src/utils/arrow_adapter.cc#L345

Added line #L345 was not covered by tests
dict_arr->null_count = 0;
dict_arr->offset = 0;
dict_arr->n_buffers = n_buf;
Expand All @@ -353,7 +366,7 @@
// returns std::optional where std::nullopt indicates the
// column does not contain enumerated values.
if (enmr->type() == TILEDB_STRING_ASCII or
enmr->type() == TILEDB_STRING_UTF8) {
enmr->type() == TILEDB_STRING_UTF8 or enmr->type() == TILEDB_CHAR) {

Check warning on line 369 in libtiledbsoma/src/utils/arrow_adapter.cc

View check run for this annotation

Codecov / codecov/patch

libtiledbsoma/src/utils/arrow_adapter.cc#L369

Added line #L369 was not covered by tests
auto dict_vec = enmr->as_vector<std::string>();
column->convert_enumeration();
dict_arr->buffers[1] = column->enum_offsets().data();
Expand Down
4 changes: 3 additions & 1 deletion libtiledbsoma/src/utils/arrow_adapter.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ class ArrowAdapter {
static void release_schema(struct ArrowSchema* schema);
static void release_array(struct ArrowArray* array);

static bool _isstr(const char* format);

/**
* @brief Convert ColumnBuffer to an Arrow array.
*
Expand Down Expand Up @@ -72,4 +74,4 @@ class ArrowAdapter {
};
}; // namespace tiledbsoma

#endif
#endif
Loading