Skip to content

Commit

Permalink
[c++/python] Check arrow_is_string_type in _update_dataframe (#3111
Browse files Browse the repository at this point in the history
…) (#3225)

Co-authored-by: nguyenv <[email protected]>
  • Loading branch information
github-actions[bot] and nguyenv authored Oct 23, 2024
1 parent e3ec4de commit bc4067e
Show file tree
Hide file tree
Showing 4 changed files with 58 additions and 12 deletions.
5 changes: 5 additions & 0 deletions apis/python/src/tiledbsoma/pytiledbsoma.cc
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,11 @@ PYBIND11_MODULE(pytiledbsoma, m) {
*ctx->tiledb_ctx(),
attr_name,
ArrowAdapter::to_tiledb_format(attr_type));

if (ArrowAdapter::arrow_is_string_type(attr_type.c_str())) {
attr.set_cell_val_num(TILEDB_VAR_NUM);
}

FilterList filter_list(*ctx->tiledb_ctx());
filter_list.add_filter(
Filter(*ctx->tiledb_ctx(), TILEDB_FILTER_ZSTD));
Expand Down
34 changes: 34 additions & 0 deletions apis/python/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1722,3 +1722,37 @@ def test_only_evolve_schema_when_enmr_is_extended(tmp_path):
# subtract 1 for the __schema/__enumerations directory;
# only looking at fragment files
assert len(vfs.ls(os.path.join(uri, "__schema"))) - 1 == 3


def test_fix_update_dataframe_with_var_strings(tmp_path):
uri = tmp_path.as_posix()

tbl = pa.table(
{
"soma_joinid": pa.array([0, 1, 2, 3], pa.int64()),
"mystring": pa.array(["a", "bb", "ccc", "dddd"], pa.large_utf8()),
"myint": pa.array([33, 44, 55, 66], pa.int32()),
"myfloat": pa.array([4.5, 5.5, 6.5, 7.5], pa.float32()),
}
)

with soma.DataFrame.create(uri, schema=tbl.schema) as sdf:
sdf.write(tbl)

with soma.DataFrame.open(uri, "r") as sdf:
updated_sdf = sdf.read().concat().to_pandas()
updated_sdf["newattr"] = np.array(["a", "b", "c", "d"])

with soma.DataFrame.open(uri, "w") as sdf:
soma.io.ingest._update_dataframe(
sdf,
updated_sdf,
"testing",
platform_config=None,
context=None,
default_index_name="mystring",
)

with soma.DataFrame.open(uri, "r") as sdf:
results = sdf.read().concat().to_pandas()
assert results.equals(updated_sdf)
20 changes: 10 additions & 10 deletions libtiledbsoma/src/utils/arrow_adapter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -763,7 +763,7 @@ ArraySchema ArrowAdapter::tiledb_schema_from_arrow_schema(
auto schild = index_column_schema->children[i];
auto col_name = schild->name;
if (strcmp(child->name, col_name) == 0) {
if (ArrowAdapter::_isvar(child->format)) {
if (ArrowAdapter::arrow_is_string_type(child->format)) {
type = TILEDB_STRING_ASCII;
}

Expand Down Expand Up @@ -802,7 +802,7 @@ ArraySchema ArrowAdapter::tiledb_schema_from_arrow_schema(
attr.set_nullable(true);
}

if (ArrowAdapter::_isvar(child->format)) {
if (ArrowAdapter::arrow_is_string_type(child->format)) {
attr.set_cell_val_num(TILEDB_VAR_NUM);
}

Expand All @@ -813,7 +813,9 @@ ArraySchema ArrowAdapter::tiledb_schema_from_arrow_schema(
*ctx,
child->name,
enmr_type,
ArrowAdapter::_isvar(enmr_format) ? TILEDB_VAR_NUM : 1,
ArrowAdapter::arrow_is_string_type(enmr_format) ?
TILEDB_VAR_NUM :
1,
child->flags & ARROW_FLAG_DICTIONARY_ORDERED);
ArraySchemaExperimental::add_enumeration(*ctx, schema, enmr);
AttributeExperimental::set_enumeration_name(
Expand Down Expand Up @@ -860,7 +862,7 @@ ArraySchema ArrowAdapter::tiledb_schema_from_arrow_schema(
continue;
}

if (ArrowAdapter::_isvar(child->format)) {
if (ArrowAdapter::arrow_is_string_type(child->format)) {
// In the core API:
//
// * domain for strings must be set as (nullptr, nullptr)
Expand Down Expand Up @@ -1149,12 +1151,10 @@ ArrowAdapter::to_arrow(std::shared_ptr<ColumnBuffer> column) {
return std::pair(std::move(array), std::move(schema));
}

bool ArrowAdapter::_isvar(const char* format) {
if ((strcmp(format, "U") == 0) || (strcmp(format, "Z") == 0) ||
(strcmp(format, "u") == 0) || (strcmp(format, "z") == 0)) {
return true;
}
return false;
bool ArrowAdapter::arrow_is_string_type(const char* format) {
return (
(strcmp(format, "U") == 0) || (strcmp(format, "Z") == 0) ||
(strcmp(format, "u") == 0) || (strcmp(format, "z") == 0));
}

std::string_view ArrowAdapter::to_arrow_format(
Expand Down
11 changes: 9 additions & 2 deletions libtiledbsoma/src/utils/arrow_adapter.h
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,15 @@ class ArrowAdapter {
static std::string_view to_arrow_format(
tiledb_datatype_t tiledb_dtype, bool use_large = true);

/**
* @brief Keystroke saver to determine whether Arrow type is of string,
* large string, binary, or large binary type.
*
* @param const char* Arrow data format
* @return bool Whether the Arrow type represents a string type
*/
static bool arrow_is_string_type(const char* format);

/**
* @brief Get TileDB datatype from Arrow format string.
*
Expand Down Expand Up @@ -363,8 +372,6 @@ class ArrowAdapter {
return Dimension::create<T>(*ctx, name, {b[0], b[1]}, b[2]);
}

static bool _isvar(const char* format);

static FilterList _create_filter_list(
std::string filters, std::shared_ptr<Context> ctx);

Expand Down

0 comments on commit bc4067e

Please sign in to comment.