Skip to content

Commit

Permalink
[r] Write string attrs as UTF-8 (Python compatibility) (#1843)
Browse files Browse the repository at this point in the history
* [r] Write string attrs as UTF-8 (Python compatibility)

* remove old to-do comment

* merge
  • Loading branch information
johnkerl authored Nov 2, 2023
1 parent 9c18c49 commit 2926e4d
Show file tree
Hide file tree
Showing 4 changed files with 22 additions and 19 deletions.
6 changes: 3 additions & 3 deletions apis/r/R/SOMADataFrame.R
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ SOMADataFrame <- R6::R6Class(
name = field_name,
# Numeric index types must be positive values for indexing
domain = arrow_type_unsigned_range(field$type), tile = tile_extent,
type = tiledb_type_from_arrow_type(field$type),
type = tiledb_type_from_arrow_type(field$type, is_dim=TRUE),
filter_list = tiledb::tiledb_filter_list(tdb_opts)
)
}
Expand All @@ -87,7 +87,7 @@ SOMADataFrame <- R6::R6Class(

for (field_name in attr_column_names) {
field <- schema$GetFieldByName(field_name)
field_type <- tiledb_type_from_arrow_type(field$type)
field_type <- tiledb_type_from_arrow_type(field$type, is_dim=FALSE)

## # Check if the field is ordered and mark it as such
## if (!is.null(x = levels[[field_name]]) && isTRUE(field$type$ordered)) {
Expand All @@ -98,7 +98,7 @@ SOMADataFrame <- R6::R6Class(
name = field_name,
type = field_type,
nullable = field$nullable,
ncells = if (field_type == "ASCII") NA_integer_ else 1L,
ncells = if (field_type == "ASCII" || field_type == "UTF8" ) NA_integer_ else 1L,
filter_list = tiledb::tiledb_filter_list(tiledb_create_options$attr_filters(field_name))
)
}
Expand Down
2 changes: 1 addition & 1 deletion apis/r/R/SOMANDArrayBase.R
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ SOMANDArrayBase <- R6::R6Class(
# create array attribute
tdb_attr <- tiledb::tiledb_attr(
name = "soma_data",
type = tiledb_type_from_arrow_type(type),
type = tiledb_type_from_arrow_type(type, is_dim=FALSE),
filter_list = tdb_attr_filters
)

Expand Down
23 changes: 13 additions & 10 deletions apis/r/R/utils-arrow.R
Original file line number Diff line number Diff line change
Expand Up @@ -36,13 +36,14 @@ is_arrow_dictionary <- function(x) {

#' Convert Arrow types to supported TileDB type
#' List of TileDB types supported in R: https://github.com/TileDB-Inc/TileDB-R/blob/8014da156b5fee5b4cc221d57b4aa7d388abc968/inst/tinytest/test_dim.R#L97-L121
#' Note: TileDB attrs may be UTF-8; TileDB dims may not.
#'
#' List of all arrow types: https://github.com/apache/arrow/blob/90aac16761b7dbf5fe931bc8837cad5116939270/r/R/type.R#L700
#' @noRd

tiledb_type_from_arrow_type <- function(x) {
tiledb_type_from_arrow_type <- function(x, is_dim) {
stopifnot(is_arrow_data_type(x))
switch(x$name,
retval <- switch(x$name,
int8 = "INT8",
int16 = "INT16",
int32 = "INT32",
Expand All @@ -62,11 +63,9 @@ tiledb_type_from_arrow_type <- function(x) {
# binary = "binary",
# large_binary = "large_binary",
# fixed_size_binary = "fixed_size_binary",
# tiledb::r_to_tiledb_type() returns UTF8 for characters but they are
# not yet queryable so we use ASCII for now
utf8 = "ASCII",
string = "ASCII",
large_utf8 = "ASCII",
utf8 = "UTF8",
string = "UTF8",
large_utf8 = "UTF8",
# date32 = "date32",
# date64 = "date64",
# time32 = "time32",
Expand All @@ -84,9 +83,13 @@ tiledb_type_from_arrow_type <- function(x) {
# fixed_size_list = "fixed_size_list",
# map_of = "map",
# duration = "duration",
dictionary = tiledb_type_from_arrow_type(x$index_type),
dictionary = tiledb_type_from_arrow_type(x$index_type, is_dim=is_dim),
stop("Unsupported Arrow data type: ", x$name, call. = FALSE)
)
if (is_dim && retval == "UTF8") {
retval <- "ASCII"
}
retval
}

arrow_type_from_tiledb_type <- function(x) {
Expand Down Expand Up @@ -209,12 +212,12 @@ tiledb_attr_from_arrow_field <- function(field, tiledb_create_options) {
COMPRESSION_LEVEL = tiledb_create_options$dataframe_dim_zstd_level()
)

field_type <- tiledb_type_from_arrow_type(field$type)
field_type <- tiledb_type_from_arrow_type(field$type, is_dim=FALSE)
tiledb::tiledb_attr(
name = field$name,
type = field_type,
nullable = field$nullable,
ncells = if (field_type == "ASCII") NA_integer_ else 1L,
ncells = if (field_type == "ASCII" || field_type == "UTF8") NA_integer_ else 1L,
filter_list = tiledb::tiledb_filter_list(
tiledb_create_options$attr_filters(
attr_name = field$name,
Expand Down
10 changes: 5 additions & 5 deletions apis/r/tests/testthat/test-Arrow-utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ test_that("TileDB classes can be converted to Arrow equivalents", {
expect_true(is_arrow_field(dim0_field))
expect_equal(dim0_field$name, tiledb::name(dim0))
expect_equal(
tiledb_type_from_arrow_type(dim0_field$type),
tiledb_type_from_arrow_type(dim0_field$type, is_dim=TRUE),
tiledb::datatype(dim0)
)

Expand All @@ -32,14 +32,14 @@ test_that("TileDB classes can be converted to Arrow equivalents", {
expect_true(is_arrow_field(dim1_field))
expect_equal(dim1_field$name, tiledb::name(dim1))
expect_equal(
tiledb_type_from_arrow_type(dim1_field$type),
tiledb_type_from_arrow_type(dim1_field$type, is_dim=TRUE),
tiledb::datatype(dim1)
)

# Attribute to Arrow field
attr0 <- tiledb::tiledb_attr(
name = "attr0",
type = "ASCII"
type = "UTF8"
)

attr1 <- tiledb::tiledb_attr(
Expand All @@ -55,7 +55,7 @@ test_that("TileDB classes can be converted to Arrow equivalents", {
expect_true(is_arrow_field(attr0_field))
expect_equal(attr0_field$name, tiledb::name(attr0))
expect_equal(
tiledb_type_from_arrow_type(attr0_field$type),
tiledb_type_from_arrow_type(attr0_field$type, is_dim=FALSE),
tiledb::datatype(attr0)
)

Expand All @@ -64,7 +64,7 @@ test_that("TileDB classes can be converted to Arrow equivalents", {
expect_true(is_arrow_field(attr1_field))
expect_equal(attr1_field$name, tiledb::name(attr1))
expect_equal(
tiledb_type_from_arrow_type(attr1_field$type),
tiledb_type_from_arrow_type(attr1_field$type, is_dim=FALSE),
tiledb::datatype(attr1)
)

Expand Down

0 comments on commit 2926e4d

Please sign in to comment.