diff --git a/DESCRIPTION b/DESCRIPTION index 92cedb7668..b319eb9c3e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,30 +1,27 @@ Package: tiledb Type: Package -Version: 0.25.0.3 -Title: Modern Database Engine for Multi-Modal Data via Sparse and Dense Multidimensional Arrays +Version: 0.25.0.4 +Title: Modern Database Engine for Complex Data Based on Multi-Dimensional Arrays Authors@R: c(person("TileDB, Inc.", role = c("aut", "cph")), - person("Dirk", "Eddelbuettel", email = "dirk@tiledb.com", role = "cre")) + person("Dirk", "Eddelbuettel", email = "dirk@tiledb.com", role = "cre")) Description: The modern database 'TileDB' introduces a powerful on-disk - format for multi-modal data based on dimensional arrays. It supports - dense and sparse arrays, dataframes and key-values stores, cloud - storage ('S3', 'GCS', 'Azure'), chunked arrays, multiple compression, - encryption and checksum filters, uses a fully multi-threaded - implementation, supports parallel I/O, data versioning ('time - travel'), metadata and groups. It is implemented as an embeddable - cross-platform C++ library with APIs from several languages, and - integrations. + format for storing and accessing any complex data based on multi-dimensional + arrays. It supports dense and sparse arrays, dataframes and key-values stores, + cloud storage ('S3', 'GCS', 'Azure'), chunked arrays, multiple compression, + encryption and checksum filters, uses a fully multi-threaded implementation, + supports parallel I/O, data versioning ('time travel'), metadata and groups. + It is implemented as an embeddable cross-platform C++ library with APIs from + several languages, and integrations. This package provides the R support. Copyright: TileDB, Inc. License: MIT + file LICENSE URL: https://github.com/TileDB-Inc/TileDB-R, https://tiledb-inc.github.io/TileDB-R/ BugReports: https://github.com/TileDB-Inc/TileDB-R/issues -SystemRequirements: A C++17 compiler is required, and for macOS - compilation version 11.0 or later is required. Optionally cmake (only - when TileDB source build selected), curl (only when TileDB source - build selected)), and git (only when TileDB source build selected); - on x86_64 and M1 platforms pre-built TileDB Embedded libraries are - available at GitHub and are used if no TileDB installation is - detected, and no other option to build or download was specified by - the user. +SystemRequirements: A C++17 compiler is required; on macOS compilation version 11.0 + or later is required. Optionally cmake (only when TileDB source build selected), + curl (only when TileDB source build selected)), and git (only when TileDB source + build selected); on x86_64 and M1 platforms pre-built TileDB Embedded libraries + are available at GitHub and are used if no TileDB installation is detected, and + no other option to build or download was specified by the user. Imports: methods, Rcpp (>= 1.0.8), nanotime, spdl, nanoarrow LinkingTo: Rcpp, RcppInt64, nanoarrow Suggests: tinytest, simplermarkdown, curl, bit64, Matrix, palmerpenguins, nycflights13, data.table, tibble, arrow diff --git a/NEWS.md b/NEWS.md index fc79b368ad..c72591bf6d 100644 --- a/NEWS.md +++ b/NEWS.md @@ -6,6 +6,8 @@ * The display of a `filter_list` not labels is correctly as a filter list (@cgiachalis in #681) +* The Arrow integration has been simplified using [nanoarrow](https://github.com/apache/arrow-nanoarrow) returning a single `nanoarrow` object; an unexported helper function `nanoarrow2list()` is provided to matching the previous interface (#682) + ## Build and Test Systems * The `configure` and `Makevars.in` received a minor update correcting small issues (#680) diff --git a/R/ArrowIO.R b/R/ArrowIO.R index 562a072634..f041621d84 100644 --- a/R/ArrowIO.R +++ b/R/ArrowIO.R @@ -27,12 +27,12 @@ ##' @param query A TileDB Query object ##' @param name A character variable identifying the buffer ##' @param ctx tiledb_ctx object (optional) -##' @return A two-element vector where the two elements are -##' external pointers to the Arrow array and schema +##' @return A \code{nanoarrow} object (which is an external pointer to an Arrow Array +##' with the Arrow Schema stored as the external pointer tag) classed as an S3 object ##' @export tiledb_query_export_buffer <- function(query, name, ctx = tiledb_get_context()) { - stopifnot(`The 'query' argument must be a tiledb query` = is(query, "tiledb_query"), - `The 'name' argument must be character` = is.character(name)) + stopifnot("The 'query' argument must be a tiledb query" = is(query, "tiledb_query"), + "The 'name' argument must be character" = is.character(name)) res <- libtiledb_query_export_buffer(ctx@ptr, query@ptr, name) res } @@ -43,16 +43,17 @@ tiledb_query_export_buffer <- function(query, name, ctx = tiledb_get_context()) ##' from two Arrow exerternal pointers. ##' @param query A TileDB Query object ##' @param name A character variable identifying the buffer -##' @param arrowpointers A two-element list vector with two external pointers -##' to an Arrow Array and Schema, respectively +##' @param nanoarrowptr A \code{nanoarrow} object (which is an external pointer to an Arrow Array +##' with the Arrow Schema stored as the external pointer tag) classed as an S3 object ##' @param ctx tiledb_ctx object (optional) ##' @return The update Query external pointer is returned ##' @export -tiledb_query_import_buffer <- function(query, name, arrowpointers, ctx = tiledb_get_context()) { - stopifnot(`The 'query' argument must be a tiledb query` = is(query, "tiledb_query"), - `The 'name' argument must be character` = is.character(name), - `The 'arrowpointers' argument must be list of length two` = is.list(arrowpointers) && length(arrowpointers)==2) - query@ptr <- libtiledb_query_import_buffer(ctx@ptr, query@ptr, name, arrowpointers) +tiledb_query_import_buffer <- function(query, name, nanoarrowptr, ctx = tiledb_get_context()) { + stopifnot("The 'query' argument must be a tiledb query" = is(query, "tiledb_query"), + "The 'name' argument must be character" = is.character(name), + "The 'nanoarrowptr' argument must be an 'nanoarrow' array object" = + inherits(nanoarrowptr, "nanoarrow_array")) + query@ptr <- libtiledb_query_import_buffer(ctx@ptr, query@ptr, name, nanoarrowptr) query } diff --git a/R/RcppExports.R b/R/RcppExports.R index 4803fe302c..e39d12f295 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -1,44 +1,18 @@ # Generated by using Rcpp::compileAttributes() -> do not edit by hand # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 -.allocate_arrow_array_as_xptr <- function() { - .Call(`_tiledb_allocate_arrow_array_as_xptr`) -} - -.allocate_arrow_schema_as_xptr <- function() { - .Call(`_tiledb_allocate_arrow_schema_as_xptr`) -} - -.delete_arrow_array_from_xptr <- function(sxp) { - invisible(.Call(`_tiledb_delete_arrow_array_from_xptr`, sxp)) -} - -.delete_arrow_schema_from_xptr <- function(sxp) { - invisible(.Call(`_tiledb_delete_arrow_schema_from_xptr`, sxp)) -} - libtiledb_query_export_buffer <- function(ctx, query, name) { .Call(`_tiledb_libtiledb_query_export_buffer`, ctx, query, name) } -libtiledb_query_import_buffer <- function(ctx, query, name, arrowpointers) { - .Call(`_tiledb_libtiledb_query_import_buffer`, ctx, query, name, arrowpointers) +libtiledb_query_import_buffer <- function(ctx, query, name, naptr) { + .Call(`_tiledb_libtiledb_query_import_buffer`, ctx, query, name, naptr) } libtiledb_query_export_arrow_table <- function(ctx, query, names) { .Call(`_tiledb_libtiledb_query_export_arrow_table`, ctx, query, names) } -#' @noRd -check_arrow_schema_tag <- function(xp) { - .Call(`_tiledb_check_arrow_schema_tag`, xp) -} - -#' @noRd -check_arrow_array_tag <- function(xp) { - .Call(`_tiledb_check_arrow_array_tag`, xp) -} - libtiledb_to_arrow <- function(ab, qry, dicts) { .Call(`_tiledb_libtiledb_to_arrow`, ab, qry, dicts) } @@ -47,6 +21,10 @@ libtiledb_allocate_column_buffers <- function(ctx, qry, uri, names, memory_budge .Call(`_tiledb_libtiledb_allocate_column_buffers`, ctx, qry, uri, names, memory_budget) } +nanoarrow2list <- function(naarrptr) { + .Call(`_tiledb_nanoarrow2list`, naarrptr) +} + makeQueryWrapper <- function(qp) { .Call(`_tiledb_makeQueryWrapper`, qp) } diff --git a/inst/include/tiledb.h b/inst/include/tiledb.h index 200fe4564f..cfd21e0e0d 100644 --- a/inst/include/tiledb.h +++ b/inst/include/tiledb.h @@ -69,8 +69,8 @@ typedef struct query_buffer query_buf_t; // map from buffer names to shared_ptr to column_buffer typedef std::unordered_map> map_to_col_buf_t; -// some lipstick on the pig that is a SEXP -- allow the nanoarrow ArrowArray XPtr be typedef'ed -typedef SEXP nanoarrowXPtr; +// some lipstick on the pig that is a SEXP -- but we stick with the S3 SEXP nanoarrow creates +typedef SEXP nanoarrowS3; // C++ compiler complains about missing delete functionality when we use tiledb_vfs_fh_t directly struct vfs_fh { diff --git a/inst/tinytest/test_arrowio.R b/inst/tinytest/test_arrowio.R index 17b19cdaeb..446af7cb06 100644 --- a/inst/tinytest/test_arrowio.R +++ b/inst/tinytest/test_arrowio.R @@ -19,10 +19,9 @@ batch <- record_batch(df) expect_true(is(batch, "RecordBatch")) expect_true(is(as.data.frame(batch), "data.frame")) - ## allocate two structures (and release at end) -aa <- tiledb_arrow_array_ptr() -as <- tiledb_arrow_schema_ptr() +aa <- nanoarrow::nanoarrow_allocate_array() +as <- nanoarrow::nanoarrow_allocate_schema() arrow:::ExportRecordBatch(batch, aa, as) newrb <- arrow:::ImportRecordBatch(aa, as) @@ -30,9 +29,6 @@ expect_true(is(newrb, "RecordBatch")) expect_true(is(as.data.frame(newrb), "data.frame")) expect_equal(batch, newrb) -tiledb_arrow_schema_del(as) -tiledb_arrow_array_del(aa) - ## round-turn test 1: write tiledb first, create arrow object via zero-copy suppressMessages(library(bit64)) @@ -74,7 +70,6 @@ tiledb_query_finalize(qry) #arr <- tiledb_array(tmp, return_as="data.frame") #print(arr[]) - arr <- tiledb_array(tmp) qry <- tiledb_query(arr, "READ") dimptr <- tiledb_query_buffer_alloc_ptr(qry, "INT32", n) @@ -90,18 +85,20 @@ tiledb_query_submit(qry) tiledb_query_finalize(qry) res <- tiledb_query_export_buffer(qry, "rows") -v <- Array$create(arrow:::ImportArray(res[[1]], res[[2]])) -tiledb_arrow_array_del(res[[1]]) -tiledb_arrow_schema_del(res[[2]]) +#v <- Array$create(arrow:::ImportArray(res[[1]], res[[2]])) +v <- Array$create(res) +#tiledb_arrow_array_del(res[[1]]) +#tiledb_arrow_schema_del(res[[2]]) expect_equal(v$as_vector(), 4:7) for (col in c("int8", "uint8", "int16", "uint16", "int32", "uint32", "int64", "uint64", "float64")) { qry <- tiledb_query_set_buffer_ptr(qry, col, attrlst[[col]]) res <- tiledb_query_export_buffer(qry, col) - v <- Array$create(arrow:::ImportArray(res[[1]], res[[2]])) - tiledb_arrow_array_del(res[[1]]) - tiledb_arrow_schema_del(res[[2]]) + v <- Array$create(res) + #v <- Array$create(arrow:::ImportArray(res[[1]], res[[2]])) + #tiledb_arrow_array_del(res[[1]]) + #tiledb_arrow_schema_del(res[[2]]) expect_equal(v$as_vector(), 4:7) } @@ -112,6 +109,8 @@ dir.create(tmp <- tempfile()) n <- 10L ## create a schema but don't fill it yet +#spdl::log("debug") + dim <- tiledb_dim("rows", domain=c(1L,n), type="INT32", tile=1L) dom <- tiledb_domain(dim) sch <- tiledb_array_schema(dom, @@ -127,6 +126,7 @@ sch <- tiledb_array_schema(dom, sparse = TRUE) tiledb_array_create(tmp, sch) +#exit_file("aa") ## create an arrow 'record batch' with a number of (correcsponding) columns rb <- record_batch("rows" = Array$create(1:n, int32()), "int8" = Array$create(1:n, int8()), @@ -144,17 +144,21 @@ rb <- record_batch("rows" = Array$create(1:n, int32()), arr <- tiledb_array(tmp) qry <- tiledb_query(arr, "WRITE") +#spdl::log("debug") nms <- rb$names() lst <- list() for (nam in nms) { vec <- rb[[nam]] # can access by name - aa <- tiledb_arrow_array_ptr() - as <- tiledb_arrow_schema_ptr() - arrow:::ExportArray(vec, aa, as) + na <- nanoarrow::as_nanoarrow_array(vec) + #print(na) + #print(class(na)) + #aa <- tiledb_arrow_array_ptr() + #as <- tiledb_arrow_schema_ptr() + #arrow:::ExportArray(vec, aa, as) - qry <- tiledb_query_import_buffer(qry, nam, list(aa, as)) + qry <- tiledb_query_import_buffer(qry, nam, na) - lst[[nam]] <- list(aa=aa, as=as) + #lst[[nam]] <- list(aa=aa, as=as) } tiledb_query_set_layout(qry, "UNORDERED") tiledb_query_submit(qry) @@ -162,12 +166,14 @@ tiledb_query_finalize(qry) arr <- tiledb_array(tmp, return_as="data.frame") df <- arr[] - -for (i in 1:10) { - l <- lst[[i]] - tiledb_arrow_array_del(l[[1]]) - tiledb_arrow_schema_del(l[[2]]) -} +#print(df) +#q() + +#for (i in 1:10) { +# l <- lst[[i]] +# tiledb_arrow_array_del(l[[1]]) +# tiledb_arrow_schema_del(l[[2]]) +#} ## n=15 expect_true(is(df, "data.frame")) diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index a75c318b32..96783ca823 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -12,75 +12,35 @@ Rcpp::Rostream& Rcpp::Rcout = Rcpp::Rcpp_cout_get(); Rcpp::Rostream& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get(); #endif -// allocate_arrow_array_as_xptr -SEXP allocate_arrow_array_as_xptr(); -RcppExport SEXP _tiledb_allocate_arrow_array_as_xptr() { -BEGIN_RCPP - Rcpp::RObject rcpp_result_gen; - Rcpp::RNGScope rcpp_rngScope_gen; - rcpp_result_gen = Rcpp::wrap(allocate_arrow_array_as_xptr()); - return rcpp_result_gen; -END_RCPP -} -// allocate_arrow_schema_as_xptr -SEXP allocate_arrow_schema_as_xptr(); -RcppExport SEXP _tiledb_allocate_arrow_schema_as_xptr() { -BEGIN_RCPP - Rcpp::RObject rcpp_result_gen; - Rcpp::RNGScope rcpp_rngScope_gen; - rcpp_result_gen = Rcpp::wrap(allocate_arrow_schema_as_xptr()); - return rcpp_result_gen; -END_RCPP -} -// delete_arrow_array_from_xptr -void delete_arrow_array_from_xptr(SEXP sxp); -RcppExport SEXP _tiledb_delete_arrow_array_from_xptr(SEXP sxpSEXP) { -BEGIN_RCPP - Rcpp::RNGScope rcpp_rngScope_gen; - Rcpp::traits::input_parameter< SEXP >::type sxp(sxpSEXP); - delete_arrow_array_from_xptr(sxp); - return R_NilValue; -END_RCPP -} -// delete_arrow_schema_from_xptr -void delete_arrow_schema_from_xptr(SEXP sxp); -RcppExport SEXP _tiledb_delete_arrow_schema_from_xptr(SEXP sxpSEXP) { -BEGIN_RCPP - Rcpp::RNGScope rcpp_rngScope_gen; - Rcpp::traits::input_parameter< SEXP >::type sxp(sxpSEXP); - delete_arrow_schema_from_xptr(sxp); - return R_NilValue; -END_RCPP -} // libtiledb_query_export_buffer -Rcpp::List libtiledb_query_export_buffer(XPtr ctx, XPtr query, std::string name); +nanoarrowS3 libtiledb_query_export_buffer(XPtr ctx, XPtr query, std::string& name); RcppExport SEXP _tiledb_libtiledb_query_export_buffer(SEXP ctxSEXP, SEXP querySEXP, SEXP nameSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< XPtr >::type ctx(ctxSEXP); Rcpp::traits::input_parameter< XPtr >::type query(querySEXP); - Rcpp::traits::input_parameter< std::string >::type name(nameSEXP); + Rcpp::traits::input_parameter< std::string& >::type name(nameSEXP); rcpp_result_gen = Rcpp::wrap(libtiledb_query_export_buffer(ctx, query, name)); return rcpp_result_gen; END_RCPP } // libtiledb_query_import_buffer -XPtr libtiledb_query_import_buffer(XPtr ctx, XPtr query, std::string name, Rcpp::List arrowpointers); -RcppExport SEXP _tiledb_libtiledb_query_import_buffer(SEXP ctxSEXP, SEXP querySEXP, SEXP nameSEXP, SEXP arrowpointersSEXP) { +XPtr libtiledb_query_import_buffer(XPtr ctx, XPtr query, std::string& name, nanoarrowS3 naptr); +RcppExport SEXP _tiledb_libtiledb_query_import_buffer(SEXP ctxSEXP, SEXP querySEXP, SEXP nameSEXP, SEXP naptrSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< XPtr >::type ctx(ctxSEXP); Rcpp::traits::input_parameter< XPtr >::type query(querySEXP); - Rcpp::traits::input_parameter< std::string >::type name(nameSEXP); - Rcpp::traits::input_parameter< Rcpp::List >::type arrowpointers(arrowpointersSEXP); - rcpp_result_gen = Rcpp::wrap(libtiledb_query_import_buffer(ctx, query, name, arrowpointers)); + Rcpp::traits::input_parameter< std::string& >::type name(nameSEXP); + Rcpp::traits::input_parameter< nanoarrowS3 >::type naptr(naptrSEXP); + rcpp_result_gen = Rcpp::wrap(libtiledb_query_import_buffer(ctx, query, name, naptr)); return rcpp_result_gen; END_RCPP } // libtiledb_query_export_arrow_table -Rcpp::List libtiledb_query_export_arrow_table(XPtr ctx, XPtr query, std::vector names); +nanoarrowS3 libtiledb_query_export_arrow_table(XPtr ctx, XPtr query, std::vector names); RcppExport SEXP _tiledb_libtiledb_query_export_arrow_table(SEXP ctxSEXP, SEXP querySEXP, SEXP namesSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; @@ -92,30 +52,8 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } -// check_arrow_schema_tag -bool check_arrow_schema_tag(Rcpp::XPtr xp); -RcppExport SEXP _tiledb_check_arrow_schema_tag(SEXP xpSEXP) { -BEGIN_RCPP - Rcpp::RObject rcpp_result_gen; - Rcpp::RNGScope rcpp_rngScope_gen; - Rcpp::traits::input_parameter< Rcpp::XPtr >::type xp(xpSEXP); - rcpp_result_gen = Rcpp::wrap(check_arrow_schema_tag(xp)); - return rcpp_result_gen; -END_RCPP -} -// check_arrow_array_tag -bool check_arrow_array_tag(Rcpp::XPtr xp); -RcppExport SEXP _tiledb_check_arrow_array_tag(SEXP xpSEXP) { -BEGIN_RCPP - Rcpp::RObject rcpp_result_gen; - Rcpp::RNGScope rcpp_rngScope_gen; - Rcpp::traits::input_parameter< Rcpp::XPtr >::type xp(xpSEXP); - rcpp_result_gen = Rcpp::wrap(check_arrow_array_tag(xp)); - return rcpp_result_gen; -END_RCPP -} // libtiledb_to_arrow -nanoarrowXPtr libtiledb_to_arrow(Rcpp::XPtr ab, Rcpp::XPtr qry, Rcpp::List dicts); +nanoarrowS3 libtiledb_to_arrow(Rcpp::XPtr ab, Rcpp::XPtr qry, Rcpp::List dicts); RcppExport SEXP _tiledb_libtiledb_to_arrow(SEXP abSEXP, SEXP qrySEXP, SEXP dictsSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; @@ -142,6 +80,17 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } +// nanoarrow2list +Rcpp::List nanoarrow2list(nanoarrowS3 naarrptr); +RcppExport SEXP _tiledb_nanoarrow2list(SEXP naarrptrSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< nanoarrowS3 >::type naarrptr(naarrptrSEXP); + rcpp_result_gen = Rcpp::wrap(nanoarrow2list(naarrptr)); + return rcpp_result_gen; +END_RCPP +} // makeQueryWrapper SEXP makeQueryWrapper(SEXP qp); RcppExport SEXP _tiledb_makeQueryWrapper(SEXP qpSEXP) { @@ -3596,17 +3545,12 @@ END_RCPP } static const R_CallMethodDef CallEntries[] = { - {"_tiledb_allocate_arrow_array_as_xptr", (DL_FUNC) &_tiledb_allocate_arrow_array_as_xptr, 0}, - {"_tiledb_allocate_arrow_schema_as_xptr", (DL_FUNC) &_tiledb_allocate_arrow_schema_as_xptr, 0}, - {"_tiledb_delete_arrow_array_from_xptr", (DL_FUNC) &_tiledb_delete_arrow_array_from_xptr, 1}, - {"_tiledb_delete_arrow_schema_from_xptr", (DL_FUNC) &_tiledb_delete_arrow_schema_from_xptr, 1}, {"_tiledb_libtiledb_query_export_buffer", (DL_FUNC) &_tiledb_libtiledb_query_export_buffer, 3}, {"_tiledb_libtiledb_query_import_buffer", (DL_FUNC) &_tiledb_libtiledb_query_import_buffer, 4}, {"_tiledb_libtiledb_query_export_arrow_table", (DL_FUNC) &_tiledb_libtiledb_query_export_arrow_table, 3}, - {"_tiledb_check_arrow_schema_tag", (DL_FUNC) &_tiledb_check_arrow_schema_tag, 1}, - {"_tiledb_check_arrow_array_tag", (DL_FUNC) &_tiledb_check_arrow_array_tag, 1}, {"_tiledb_libtiledb_to_arrow", (DL_FUNC) &_tiledb_libtiledb_to_arrow, 3}, {"_tiledb_libtiledb_allocate_column_buffers", (DL_FUNC) &_tiledb_libtiledb_allocate_column_buffers, 5}, + {"_tiledb_nanoarrow2list", (DL_FUNC) &_tiledb_nanoarrow2list, 1}, {"_tiledb_makeQueryWrapper", (DL_FUNC) &_tiledb_makeQueryWrapper, 1}, {"_tiledb_libtiledb_query_add_range_with_type", (DL_FUNC) &_tiledb_libtiledb_query_add_range_with_type, 6}, {"_tiledb_libtiledb_query_add_range", (DL_FUNC) &_tiledb_libtiledb_query_add_range, 5}, diff --git a/src/arrowio.cpp b/src/arrowio.cpp index 208ceaea2b..f96adaa577 100644 --- a/src/arrowio.cpp +++ b/src/arrowio.cpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2020-2023 TileDB Inc. +// Copyright (c) 2020-2024 TileDB Inc. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -23,336 +23,98 @@ #include "libtiledb.h" #include "tiledb_version.h" #include "nanoarrow/r.h" -//#include // for C interface to Arrow -//#include #include "tiledb_arrowio.h" #include "column_buffer.h" #include "arrow_adapter.h" -ArrowSchema* schema_owning_ptr(void) { - struct ArrowSchema* schema = new struct ArrowSchema; - if (schema == nullptr) Rcpp::stop("Failed to allocate ArrowSchema"); - schema->release = NULL; - spdl::debug("[schema_owning_ptr] created"); - return schema; -} - -ArrowArray* array_owning_ptr(void) { - struct ArrowArray* array = new struct ArrowArray; - if (array == nullptr) Rcpp::stop("Failed to allocate ArrowArray"); - array->release = NULL; - spdl::debug("[array_owning_ptr] created"); - return array; -} - - -// borrowed from arrow R package (version 7.0.0) (licensed under Apache-2.0) and slightly extended/adapted -template -struct Pointer { - Pointer() : ptr_(new T()) {} - explicit Pointer(SEXP x) { - if (TYPEOF(x) == EXTPTRSXP) { - ptr_ = (T*)R_ExternalPtrAddr(x); - - } else if (TYPEOF(x) == STRSXP && Rf_length(x) == 1) { - // User passed a character representation of the pointer address - SEXP char0 = STRING_ELT(x, 0); - if (char0 == NA_STRING) { - Rcpp::stop("Can't convert NA_character_ to pointer"); - } - - const char* input_chars = CHAR(char0); - char* endptr; - uint64_t ptr_value = strtoull(input_chars, &endptr, 0); - if (endptr != (input_chars + strlen(input_chars))) { - Rcpp::stop("Can't parse '%s' as a 64-bit integer address", input_chars); - } - ptr_ = reinterpret_cast(static_cast(ptr_value)); - - } else if (Rf_inherits(x, "integer64") && Rf_length(x) == 1) { - // User passed an integer64(1) of the pointer address an integer64 is a REALSXP - // under the hood, with the bytes of each double reinterpreted as an int64. - uint64_t ptr_value; - memcpy(&ptr_value, REAL(x), sizeof(uint64_t)); - ptr_ = reinterpret_cast(static_cast(ptr_value)); - - } else if (TYPEOF(x) == RAWSXP && Rf_length(x) == sizeof(T*)) { - // User passed a raw() with the literal bytes of the pointer. - memcpy(&ptr_, RAW(x), sizeof(T*)); - - } else if (TYPEOF(x) == REALSXP && Rf_length(x) == 1) { - // User passed a double(1) of the static-casted pointer address. - ptr_ = reinterpret_cast(static_cast(REAL(x)[0])); - - } else { - Rcpp::stop("Can't convert input object to pointer: %d", TYPEOF(x)); - } - } - - inline operator SEXP() const { return R_MakeExternalPtr(ptr_, R_NilValue, R_NilValue); } - - inline operator T*() const { return ptr_; } - - inline void finalize() { delete ptr_; } - - inline T* get() const { return ptr_; } - - T* ptr_; -}; - - -// these functions are local to this compilation unit as is the defintion of Pointer -Pointer allocate_arrow_array() { return {}; } -Pointer allocate_arrow_schema() { return {}; } -void delete_arrow_array(Pointer ptr) { ptr.finalize(); } -void delete_arrow_schema(Pointer ptr) { ptr.finalize(); } - -// [[Rcpp::export(.allocate_arrow_array_as_xptr)]] -SEXP allocate_arrow_array_as_xptr() { - return allocate_arrow_array(); -} - -// [[Rcpp::export(.allocate_arrow_schema_as_xptr)]] -SEXP allocate_arrow_schema_as_xptr() { - return allocate_arrow_schema(); -} - -// [[Rcpp::export(.delete_arrow_array_from_xptr)]] -void delete_arrow_array_from_xptr(SEXP sxp) { - Pointer ptr(sxp); - delete_arrow_array(ptr); -} - -// [[Rcpp::export(.delete_arrow_schema_from_xptr)]] -void delete_arrow_schema_from_xptr(SEXP sxp) { - Pointer ptr(sxp); - delete_arrow_schema(ptr); -} +void _array_xptr_set_schema(SEXP array_xptr, SEXP schema_xptr); // forward declaration, see below +SEXP _array_xptr_get_schema(SEXP array_xptr); +inline void exitIfError(const ArrowErrorCode ec, const std::string& msg); +inline void* _getPtr(SEXP p) { return R_ExternalPtrAddr(p); } // [[Rcpp::export]] -Rcpp::List libtiledb_query_export_buffer(XPtr ctx, - XPtr query, - std::string name) { +nanoarrowS3 libtiledb_query_export_buffer(XPtr ctx, + XPtr query, + std::string& name) { + check_xptr_tag(ctx); + check_xptr_tag(query); + tiledb::arrow::ArrowAdapter adapter(ctx, query); - //auto arrptr = allocate_arrow_array(); // external pointer object - //auto schptr = allocate_arrow_schema(); - auto schptr = schema_owning_ptr(); - auto arrptr = array_owning_ptr(); - adapter.export_buffer(name.c_str(), - static_cast(arrptr), - static_cast(schptr)); + auto schemaxp = nanoarrow_schema_owning_xptr(); + auto sch = nanoarrow_output_schema_from_xptr(schemaxp); + auto arrayxp = nanoarrow_array_owning_xptr(); + auto arr = nanoarrow_output_array_from_xptr(arrayxp); + + adapter.export_buffer(name.c_str(), arr, sch); spdl::debug(tfm::format("[libtiledb_query_export_buffer] name '%s'", name.c_str())); - SEXP xpschema = R_MakeExternalPtr((void*) schptr, R_NilValue, R_NilValue); - SEXP xparray = R_MakeExternalPtr((void*) arrptr, R_NilValue, R_NilValue); - return Rcpp::List::create(xparray, xpschema); + + // Nanoarrow special: stick schema into xptr tag to return single SEXP + _array_xptr_set_schema(arrayxp, schemaxp); // embed schema in array + return arrayxp; } // [[Rcpp::export]] XPtr libtiledb_query_import_buffer(XPtr ctx, XPtr query, - std::string name, - Rcpp::List arrowpointers) { + std::string& name, + nanoarrowS3 naptr) { + check_xptr_tag(ctx); + check_xptr_tag(query); tiledb::arrow::ArrowAdapter adapter(ctx, query); + // get schema xptr out of array xptr tag + auto schptr = _array_xptr_get_schema(naptr); + adapter.import_buffer(name.c_str(), - R_ExternalPtrAddr(arrowpointers[0]), - R_ExternalPtrAddr(arrowpointers[1])); + (struct ArrowArray*) _getPtr(naptr), + (struct ArrowSchema*) _getPtr(schptr)); + return(query); } -Rcpp::XPtr schema_owning_xptr(void); -Rcpp::XPtr array_owning_xptr(void); -Rcpp::XPtr schema_setup_struct(Rcpp::XPtr schxp, int64_t n_children); -Rcpp::XPtr array_setup_struct(Rcpp::XPtr arrxp, int64_t n_children); - // [[Rcpp::export]] -Rcpp::List libtiledb_query_export_arrow_table(XPtr ctx, - XPtr query, - std::vector names) { +nanoarrowS3 libtiledb_query_export_arrow_table(XPtr ctx, + XPtr query, + std::vector names) { + check_xptr_tag(ctx); + check_xptr_tag(query); size_t ncol = names.size(); tiledb::arrow::ArrowAdapter adapter(ctx, query); - Rcpp::XPtr schemap = schema_owning_xptr(); - Rcpp::XPtr arrayp = array_owning_xptr(); - schemap = schema_setup_struct(schemap, ncol); - arrayp = array_setup_struct(arrayp, ncol); + auto schemaxp = nanoarrow_schema_owning_xptr(); + auto sch = nanoarrow_output_schema_from_xptr(schemaxp); + exitIfError(ArrowSchemaInitFromType(sch, NANOARROW_TYPE_STRUCT), "Bad schema init"); + exitIfError(ArrowSchemaSetName(sch, ""), "Bad schema name"); + exitIfError(ArrowSchemaAllocateChildren(sch, ncol), "Bad schema children alloc"); - arrayp->length = 0; + auto arrayxp = nanoarrow_array_owning_xptr(); + auto arr = nanoarrow_output_array_from_xptr(arrayxp); + exitIfError(ArrowArrayInitFromType(arr, NANOARROW_TYPE_STRUCT), "Bad array init"); + exitIfError(ArrowArrayAllocateChildren(arr, ncol), "Bad array children alloc"); + arr->length = 0; for (size_t i=0; ichildren[i] = chldschemap; - arrayp->children[i] = chldarrayp; + adapter.export_buffer(names[i].c_str(), arr->children[i], sch->children[i]); - if (chldarrayp->length > arrayp->length) { - spdl::info(tfm::format("[libtiledb_query_export_arrow_table] Setting array length to %d", chldarrayp->length)); - arrayp->length = chldarrayp->length; + if (arr->children[i]->length > arr->length) { + spdl::info(tfm::format("[libtiledb_query_export_arrow_table] Setting array length to %d", arr->children[i]->length)); + arr->length = arr->children[i]->length; } spdl::info(tfm::format("[libtiledb_query_export_arrow_table] Seeing %s (%s) at length %d null_count %d buffers %d", - names[i], chldschemap->format, chldarrayp->length, chldarrayp->null_count, chldarrayp->n_buffers)); - + names[i], sch->children[i]->format, arr->children[i]->length, arr->children[i]->null_count, arr->children[i]->n_buffers)); } - Rcpp::List as = Rcpp::List::create(Rcpp::Named("array_data") = arrayp, - Rcpp::Named("schema") = schemap); - return as; -} - -//' @noRd -// [[Rcpp::export]] -bool check_arrow_schema_tag(Rcpp::XPtr xp) { - check_xptr_tag(xp); // throws if mismatched - return true; -} - -//' @noRd -// [[Rcpp::export]] -bool check_arrow_array_tag(Rcpp::XPtr xp) { - check_xptr_tag(xp); // throws if mismatched - return true; -} - - -// (Adapted) helper functions from nanoarrow -// -// Create an external pointer with the proper class and that will release any -// non-null, non-released pointer when garbage collected. We use a tagged XPtr, -// but do not set an XPtr finalizer -Rcpp::XPtr schema_owning_xptr(void) { - struct ArrowSchema* schema = (struct ArrowSchema*)ArrowMalloc(sizeof(struct ArrowSchema)); - spdl::trace(tfm::format("[schema_owning_xptr] Allocating %d bytes", sizeof(struct ArrowSchema))); - if (schema == NULL) Rcpp::stop("Failed to allocate ArrowSchema"); - schema->release = NULL; - Rcpp::XPtr schema_xptr = make_xptr(schema, false); - return schema_xptr; -} -// Create an external pointer with the proper class and that will release any -// non-null, non-released pointer when garbage collected. We use a tagged XPtr, -// but do not set an XPtr finalizer -Rcpp::XPtr array_owning_xptr(void) { - struct ArrowArray* array = (struct ArrowArray*)ArrowMalloc(sizeof(struct ArrowArray)); - spdl::trace(tfm::format("[array_owning_xptr] Allocating %d bytes", sizeof(struct ArrowArray))); - if (array == NULL) Rcpp::stop("Failed to allocate ArrowArray"); - array->release = NULL; - Rcpp::XPtr array_xptr = make_xptr(array, false); - return array_xptr; -} - -// Helper function to register a finalizer -- eg for debugging purposes -inline void registerXptrFinalizer(SEXP s, R_CFinalizer_t f, bool onexit = true) { - R_RegisterCFinalizerEx(s, f, onexit ? TRUE : FALSE); -} -extern "C" { - void ArrowArrayReleaseInternal(struct ArrowArray *array); // made non-static in nanoarrow.c - ArrowErrorCode ArrowArraySetStorageType(struct ArrowArray* array, // ditto - enum ArrowType storage_type); - ArrowErrorCode localArrowSchemaSetType(struct ArrowSchema* schema, enum ArrowType type); + // Nanoarrow special: stick schema into xptr tag to return single SEXP + _array_xptr_set_schema(arrayxp, schemaxp); // embed schema in array + return arrayxp; } -Rcpp::XPtr schema_setup_struct(Rcpp::XPtr schxp, int64_t n_children) { - ArrowSchema* schema = schxp.get(); - auto type = NANOARROW_TYPE_STRUCT; - - ArrowSchemaInit(schema); // modified from ArrowSchemaInitFromType() - int result = localArrowSchemaSetType(schema, type); // modified to call func with XPtr - if (result != NANOARROW_OK) { - schema->release(schema); - Rcpp::stop("Error setting struct schema"); - } - - // now adapted from ArrowSchemaAllocateChildren - if (schema->children != NULL) Rcpp::stop("Error allocation as children not null"); - - if (n_children > 0) { - auto ptr = (struct ArrowSchema**) ArrowMalloc(n_children * sizeof(struct ArrowSchema*)); - Rcpp::XPtr schema_ptrxp = make_xptr(ptr, false); - schema->children = schema_ptrxp.get(); - if (schema->children == NULL) Rcpp::stop("Failed to allocate ArrowSchema*"); - - schema->n_children = n_children; - memset(schema->children, 0, n_children * sizeof(struct ArrowSchema*)); - - for (int64_t i = 0; i < n_children; i++) { - schema->children[i] = schema_owning_xptr(); - if (schema->children[i] == NULL) Rcpp::stop("Error allocation schema child %ld", i); - schema->children[i]->release = NULL; - } - } - return schxp; -} - -Rcpp::XPtr array_setup_struct(Rcpp::XPtr arrxp, int64_t n_children) { - ArrowArray* array = arrxp.get(); - auto storage_type = NANOARROW_TYPE_STRUCT; - - array->length = 0; - array->null_count = 0; - array->offset = 0; - array->n_buffers = 0; - array->n_children = 0; - array->buffers = NULL; - array->children = NULL; - array->dictionary = NULL; - array->release = &ArrowArrayReleaseInternal; - array->private_data = NULL; - - auto private_data = (struct ArrowArrayPrivateData*) ArrowMalloc(sizeof(struct ArrowArrayPrivateData)); - if (private_data == NULL) { - array->release = NULL; - Rcpp::stop("Error allocating array private data"); - } - ArrowBitmapInit(&private_data->bitmap); - ArrowBufferInit(&private_data->buffers[0]); - ArrowBufferInit(&private_data->buffers[1]); - private_data->buffer_data[0] = NULL; - private_data->buffer_data[1] = NULL; - private_data->buffer_data[2] = NULL; - array->private_data = private_data; - array->buffers = (const void**)(&private_data->buffer_data); - int result = ArrowArraySetStorageType(array, storage_type); - if (result != NANOARROW_OK) { - array->release(array); - Rcpp::stop("Error setting array storage type"); - } - - ArrowLayoutInit(&private_data->layout, storage_type); - // We can only know this not to be true when initializing based on a schema so assume this to be true. - private_data->union_type_id_is_child_index = 1; - - - // remainder from ArrowArrayAllocateChildren() - if (array->children != NULL) Rcpp::stop("Error allocating array children as pointer not null"); - - if (n_children == 0) { - return arrxp; - } - - auto ptr = (struct ArrowArray**) ArrowMalloc(n_children * sizeof(struct ArrowArray*)); - Rcpp::XPtr array_ptrxp = make_xptr(ptr, false); - array->children = array_ptrxp.get(); - if (array->children == NULL) Rcpp::stop("Failed to allocated ArrayArray*"); - - memset(array->children, 0, n_children * sizeof(struct ArrowArray*)); - - for (int64_t i = 0; i < n_children; i++) { - array->children[i] = array_owning_xptr(); - if (array->children[i] == NULL) Rcpp::stop("Error allocation array child %ld", i); - array->children[i]->release = NULL; - } - array->n_children = n_children; - return arrxp; -} inline void exitIfError(const ArrowErrorCode ec, const std::string& msg) { if (ec != NANOARROW_OK) Rcpp::stop(msg); @@ -360,15 +122,19 @@ inline void exitIfError(const ArrowErrorCode ec, const std::string& msg) { // Attaches a schema to an array external pointer. The nanoarrow R package // attempts to do this whenever possible to avoid misinterpreting arrays. -void array_xptr_set_schema(SEXP array_xptr, SEXP schema_xptr) { +void _array_xptr_set_schema(SEXP array_xptr, SEXP schema_xptr) { R_SetExternalPtrTag(array_xptr, schema_xptr); } +// Reverse: peel the schema out of the array via the XPtr tag +SEXP _array_xptr_get_schema(SEXP array_xptr) { + return R_ExternalPtrTag(array_xptr); +} // was: Rcpp::List // [[Rcpp::export]] -nanoarrowXPtr libtiledb_to_arrow(Rcpp::XPtr ab, - Rcpp::XPtr qry, - Rcpp::List dicts) { +nanoarrowS3 libtiledb_to_arrow(Rcpp::XPtr ab, + Rcpp::XPtr qry, + Rcpp::List dicts) { check_xptr_tag(ab); check_xptr_tag(qry); std::vector names = ab->names(); @@ -451,7 +217,7 @@ nanoarrowXPtr libtiledb_to_arrow(Rcpp::XPtr ab, spdl::info("[libtiledb_to_arrow] ArrowArrayFinishBuildingDefault"); // Nanoarrow special: stick schema into xptr tag to return single SEXP - array_xptr_set_schema(arrayxp, schemaxp); // embed schema in array + _array_xptr_set_schema(arrayxp, schemaxp); // embed schema in array spdl::trace("[libtiledb_to_arrow] returning from libtiledb_to_arrow"); return arrayxp; @@ -482,45 +248,8 @@ Rcpp::XPtr libtiledb_allocate_column_buffers(Rcpp::XPtr(abp); } -// added two local copies to inject xptr for format -extern "C" { - const char* ArrowSchemaFormatTemplate(enum ArrowType type); // remove static in nanoarrow - int ArrowSchemaInitChildrenIfNeeded(struct ArrowSchema* schema, enum ArrowType type); // ditto -} -ArrowErrorCode localArrowSchemaSetFormat(struct ArrowSchema* schema, const char* format) { - if (schema->format != NULL) { - ArrowFree((void*)schema->format); - } - - if (format != NULL) { - size_t format_size = strlen(format) + 1; - schema->format = (const char*)ArrowMalloc(format_size); - if (schema->format == NULL) { - return ENOMEM; - } - Rcpp::XPtr schema_fmt_xp = make_xptr(schema->format, false); - - memcpy((void*)schema->format, format, format_size); - } else { - schema->format = NULL; - } - - return NANOARROW_OK; -} -ArrowErrorCode localArrowSchemaSetType(struct ArrowSchema* schema, enum ArrowType type) { - // We don't allocate the dictionary because it has to be nullptr - // for non-dictionary-encoded arrays. - - // Set the format to a valid format string for type - const char* template_format = ArrowSchemaFormatTemplate(type); - - // If type isn't recognized and not explicitly unset - if (template_format == NULL && type != NANOARROW_TYPE_UNINITIALIZED) { - return EINVAL; - } - - NANOARROW_RETURN_NOT_OK(localArrowSchemaSetFormat(schema, template_format)); - - // For types with an umabiguous child structure, allocate children - return ArrowSchemaInitChildrenIfNeeded(schema, type); +// [[Rcpp::export]] +Rcpp::List nanoarrow2list(nanoarrowS3 naarrptr) { + auto schptr = _array_xptr_get_schema(naarrptr); + return Rcpp::List::create(naarrptr, schptr); }