From a3c9a2a816ae7989fd336c24b4af75d15e440310 Mon Sep 17 00:00:00 2001 From: John Kerl Date: Tue, 17 Sep 2024 17:23:47 -0400 Subject: [PATCH] arrow-helpers for string-valued columns [skip ci] --- libtiledbsoma/src/utils/arrow_adapter.h | 69 +++++++++++++++++++++---- 1 file changed, 59 insertions(+), 10 deletions(-) diff --git a/libtiledbsoma/src/utils/arrow_adapter.h b/libtiledbsoma/src/utils/arrow_adapter.h index 51d36691d9..ec7087392b 100644 --- a/libtiledbsoma/src/utils/arrow_adapter.h +++ b/libtiledbsoma/src/utils/arrow_adapter.h @@ -293,11 +293,24 @@ class ArrowAdapter { arrow_array->length = n; arrow_array->null_count = 0; arrow_array->offset = 0; + + // Two-buffer model for non-string data: + // * Slot 0 is the Arrow validity buffer which we leave null + // * Slot 1 is data, void* but will be derefrenced as T* + // * There is no offset information arrow_array->n_buffers = 2; - arrow_array->release = &ArrowAdapter::release_array; arrow_array->buffers = new const void*[2]; arrow_array->n_children = 0; // leaf/child node + // The nominal use of these methods as of this writing is for + // low-volume data such as schema information -- less than a + // kilobyte total. It's simplest and safest to do data copies, + // for-loop-wise. If we were to extend usage of these methods + // to bulk data in the megabyte/gigabyte range, we'd want to + // look at zero-copy for buffers, with variable approaches + // to memory management. + arrow_array->release = &ArrowAdapter::release_array; + arrow_array->buffers[0] = nullptr; // Use malloc here, not new, to match ArrowAdapter::release_array T* dest = (T*)malloc(n * sizeof(T)); @@ -309,6 +322,15 @@ class ArrowAdapter { return arrow_array; } + // A nominal use of this method is for reporting core domain, current + // domain, and non-empty domain back to Python/R. Meanwhile core string + // dims must always have domain of (nullptr, nullptr); and they have current + // domain which must _not_ be nullptr pairs. + // + // For the former do we give back a column of length 2 with nulls in it, + // using Arrow's validity buffers? Or do we use ("", "") as TileDB-Py does? + // + // We choose the latter. static ArrowArray* make_arrow_array_child( const std::vector& v) { // Use new here, not malloc, to match ArrowAdapter::release_array @@ -316,19 +338,46 @@ class ArrowAdapter { size_t n = v.size(); - arrow_array->length = n; + arrow_array->length = n; // Number of strings, not number of bytes arrow_array->null_count = 0; arrow_array->offset = 0; - arrow_array->n_buffers = 2; - arrow_array->release = &ArrowAdapter::release_array; - arrow_array->buffers = new const void*[2]; + + // Three-buffer model for string data: + // * Slot 0 is the Arrow uint8_t* validity buffer + // * Slot 1 is the Arrow offsets buffer: uint32_t* for Arrow string + // or uint64_t* for Arrow large_string + // * Slot 2 is data, void* but will be derefrenced as T* + arrow_array->n_buffers = 3; + arrow_array->buffers = new const void*[3]; arrow_array->n_children = 0; // leaf/child node - // For core domain, these are always nullptr for strings and cannot be - // anything else. More general use of this class is WIP on - // https://github.com/single-cell-data/TileDB-SOMA/issues/2407 - arrow_array->buffers[0] = nullptr; - arrow_array->buffers[1] = nullptr; + arrow_array->release = &ArrowAdapter::release_array; + + size_t nbytes = 0; + for (auto e : v) { + nbytes += e.length(); + } + + // This function produces arrow large_string, which has 64-bit offsets. + uint64_t* offsets = (uint64_t*)malloc((n + 1) * sizeof(uint64_t)); + + // Data + char* data = (char*)malloc(nbytes * sizeof(char)); + uint64_t dest_start = 0; + + offsets[0] = dest_start; + for (size_t i = 0; i < n; i++) { + const std::string& elem = v[i]; + size_t elem_len = elem.size(); + + memcpy(&data[dest_start], elem.c_str(), elem_len); + dest_start += elem_len; + offsets[i + 1] = dest_start; + } + + arrow_array->buffers[0] = nullptr; // validity + arrow_array->buffers[1] = offsets; + arrow_array->buffers[2] = data; return arrow_array; }