Skip to content

Commit

Permalink
arrow-helpers for string-valued columns [skip ci]
Browse files Browse the repository at this point in the history
  • Loading branch information
johnkerl committed Sep 17, 2024
1 parent bb4a436 commit a3c9a2a
Showing 1 changed file with 59 additions and 10 deletions.
69 changes: 59 additions & 10 deletions libtiledbsoma/src/utils/arrow_adapter.h
Original file line number Diff line number Diff line change
Expand Up @@ -293,11 +293,24 @@ class ArrowAdapter {
arrow_array->length = n;
arrow_array->null_count = 0;
arrow_array->offset = 0;

// Two-buffer model for non-string data:
// * Slot 0 is the Arrow validity buffer which we leave null
// * Slot 1 is data, void* but will be derefrenced as T*
// * There is no offset information
arrow_array->n_buffers = 2;
arrow_array->release = &ArrowAdapter::release_array;
arrow_array->buffers = new const void*[2];
arrow_array->n_children = 0; // leaf/child node

// The nominal use of these methods as of this writing is for
// low-volume data such as schema information -- less than a
// kilobyte total. It's simplest and safest to do data copies,
// for-loop-wise. If we were to extend usage of these methods
// to bulk data in the megabyte/gigabyte range, we'd want to
// look at zero-copy for buffers, with variable approaches
// to memory management.
arrow_array->release = &ArrowAdapter::release_array;

arrow_array->buffers[0] = nullptr;
// Use malloc here, not new, to match ArrowAdapter::release_array
T* dest = (T*)malloc(n * sizeof(T));
Expand All @@ -309,26 +322,62 @@ class ArrowAdapter {
return arrow_array;
}

// A nominal use of this method is for reporting core domain, current
// domain, and non-empty domain back to Python/R. Meanwhile core string
// dims must always have domain of (nullptr, nullptr); and they have current
// domain which must _not_ be nullptr pairs.
//
// For the former do we give back a column of length 2 with nulls in it,
// using Arrow's validity buffers? Or do we use ("", "") as TileDB-Py does?
//
// We choose the latter.
static ArrowArray* make_arrow_array_child(
const std::vector<std::string>& v) {
// Use new here, not malloc, to match ArrowAdapter::release_array
auto arrow_array = new ArrowArray;

size_t n = v.size();

arrow_array->length = n;
arrow_array->length = n; // Number of strings, not number of bytes
arrow_array->null_count = 0;
arrow_array->offset = 0;
arrow_array->n_buffers = 2;
arrow_array->release = &ArrowAdapter::release_array;
arrow_array->buffers = new const void*[2];

// Three-buffer model for string data:
// * Slot 0 is the Arrow uint8_t* validity buffer
// * Slot 1 is the Arrow offsets buffer: uint32_t* for Arrow string
// or uint64_t* for Arrow large_string
// * Slot 2 is data, void* but will be derefrenced as T*
arrow_array->n_buffers = 3;
arrow_array->buffers = new const void*[3];
arrow_array->n_children = 0; // leaf/child node

// For core domain, these are always nullptr for strings and cannot be
// anything else. More general use of this class is WIP on
// https://github.com/single-cell-data/TileDB-SOMA/issues/2407
arrow_array->buffers[0] = nullptr;
arrow_array->buffers[1] = nullptr;
arrow_array->release = &ArrowAdapter::release_array;

size_t nbytes = 0;
for (auto e : v) {
nbytes += e.length();
}

// This function produces arrow large_string, which has 64-bit offsets.
uint64_t* offsets = (uint64_t*)malloc((n + 1) * sizeof(uint64_t));

// Data
char* data = (char*)malloc(nbytes * sizeof(char));
uint64_t dest_start = 0;

offsets[0] = dest_start;
for (size_t i = 0; i < n; i++) {
const std::string& elem = v[i];
size_t elem_len = elem.size();

memcpy(&data[dest_start], elem.c_str(), elem_len);
dest_start += elem_len;
offsets[i + 1] = dest_start;
}

arrow_array->buffers[0] = nullptr; // validity
arrow_array->buffers[1] = offsets;
arrow_array->buffers[2] = data;

return arrow_array;
}
Expand Down

0 comments on commit a3c9a2a

Please sign in to comment.