arrow-helpers for string-valued columns [skip ci]

single-cell-data · Sep 17, 2024 · a3c9a2a · a3c9a2a
1 parent bb4a436
commit a3c9a2a
Showing 1 changed file with 59 additions and 10 deletions.
diff --git a/libtiledbsoma/src/utils/arrow_adapter.h b/libtiledbsoma/src/utils/arrow_adapter.h
@@ -293,11 +293,24 @@ class ArrowAdapter {
         arrow_array->length = n;
         arrow_array->null_count = 0;
         arrow_array->offset = 0;
+
+        // Two-buffer model for non-string data:
+        // * Slot 0 is the Arrow validity buffer which we leave null
+        // * Slot 1 is data, void* but will be derefrenced as T*
+        // * There is no offset information
         arrow_array->n_buffers = 2;
-        arrow_array->release = &ArrowAdapter::release_array;
         arrow_array->buffers = new const void*[2];
         arrow_array->n_children = 0;  // leaf/child node
 
+        // The nominal use of these methods as of this writing is for
+        // low-volume data such as schema information -- less than a
+        // kilobyte total. It's simplest and safest to do data copies,
+        // for-loop-wise. If we were to extend usage of these methods
+        // to bulk data in the megabyte/gigabyte range, we'd want to
+        // look at zero-copy for buffers, with variable approaches
+        // to memory management.
+        arrow_array->release = &ArrowAdapter::release_array;
+
         arrow_array->buffers[0] = nullptr;
         // Use malloc here, not new, to match ArrowAdapter::release_array
         T* dest = (T*)malloc(n * sizeof(T));
@@ -309,26 +322,62 @@ class ArrowAdapter {
         return arrow_array;
     }
 
+    // A nominal use of this method is for reporting core domain, current
+    // domain, and non-empty domain back to Python/R.  Meanwhile core string
+    // dims must always have domain of (nullptr, nullptr); and they have current
+    // domain which must _not_ be nullptr pairs.
+    //
+    // For the former do we give back a column of length 2 with nulls in it,
+    // using Arrow's validity buffers?  Or do we use ("", "") as TileDB-Py does?
+    //
+    // We choose the latter.
     static ArrowArray* make_arrow_array_child(
         const std::vector<std::string>& v) {
         // Use new here, not malloc, to match ArrowAdapter::release_array
         auto arrow_array = new ArrowArray;
 
         size_t n = v.size();
 
-        arrow_array->length = n;
+        arrow_array->length = n;  // Number of strings, not number of bytes
         arrow_array->null_count = 0;
         arrow_array->offset = 0;
-        arrow_array->n_buffers = 2;
-        arrow_array->release = &ArrowAdapter::release_array;
-        arrow_array->buffers = new const void*[2];
+
+        // Three-buffer model for string data:
+        // * Slot 0 is the Arrow uint8_t* validity buffer
+        // * Slot 1 is the Arrow offsets buffer: uint32_t* for Arrow string
+        //   or uint64_t* for Arrow large_string
+        // * Slot 2 is data, void* but will be derefrenced as T*
+        arrow_array->n_buffers = 3;
+        arrow_array->buffers = new const void*[3];
         arrow_array->n_children = 0;  // leaf/child node
 
-        // For core domain, these are always nullptr for strings and cannot be
-        // anything else. More general use of this class is WIP on
-        // https://github.com/single-cell-data/TileDB-SOMA/issues/2407
-        arrow_array->buffers[0] = nullptr;
-        arrow_array->buffers[1] = nullptr;
+        arrow_array->release = &ArrowAdapter::release_array;
+
+        size_t nbytes = 0;
+        for (auto e : v) {
+            nbytes += e.length();
+        }
+
+        // This function produces arrow large_string, which has 64-bit offsets.
+        uint64_t* offsets = (uint64_t*)malloc((n + 1) * sizeof(uint64_t));
+
+        // Data
+        char* data = (char*)malloc(nbytes * sizeof(char));
+        uint64_t dest_start = 0;
+
+        offsets[0] = dest_start;
+        for (size_t i = 0; i < n; i++) {
+            const std::string& elem = v[i];
+            size_t elem_len = elem.size();
+
+            memcpy(&data[dest_start], elem.c_str(), elem_len);
+            dest_start += elem_len;
+            offsets[i + 1] = dest_start;
+        }
+
+        arrow_array->buffers[0] = nullptr;  // validity
+        arrow_array->buffers[1] = offsets;
+        arrow_array->buffers[2] = data;
 
         return arrow_array;
     }