diff --git a/libtiledbsoma/src/utils/arrow_adapter.cc b/libtiledbsoma/src/utils/arrow_adapter.cc index 17bda5582f..6d459a5507 100644 --- a/libtiledbsoma/src/utils/arrow_adapter.cc +++ b/libtiledbsoma/src/utils/arrow_adapter.cc @@ -30,8 +30,8 @@ * This file defines the ArrowAdapter class. */ -#include "arrow_adapter.h" #include "../soma/column_buffer.h" +#include "arrow_adapter.h" #include "logger.h" namespace tiledbsoma { @@ -1309,4 +1309,215 @@ std::unique_ptr ArrowAdapter::make_arrow_array_parent( return arrow_array; } +// struct ArrowSchema { +// const char* format; +// const char* name; +// const char* metadata; +// int64_t flags; +// int64_t n_children; +// struct ArrowSchema** children; +// struct ArrowSchema* dictionary; +// +// void (*release)(struct ArrowSchema*); +// void* private_data; +// }; + +// struct ArrowArray { +// int64_t length; +// int64_t null_count; +// int64_t offset; +// int64_t n_buffers; +// int64_t n_children; +// const void** buffers; +// struct ArrowArray** children; +// struct ArrowArray* dictionary; +// +// void (*release)(struct ArrowArray*); +// void* private_data; +// }; + +void ArrowAdapter::_check_shapes( + ArrowArray* arrow_array, ArrowSchema* arrow_schema) { + if (arrow_array->n_children != arrow_schema->n_children) { + throw std::runtime_error( + "ArrowAdapter::_check_shapes: internal coding error: data/schema " + "mismatch"); + } + for (int64_t i = 0; i < arrow_array->n_children; i++) { + _check_shapes(arrow_array->children[i], arrow_schema->children[i]); + } +} + +int64_t ArrowAdapter::_get_column_index_from_name( + ArrowArray* arrow_array, + ArrowSchema* arrow_schema, + std::string column_name) { + // Make sure the child-count is the same + _check_shapes(arrow_array, arrow_schema); + + if (arrow_schema->n_children == 0) { + throw std::runtime_error( + "ArrowAdapter::_check_shapes: internal coding error: childless " + "table"); + } + + for (int64_t i = 0; i < arrow_schema->n_children; i++) { + if (strcmp(arrow_schema->children[i]->name, column_name.c_str()) == 0) { + return i; + } + } + + throw std::runtime_error(fmt::format( + "ArrowAdapter::_check_shapes: column {} not found", column_name)); +} + +template +std::vector ArrowAdapter::get_table_column_by_index( + ArrowArray* arrow_array, ArrowSchema* arrow_schema, int64_t column_index) { + // Make sure the child-count is the same + _check_shapes(arrow_array, arrow_schema); + + if (std::is_same_v) { + throw std::runtime_error( + "SOMAArray::_core_domain_slot: template-specialization " + "failure."); + } + + ArrowArray* child = _get_and_check_column(arrow_array, column_index, 2); + + // For our purposes -- reporting domains, etc. -- we don't use the Arrow + // validity buffers. If this class needs to be extended someday to support + // arrow-nulls, we can do that. + if (child->buffers[0] != nullptr) { + throw std::runtime_error(fmt::format( + "ArrowAdapter::get_table_column_by_index: column index {} is " + "has validity buffer unsupported here", + column_index)); + } + + const void* vdata = child->buffers[1]; + if (vdata == nullptr) { + throw std::runtime_error(fmt::format( + "ArrowAdapter::get_table_column_by_index: column index {} is " + "has null data buffer", + column_index)); + } + + const T* data = (T*)vdata; + return std::vector(data, child->length); +} + +std::vector ArrowAdapter::get_table_string_column_by_index( + ArrowArray* arrow_array, ArrowSchema* arrow_schema, int64_t column_index) { + // Make sure the child-count is the same + _check_shapes(arrow_array, arrow_schema); + + ArrowArray* child = _get_and_check_column(arrow_array, column_index, 3); + + // For our purposes -- reporting domains, etc. -- we don't use the Arrow + // validity buffers. If this class needs to be extended someday to support + // arrow-nulls, we can do that. + if (child->buffers[0] != nullptr) { + throw std::runtime_error(fmt::format( + "ArrowAdapter::get_table_column_by_index: column index {} is " + "has validity buffer unsupported here", + column_index)); + } + + const char* data = (char*)child->buffers[2]; + + if (data == nullptr) { + throw std::runtime_error(fmt::format( + "ArrowAdapter::get_table_column_by_index: column index {} is " + "has null data buffer", + column_index)); + } + + if (strcmp(arrow_schema->children[column_index]->format, "U") != 0) { + throw std::runtime_error(fmt::format( + "ArrowAdapter::get_table_column_by_index: column index {} is " + "has format {}; expected \"U\"", + column_index, + arrow_schema->children[column_index]->format)); + } + uint64_t* offsets = (uint64_t*)child->buffers[1]; + + int num_cells = (int)child->length; + std::vector retval(num_cells); + for (int j = 0; j < num_cells; j++) { + std::string e(&data[offsets[j]], &data[offsets[j + 1]]); + retval.push_back(e); + } + + return retval; +} + +template +std::vector ArrowAdapter::get_table_column_by_name( + ArrowArray* arrow_array, + ArrowSchema* arrow_schema, + std::string column_name) { + int64_t index = _get_column_index_from_name( + arrow_array, arrow_schema, column_name); + return get_table_column_by_index(arrow_array, arrow_schema, index); +} + +std::vector ArrowAdapter::get_table_string_column_by_name( + ArrowArray* arrow_array, + ArrowSchema* arrow_schema, + std::string column_name) { + int64_t index = _get_column_index_from_name( + arrow_array, arrow_schema, column_name); + return get_table_string_column_by_index(arrow_array, arrow_schema, index); +} + +ArrowArray* ArrowAdapter::_get_and_check_column( + ArrowArray* arrow_array, int64_t column_index, int64_t expected_n_buffers) { + if (column_index < 0 || column_index >= arrow_array->n_children) { + throw std::runtime_error(fmt::format( + "ArrowAdapter::_get_and_check_column: column index {} out of " + "bounds {}..{}", + column_index, + 0, + arrow_array->n_children - 1)); + } + + ArrowArray* child = arrow_array->children[column_index]; + + if (child->n_children != 0) { + throw std::runtime_error(fmt::format( + "ArrowAdapter::_get_and_check_column: column index {} is " + "non-terminal", + column_index)); + } + + if (expected_n_buffers == 2) { + if (child->n_buffers != 2) { + throw std::runtime_error(fmt::format( + "ArrowAdapter::_get_and_check_column: column index {} " + "has buffer count {}; expected 2 for non-string data", + column_index, + child->n_buffers)); + } + + } else if (expected_n_buffers == 3) { + if (child->n_buffers != 3) { + throw std::runtime_error(fmt::format( + "ArrowAdapter::get_table_column_by_index: column index {} is " + "has buffer count {}; expected 3 for string data", + column_index, + child->n_buffers)); + } + + } else { + throw std::runtime_error(fmt::format( + "ArrowAdapter::_get_and_check_column: internal coding error: " + "expected_n_buffers {} is " + "neither 2 nor 3.", + expected_n_buffers)); + } + + return child; +} + } // namespace tiledbsoma diff --git a/libtiledbsoma/src/utils/arrow_adapter.h b/libtiledbsoma/src/utils/arrow_adapter.h index 9e7f46e65d..cd4fd2b1bb 100644 --- a/libtiledbsoma/src/utils/arrow_adapter.h +++ b/libtiledbsoma/src/utils/arrow_adapter.h @@ -397,6 +397,28 @@ class ArrowAdapter { // XXX TODO: column-getters by index or by name, as std::vector // as test access points + template + static std::vector get_table_column_by_index( + ArrowArray* arrow_array, + ArrowSchema* arrow_schema, + int64_t column_index); + + static std::vector get_table_string_column_by_index( + ArrowArray* arrow_array, + ArrowSchema* arrow_schema, + int64_t column_index); + + template + static std::vector get_table_column_by_name( + ArrowArray* arrow_array, + ArrowSchema* arrow_schema, + std::string column_name); + + static std::vector get_table_string_column_by_name( + ArrowArray* arrow_array, + ArrowSchema* arrow_schema, + std::string column_name); + private: static std::pair _get_data_and_length( Enumeration& enmr, const void* dst); @@ -459,6 +481,22 @@ class ArrowAdapter { static tiledb_layout_t _get_order(std::string order); + // Throws if the array and the schema don't have the same + // recursive child-counts. + static void _check_shapes( + ArrowArray* arrow_array, ArrowSchema* arrow_schema); + + // Throws if the table doesn't have the column name. + static int64_t _get_column_index_from_name( + ArrowArray* arrow_array, + ArrowSchema* arrow_schema, + std::string column_name); + + static ArrowArray* _get_and_check_column( + ArrowArray* arrow_array, + int64_t column_index, + int64_t expected_n_buffers); + }; // class ArrowAdapter }; // namespace tiledbsoma