Skip to content

Commit

Permalink
temp [skip ci]
Browse files Browse the repository at this point in the history
  • Loading branch information
johnkerl committed Sep 18, 2024
1 parent 038092e commit dfed646
Show file tree
Hide file tree
Showing 2 changed files with 250 additions and 1 deletion.
213 changes: 212 additions & 1 deletion libtiledbsoma/src/utils/arrow_adapter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,8 @@
* This file defines the ArrowAdapter class.
*/

#include "arrow_adapter.h"
#include "../soma/column_buffer.h"
#include "arrow_adapter.h"
#include "logger.h"

namespace tiledbsoma {
Expand Down Expand Up @@ -1309,4 +1309,215 @@ std::unique_ptr<ArrowArray> ArrowAdapter::make_arrow_array_parent(
return arrow_array;
}

// struct ArrowSchema {
// const char* format;
// const char* name;
// const char* metadata;
// int64_t flags;
// int64_t n_children;
// struct ArrowSchema** children;
// struct ArrowSchema* dictionary;
//
// void (*release)(struct ArrowSchema*);
// void* private_data;
// };

// struct ArrowArray {
// int64_t length;
// int64_t null_count;
// int64_t offset;
// int64_t n_buffers;
// int64_t n_children;
// const void** buffers;
// struct ArrowArray** children;
// struct ArrowArray* dictionary;
//
// void (*release)(struct ArrowArray*);
// void* private_data;
// };

void ArrowAdapter::_check_shapes(
ArrowArray* arrow_array, ArrowSchema* arrow_schema) {
if (arrow_array->n_children != arrow_schema->n_children) {
throw std::runtime_error(
"ArrowAdapter::_check_shapes: internal coding error: data/schema "
"mismatch");
}
for (int64_t i = 0; i < arrow_array->n_children; i++) {
_check_shapes(arrow_array->children[i], arrow_schema->children[i]);
}
}

int64_t ArrowAdapter::_get_column_index_from_name(
ArrowArray* arrow_array,
ArrowSchema* arrow_schema,
std::string column_name) {
// Make sure the child-count is the same
_check_shapes(arrow_array, arrow_schema);

if (arrow_schema->n_children == 0) {
throw std::runtime_error(
"ArrowAdapter::_check_shapes: internal coding error: childless "
"table");
}

for (int64_t i = 0; i < arrow_schema->n_children; i++) {
if (strcmp(arrow_schema->children[i]->name, column_name.c_str()) == 0) {
return i;
}
}

throw std::runtime_error(fmt::format(
"ArrowAdapter::_check_shapes: column {} not found", column_name));
}

template <typename T>
std::vector<T> ArrowAdapter::get_table_column_by_index(
ArrowArray* arrow_array, ArrowSchema* arrow_schema, int64_t column_index) {
// Make sure the child-count is the same
_check_shapes(arrow_array, arrow_schema);

if (std::is_same_v<T, std::string>) {
throw std::runtime_error(
"SOMAArray::_core_domain_slot: template-specialization "
"failure.");
}

ArrowArray* child = _get_and_check_column(arrow_array, column_index, 2);

// For our purposes -- reporting domains, etc. -- we don't use the Arrow
// validity buffers. If this class needs to be extended someday to support
// arrow-nulls, we can do that.
if (child->buffers[0] != nullptr) {
throw std::runtime_error(fmt::format(
"ArrowAdapter::get_table_column_by_index: column index {} is "
"has validity buffer unsupported here",
column_index));
}

const void* vdata = child->buffers[1];
if (vdata == nullptr) {
throw std::runtime_error(fmt::format(
"ArrowAdapter::get_table_column_by_index: column index {} is "
"has null data buffer",
column_index));
}

const T* data = (T*)vdata;
return std::vector(data, child->length);
}

std::vector<std::string> ArrowAdapter::get_table_string_column_by_index(
ArrowArray* arrow_array, ArrowSchema* arrow_schema, int64_t column_index) {
// Make sure the child-count is the same
_check_shapes(arrow_array, arrow_schema);

ArrowArray* child = _get_and_check_column(arrow_array, column_index, 3);

// For our purposes -- reporting domains, etc. -- we don't use the Arrow
// validity buffers. If this class needs to be extended someday to support
// arrow-nulls, we can do that.
if (child->buffers[0] != nullptr) {
throw std::runtime_error(fmt::format(
"ArrowAdapter::get_table_column_by_index: column index {} is "
"has validity buffer unsupported here",
column_index));
}

const char* data = (char*)child->buffers[2];

if (data == nullptr) {
throw std::runtime_error(fmt::format(
"ArrowAdapter::get_table_column_by_index: column index {} is "
"has null data buffer",
column_index));
}

if (strcmp(arrow_schema->children[column_index]->format, "U") != 0) {
throw std::runtime_error(fmt::format(
"ArrowAdapter::get_table_column_by_index: column index {} is "
"has format {}; expected \"U\"",
column_index,
arrow_schema->children[column_index]->format));
}
uint64_t* offsets = (uint64_t*)child->buffers[1];

int num_cells = (int)child->length;
std::vector<std::string> retval(num_cells);
for (int j = 0; j < num_cells; j++) {
std::string e(&data[offsets[j]], &data[offsets[j + 1]]);
retval.push_back(e);
}

return retval;
}

template <typename T>
std::vector<T> ArrowAdapter::get_table_column_by_name(
ArrowArray* arrow_array,
ArrowSchema* arrow_schema,
std::string column_name) {
int64_t index = _get_column_index_from_name(
arrow_array, arrow_schema, column_name);
return get_table_column_by_index<T>(arrow_array, arrow_schema, index);
}

std::vector<std::string> ArrowAdapter::get_table_string_column_by_name(
ArrowArray* arrow_array,
ArrowSchema* arrow_schema,
std::string column_name) {
int64_t index = _get_column_index_from_name(
arrow_array, arrow_schema, column_name);
return get_table_string_column_by_index(arrow_array, arrow_schema, index);
}

ArrowArray* ArrowAdapter::_get_and_check_column(
ArrowArray* arrow_array, int64_t column_index, int64_t expected_n_buffers) {
if (column_index < 0 || column_index >= arrow_array->n_children) {
throw std::runtime_error(fmt::format(
"ArrowAdapter::_get_and_check_column: column index {} out of "
"bounds {}..{}",
column_index,
0,
arrow_array->n_children - 1));
}

ArrowArray* child = arrow_array->children[column_index];

if (child->n_children != 0) {
throw std::runtime_error(fmt::format(
"ArrowAdapter::_get_and_check_column: column index {} is "
"non-terminal",
column_index));
}

if (expected_n_buffers == 2) {
if (child->n_buffers != 2) {
throw std::runtime_error(fmt::format(
"ArrowAdapter::_get_and_check_column: column index {} "
"has buffer count {}; expected 2 for non-string data",
column_index,
child->n_buffers));
}

} else if (expected_n_buffers == 3) {
if (child->n_buffers != 3) {
throw std::runtime_error(fmt::format(
"ArrowAdapter::get_table_column_by_index: column index {} is "
"has buffer count {}; expected 3 for string data",
column_index,
child->n_buffers));
}

} else {
throw std::runtime_error(fmt::format(
"ArrowAdapter::_get_and_check_column: internal coding error: "
"expected_n_buffers {} is "
"neither 2 nor 3.",
expected_n_buffers));
}

return child;
}

} // namespace tiledbsoma
38 changes: 38 additions & 0 deletions libtiledbsoma/src/utils/arrow_adapter.h
Original file line number Diff line number Diff line change
Expand Up @@ -397,6 +397,28 @@ class ArrowAdapter {
// XXX TODO: column-getters by index or by name, as std::vector<T>
// as test access points

template <typename T>
static std::vector<T> get_table_column_by_index(
ArrowArray* arrow_array,
ArrowSchema* arrow_schema,
int64_t column_index);

static std::vector<std::string> get_table_string_column_by_index(
ArrowArray* arrow_array,
ArrowSchema* arrow_schema,
int64_t column_index);

template <typename T>
static std::vector<T> get_table_column_by_name(
ArrowArray* arrow_array,
ArrowSchema* arrow_schema,
std::string column_name);

static std::vector<std::string> get_table_string_column_by_name(
ArrowArray* arrow_array,
ArrowSchema* arrow_schema,
std::string column_name);

private:
static std::pair<const void*, std::size_t> _get_data_and_length(
Enumeration& enmr, const void* dst);
Expand Down Expand Up @@ -459,6 +481,22 @@ class ArrowAdapter {

static tiledb_layout_t _get_order(std::string order);

// Throws if the array and the schema don't have the same
// recursive child-counts.
static void _check_shapes(
ArrowArray* arrow_array, ArrowSchema* arrow_schema);

// Throws if the table doesn't have the column name.
static int64_t _get_column_index_from_name(
ArrowArray* arrow_array,
ArrowSchema* arrow_schema,
std::string column_name);

static ArrowArray* _get_and_check_column(
ArrowArray* arrow_array,
int64_t column_index,
int64_t expected_n_buffers);

}; // class ArrowAdapter

}; // namespace tiledbsoma
Expand Down

0 comments on commit dfed646

Please sign in to comment.