From 6613db061ba2f6aac0d5f98f1db63eaeb5f162fc Mon Sep 17 00:00:00 2001 From: Vivian Nguyen Date: Thu, 2 Nov 2023 18:46:26 -0500 Subject: [PATCH] Keep logger.h internal; do not install --- apis/python/setup.py | 8 +- apis/python/src/tiledbsoma/pytiledbsoma.cc | 7 +- libtiledbsoma/src/CMakeLists.txt | 3 +- libtiledbsoma/src/cli/cli.cc | 2 + libtiledbsoma/src/soma/array_buffers.cc | 58 +++++ libtiledbsoma/src/soma/array_buffers.h | 19 +- libtiledbsoma/src/soma/column_buffer.cc | 5 + libtiledbsoma/src/soma/column_buffer.h | 5 +- libtiledbsoma/src/soma/managed_query.cc | 11 +- libtiledbsoma/src/soma/managed_query.h | 9 +- libtiledbsoma/src/soma/soma_array.cc | 2 +- libtiledbsoma/src/soma/soma_array.h | 41 ++-- libtiledbsoma/src/utils/arrow_adapter.cc | 264 +++++++++++++++++++++ libtiledbsoma/src/utils/arrow_adapter.h | 233 +----------------- libtiledbsoma/test/unit_soma_array.cc | 50 ++-- libtiledbsoma/test/unit_soma_group.cc | 9 +- 16 files changed, 406 insertions(+), 320 deletions(-) create mode 100644 libtiledbsoma/src/soma/array_buffers.cc create mode 100644 libtiledbsoma/src/utils/arrow_adapter.cc diff --git a/apis/python/setup.py b/apis/python/setup.py index 38ce2474a4..ceabdbc2cd 100644 --- a/apis/python/setup.py +++ b/apis/python/setup.py @@ -184,13 +184,9 @@ def run(self): INC_DIRS = [ "dist_links/libtiledbsoma/include", - "dist_links/libtiledbsoma/external/include", - "../../build/externals/install/include", str(libtiledbsoma_dir / "include"), - str( - "./src/tiledbsoma" - ), # since pytiledbsoma.cc does #include of query_condition.cc - str(libtiledbsoma_dir.parent / "build/externals/install/include"), + # since pytiledbsoma.cc does #include of query_condition.cc + str("./src/tiledbsoma"), str(tiledb_dir / "include"), ] diff --git a/apis/python/src/tiledbsoma/pytiledbsoma.cc b/apis/python/src/tiledbsoma/pytiledbsoma.cc index 3bc48fb3ca..a489d2e964 100644 --- a/apis/python/src/tiledbsoma/pytiledbsoma.cc +++ b/apis/python/src/tiledbsoma/pytiledbsoma.cc @@ -474,10 +474,9 @@ PYBIND11_MODULE(pytiledbsoma, m) { reader.set_dim_points( dim, coords.cast>()); } else { - throw TileDBSOMAError(fmt::format( - "[pytiledbsoma] set_dim_points: type={} not " - "supported", - arrow_schema.format)); + throw TileDBSOMAError( + "[pytiledbsoma] set_dim_points: type=" + std::string(arrow_schema.format) + " not " + "supported"); } // Release arrow schema diff --git a/libtiledbsoma/src/CMakeLists.txt b/libtiledbsoma/src/CMakeLists.txt index 90d7b41be8..3521070473 100644 --- a/libtiledbsoma/src/CMakeLists.txt +++ b/libtiledbsoma/src/CMakeLists.txt @@ -52,7 +52,9 @@ add_library(TILEDB_SOMA_OBJECTS OBJECT ${CMAKE_CURRENT_SOURCE_DIR}/soma/soma_dataframe.cc ${CMAKE_CURRENT_SOURCE_DIR}/soma/soma_dense_ndarray.cc ${CMAKE_CURRENT_SOURCE_DIR}/soma/soma_sparse_ndarray.cc + ${CMAKE_CURRENT_SOURCE_DIR}/soma/array_buffers.cc ${CMAKE_CURRENT_SOURCE_DIR}/soma/column_buffer.cc + ${CMAKE_CURRENT_SOURCE_DIR}/utils/arrow_adapter.cc ${CMAKE_CURRENT_SOURCE_DIR}/utils/logger.cc ${CMAKE_CURRENT_SOURCE_DIR}/utils/stats.cc ${CMAKE_CURRENT_SOURCE_DIR}/utils/util.cc @@ -201,7 +203,6 @@ install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/utils/arrow_adapter.h ${CMAKE_CURRENT_SOURCE_DIR}/utils/carrow.h ${CMAKE_CURRENT_SOURCE_DIR}/utils/common.h - ${CMAKE_CURRENT_SOURCE_DIR}/utils/logger.h ${CMAKE_CURRENT_SOURCE_DIR}/utils/stats.h ${CMAKE_CURRENT_SOURCE_DIR}/utils/util.h ${CMAKE_CURRENT_SOURCE_DIR}/utils/version.h diff --git a/libtiledbsoma/src/cli/cli.cc b/libtiledbsoma/src/cli/cli.cc index 53cc754913..5d16c753fe 100644 --- a/libtiledbsoma/src/cli/cli.cc +++ b/libtiledbsoma/src/cli/cli.cc @@ -33,6 +33,8 @@ #include "soma/enums.h" #include "soma/soma_array.h" #include "utils/arrow_adapter.h" +#include "utils/carrow.h" +#include "utils/logger.h" using namespace tiledbsoma; diff --git a/libtiledbsoma/src/soma/array_buffers.cc b/libtiledbsoma/src/soma/array_buffers.cc new file mode 100644 index 0000000000..a55d107c59 --- /dev/null +++ b/libtiledbsoma/src/soma/array_buffers.cc @@ -0,0 +1,58 @@ +/** + * @file array_buffers.cc + * + * @section LICENSE + * + * The MIT License + * + * @copyright Copyright (c) 2022 TileDB, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + * @section DESCRIPTION + * + * This file defines the ArrayBuffers class. + */ + +#include "array_buffers.h" +#include "../utils/logger.h" + +namespace tiledbsoma { + +using namespace tiledb; + +std::shared_ptr ArrayBuffers::at(const std::string& name) { + if (!contains(name)) { + throw TileDBSOMAError( + fmt::format("[ArrayBuffers] column '{}' does not exist", name)); + } + return buffers_[name]; +} + +void ArrayBuffers::emplace( + const std::string& name, std::shared_ptr buffer) { + if (contains(name)) { + throw TileDBSOMAError( + fmt::format("[ArrayBuffers] column '{}' already exists", name)); + } + names_.push_back(name); + buffers_.emplace(name, buffer); +} + +} // namespace tiledbsoma \ No newline at end of file diff --git a/libtiledbsoma/src/soma/array_buffers.h b/libtiledbsoma/src/soma/array_buffers.h index 0e2fc43c55..202c306061 100644 --- a/libtiledbsoma/src/soma/array_buffers.h +++ b/libtiledbsoma/src/soma/array_buffers.h @@ -38,7 +38,6 @@ #include #include "../utils/common.h" -#include "../utils/logger.h" #include "column_buffer.h" namespace tiledbsoma { @@ -58,13 +57,7 @@ class ArrayBuffers { * @param name Column name * @return std::shared_ptr Column buffer */ - std::shared_ptr at(const std::string& name) { - if (!contains(name)) { - throw TileDBSOMAError( - fmt::format("[ArrayBuffers] column '{}' does not exist", name)); - } - return buffers_[name]; - } + std::shared_ptr at(const std::string& name); /** * @brief Return true if a buffer with the given name exists. @@ -83,15 +76,7 @@ class ArrayBuffers { * @param name Column name * @param buffer Column buffer */ - void emplace( - const std::string& name, std::shared_ptr buffer) { - if (contains(name)) { - throw TileDBSOMAError( - fmt::format("[ArrayBuffers] column '{}' already exists", name)); - } - names_.push_back(name); - buffers_.emplace(name, buffer); - } + void emplace(const std::string& name, std::shared_ptr buffer); /** * @brief Returns the ordered vector of names. diff --git a/libtiledbsoma/src/soma/column_buffer.cc b/libtiledbsoma/src/soma/column_buffer.cc index 034b12c80a..00e5e49af3 100644 --- a/libtiledbsoma/src/soma/column_buffer.cc +++ b/libtiledbsoma/src/soma/column_buffer.cc @@ -31,6 +31,7 @@ */ #include "column_buffer.h" +#include "../utils/logger.h" namespace tiledbsoma { @@ -152,6 +153,10 @@ ColumnBuffer::ColumnBuffer( } } +ColumnBuffer::~ColumnBuffer() { + LOG_TRACE(fmt::format("[ColumnBuffer] release '{}'", name_)); +} + void ColumnBuffer::attach(Query& query) { // We cannot use: // `set_data_buffer(const std::string& name, std::vector& buf)` diff --git a/libtiledbsoma/src/soma/column_buffer.h b/libtiledbsoma/src/soma/column_buffer.h index 45f687cee1..500e90b79a 100644 --- a/libtiledbsoma/src/soma/column_buffer.h +++ b/libtiledbsoma/src/soma/column_buffer.h @@ -39,7 +39,6 @@ #include #include "../utils/common.h" -#include "../utils/logger.h" #include "span/span.hpp" namespace tiledbsoma { @@ -128,9 +127,7 @@ class ColumnBuffer { ColumnBuffer(const ColumnBuffer&) = delete; ColumnBuffer(ColumnBuffer&&) = default; - ~ColumnBuffer() { - LOG_TRACE(fmt::format("[ColumnBuffer] release '{}'", name_)); - } + ~ColumnBuffer(); /** * @brief Attach this ColumnBuffer to a TileDB query. diff --git a/libtiledbsoma/src/soma/managed_query.cc b/libtiledbsoma/src/soma/managed_query.cc index 6f9e3e29dd..bf687b259f 100644 --- a/libtiledbsoma/src/soma/managed_query.cc +++ b/libtiledbsoma/src/soma/managed_query.cc @@ -33,7 +33,7 @@ #include "managed_query.h" #include #include -#include "logger_public.h" +#include "../utils/logger.h" #include "utils/common.h" namespace tiledbsoma { @@ -220,4 +220,13 @@ std::shared_ptr ManagedQuery::submit_read() { return buffers_; } +void ManagedQuery::check_column_name(const std::string& name) { + if (!buffers_->contains(name)) { + throw TileDBSOMAError(fmt::format( + "[ManagedQuery] Column '{}' is not available in the query " + "results.", + name)); + } +} + }; // namespace tiledbsoma diff --git a/libtiledbsoma/src/soma/managed_query.h b/libtiledbsoma/src/soma/managed_query.h index cf7e3415d4..af603f8b2c 100644 --- a/libtiledbsoma/src/soma/managed_query.h +++ b/libtiledbsoma/src/soma/managed_query.h @@ -429,14 +429,7 @@ class ManagedQuery { * * @param name Column name */ - void check_column_name(const std::string& name) { - if (!buffers_->contains(name)) { - throw TileDBSOMAError(fmt::format( - "[ManagedQuery] Column '{}' is not available in the query " - "results.", - name)); - } - } + void check_column_name(const std::string& name); // TileDB array being queried. std::shared_ptr array_; diff --git a/libtiledbsoma/src/soma/soma_array.cc b/libtiledbsoma/src/soma/soma_array.cc index d5152ab4cb..34c865bd4a 100644 --- a/libtiledbsoma/src/soma/soma_array.cc +++ b/libtiledbsoma/src/soma/soma_array.cc @@ -32,8 +32,8 @@ #include "soma_array.h" #include +#include "../utils/logger.h" #include "../utils/util.h" -#include "logger_public.h" namespace tiledbsoma { using namespace tiledb; diff --git a/libtiledbsoma/src/soma/soma_array.h b/libtiledbsoma/src/soma/soma_array.h index a18c9fdbe4..0845cd694b 100644 --- a/libtiledbsoma/src/soma/soma_array.h +++ b/libtiledbsoma/src/soma/soma_array.h @@ -40,6 +40,7 @@ #include #include #include "enums.h" +#include "logger_public.h" #include "managed_query.h" namespace tiledbsoma { @@ -251,12 +252,13 @@ class SOMAArray { int partition_count) { // Validate partition inputs if (partition_index >= partition_count) { - throw TileDBSOMAError(fmt::format( - "[SOMAArray] partition_index ({}) must be < " - "partition_count " - "({})", - partition_index, - partition_count)); + // TODO this use to be formatted with fmt::format which is part of + // internal header spd/log/fmt/fmt.h and should not be used. + // In C++20, this can be replaced with std::format. + std::ostringstream err; + err << "[SOMAArray] partition_index (" << partition_index + << ") must be < partition_count (" << partition_count; + throw TileDBSOMAError(err.str()); } if (partition_count > 1) { @@ -268,19 +270,17 @@ class SOMAArray { partition_size = points.size() - start; } - LOG_DEBUG(fmt::format( - "[SOMAArray] set_dim_points partitioning: sizeof(T)={} " - "dim={} " - "index={} " - "count={} " - "range=[{}, {}] of {} points", - sizeof(T), - dim, - partition_index, - partition_count, - start, - start + partition_size - 1, - points.size())); + // TODO this use to be formatted with fmt::format which is part of + // internal header spd/log/fmt/fmt.h and should not be used. + // In C++20, this can be replaced with std::format. + std::ostringstream log_dbg; + log_dbg << "[SOMAArray] set_dim_points partitioning:" + << " sizeof(T)=" << sizeof(T) << " dim=" << dim + << " index=" << partition_index + << " count=" << partition_count << " range =[" << start + << ", " << start + partition_size - 1 << "] of " + << points.size() << "points"; + LOG_DEBUG(log_dbg.str()); mq_->select_points( dim, tcb::span{&points[start], partition_size}); @@ -301,7 +301,8 @@ class SOMAArray { template void set_dim_points(const std::string& dim, const std::vector& points) { LOG_DEBUG( - fmt::format("[SOMAArray] set_dim_points: sizeof(T)={}", sizeof(T))); + "[SOMAArray] set_dim_points: sizeof(T)=" + + std::to_string(sizeof(T))); mq_->select_points(dim, points); } diff --git a/libtiledbsoma/src/utils/arrow_adapter.cc b/libtiledbsoma/src/utils/arrow_adapter.cc new file mode 100644 index 0000000000..dae67ec7b3 --- /dev/null +++ b/libtiledbsoma/src/utils/arrow_adapter.cc @@ -0,0 +1,264 @@ +/** + * @file arrow_adapter.cc + * + * @section LICENSE + * + * The MIT License + * + * @copyright Copyright (c) 2022 TileDB, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + * @section DESCRIPTION + * + * This file defines the ArrowAdapter class. + */ + +#include "arrow_adapter.h" +#include "../soma/column_buffer.h" +#include "../utils/logger.h" + +namespace tiledbsoma { + +using namespace tiledb; + +void ArrowAdapter::release_schema(struct ArrowSchema* schema) { + schema->release = nullptr; + + struct ArrowSchema* dict = schema->dictionary; + if (dict != nullptr) { + if (dict->format != nullptr) { + free((void*)dict->format); + dict->format = nullptr; + } + if (dict->release != nullptr) { + delete dict; + dict = nullptr; + } + } + + LOG_TRACE("[ArrowAdapter] release_schema"); +} + +void ArrowAdapter::release_array(struct ArrowArray* array) { + auto arrow_buffer = static_cast(array->private_data); + + LOG_TRACE(fmt::format( + "[ArrowAdapter] release_array {} use_count={}", + arrow_buffer->buffer_->name(), + arrow_buffer->buffer_.use_count())); + + // Delete the ArrowBuffer, which was allocated with new. + // If the ArrowBuffer.buffer_ shared_ptr is the last reference to the + // underlying ColumnBuffer, the ColumnBuffer will be deleted. + delete arrow_buffer; + + if (array->buffers != nullptr) { + free(array->buffers); + } + + struct ArrowArray* dict = array->dictionary; + if (dict != nullptr) { + if (dict->buffers != nullptr) { + free(dict->buffers); + dict->buffers = nullptr; + } + if (dict->release != nullptr) { + delete dict; + dict = nullptr; + } + } + + array->release = nullptr; +} + +std::pair, std::unique_ptr> +ArrowAdapter::to_arrow(std::shared_ptr column, bool use_enum) { + std::unique_ptr schema = std::make_unique(); + std::unique_ptr array = std::make_unique(); + + schema->format = to_arrow_format(column->type()).data(); // mandatory + schema->name = column->name().data(); // optional + schema->metadata = nullptr; // optional + schema->flags = 0; // optional + schema->n_children = 0; // mandatory + schema->children = nullptr; // optional + schema->dictionary = nullptr; // optional + schema->release = &release_schema; // mandatory + schema->private_data = nullptr; // optional + + int n_buffers = column->is_var() ? 3 : 2; + + // Create an ArrowBuffer to manage the lifetime of `column`. + // - `arrow_buffer` holds a shared_ptr to `column`, which increments + // the use count and keeps the ColumnBuffer data alive. + // - When the arrow array is released, `array->release()` is called with + // `arrow_buffer` in `private_data`. `arrow_buffer` is deleted, which + // decrements the the `column` use count. When the `column` use count + // reaches 0, the ColumnBuffer data will be deleted. + auto arrow_buffer = new ArrowBuffer(column); + + array->length = column->size(); // mandatory + array->null_count = 0; // mandatory + array->offset = 0; // mandatory + array->n_buffers = n_buffers; // mandatory + array->n_children = 0; // mandatory + array->buffers = nullptr; // mandatory + array->children = nullptr; // optional + array->dictionary = nullptr; // optional + array->release = &release_array; // mandatory + array->private_data = (void*)arrow_buffer; // mandatory + + LOG_TRACE(fmt::format( + "[ArrowAdapter] create array name='{}' use_count={}", + column->name(), + column.use_count())); + + array->buffers = (const void**)malloc(sizeof(void*) * n_buffers); + assert(array->buffers != nullptr); + array->buffers[0] = nullptr; // validity + array->buffers[n_buffers - 1] = column->data().data(); // data + if (n_buffers == 3) { + array->buffers[1] = column->offsets().data(); // offsets + } + + if (column->is_nullable()) { + schema->flags |= ARROW_FLAG_NULLABLE; + + // Count nulls + for (auto v : column->validity()) { + array->null_count += v == 0; + } + + // Convert validity bytemap to a bitmap in place + column->validity_to_bitmap(); + array->buffers[0] = column->validity().data(); + } + if (column->is_ordered()) { + schema->flags |= ARROW_FLAG_DICTIONARY_ORDERED; + } + + /* Workaround to cast TILEDB_BOOL from uint8 to 1-bit Arrow boolean. */ + if (column->type() == TILEDB_BOOL) { + column->data_to_bitmap(); + } + + // If we have an enumeration, fill a dictionary. + // The Python callpath handles this separately. The R callpath needs us + // to do this. TODO: uniformize this at the callsites. + if (column->has_enumeration() && use_enum) { + auto enumvec = column->get_enumeration(); + + ArrowSchema* dict_sch = new ArrowSchema; + ArrowArray* dict_arr = new ArrowArray; + + dict_sch->format = (const char*)malloc( + sizeof(char) * 2); // mandatory, 'u' as 32bit indexing + strcpy((char*)dict_sch->format, "u"); + dict_sch->name = nullptr; // optional in dictionary + dict_sch->metadata = nullptr; // optional + dict_sch->flags = 0; // optional + dict_sch->n_children = 0; // mandatory + dict_sch->children = nullptr; // optional + dict_sch->dictionary = nullptr; // optional + dict_sch->release = &release_schema; // mandatory + dict_sch->private_data = nullptr; // optional + + const int n_buf = 3; // always variable here + + const int64_t n_vec = enumvec.size(); + dict_arr->length = n_vec; // mandatory + dict_arr->null_count = 0; // mandatory + dict_arr->offset = 0; // mandatory + dict_arr->n_buffers = n_buf; // mandatory + dict_arr->n_children = 0; // mandatory + dict_arr->buffers = nullptr; // mandatory + dict_arr->children = nullptr; // optional + dict_arr->dictionary = nullptr; // optional + dict_arr->release = &release_array; // release from parent + dict_arr->private_data = nullptr; // optional here + + column->convert_enumeration(); + dict_arr->buffers = (const void**)malloc(sizeof(void*) * n_buf); + dict_arr->buffers[0] = nullptr; // validity: none here + dict_arr->buffers[1] = column->enum_offsets().data(); + dict_arr->buffers[2] = column->enum_string().data(); + + schema->dictionary = dict_sch; + array->dictionary = dict_arr; + } + + return std::pair(std::move(array), std::move(schema)); +} + +std::string_view ArrowAdapter::to_arrow_format(tiledb_datatype_t datatype) { + switch (datatype) { + case TILEDB_STRING_ASCII: + case TILEDB_STRING_UTF8: + return "U"; // large because TileDB uses 64bit offsets + case TILEDB_CHAR: + case TILEDB_BLOB: + return "Z"; // large because TileDB uses 64bit offsets + case TILEDB_BOOL: + return "b"; + case TILEDB_INT32: + return "i"; + case TILEDB_INT64: + return "l"; + case TILEDB_FLOAT32: + return "f"; + case TILEDB_FLOAT64: + return "g"; + case TILEDB_INT8: + return "c"; + case TILEDB_UINT8: + return "C"; + case TILEDB_INT16: + return "s"; + case TILEDB_UINT16: + return "S"; + case TILEDB_UINT32: + return "I"; + case TILEDB_UINT64: + return "L"; + case TILEDB_TIME_SEC: + return "tts"; + case TILEDB_TIME_MS: + return "ttm"; + case TILEDB_TIME_US: + return "ttu"; + case TILEDB_TIME_NS: + return "ttn"; + case TILEDB_DATETIME_SEC: + return "tss:"; + case TILEDB_DATETIME_MS: + return "tsm:"; + case TILEDB_DATETIME_US: + return "tsu:"; + case TILEDB_DATETIME_NS: + return "tsn:"; + default: + break; + } + throw TileDBSOMAError(fmt::format( + "ArrowAdapter: Unsupported TileDB datatype: {} ", + tiledb::impl::type_to_str(datatype))); +} + +} // namespace tiledbsoma \ No newline at end of file diff --git a/libtiledbsoma/src/utils/arrow_adapter.h b/libtiledbsoma/src/utils/arrow_adapter.h index 2588bd8600..efd6b4035b 100644 --- a/libtiledbsoma/src/utils/arrow_adapter.h +++ b/libtiledbsoma/src/utils/arrow_adapter.h @@ -2,18 +2,16 @@ #define ARROW_ADAPTER_H #include -#include "../soma/column_buffer.h" -#include "../utils/logger.h" -#ifndef ARROW_SCHEMA_AND_ARRAY_DEFINED -#include "carrow.h" -#endif // https://arrow.apache.org/docs/format/CDataInterface.html // https://arrow.apache.org/docs/format/Columnar.html#buffer-listing-for-each-layout // https://arrow.apache.org/docs/format/CDataInterface.html#exporting-a-simple-int32-array +#include "carrow.h" namespace tiledbsoma { +class ColumnBuffer; + /** * @brief The ArrowBuffer holds a shared pointer to a ColumnBuffer, which * manages the lifetime of a ColumnBuffer used to back an Arrow array. @@ -31,179 +29,16 @@ struct ArrowBuffer { class ArrowAdapter { public: - static void release_schema(struct ArrowSchema* schema) { - schema->release = nullptr; - - struct ArrowSchema* dict = schema->dictionary; - if (dict != nullptr) { - if (dict->format != nullptr) { - free((void*)dict->format); - dict->format = nullptr; - } - if (dict->release != nullptr) { - delete dict; - dict = nullptr; - } - } - - LOG_TRACE("[ArrowAdapter] release_schema"); - } - - static void release_array(struct ArrowArray* array) { - auto arrow_buffer = static_cast(array->private_data); - - LOG_TRACE(fmt::format( - "[ArrowAdapter] release_array {} use_count={}", - arrow_buffer->buffer_->name(), - arrow_buffer->buffer_.use_count())); - - // Delete the ArrowBuffer, which was allocated with new. - // If the ArrowBuffer.buffer_ shared_ptr is the last reference to the - // underlying ColumnBuffer, the ColumnBuffer will be deleted. - delete arrow_buffer; - - if (array->buffers != nullptr) { - free(array->buffers); - } - - struct ArrowArray* dict = array->dictionary; - if (dict != nullptr) { - if (dict->buffers != nullptr) { - free(dict->buffers); - dict->buffers = nullptr; - } - if (dict->release != nullptr) { - delete dict; - dict = nullptr; - } - } - - array->release = nullptr; - } + static void release_schema(struct ArrowSchema* schema); + static void release_array(struct ArrowArray* array); /** * @brief Convert ColumnBuffer to an Arrow array. * * @return auto [Arrow array, Arrow schema] */ - static auto to_arrow( - std::shared_ptr column, bool use_enum = false) { - std::unique_ptr schema = std::make_unique(); - std::unique_ptr array = std::make_unique(); - - schema->format = to_arrow_format(column->type()).data(); // mandatory - schema->name = column->name().data(); // optional - schema->metadata = nullptr; // optional - schema->flags = 0; // optional - schema->n_children = 0; // mandatory - schema->children = nullptr; // optional - schema->dictionary = nullptr; // optional - schema->release = &release_schema; // mandatory - schema->private_data = nullptr; // optional - - int n_buffers = column->is_var() ? 3 : 2; - - // Create an ArrowBuffer to manage the lifetime of `column`. - // - `arrow_buffer` holds a shared_ptr to `column`, which increments - // the use count and keeps the ColumnBuffer data alive. - // - When the arrow array is released, `array->release()` is called with - // `arrow_buffer` in `private_data`. `arrow_buffer` is deleted, which - // decrements the the `column` use count. When the `column` use count - // reaches 0, the ColumnBuffer data will be deleted. - auto arrow_buffer = new ArrowBuffer(column); - - array->length = column->size(); // mandatory - array->null_count = 0; // mandatory - array->offset = 0; // mandatory - array->n_buffers = n_buffers; // mandatory - array->n_children = 0; // mandatory - array->buffers = nullptr; // mandatory - array->children = nullptr; // optional - array->dictionary = nullptr; // optional - array->release = &release_array; // mandatory - array->private_data = (void*)arrow_buffer; // mandatory - - LOG_TRACE(fmt::format( - "[ArrowAdapter] create array name='{}' use_count={}", - column->name(), - column.use_count())); - - array->buffers = (const void**)malloc(sizeof(void*) * n_buffers); - assert(array->buffers != nullptr); - array->buffers[0] = nullptr; // validity - array->buffers[n_buffers - 1] = column->data().data(); // data - if (n_buffers == 3) { - array->buffers[1] = column->offsets().data(); // offsets - } - - if (column->is_nullable()) { - schema->flags |= ARROW_FLAG_NULLABLE; - - // Count nulls - for (auto v : column->validity()) { - array->null_count += v == 0; - } - - // Convert validity bytemap to a bitmap in place - column->validity_to_bitmap(); - array->buffers[0] = column->validity().data(); - } - if (column->is_ordered()) { - schema->flags |= ARROW_FLAG_DICTIONARY_ORDERED; - } - - /* Workaround to cast TILEDB_BOOL from uint8 to 1-bit Arrow boolean. */ - if (column->type() == TILEDB_BOOL) { - column->data_to_bitmap(); - } - - // If we have an enumeration, fill a dictionary. - // The Python callpath handles this separately. The R callpath needs us - // to do this. TODO: uniformize this at the callsites. - if (column->has_enumeration() && use_enum) { - auto enumvec = column->get_enumeration(); - - ArrowSchema* dict_sch = new ArrowSchema; - ArrowArray* dict_arr = new ArrowArray; - - dict_sch->format = (const char*)malloc( - sizeof(char) * 2); // mandatory, 'u' as 32bit indexing - strcpy((char*)dict_sch->format, "u"); - dict_sch->name = nullptr; // optional in dictionary - dict_sch->metadata = nullptr; // optional - dict_sch->flags = 0; // optional - dict_sch->n_children = 0; // mandatory - dict_sch->children = nullptr; // optional - dict_sch->dictionary = nullptr; // optional - dict_sch->release = &release_schema; // mandatory - dict_sch->private_data = nullptr; // optional - - const int n_buf = 3; // always variable here - - const int64_t n_vec = enumvec.size(); - dict_arr->length = n_vec; // mandatory - dict_arr->null_count = 0; // mandatory - dict_arr->offset = 0; // mandatory - dict_arr->n_buffers = n_buf; // mandatory - dict_arr->n_children = 0; // mandatory - dict_arr->buffers = nullptr; // mandatory - dict_arr->children = nullptr; // optional - dict_arr->dictionary = nullptr; // optional - dict_arr->release = &release_array; // release from parent - dict_arr->private_data = nullptr; // optional here - - column->convert_enumeration(); - dict_arr->buffers = (const void**)malloc(sizeof(void*) * n_buf); - dict_arr->buffers[0] = nullptr; // validity: none here - dict_arr->buffers[1] = column->enum_offsets().data(); - dict_arr->buffers[2] = column->enum_string().data(); - - schema->dictionary = dict_sch; - array->dictionary = dict_arr; - } - - return std::pair(std::move(array), std::move(schema)); - } + static std::pair, std::unique_ptr> + to_arrow(std::shared_ptr column, bool use_enum = false); /** * @brief Get Arrow format string from TileDB datatype. @@ -211,59 +46,7 @@ class ArrowAdapter { * @param datatype TileDB datatype. * @return std::string_view Arrow format string. */ - static std::string_view to_arrow_format(tiledb_datatype_t datatype) { - switch (datatype) { - case TILEDB_STRING_ASCII: - case TILEDB_STRING_UTF8: - return "U"; // large because TileDB uses 64bit offsets - case TILEDB_CHAR: - case TILEDB_BLOB: - return "Z"; // large because TileDB uses 64bit offsets - case TILEDB_BOOL: - return "b"; - case TILEDB_INT32: - return "i"; - case TILEDB_INT64: - return "l"; - case TILEDB_FLOAT32: - return "f"; - case TILEDB_FLOAT64: - return "g"; - case TILEDB_INT8: - return "c"; - case TILEDB_UINT8: - return "C"; - case TILEDB_INT16: - return "s"; - case TILEDB_UINT16: - return "S"; - case TILEDB_UINT32: - return "I"; - case TILEDB_UINT64: - return "L"; - case TILEDB_TIME_SEC: - return "tts"; - case TILEDB_TIME_MS: - return "ttm"; - case TILEDB_TIME_US: - return "ttu"; - case TILEDB_TIME_NS: - return "ttn"; - case TILEDB_DATETIME_SEC: - return "tss:"; - case TILEDB_DATETIME_MS: - return "tsm:"; - case TILEDB_DATETIME_US: - return "tsu:"; - case TILEDB_DATETIME_NS: - return "tsn:"; - default: - break; - } - throw TileDBSOMAError(fmt::format( - "ArrowAdapter: Unsupported TileDB datatype: {} ", - tiledb::impl::type_to_str(datatype))); - } + static std::string_view to_arrow_format(tiledb_datatype_t datatype); }; }; // namespace tiledbsoma diff --git a/libtiledbsoma/test/unit_soma_array.cc b/libtiledbsoma/test/unit_soma_array.cc index 18873e9b4d..177e7755b4 100644 --- a/libtiledbsoma/test/unit_soma_array.cc +++ b/libtiledbsoma/test/unit_soma_array.cc @@ -59,20 +59,12 @@ const std::string src_path = TILEDBSOMA_SOURCE_ROOT; namespace { std::tuple create_array( - const std::string& uri_in, + const std::string& uri, std::shared_ptr ctx, int num_cells_per_fragment = 10, int num_fragments = 1, bool overlap = false, bool allow_duplicates = false) { - std::string uri = fmt::format( - "{}-{}-{}-{}-{}", - uri_in, - num_cells_per_fragment, - num_fragments, - overlap, - allow_duplicates); - auto vfs = VFS(*ctx); if (vfs.is_dir(uri)) { vfs.remove_dir(uri); @@ -190,11 +182,14 @@ TEST_CASE("SOMAArray: nnz") { int num_cells_per_fragment = 128; auto timestamp = 10; - SECTION(fmt::format( - " - fragments={}, overlap={}, allow_duplicates={}", - num_fragments, - overlap, - allow_duplicates)) { + // TODO this use to be formatted with fmt::format which is part of internal + // header spd/log/fmt/fmt.h and should not be used. In C++20, this can be + // replaced with std::format. + std::ostringstream section; + section << "- fragments=" << num_fragments << ", overlap" << overlap + << ", allow_duplicates=" << allow_duplicates; + + SECTION(section.str()) { auto ctx = std::make_shared(); // Create array @@ -260,11 +255,14 @@ TEST_CASE("SOMAArray: nnz with timestamp") { auto allow_duplicates = true; int num_cells_per_fragment = 128; - SECTION(fmt::format( - " - fragments={}, overlap={}, allow_duplicates={}", - num_fragments, - overlap, - allow_duplicates)) { + // TODO this use to be formatted with fmt::format which is part of internal + // header spd/log/fmt/fmt.h and should not be used. In C++20, this can be + // replaced with std::format. + std::ostringstream section; + section << "- fragments=" << num_fragments << ", overlap" << overlap + << ", allow_duplicates=" << allow_duplicates; + + SECTION(section.str()) { auto ctx = std::make_shared(); // Create array @@ -310,12 +308,14 @@ TEST_CASE("SOMAArray: nnz with consolidation") { auto vacuum = GENERATE(false, true); int num_cells_per_fragment = 128; - SECTION(fmt::format( - " - fragments={}, overlap={}, allow_duplicates={}, vacuum={}", - num_fragments, - overlap, - allow_duplicates, - vacuum)) { + // TODO this use to be formatted with fmt::format which is part of internal + // header spd/log/fmt/fmt.h and should not be used. In C++20, this can be + // replaced with std::format. + std::ostringstream section; + section << "- fragments=" << num_fragments << ", overlap" << overlap + << ", allow_duplicates=" << allow_duplicates; + + SECTION(section.str()) { auto ctx = std::make_shared(); // Create array diff --git a/libtiledbsoma/test/unit_soma_group.cc b/libtiledbsoma/test/unit_soma_group.cc index c6052551f1..46bfe1bbb2 100644 --- a/libtiledbsoma/test/unit_soma_group.cc +++ b/libtiledbsoma/test/unit_soma_group.cc @@ -60,7 +60,7 @@ const std::string src_path = TILEDBSOMA_SOURCE_ROOT; namespace { std::tuple create_array( - const std::string& uri_in, + const std::string& uri, Context& ctx, int num_cells_per_fragment = 10, int num_fragments = 1, @@ -68,13 +68,6 @@ std::tuple create_array( bool allow_duplicates = false, uint64_t timestamp = 1, bool reuse_existing = false) { - std::string uri = fmt::format( - "{}-{}-{}-{}-{}", - uri_in, - num_cells_per_fragment, - num_fragments, - overlap, - allow_duplicates); // Create array, if not reusing the existing array if (!reuse_existing) { auto vfs = VFS(ctx);