Skip to content

Commit

Permalink
arrow-helpers for string-valued columns [skip ci]
Browse files Browse the repository at this point in the history
  • Loading branch information
johnkerl committed Sep 19, 2024
1 parent bf78157 commit e2b6621
Show file tree
Hide file tree
Showing 8 changed files with 947 additions and 40 deletions.
102 changes: 100 additions & 2 deletions libtiledbsoma/src/soma/soma_array.cc
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
/**
* @file soma_array.cc
/** @file soma_array.cc
*
* @section LICENSE
*
Expand Down Expand Up @@ -1141,6 +1140,105 @@ std::optional<TimestampRange> SOMAArray::timestamp() {
return timestamp_;
}

// Note that ArrowTable is simply our libtiledbsoma pairing of ArrowArray and
// ArrowSchema from nanoarrow.
//
// The domainish enum simply lets us re-use code which is common across
// core domain, core current domain, and core non-empty domain.
ArrowTable SOMAArray::_get_core_domainish(enum Domainish which_kind) {
int array_ndim = this->ndim();
auto dimensions = tiledb_schema()->domain().dimensions();

// Create the schema for the info we return
std::vector<std::string> names(array_ndim);
std::vector<tiledb_datatype_t> tiledb_datatypes(array_ndim);

for (int i = 0; i < (int)array_ndim; i++) {
const Dimension& core_dim = dimensions[i];
names[i] = core_dim.name();
tiledb_datatypes[i] = core_dim.type();
}

auto arrow_schema = ArrowAdapter::make_arrow_schema(
names, tiledb_datatypes);

// Create the data for the info we return
auto arrow_array = ArrowAdapter::make_arrow_array_parent(array_ndim);

for (int i = 0; i < array_ndim; i++) {
auto core_dim = dimensions[i];
auto core_type_code = core_dim.type();

ArrowArray* child = nullptr;

switch (core_type_code) {
case TILEDB_INT64:
child = ArrowAdapter::make_arrow_array_child(
_core_domainish_slot<int64_t>(core_dim.name(), which_kind));
break;
case TILEDB_UINT64:
child = ArrowAdapter::make_arrow_array_child(
_core_domainish_slot<uint64_t>(
core_dim.name(), which_kind));
break;
case TILEDB_INT32:
child = ArrowAdapter::make_arrow_array_child(
_core_domainish_slot<int32_t>(core_dim.name(), which_kind));
break;
case TILEDB_UINT32:
child = ArrowAdapter::make_arrow_array_child(
_core_domainish_slot<uint32_t>(
core_dim.name(), which_kind));
break;
case TILEDB_INT16:
child = ArrowAdapter::make_arrow_array_child(
_core_domainish_slot<int16_t>(core_dim.name(), which_kind));
break;
case TILEDB_UINT16:
child = ArrowAdapter::make_arrow_array_child(
_core_domainish_slot<uint16_t>(
core_dim.name(), which_kind));
break;
case TILEDB_INT8:
child = ArrowAdapter::make_arrow_array_child(
_core_domainish_slot<int8_t>(core_dim.name(), which_kind));
break;
case TILEDB_UINT8:
child = ArrowAdapter::make_arrow_array_child(
_core_domainish_slot<uint8_t>(core_dim.name(), which_kind));
break;

case TILEDB_FLOAT64:
child = ArrowAdapter::make_arrow_array_child(
_core_domainish_slot<double>(core_dim.name(), which_kind));
break;
case TILEDB_FLOAT32:
child = ArrowAdapter::make_arrow_array_child(
_core_domainish_slot<float>(core_dim.name(), which_kind));
break;

case TILEDB_STRING_ASCII:
case TILEDB_CHAR:
child = ArrowAdapter::make_arrow_array_child_string(
_core_domainish_slot_string(core_dim.name(), which_kind));
break;

default:
break;
}

if (child == nullptr) {
// throw TileDBSOMAError(fmt::format(
// "WIP {} {}",
// core_dim.name(),
// tiledb::impl::type_to_str(core_type_code)));
}
arrow_array->children[i] = child;
}

return ArrowTable(std::move(arrow_array), std::move(arrow_schema));
}

uint64_t SOMAArray::nnz() {
// Verify array is sparse
if (mq_->schema()->array_type() != TILEDB_SPARSE) {
Expand Down
208 changes: 185 additions & 23 deletions libtiledbsoma/src/soma/soma_array.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,14 @@
namespace tiledbsoma {
using namespace tiledb;

// This enables some code deduplication between core domain, core current
// domain, and core non-empty domain.
enum class Domainish {
kind_core_domain = 0,
kind_core_current_domain = 1,
kind_non_empty_domain = 2
};

class SOMAArray : public SOMAObject {
public:
//===================================================================
Expand Down Expand Up @@ -703,7 +711,7 @@ class SOMAArray : public SOMAObject {
* non-empty domains of the array fragments.
*/
template <typename T>
std::pair<T, T> non_empty_domain_slot(const std::string& name) {
std::pair<T, T> non_empty_domain_slot(const std::string& name) const {
try {
return arr_->non_empty_domain<T>(name);
} catch (const std::exception& e) {
Expand All @@ -717,7 +725,7 @@ class SOMAArray : public SOMAObject {
* Applicable only to var-sized dimensions.
*/
std::pair<std::string, std::string> non_empty_domain_slot_var(
const std::string& name) {
const std::string& name) const {
try {
return arr_->non_empty_domain_var(name);
} catch (const std::exception& e) {
Expand Down Expand Up @@ -745,6 +753,70 @@ class SOMAArray : public SOMAObject {
return !_get_current_domain().is_empty();
}

/**
* Returns the core current domain at the given dimension.
*
* o For arrays with core current-domain support:
* - soma domain is core current domain
* - soma maxdomain is core domain
* o For arrays without core current-domain support:
* - soma domain is core domain
* - soma maxdomain is core domain
* - core current domain is not accessed at the soma level
*
* @tparam T Domain datatype
* @return Pair of [lower, upper] inclusive bounds.
*/
template <typename T>
std::pair<T, T> _core_current_domain_slot(const std::string& name) const {
CurrentDomain current_domain = _get_current_domain();
if (current_domain.is_empty()) {
throw TileDBSOMAError(
"_core_current_domain_slot: internal coding error");
}
if (current_domain.type() != TILEDB_NDRECTANGLE) {
throw TileDBSOMAError(
"_core_current_domain_slot: found non-rectangle type");
}
NDRectangle ndrect = current_domain.ndrectangle();

// Convert from two-element array (core API) to pair (tiledbsoma API)
std::array<T, 2> arr = ndrect.range<T>(name);
return std::pair<T, T>(arr[0], arr[1]);
}

/**
* Returns the core domain at the given dimension.
*
* o For arrays with core current-domain support:
* - soma domain is core current domain
* - soma maxdomain is core domain
* o For arrays without core current-domain support:
* - soma domain is core domain
* - soma maxdomain is core domain
* - core current domain is not accessed at the soma level
*
* @tparam T Domain datatype
* @return Pair of [lower, upper] inclusive bounds.
*/
template <typename T>
std::pair<T, T> _core_domain_slot(const std::string& name) const {
if (std::is_same_v<T, std::string>) {
throw std::runtime_error(
"SOMAArray::_core_domain_slot: template-specialization "
"failure.");
}
return arr_->schema().domain().dimension(name).domain<T>();
}

std::pair<std::string, std::string> _core_domain_slot_string(
const std::string&) const {
// Core domain for string dims is always a nullptr pair at the C++
// level. We follow the convention started by TileDB-Py which is to
// report these as an empty-string pair.
return std::pair<std::string, std::string>("", "");
}

/**
* Returns the SOMA domain at the given dimension.
*
Expand Down Expand Up @@ -775,6 +847,102 @@ class SOMAArray : public SOMAObject {
return _core_domain_slot<T>(name);
}

/**
* Returns the SOMA domain in its entirety, as an Arrow table for return to
* Python/R.
*
* o For arrays with core current-domain support:
* - soma domain is core current domain
* - soma maxdomain is core domain
* o For arrays without core current-domain support:
* - soma domain is core domain
* - soma maxdomain is core domain
* - core current domain is not accessed at the soma level
*
* @tparam T Domain datatype
* @return Pair of [lower, upper] inclusive bounds.
*/
ArrowTable get_soma_domain() {
if (has_current_domain()) {
return _get_core_current_domain();
} else {
return _get_core_domain();
}
}

/**
* Returns the SOMA maxdomain in its entirety, as an Arrow table for return
* to Python/R.
*
* o For arrays with core current-domain support:
* - soma domain is core current domain
* - soma maxdomain is core domain
* o For arrays without core current-domain support:
* - soma domain is core domain
* - soma maxdomain is core domain
* - core current domain is not accessed at the soma level
*
* @tparam T Domain datatype
* @return Pair of [lower, upper] inclusive bounds.
*/
ArrowTable get_soma_maxdomain() {
return _get_core_domain();
}

/**
* Returns the core non-empty domain in its entirety, as an Arrow
* table for return to Python/R.
*/
ArrowTable get_non_empty_domain() {
return _get_core_domainish(Domainish::kind_non_empty_domain);
}

/**
* This enables some code deduplication between core domain, core current
* domain, and core non-empty domain.
*/
template <typename T>
std::pair<T, T> _core_domainish_slot(
const std::string& name, enum Domainish which_kind) const {
if (std::is_same_v<T, std::string>) {
throw std::runtime_error(
"SOMAArray::_core_domainish_slot: template-specialization "
"failure.");
}
switch (which_kind) {
case Domainish::kind_core_domain:
return _core_domain_slot<T>(name);
break;
case Domainish::kind_core_current_domain:
return _core_current_domain_slot<T>(name);
break;
case Domainish::kind_non_empty_domain:
return non_empty_domain_slot<T>(name);
break;
default:
throw std::runtime_error(
"internal coding error in SOMAArray::_core_domainish_slot");
}
}

std::pair<std::string, std::string> _core_domainish_slot_string(
const std::string& name, enum Domainish which_kind) const {
switch (which_kind) {
case Domainish::kind_core_domain:
return _core_domain_slot_string(name);
break;
case Domainish::kind_core_current_domain:
return _core_current_domain_slot<std::string>(name);
break;
case Domainish::kind_non_empty_domain:
return non_empty_domain_slot_var(name);
break;
default:
throw std::runtime_error(
"internal coding error in SOMAArray::_core_domainish_slot");
}
}

/**
* @brief Get the total number of unique cells in the array.
*
Expand Down Expand Up @@ -896,7 +1064,14 @@ class SOMAArray : public SOMAObject {
}

/**
* Returns the core current domain at the given dimension.
* This enables some code deduplication between core domain, core current
* domain, and core non-empty domain.
*/
ArrowTable _get_core_domainish(enum Domainish which_kind);

/**
* Returns the core domain in its entirety, as an Arrow table
* for return to Python/R.
*
* o For arrays with core current-domain support:
* - soma domain is core current domain
Expand All @@ -909,26 +1084,14 @@ class SOMAArray : public SOMAObject {
* @tparam T Domain datatype
* @return Pair of [lower, upper] inclusive bounds.
*/
template <typename T>
std::pair<T, T> _core_current_domain_slot(const std::string& name) const {
CurrentDomain current_domain = _get_current_domain();
if (current_domain.is_empty()) {
throw TileDBSOMAError(
"_core_current_domain_slot: internal coding error");
}
if (current_domain.type() != TILEDB_NDRECTANGLE) {
throw TileDBSOMAError(
"_core_current_domain_slot: found non-rectangle type");
}
NDRectangle ndrect = current_domain.ndrectangle();

// Convert from two-element array (core API) to pair (tiledbsoma API)
std::array<T, 2> arr = ndrect.range<T>(name);
return std::pair<T, T>(arr[0], arr[1]);
ArrowTable _get_core_domain() {
return _get_core_domainish(Domainish::kind_core_domain);
}

/**
* Returns the core current domain at the given dimension.
* Returns the core current domain in its entirety, as an Arrow table
* for return to Python/R.
*
*
* o For arrays with core current-domain support:
* - soma domain is core current domain
Expand All @@ -941,9 +1104,8 @@ class SOMAArray : public SOMAObject {
* @tparam T Domain datatype
* @return Pair of [lower, upper] inclusive bounds.
*/
template <typename T>
std::pair<T, T> _core_domain_slot(const std::string& name) const {
return arr_->schema().domain().dimension(name).domain<T>();
ArrowTable _get_core_current_domain() {
return _get_core_domainish(Domainish::kind_core_current_domain);
}

/**
Expand Down
Loading

0 comments on commit e2b6621

Please sign in to comment.