Skip to content

Commit

Permalink
[c++] Implementation and unit testing for domainish accessors (#3012)
Browse files Browse the repository at this point in the history
* [c++] implementation and unit testing for domainish accessors

* Update libtiledbsoma/src/soma/soma_array.h [skip ci]

Co-authored-by: nguyenv <[email protected]>

* code-review feedback [skip ci]

Co-authored-by: nguyenv <[email protected]>

* code-review feedback [skip ci]

Co-authored-by: nguyenv <[email protected]>

* code-review feedback [skip ci]

* code-review feedback]

* code-review feedback

Co-authored-by: nguyenv <[email protected]>

---------

Co-authored-by: nguyenv <[email protected]>
  • Loading branch information
johnkerl and nguyenv authored Sep 20, 2024
1 parent 42b5695 commit 97f7084
Show file tree
Hide file tree
Showing 3 changed files with 570 additions and 50 deletions.
99 changes: 97 additions & 2 deletions libtiledbsoma/src/soma/soma_array.cc
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
/**
* @file soma_array.cc
/** @file soma_array.cc
*
* @section LICENSE
*
Expand Down Expand Up @@ -1141,6 +1140,102 @@ std::optional<TimestampRange> SOMAArray::timestamp() {
return timestamp_;
}

// Note that ArrowTable is simply our libtiledbsoma pairing of ArrowArray and
// ArrowSchema from nanoarrow.
//
// The domainish enum simply lets us re-use code which is common across
// core domain, core current domain, and core non-empty domain.
ArrowTable SOMAArray::_get_core_domainish(enum Domainish which_kind) {
int array_ndim = this->ndim();
auto dimensions = tiledb_schema()->domain().dimensions();

// Create the schema for the info we return
std::vector<std::string> names(array_ndim);
std::vector<tiledb_datatype_t> tiledb_datatypes(array_ndim);

for (int i = 0; i < (int)array_ndim; i++) {
const Dimension& core_dim = dimensions[i];
names[i] = core_dim.name();
tiledb_datatypes[i] = core_dim.type();
}

auto arrow_schema = ArrowAdapter::make_arrow_schema(
names, tiledb_datatypes);

// Create the data for the info we return
auto arrow_array = ArrowAdapter::make_arrow_array_parent(array_ndim);

for (int i = 0; i < array_ndim; i++) {
auto core_dim = dimensions[i];
auto core_type_code = core_dim.type();

ArrowArray* child = nullptr;

switch (core_type_code) {
case TILEDB_INT64:
child = ArrowAdapter::make_arrow_array_child(
_core_domainish_slot<int64_t>(core_dim.name(), which_kind));
break;
case TILEDB_UINT64:
child = ArrowAdapter::make_arrow_array_child(
_core_domainish_slot<uint64_t>(
core_dim.name(), which_kind));
break;
case TILEDB_INT32:
child = ArrowAdapter::make_arrow_array_child(
_core_domainish_slot<int32_t>(core_dim.name(), which_kind));
break;
case TILEDB_UINT32:
child = ArrowAdapter::make_arrow_array_child(
_core_domainish_slot<uint32_t>(
core_dim.name(), which_kind));
break;
case TILEDB_INT16:
child = ArrowAdapter::make_arrow_array_child(
_core_domainish_slot<int16_t>(core_dim.name(), which_kind));
break;
case TILEDB_UINT16:
child = ArrowAdapter::make_arrow_array_child(
_core_domainish_slot<uint16_t>(
core_dim.name(), which_kind));
break;
case TILEDB_INT8:
child = ArrowAdapter::make_arrow_array_child(
_core_domainish_slot<int8_t>(core_dim.name(), which_kind));
break;
case TILEDB_UINT8:
child = ArrowAdapter::make_arrow_array_child(
_core_domainish_slot<uint8_t>(core_dim.name(), which_kind));
break;

case TILEDB_FLOAT64:
child = ArrowAdapter::make_arrow_array_child(
_core_domainish_slot<double>(core_dim.name(), which_kind));
break;
case TILEDB_FLOAT32:
child = ArrowAdapter::make_arrow_array_child(
_core_domainish_slot<float>(core_dim.name(), which_kind));
break;

case TILEDB_STRING_ASCII:
case TILEDB_CHAR:
child = ArrowAdapter::make_arrow_array_child_string(
_core_domainish_slot_string(core_dim.name(), which_kind));
break;

default:
throw TileDBSOMAError(fmt::format(
"SOMAArray::_get_core_domainish:dim {} has unhandled type "
"{}",
core_dim.name(),
tiledb::impl::type_to_str(core_type_code)));
}
arrow_array->children[i] = child;
}

return ArrowTable(std::move(arrow_array), std::move(arrow_schema));
}

uint64_t SOMAArray::nnz() {
// Verify array is sparse
if (mq_->schema()->array_type() != TILEDB_SPARSE) {
Expand Down
226 changes: 181 additions & 45 deletions libtiledbsoma/src/soma/soma_array.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,14 @@
namespace tiledbsoma {
using namespace tiledb;

// This enables some code deduplication between core domain, core current
// domain, and core non-empty domain.
enum class Domainish {
kind_core_domain = 0,
kind_core_current_domain = 1,
kind_non_empty_domain = 2
};

class SOMAArray : public SOMAObject {
public:
//===================================================================
Expand Down Expand Up @@ -703,7 +711,7 @@ class SOMAArray : public SOMAObject {
* non-empty domains of the array fragments.
*/
template <typename T>
std::pair<T, T> non_empty_domain_slot(const std::string& name) {
std::pair<T, T> non_empty_domain_slot(const std::string& name) const {
try {
return arr_->non_empty_domain<T>(name);
} catch (const std::exception& e) {
Expand All @@ -717,7 +725,7 @@ class SOMAArray : public SOMAObject {
* Applicable only to var-sized dimensions.
*/
std::pair<std::string, std::string> non_empty_domain_slot_var(
const std::string& name) {
const std::string& name) const {
try {
return arr_->non_empty_domain_var(name);
} catch (const std::exception& e) {
Expand Down Expand Up @@ -745,6 +753,70 @@ class SOMAArray : public SOMAObject {
return !_get_current_domain().is_empty();
}

/**
* Returns the core current domain at the given dimension.
*
* o For arrays with core current-domain support:
* - soma domain is core current domain
* - soma maxdomain is core domain
* o For arrays without core current-domain support:
* - soma domain is core domain
* - soma maxdomain is core domain
* - core current domain is not accessed at the soma level
*
* @tparam T Domain datatype
* @return Pair of [lower, upper] inclusive bounds.
*/
template <typename T>
std::pair<T, T> _core_current_domain_slot(const std::string& name) const {
CurrentDomain current_domain = _get_current_domain();
if (current_domain.is_empty()) {
throw TileDBSOMAError(
"_core_current_domain_slot: internal coding error");
}
if (current_domain.type() != TILEDB_NDRECTANGLE) {
throw TileDBSOMAError(
"_core_current_domain_slot: found non-rectangle type");
}
NDRectangle ndrect = current_domain.ndrectangle();

// Convert from two-element array (core API) to pair (tiledbsoma API)
std::array<T, 2> arr = ndrect.range<T>(name);
return std::pair<T, T>(arr[0], arr[1]);
}

/**
* Returns the core domain at the given dimension.
*
* o For arrays with core current-domain support:
* - soma domain is core current domain
* - soma maxdomain is core domain
* o For arrays without core current-domain support:
* - soma domain is core domain
* - soma maxdomain is core domain
* - core current domain is not accessed at the soma level
*
* @tparam T Domain datatype
* @return Pair of [lower, upper] inclusive bounds.
*/
template <typename T>
std::pair<T, T> _core_domain_slot(const std::string& name) const {
if (std::is_same_v<T, std::string>) {
throw std::runtime_error(
"SOMAArray::_core_domain_slot: template-specialization "
"failure.");
}
return arr_->schema().domain().dimension(name).domain<T>();
}

std::pair<std::string, std::string> _core_domain_slot_string(
const std::string&) const {
// Core domain for string dims is always a nullptr pair at the C++
// level. We follow the convention started by TileDB-Py which is to
// report these as an empty-string pair.
return std::pair<std::string, std::string>("", "");
}

/**
* Returns the SOMA domain at the given dimension.
*
Expand Down Expand Up @@ -775,6 +847,105 @@ class SOMAArray : public SOMAObject {
return _core_domain_slot<T>(name);
}

/**
* Returns the SOMA domain in its entirety, as an Arrow table for return to
* Python/R.
*
* o For arrays with core current-domain support:
* - soma domain is core current domain
* - soma maxdomain is core domain
* o For arrays without core current-domain support:
* - soma domain is core domain
* - soma maxdomain is core domain
* - core current domain is not accessed at the soma level
*
* @tparam T Domain datatype
* @return Pair of [lower, upper] inclusive bounds.
*/
ArrowTable get_soma_domain() {
if (has_current_domain()) {
return _get_core_current_domain();
} else {
return _get_core_domain();
}
}

/**
* Returns the SOMA maxdomain in its entirety, as an Arrow table for return
* to Python/R.
*
* o For arrays with core current-domain support:
* - soma domain is core current domain
* - soma maxdomain is core domain
* o For arrays without core current-domain support:
* - soma domain is core domain
* - soma maxdomain is core domain
* - core current domain is not accessed at the soma level
*
* @tparam T Domain datatype
* @return Pair of [lower, upper] inclusive bounds.
*/
ArrowTable get_soma_maxdomain() {
return _get_core_domain();
}

/**
* Returns the core non-empty domain in its entirety, as an Arrow
* table for return to Python/R.
*/
ArrowTable get_non_empty_domain() {
return _get_core_domainish(Domainish::kind_non_empty_domain);
}

/**
* Code-dedupe helper for core domain, core current domain, and core
* non-empty domain.
*/
ArrowTable _get_core_domainish(enum Domainish which_kind);

/**
* This enables some code deduplication between core domain, core current
* domain, and core non-empty domain.
*/
template <typename T>
std::pair<T, T> _core_domainish_slot(
const std::string& name, enum Domainish which_kind) const {
if (std::is_same_v<T, std::string>) {
throw std::runtime_error(
"SOMAArray::_core_domainish_slot: template-specialization "
"failure.");
}
switch (which_kind) {
case Domainish::kind_core_domain:
return _core_domain_slot<T>(name);
case Domainish::kind_core_current_domain:
return _core_current_domain_slot<T>(name);
case Domainish::kind_non_empty_domain:
return non_empty_domain_slot<T>(name);
default:
throw std::runtime_error(
"internal coding error in SOMAArray::_core_domainish_slot: "
"unknown kind");
}
}

std::pair<std::string, std::string> _core_domainish_slot_string(
const std::string& name, enum Domainish which_kind) const {
switch (which_kind) {
case Domainish::kind_core_domain:
return _core_domain_slot_string(name);
case Domainish::kind_core_current_domain:
return _core_current_domain_slot<std::string>(name);
case Domainish::kind_non_empty_domain:
return non_empty_domain_slot_var(name);
default:
throw std::runtime_error(
"internal coding error in "
"SOMAArray::_core_domainish_slot_string: "
"unknown kind");
}
}

/**
* @brief Get the total number of unique cells in the array.
*
Expand Down Expand Up @@ -896,54 +1067,19 @@ class SOMAArray : public SOMAObject {
}

/**
* Returns the core current domain at the given dimension.
*
* o For arrays with core current-domain support:
* - soma domain is core current domain
* - soma maxdomain is core domain
* o For arrays without core current-domain support:
* - soma domain is core domain
* - soma maxdomain is core domain
* - core current domain is not accessed at the soma level
*
* @tparam T Domain datatype
* @return Pair of [lower, upper] inclusive bounds.
* Returns the core current domain in its entirety, as an Arrow
* table for return to Python/R.
*/
template <typename T>
std::pair<T, T> _core_current_domain_slot(const std::string& name) const {
CurrentDomain current_domain = _get_current_domain();
if (current_domain.is_empty()) {
throw TileDBSOMAError(
"_core_current_domain_slot: internal coding error");
}
if (current_domain.type() != TILEDB_NDRECTANGLE) {
throw TileDBSOMAError(
"_core_current_domain_slot: found non-rectangle type");
}
NDRectangle ndrect = current_domain.ndrectangle();

// Convert from two-element array (core API) to pair (tiledbsoma API)
std::array<T, 2> arr = ndrect.range<T>(name);
return std::pair<T, T>(arr[0], arr[1]);
ArrowTable _get_core_current_domain() {
return _get_core_domainish(Domainish::kind_core_current_domain);
}

/**
* Returns the core current domain at the given dimension.
*
* o For arrays with core current-domain support:
* - soma domain is core current domain
* - soma maxdomain is core domain
* o For arrays without core current-domain support:
* - soma domain is core domain
* - soma maxdomain is core domain
* - core current domain is not accessed at the soma level
*
* @tparam T Domain datatype
* @return Pair of [lower, upper] inclusive bounds.
* Returns the core domain in its entirety, as an Arrow
* table for return to Python/R.
*/
template <typename T>
std::pair<T, T> _core_domain_slot(const std::string& name) const {
return arr_->schema().domain().dimension(name).domain<T>();
ArrowTable _get_core_domain() {
return _get_core_domainish(Domainish::kind_core_domain);
}

/**
Expand Down
Loading

0 comments on commit 97f7084

Please sign in to comment.