Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[c++] Implementation and unit testing for domainish accessors #3012

Merged
merged 8 commits into from
Sep 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
99 changes: 97 additions & 2 deletions libtiledbsoma/src/soma/soma_array.cc
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
/**
* @file soma_array.cc
/** @file soma_array.cc
*
* @section LICENSE
*
Expand Down Expand Up @@ -1141,6 +1140,102 @@ std::optional<TimestampRange> SOMAArray::timestamp() {
return timestamp_;
}

// Note that ArrowTable is simply our libtiledbsoma pairing of ArrowArray and
// ArrowSchema from nanoarrow.
//
// The domainish enum simply lets us re-use code which is common across
// core domain, core current domain, and core non-empty domain.
ArrowTable SOMAArray::_get_core_domainish(enum Domainish which_kind) {
int array_ndim = this->ndim();
auto dimensions = tiledb_schema()->domain().dimensions();

// Create the schema for the info we return
std::vector<std::string> names(array_ndim);
std::vector<tiledb_datatype_t> tiledb_datatypes(array_ndim);

for (int i = 0; i < (int)array_ndim; i++) {
const Dimension& core_dim = dimensions[i];
names[i] = core_dim.name();
tiledb_datatypes[i] = core_dim.type();
}

auto arrow_schema = ArrowAdapter::make_arrow_schema(
names, tiledb_datatypes);

// Create the data for the info we return
auto arrow_array = ArrowAdapter::make_arrow_array_parent(array_ndim);

for (int i = 0; i < array_ndim; i++) {
auto core_dim = dimensions[i];
auto core_type_code = core_dim.type();

ArrowArray* child = nullptr;

switch (core_type_code) {
case TILEDB_INT64:
child = ArrowAdapter::make_arrow_array_child(
_core_domainish_slot<int64_t>(core_dim.name(), which_kind));
break;
case TILEDB_UINT64:
child = ArrowAdapter::make_arrow_array_child(
_core_domainish_slot<uint64_t>(
core_dim.name(), which_kind));
break;
case TILEDB_INT32:
child = ArrowAdapter::make_arrow_array_child(
_core_domainish_slot<int32_t>(core_dim.name(), which_kind));
break;
case TILEDB_UINT32:
child = ArrowAdapter::make_arrow_array_child(
_core_domainish_slot<uint32_t>(
core_dim.name(), which_kind));
break;
case TILEDB_INT16:
child = ArrowAdapter::make_arrow_array_child(
_core_domainish_slot<int16_t>(core_dim.name(), which_kind));
break;
case TILEDB_UINT16:
child = ArrowAdapter::make_arrow_array_child(
_core_domainish_slot<uint16_t>(
core_dim.name(), which_kind));
break;
case TILEDB_INT8:
child = ArrowAdapter::make_arrow_array_child(
_core_domainish_slot<int8_t>(core_dim.name(), which_kind));
break;
case TILEDB_UINT8:
child = ArrowAdapter::make_arrow_array_child(
_core_domainish_slot<uint8_t>(core_dim.name(), which_kind));
break;

case TILEDB_FLOAT64:
child = ArrowAdapter::make_arrow_array_child(
_core_domainish_slot<double>(core_dim.name(), which_kind));
break;
case TILEDB_FLOAT32:
child = ArrowAdapter::make_arrow_array_child(
_core_domainish_slot<float>(core_dim.name(), which_kind));
break;

case TILEDB_STRING_ASCII:
case TILEDB_CHAR:
child = ArrowAdapter::make_arrow_array_child_string(
_core_domainish_slot_string(core_dim.name(), which_kind));
break;

default:
throw TileDBSOMAError(fmt::format(
"SOMAArray::_get_core_domainish:dim {} has unhandled type "
"{}",
core_dim.name(),
tiledb::impl::type_to_str(core_type_code)));
}
arrow_array->children[i] = child;
}

return ArrowTable(std::move(arrow_array), std::move(arrow_schema));
}

uint64_t SOMAArray::nnz() {
// Verify array is sparse
if (mq_->schema()->array_type() != TILEDB_SPARSE) {
Expand Down
226 changes: 181 additions & 45 deletions libtiledbsoma/src/soma/soma_array.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,14 @@
namespace tiledbsoma {
using namespace tiledb;

// This enables some code deduplication between core domain, core current
// domain, and core non-empty domain.
enum class Domainish {
kind_core_domain = 0,
kind_core_current_domain = 1,
kind_non_empty_domain = 2
};

class SOMAArray : public SOMAObject {
public:
//===================================================================
Expand Down Expand Up @@ -703,7 +711,7 @@ class SOMAArray : public SOMAObject {
* non-empty domains of the array fragments.
*/
template <typename T>
std::pair<T, T> non_empty_domain_slot(const std::string& name) {
std::pair<T, T> non_empty_domain_slot(const std::string& name) const {
try {
return arr_->non_empty_domain<T>(name);
} catch (const std::exception& e) {
Expand All @@ -717,7 +725,7 @@ class SOMAArray : public SOMAObject {
* Applicable only to var-sized dimensions.
*/
std::pair<std::string, std::string> non_empty_domain_slot_var(
const std::string& name) {
const std::string& name) const {
try {
return arr_->non_empty_domain_var(name);
} catch (const std::exception& e) {
Expand Down Expand Up @@ -745,6 +753,70 @@ class SOMAArray : public SOMAObject {
return !_get_current_domain().is_empty();
}

/**
* Returns the core current domain at the given dimension.
*
* o For arrays with core current-domain support:
* - soma domain is core current domain
* - soma maxdomain is core domain
* o For arrays without core current-domain support:
* - soma domain is core domain
* - soma maxdomain is core domain
* - core current domain is not accessed at the soma level
*
* @tparam T Domain datatype
* @return Pair of [lower, upper] inclusive bounds.
*/
template <typename T>
std::pair<T, T> _core_current_domain_slot(const std::string& name) const {
CurrentDomain current_domain = _get_current_domain();
if (current_domain.is_empty()) {
throw TileDBSOMAError(
"_core_current_domain_slot: internal coding error");
}
if (current_domain.type() != TILEDB_NDRECTANGLE) {
throw TileDBSOMAError(
"_core_current_domain_slot: found non-rectangle type");
}
NDRectangle ndrect = current_domain.ndrectangle();

// Convert from two-element array (core API) to pair (tiledbsoma API)
std::array<T, 2> arr = ndrect.range<T>(name);
return std::pair<T, T>(arr[0], arr[1]);
}

/**
* Returns the core domain at the given dimension.
*
* o For arrays with core current-domain support:
* - soma domain is core current domain
* - soma maxdomain is core domain
* o For arrays without core current-domain support:
* - soma domain is core domain
* - soma maxdomain is core domain
* - core current domain is not accessed at the soma level
*
* @tparam T Domain datatype
* @return Pair of [lower, upper] inclusive bounds.
*/
template <typename T>
std::pair<T, T> _core_domain_slot(const std::string& name) const {
if (std::is_same_v<T, std::string>) {
throw std::runtime_error(
"SOMAArray::_core_domain_slot: template-specialization "
"failure.");
}
return arr_->schema().domain().dimension(name).domain<T>();
}

std::pair<std::string, std::string> _core_domain_slot_string(
const std::string&) const {
// Core domain for string dims is always a nullptr pair at the C++
// level. We follow the convention started by TileDB-Py which is to
// report these as an empty-string pair.
return std::pair<std::string, std::string>("", "");
}

/**
* Returns the SOMA domain at the given dimension.
*
Expand Down Expand Up @@ -775,6 +847,105 @@ class SOMAArray : public SOMAObject {
return _core_domain_slot<T>(name);
}

/**
* Returns the SOMA domain in its entirety, as an Arrow table for return to
* Python/R.
*
* o For arrays with core current-domain support:
* - soma domain is core current domain
* - soma maxdomain is core domain
* o For arrays without core current-domain support:
* - soma domain is core domain
* - soma maxdomain is core domain
* - core current domain is not accessed at the soma level
*
* @tparam T Domain datatype
* @return Pair of [lower, upper] inclusive bounds.
*/
ArrowTable get_soma_domain() {
if (has_current_domain()) {
return _get_core_current_domain();
} else {
return _get_core_domain();
}
}

/**
* Returns the SOMA maxdomain in its entirety, as an Arrow table for return
* to Python/R.
*
* o For arrays with core current-domain support:
* - soma domain is core current domain
* - soma maxdomain is core domain
* o For arrays without core current-domain support:
* - soma domain is core domain
* - soma maxdomain is core domain
* - core current domain is not accessed at the soma level
*
* @tparam T Domain datatype
* @return Pair of [lower, upper] inclusive bounds.
*/
ArrowTable get_soma_maxdomain() {
return _get_core_domain();
}

/**
* Returns the core non-empty domain in its entirety, as an Arrow
* table for return to Python/R.
*/
ArrowTable get_non_empty_domain() {
return _get_core_domainish(Domainish::kind_non_empty_domain);
}

/**
* Code-dedupe helper for core domain, core current domain, and core
* non-empty domain.
*/
ArrowTable _get_core_domainish(enum Domainish which_kind);

/**
* This enables some code deduplication between core domain, core current
* domain, and core non-empty domain.
*/
template <typename T>
std::pair<T, T> _core_domainish_slot(
const std::string& name, enum Domainish which_kind) const {
if (std::is_same_v<T, std::string>) {
throw std::runtime_error(
"SOMAArray::_core_domainish_slot: template-specialization "
"failure.");
}
switch (which_kind) {
case Domainish::kind_core_domain:
return _core_domain_slot<T>(name);
case Domainish::kind_core_current_domain:
return _core_current_domain_slot<T>(name);
case Domainish::kind_non_empty_domain:
return non_empty_domain_slot<T>(name);
default:
throw std::runtime_error(
"internal coding error in SOMAArray::_core_domainish_slot: "
"unknown kind");
}
}

std::pair<std::string, std::string> _core_domainish_slot_string(
const std::string& name, enum Domainish which_kind) const {
switch (which_kind) {
case Domainish::kind_core_domain:
return _core_domain_slot_string(name);
case Domainish::kind_core_current_domain:
return _core_current_domain_slot<std::string>(name);
case Domainish::kind_non_empty_domain:
return non_empty_domain_slot_var(name);
default:
throw std::runtime_error(
"internal coding error in "
"SOMAArray::_core_domainish_slot_string: "
"unknown kind");
}
}

/**
* @brief Get the total number of unique cells in the array.
*
Expand Down Expand Up @@ -896,54 +1067,19 @@ class SOMAArray : public SOMAObject {
}

/**
* Returns the core current domain at the given dimension.
*
* o For arrays with core current-domain support:
* - soma domain is core current domain
* - soma maxdomain is core domain
* o For arrays without core current-domain support:
* - soma domain is core domain
* - soma maxdomain is core domain
* - core current domain is not accessed at the soma level
*
* @tparam T Domain datatype
* @return Pair of [lower, upper] inclusive bounds.
* Returns the core current domain in its entirety, as an Arrow
* table for return to Python/R.
*/
template <typename T>
std::pair<T, T> _core_current_domain_slot(const std::string& name) const {
CurrentDomain current_domain = _get_current_domain();
if (current_domain.is_empty()) {
throw TileDBSOMAError(
"_core_current_domain_slot: internal coding error");
}
if (current_domain.type() != TILEDB_NDRECTANGLE) {
throw TileDBSOMAError(
"_core_current_domain_slot: found non-rectangle type");
}
NDRectangle ndrect = current_domain.ndrectangle();

// Convert from two-element array (core API) to pair (tiledbsoma API)
std::array<T, 2> arr = ndrect.range<T>(name);
return std::pair<T, T>(arr[0], arr[1]);
ArrowTable _get_core_current_domain() {
return _get_core_domainish(Domainish::kind_core_current_domain);
}

/**
* Returns the core current domain at the given dimension.
*
* o For arrays with core current-domain support:
* - soma domain is core current domain
* - soma maxdomain is core domain
* o For arrays without core current-domain support:
* - soma domain is core domain
* - soma maxdomain is core domain
* - core current domain is not accessed at the soma level
*
* @tparam T Domain datatype
* @return Pair of [lower, upper] inclusive bounds.
* Returns the core domain in its entirety, as an Arrow
* table for return to Python/R.
*/
template <typename T>
std::pair<T, T> _core_domain_slot(const std::string& name) const {
return arr_->schema().domain().dimension(name).domain<T>();
ArrowTable _get_core_domain() {
return _get_core_domainish(Domainish::kind_core_domain);
}

/**
Expand Down
Loading
Loading