Skip to content

Commit

Permalink
[c++] Arrow utils with current-domain option [WIP]
Browse files Browse the repository at this point in the history
  • Loading branch information
johnkerl committed Aug 18, 2024
1 parent d93cdf8 commit 3441d9e
Show file tree
Hide file tree
Showing 13 changed files with 920 additions and 483 deletions.
2 changes: 1 addition & 1 deletion apis/r/tests/testthat/helper-test-data.R
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,6 @@ create_arrow_table <- function(nrows = 10L, factors = FALSE) {
soma_joinid = bit64::seq.integer64(from = 0L, to = nrows - 1L),
bar = seq(nrows) + 0.1,
baz = as.character(seq.int(nrows) + 1000L)
# schema = create_arrow_schema()
# schema = create_arrow_schema(false)
)
}
85 changes: 85 additions & 0 deletions libtiledbsoma/src/soma/soma_array.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1178,6 +1178,26 @@ uint64_t SOMAArray::nnz_slow() {
}

std::vector<int64_t> SOMAArray::shape() {
// Two reasons for this:
// * Transitional, non-monolithic, phased, careful development for the
// new-shape feature
// * Even after the new-shape feature is fully released, there will be old
// arrays on disk that were created before this feature existed.
// So this is long-term code.
auto current_domain = tiledb::ArraySchemaExperimental::current_domain(
*ctx_->tiledb_ctx(), arr_->schema());
if (current_domain.is_empty()) {
return _tiledb_domain();
} else {
return _tiledb_current_domain();
}
}

std::vector<int64_t> SOMAArray::maxshape() {
return _tiledb_domain();
}

std::vector<int64_t> SOMAArray::_tiledb_domain() {
std::vector<int64_t> result;
auto dimensions = mq_->schema()->domain().dimensions();

Expand Down Expand Up @@ -1253,6 +1273,71 @@ std::vector<int64_t> SOMAArray::shape() {
return result;
}

std::vector<int64_t> SOMAArray::_tiledb_current_domain() {
std::vector<int64_t> result;

auto current_domain = tiledb::ArraySchemaExperimental::current_domain(
*ctx_->tiledb_ctx(), arr_->schema());

if (current_domain.is_empty()) {
throw TileDBSOMAError(
"Internal error: current domain requested for an array which does "
"not support it");
}

auto t = current_domain.type();
if (t != TILEDB_NDRECTANGLE) {
throw TileDBSOMAError("current_domain type is not NDRECTANGLE");
}

NDRectangle ndrect = current_domain.ndrectangle();

for (auto dimension_name : dimension_names()) {
// TODO: non-int64 types for SOMADataFrame extra dims.
// This simply needs to be integrated with switch statements as in the
// legacy code below.
auto range = ndrect.range<int64_t>(dimension_name);
result.push_back(range[1] + 1);
}
return result;
}

void SOMAArray::resize(const std::vector<int64_t>& newshape) {
if (mq_->query_type() != TILEDB_WRITE) {
throw TileDBSOMAError(
"[SOMAArray::resize] array must be opened in write mode");
}

auto tctx = ctx_->tiledb_ctx();
ArraySchema schema = arr_->schema();
Domain domain = schema.domain();
ArraySchemaEvolution schema_evolution(*tctx);
CurrentDomain new_current_domain(*tctx);

NDRectangle ndrect(*tctx, domain);

// TODO: non-int64 for DataFrame when it has extra index dims.
// This will be via a resize-helper.

unsigned n = domain.ndim();
if ((unsigned)newshape.size() != n) {
throw TileDBSOMAError(fmt::format(
"[SOMAArray::resize]: newshape has dimension count {}; array has "
"{} ",
newshape.size(),
n));
}

for (unsigned i = 0; i < n; i++) {
ndrect.set_range<int64_t>(
domain.dimension(i).name(), 0, newshape[i] - 1);
}

new_current_domain.set_ndrectangle(ndrect);
schema_evolution.expand_current_domain(new_current_domain);
schema_evolution.array_evolve(uri_);
}

uint64_t SOMAArray::ndim() const {
return tiledb_schema()->domain().ndim();
}
Expand Down
57 changes: 55 additions & 2 deletions libtiledbsoma/src/soma/soma_array.h
Original file line number Diff line number Diff line change
Expand Up @@ -571,13 +571,54 @@ class SOMAArray : public SOMAObject {
}

/**
* @brief Get the capacity of each dimension.
* @brief Get the current capacity of each dimension.
*
* This applies to arrays all of whose dims are of type int64_t: this
* includes SOMASparseNDArray and SOMADenseNDArray, and default-indexed
* SOMADataFrame.
*
* At the TileDB-SOMA level we call this "shape". At the TileDB Core
* storage level this maps to "current domain".
*
* Further, we map this single n to the pair (0, n-1) since core permits a
* doubly inclusive pair (lo, hi) on each dimension slot.
*
* @return A vector with length equal to the number of dimensions; each
* value in the vector is the capcity of each dimension.
* value in the vector is the capacity of each dimension.
*/
std::vector<int64_t> shape();

/**
* @brief Get the maximum resizable capacity of each dimension.
*
* This applies to arrays all of whose dims are of type int64_t: this
* includes SOMASparseNDArray and SOMADenseNDArray, and default-indexed
* SOMADataFrame.
*
* At the TileDB-SOMA level we call this "maxshape". At the TileDB Core
* storage level this maps to "domain".
*
* Further, we map this single n to the pair (0, n-1) since core permits a
* doubly inclusive pair (lo, hi) on each dimension slot.
*
* @return A vector with length equal to the number of dimensions; each
* value in the vector is the maximum capacity of each dimension.
*/
std::vector<int64_t> maxshape();

/**
* @brief Resize the shape (what core calls "current domain") up to the
* maxshape (what core calls "domain").
*
* This applies to arrays all of whose dims are of type int64_t: this
* includes SOMASparseNDArray and SOMADenseNDArray, and default-indexed
* SOMADataFrame.
*
* @return Nothing. Raises an exception if the resize would be a downsize,
* which is not supported.
*/
void resize(const std::vector<int64_t>& newshape);

/**
* @brief Get the number of dimensions.
*
Expand Down Expand Up @@ -762,6 +803,18 @@ class SOMAArray : public SOMAObject {

uint64_t _get_max_capacity(tiledb_datatype_t index_type);

/**
* With old shape: core domain mapped to tiledbsoma shape; core current
* domain did not exist.
*
* With new shape: core domain maps to tiledbsoma maxshape;
* core current_domain maps to tiledbsoma shape.
*
* Here we distinguish between user-side API, and core-side implementation.
*/
std::vector<int64_t> _tiledb_domain();
std::vector<int64_t> _tiledb_current_domain();

bool _extend_enumeration(
ArrowSchema* value_schema,
ArrowArray* value_array,
Expand Down
2 changes: 1 addition & 1 deletion libtiledbsoma/src/soma/soma_collection.h
Original file line number Diff line number Diff line change
Expand Up @@ -281,4 +281,4 @@ class SOMACollection : public SOMAGroup {
};
} // namespace tiledbsoma

#endif // SOMA_COLLECTION
#endif // SOMA_COLLECTION
Loading

0 comments on commit 3441d9e

Please sign in to comment.