Skip to content

Commit

Permalink
[c++] Performant DataFrame.shape (#2916)
Browse files Browse the repository at this point in the history
* [c++] Split out a test fixture for dataframes

* fix a failure case

* [c++] Unit-test variant-indexed dataframes [WIP]

* neaten

* [c++] Performant DataFrame.shape

* Update libtiledbsoma/src/soma/soma_array.h

Co-authored-by: nguyenv <[email protected]>

* Update libtiledbsoma/src/soma/soma_array.h

Co-authored-by: nguyenv <[email protected]>

---------

Co-authored-by: nguyenv <[email protected]>
  • Loading branch information
johnkerl and nguyenv authored Sep 3, 2024
1 parent ded1e42 commit f5ae258
Show file tree
Hide file tree
Showing 6 changed files with 121 additions and 13 deletions.
24 changes: 24 additions & 0 deletions libtiledbsoma/src/soma/soma_array.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1191,6 +1191,30 @@ std::vector<int64_t> SOMAArray::maxshape() {
return _tiledb_domain();
}

std::optional<int64_t> SOMAArray::_shape_slot_if_soma_joinid_dim() {
const std::string dim_name = "soma_joinid";

if (!arr_->schema().domain().has_dimension(dim_name)) {
return std::nullopt;
}

auto current_domain = _get_current_domain();
if (current_domain.is_empty()) {
return std::nullopt;
}

auto t = current_domain.type();
if (t != TILEDB_NDRECTANGLE) {
throw TileDBSOMAError("current_domain type is not NDRECTANGLE");
}

NDRectangle ndrect = current_domain.ndrectangle();

auto range = ndrect.range<int64_t>(dim_name);
auto max = range[1] + 1;
return std::optional<int64_t>(max);
}

std::vector<int64_t> SOMAArray::_tiledb_domain() {
std::vector<int64_t> result;
auto dimensions = mq_->schema()->domain().dimensions();
Expand Down
8 changes: 8 additions & 0 deletions libtiledbsoma/src/soma/soma_array.h
Original file line number Diff line number Diff line change
Expand Up @@ -803,6 +803,14 @@ class SOMAArray : public SOMAObject {
return _get_current_domain();
}

protected:
// For use nominally by SOMADataFrame. This could be moved in its entirety
// to SOMADataFrame, but it would entail moving several SOMAArray attributes
// from private to protected, which has knock-on effects on the order of
// constructor initializers, etc.: in total it's simplest to place this
// here and have SOMADataFrame invoke it.
std::optional<int64_t> _shape_slot_if_soma_joinid_dim();

private:
//===================================================================
//= private non-static
Expand Down
6 changes: 6 additions & 0 deletions libtiledbsoma/src/soma/soma_dataframe.cc
Original file line number Diff line number Diff line change
Expand Up @@ -94,4 +94,10 @@ uint64_t SOMADataFrame::count() {
return this->nnz();
}

std::vector<int64_t> SOMADataFrame::shape() {
std::optional<int64_t> attempt = _shape_slot_if_soma_joinid_dim();
int64_t max = attempt.has_value() ? attempt.value() : this->nnz();
return std::vector<int64_t>({max});
}

} // namespace tiledbsoma
17 changes: 17 additions & 0 deletions libtiledbsoma/src/soma/soma_dataframe.h
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,24 @@ class SOMADataFrame : public SOMAArray {
* @return int64_t
*/
uint64_t count();

/**
* For DataFrame with default indexing, namely, a single int64_t
* soma_joinid, returns the same as SOMAArray. For DataFrame with
* soma_joinid being a dim along with other dims (optional behavior), return
* the slot along that dim. For DataFrame with soma_joinid being an attr,
* not a dim at all, returns nnz().
*
* Note that the SOMA spec for SOMADataFrame mandates a .domain() accessor,
* which is distinct, and type-polymorphic. This shape accessor exists
* because people can and do call .shape() on SOMA DataFrames, and we have
* to keep letting them do that.
*
* @return int64_t
*/
std::vector<int64_t> shape();
};

} // namespace tiledbsoma

#endif // SOMA_DATAFRAME
70 changes: 62 additions & 8 deletions libtiledbsoma/test/unit_soma_dataframe.cc
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,10 @@ struct VariouslyIndexedDataFrameFixture {
}
};

TEST_CASE_METHOD(VariouslyIndexedDataFrameFixture, "SOMADataFrame: basic") {
TEST_CASE_METHOD(
VariouslyIndexedDataFrameFixture,
"SOMADataFrame: basic",
"[SOMADataFrame]") {
auto use_current_domain = GENERATE(false, true);
// TODO this could be formatted with fmt::format which is part of internal
// header spd/log/fmt/fmt.h and should not be used. In C++20, this can be
Expand Down Expand Up @@ -250,7 +253,9 @@ TEST_CASE_METHOD(VariouslyIndexedDataFrameFixture, "SOMADataFrame: basic") {
}

TEST_CASE_METHOD(
VariouslyIndexedDataFrameFixture, "SOMADataFrame: platform_config") {
VariouslyIndexedDataFrameFixture,
"SOMADataFrame: platform_config",
"[SOMADataFrame]") {
std::pair<std::string, tiledb_filter_type_t> filter = GENERATE(
std::make_pair(
R"({"name": "GZIP", "COMPRESSION_LEVEL": 3})", TILEDB_FILTER_GZIP),
Expand Down Expand Up @@ -354,7 +359,10 @@ TEST_CASE_METHOD(
}
}

TEST_CASE_METHOD(VariouslyIndexedDataFrameFixture, "SOMADataFrame: metadata") {
TEST_CASE_METHOD(
VariouslyIndexedDataFrameFixture,
"SOMADataFrame: metadata",
"[SOMADataFrame]") {
auto use_current_domain = GENERATE(false, true);
// TODO this could be formatted with fmt::format which is part of internal
// header spd/log/fmt/fmt.h and should not be used. In C++20, this can be
Expand Down Expand Up @@ -425,7 +433,9 @@ TEST_CASE_METHOD(VariouslyIndexedDataFrameFixture, "SOMADataFrame: metadata") {
}

TEST_CASE_METHOD(
VariouslyIndexedDataFrameFixture, "SOMADataFrame: bounds-checking") {
VariouslyIndexedDataFrameFixture,
"SOMADataFrame: bounds-checking",
"[SOMADataFrame]") {
bool use_current_domain = true;
int old_max = 100;
int new_max = 200;
Expand Down Expand Up @@ -464,7 +474,8 @@ TEST_CASE_METHOD(

TEST_CASE_METHOD(
VariouslyIndexedDataFrameFixture,
"SOMADataFrame: variant-indexed dataframe dim:sjid attr:str,u32") {
"SOMADataFrame: variant-indexed dataframe dim:sjid attr:str,u32",
"[SOMADataFrame]") {
auto use_current_domain = GENERATE(false, true);
std::ostringstream section;
section << "- use_current_domain=" << use_current_domain;
Expand Down Expand Up @@ -498,15 +509,26 @@ TEST_CASE_METHOD(
REQUIRE(i64_range[1] == (int64_t)dim_infos[0].dim_max);
}

// Check shape before write
int64_t expect = use_current_domain ? dim_infos[0].dim_max + 1 : 0;
REQUIRE(soma_dataframe->shape() == std::vector<int64_t>({expect}));

soma_dataframe->close();

write_generic_data();

// Check shape after write
soma_dataframe = open(OpenMode::read);
expect = use_current_domain ? dim_infos[0].dim_max + 1 : 2;
REQUIRE(soma_dataframe->shape() == std::vector<int64_t>({expect}));
soma_dataframe->close();
}
}

TEST_CASE_METHOD(
VariouslyIndexedDataFrameFixture,
"SOMADataFrame: variant-indexed dataframe dim:sjid,u32 attr:str") {
"SOMADataFrame: variant-indexed dataframe dim:sjid,u32 attr:str",
"[SOMADataFrame]") {
auto use_current_domain = GENERATE(false, true);
std::ostringstream section;
section << "- use_current_domain=" << use_current_domain;
Expand Down Expand Up @@ -545,16 +567,27 @@ TEST_CASE_METHOD(
REQUIRE(u32_range[1] == (uint32_t)dim_infos[0].dim_max);
}

// Check shape before write
int64_t expect = use_current_domain ? dim_infos[0].dim_max + 1 : 0;
REQUIRE(soma_dataframe->shape() == std::vector<int64_t>({expect}));

soma_dataframe->close();

// Write
write_generic_data();

// Check shape after write
soma_dataframe = open(OpenMode::read);
expect = use_current_domain ? dim_infos[0].dim_max + 1 : 2;
REQUIRE(soma_dataframe->shape() == std::vector<int64_t>({expect}));
soma_dataframe->close();
}
}

TEST_CASE_METHOD(
VariouslyIndexedDataFrameFixture,
"SOMADataFrame: variant-indexed dataframe dim:sjid,str attr:u32") {
"SOMADataFrame: variant-indexed dataframe dim:sjid,str attr:u32",
"[SOMADataFrame]") {
auto use_current_domain = GENERATE(false, true);
std::ostringstream section;
section << "- use_current_domain=" << use_current_domain;
Expand Down Expand Up @@ -597,16 +630,27 @@ TEST_CASE_METHOD(
REQUIRE(str_range[1] > "~");
}

// Check shape before write
int64_t expect = use_current_domain ? dim_infos[0].dim_max + 1 : 0;
REQUIRE(soma_dataframe->shape() == std::vector<int64_t>({expect}));

soma_dataframe->close();

// Write
write_generic_data();

// Check shape after write
soma_dataframe = open(OpenMode::read);
expect = use_current_domain ? dim_infos[0].dim_max + 1 : 2;
REQUIRE(soma_dataframe->shape() == std::vector<int64_t>({expect}));
soma_dataframe->close();
}
}

TEST_CASE_METHOD(
VariouslyIndexedDataFrameFixture,
"SOMADataFrame: variant-indexed dataframe dim:str,u32 attr:sjid") {
"SOMADataFrame: variant-indexed dataframe dim:str,u32 attr:sjid",
"[SOMADataFrame]") {
auto use_current_domain = GENERATE(false, true);
std::ostringstream section;
section << "- use_current_domain=" << use_current_domain;
Expand Down Expand Up @@ -649,9 +693,19 @@ TEST_CASE_METHOD(
REQUIRE(u32_range[1] == (uint32_t)dim_infos[1].dim_max);
}

// Check shape before write
int64_t expect = 0;
REQUIRE(soma_dataframe->shape() == std::vector<int64_t>({expect}));

soma_dataframe->close();

// Write
write_generic_data();

// Check shape after write
soma_dataframe = open(OpenMode::read);
expect = 2;
REQUIRE(soma_dataframe->shape() == std::vector<int64_t>({expect}));
soma_dataframe->close();
}
}
9 changes: 4 additions & 5 deletions libtiledbsoma/test/unit_soma_sparse_ndarray.cc
Original file line number Diff line number Diff line change
Expand Up @@ -86,11 +86,10 @@ TEST_CASE("SOMASparseNDArray: basic", "[SOMASparseNDArray]") {
REQUIRE(soma_sparse->ndim() == 1);
REQUIRE(soma_sparse->nnz() == 0);

if (use_current_domain) {
REQUIRE(soma_sparse->shape() == std::vector<int64_t>{dim_max + 1});
} else {
REQUIRE(
soma_sparse->maxshape() == std::vector<int64_t>{dim_max + 1});
auto expect = std::vector<int64_t>({dim_max + 1});
REQUIRE(soma_sparse->shape() == expect);
if (!use_current_domain) {
REQUIRE(soma_sparse->maxshape() == expect);
}

soma_sparse->close();
Expand Down

0 comments on commit f5ae258

Please sign in to comment.