Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[c++] Performant DataFrame.shape #2916

Merged
merged 8 commits into from
Sep 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions libtiledbsoma/src/soma/soma_array.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1191,6 +1191,30 @@ std::vector<int64_t> SOMAArray::maxshape() {
return _tiledb_domain();
}

std::optional<int64_t> SOMAArray::_shape_slot_if_soma_joinid_dim() {
const std::string dim_name = "soma_joinid";

if (!arr_->schema().domain().has_dimension(dim_name)) {
return std::nullopt;
}

auto current_domain = _get_current_domain();
if (current_domain.is_empty()) {
return std::nullopt;
}

auto t = current_domain.type();
if (t != TILEDB_NDRECTANGLE) {
throw TileDBSOMAError("current_domain type is not NDRECTANGLE");
}

NDRectangle ndrect = current_domain.ndrectangle();

auto range = ndrect.range<int64_t>(dim_name);
auto max = range[1] + 1;
return std::optional<int64_t>(max);
}

std::vector<int64_t> SOMAArray::_tiledb_domain() {
std::vector<int64_t> result;
auto dimensions = mq_->schema()->domain().dimensions();
Expand Down
8 changes: 8 additions & 0 deletions libtiledbsoma/src/soma/soma_array.h
Original file line number Diff line number Diff line change
Expand Up @@ -803,6 +803,14 @@ class SOMAArray : public SOMAObject {
return _get_current_domain();
}

protected:
// For use nominally by SOMADataFrame. This could be moved in its entirety
// to SOMADataFrame, but it would entail moving several SOMAArray attributes
// from private to protected, which has knock-on effects on the order of
// constructor initializers, etc.: in total it's simplest to place this
// here and have SOMADataFrame invoke it.
std::optional<int64_t> _shape_slot_if_soma_joinid_dim();

private:
//===================================================================
//= private non-static
Expand Down
6 changes: 6 additions & 0 deletions libtiledbsoma/src/soma/soma_dataframe.cc
Original file line number Diff line number Diff line change
Expand Up @@ -94,4 +94,10 @@ uint64_t SOMADataFrame::count() {
return this->nnz();
}

std::vector<int64_t> SOMADataFrame::shape() {
std::optional<int64_t> attempt = _shape_slot_if_soma_joinid_dim();
int64_t max = attempt.has_value() ? attempt.value() : this->nnz();
return std::vector<int64_t>({max});
}

} // namespace tiledbsoma
17 changes: 17 additions & 0 deletions libtiledbsoma/src/soma/soma_dataframe.h
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,24 @@ class SOMADataFrame : public SOMAArray {
* @return int64_t
*/
uint64_t count();

/**
* For DataFrame with default indexing, namely, a single int64_t
* soma_joinid, returns the same as SOMAArray. For DataFrame with
* soma_joinid being a dim along with other dims (optional behavior), return
* the slot along that dim. For DataFrame with soma_joinid being an attr,
* not a dim at all, returns nnz().
*
* Note that the SOMA spec for SOMADataFrame mandates a .domain() accessor,
* which is distinct, and type-polymorphic. This shape accessor exists
* because people can and do call .shape() on SOMA DataFrames, and we have
* to keep letting them do that.
*
* @return int64_t
*/
std::vector<int64_t> shape();
};

} // namespace tiledbsoma

#endif // SOMA_DATAFRAME
70 changes: 62 additions & 8 deletions libtiledbsoma/test/unit_soma_dataframe.cc
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,10 @@ struct VariouslyIndexedDataFrameFixture {
}
};

TEST_CASE_METHOD(VariouslyIndexedDataFrameFixture, "SOMADataFrame: basic") {
TEST_CASE_METHOD(
VariouslyIndexedDataFrameFixture,
"SOMADataFrame: basic",
"[SOMADataFrame]") {
auto use_current_domain = GENERATE(false, true);
// TODO this could be formatted with fmt::format which is part of internal
// header spd/log/fmt/fmt.h and should not be used. In C++20, this can be
Expand Down Expand Up @@ -250,7 +253,9 @@ TEST_CASE_METHOD(VariouslyIndexedDataFrameFixture, "SOMADataFrame: basic") {
}

TEST_CASE_METHOD(
VariouslyIndexedDataFrameFixture, "SOMADataFrame: platform_config") {
VariouslyIndexedDataFrameFixture,
"SOMADataFrame: platform_config",
"[SOMADataFrame]") {
std::pair<std::string, tiledb_filter_type_t> filter = GENERATE(
std::make_pair(
R"({"name": "GZIP", "COMPRESSION_LEVEL": 3})", TILEDB_FILTER_GZIP),
Expand Down Expand Up @@ -354,7 +359,10 @@ TEST_CASE_METHOD(
}
}

TEST_CASE_METHOD(VariouslyIndexedDataFrameFixture, "SOMADataFrame: metadata") {
TEST_CASE_METHOD(
VariouslyIndexedDataFrameFixture,
"SOMADataFrame: metadata",
"[SOMADataFrame]") {
auto use_current_domain = GENERATE(false, true);
// TODO this could be formatted with fmt::format which is part of internal
// header spd/log/fmt/fmt.h and should not be used. In C++20, this can be
Expand Down Expand Up @@ -425,7 +433,9 @@ TEST_CASE_METHOD(VariouslyIndexedDataFrameFixture, "SOMADataFrame: metadata") {
}

TEST_CASE_METHOD(
VariouslyIndexedDataFrameFixture, "SOMADataFrame: bounds-checking") {
VariouslyIndexedDataFrameFixture,
"SOMADataFrame: bounds-checking",
"[SOMADataFrame]") {
bool use_current_domain = true;
int old_max = 100;
int new_max = 200;
Expand Down Expand Up @@ -464,7 +474,8 @@ TEST_CASE_METHOD(

TEST_CASE_METHOD(
VariouslyIndexedDataFrameFixture,
"SOMADataFrame: variant-indexed dataframe dim:sjid attr:str,u32") {
"SOMADataFrame: variant-indexed dataframe dim:sjid attr:str,u32",
"[SOMADataFrame]") {
auto use_current_domain = GENERATE(false, true);
std::ostringstream section;
section << "- use_current_domain=" << use_current_domain;
Expand Down Expand Up @@ -498,15 +509,26 @@ TEST_CASE_METHOD(
REQUIRE(i64_range[1] == (int64_t)dim_infos[0].dim_max);
}

// Check shape before write
int64_t expect = use_current_domain ? dim_infos[0].dim_max + 1 : 0;
REQUIRE(soma_dataframe->shape() == std::vector<int64_t>({expect}));

soma_dataframe->close();

write_generic_data();

// Check shape after write
soma_dataframe = open(OpenMode::read);
expect = use_current_domain ? dim_infos[0].dim_max + 1 : 2;
REQUIRE(soma_dataframe->shape() == std::vector<int64_t>({expect}));
soma_dataframe->close();
}
}

TEST_CASE_METHOD(
VariouslyIndexedDataFrameFixture,
"SOMADataFrame: variant-indexed dataframe dim:sjid,u32 attr:str") {
"SOMADataFrame: variant-indexed dataframe dim:sjid,u32 attr:str",
"[SOMADataFrame]") {
auto use_current_domain = GENERATE(false, true);
std::ostringstream section;
section << "- use_current_domain=" << use_current_domain;
Expand Down Expand Up @@ -545,16 +567,27 @@ TEST_CASE_METHOD(
REQUIRE(u32_range[1] == (uint32_t)dim_infos[0].dim_max);
}

// Check shape before write
int64_t expect = use_current_domain ? dim_infos[0].dim_max + 1 : 0;
REQUIRE(soma_dataframe->shape() == std::vector<int64_t>({expect}));

soma_dataframe->close();

// Write
write_generic_data();

// Check shape after write
soma_dataframe = open(OpenMode::read);
expect = use_current_domain ? dim_infos[0].dim_max + 1 : 2;
REQUIRE(soma_dataframe->shape() == std::vector<int64_t>({expect}));
soma_dataframe->close();
}
}

TEST_CASE_METHOD(
VariouslyIndexedDataFrameFixture,
"SOMADataFrame: variant-indexed dataframe dim:sjid,str attr:u32") {
"SOMADataFrame: variant-indexed dataframe dim:sjid,str attr:u32",
"[SOMADataFrame]") {
auto use_current_domain = GENERATE(false, true);
std::ostringstream section;
section << "- use_current_domain=" << use_current_domain;
Expand Down Expand Up @@ -597,16 +630,27 @@ TEST_CASE_METHOD(
REQUIRE(str_range[1] > "~");
}

// Check shape before write
int64_t expect = use_current_domain ? dim_infos[0].dim_max + 1 : 0;
REQUIRE(soma_dataframe->shape() == std::vector<int64_t>({expect}));

soma_dataframe->close();

// Write
write_generic_data();

// Check shape after write
soma_dataframe = open(OpenMode::read);
expect = use_current_domain ? dim_infos[0].dim_max + 1 : 2;
REQUIRE(soma_dataframe->shape() == std::vector<int64_t>({expect}));
soma_dataframe->close();
}
}

TEST_CASE_METHOD(
VariouslyIndexedDataFrameFixture,
"SOMADataFrame: variant-indexed dataframe dim:str,u32 attr:sjid") {
"SOMADataFrame: variant-indexed dataframe dim:str,u32 attr:sjid",
"[SOMADataFrame]") {
auto use_current_domain = GENERATE(false, true);
std::ostringstream section;
section << "- use_current_domain=" << use_current_domain;
Expand Down Expand Up @@ -649,9 +693,19 @@ TEST_CASE_METHOD(
REQUIRE(u32_range[1] == (uint32_t)dim_infos[1].dim_max);
}

// Check shape before write
int64_t expect = 0;
REQUIRE(soma_dataframe->shape() == std::vector<int64_t>({expect}));

soma_dataframe->close();

// Write
write_generic_data();

// Check shape after write
soma_dataframe = open(OpenMode::read);
expect = 2;
REQUIRE(soma_dataframe->shape() == std::vector<int64_t>({expect}));
soma_dataframe->close();
}
}
9 changes: 4 additions & 5 deletions libtiledbsoma/test/unit_soma_sparse_ndarray.cc
Original file line number Diff line number Diff line change
Expand Up @@ -86,11 +86,10 @@ TEST_CASE("SOMASparseNDArray: basic", "[SOMASparseNDArray]") {
REQUIRE(soma_sparse->ndim() == 1);
REQUIRE(soma_sparse->nnz() == 0);

if (use_current_domain) {
REQUIRE(soma_sparse->shape() == std::vector<int64_t>{dim_max + 1});
} else {
REQUIRE(
soma_sparse->maxshape() == std::vector<int64_t>{dim_max + 1});
auto expect = std::vector<int64_t>({dim_max + 1});
REQUIRE(soma_sparse->shape() == expect);
if (!use_current_domain) {
REQUIRE(soma_sparse->maxshape() == expect);
}

soma_sparse->close();
Expand Down
Loading