diff --git a/libtiledbsoma/src/soma/soma_array.cc b/libtiledbsoma/src/soma/soma_array.cc index ff55a360f0..092af65253 100644 --- a/libtiledbsoma/src/soma/soma_array.cc +++ b/libtiledbsoma/src/soma/soma_array.cc @@ -1191,6 +1191,30 @@ std::vector SOMAArray::maxshape() { return _tiledb_domain(); } +std::optional SOMAArray::_shape_slot_if_soma_joinid_dim() { + const std::string dim_name = "soma_joinid"; + + if (!arr_->schema().domain().has_dimension(dim_name)) { + return std::nullopt; + } + + auto current_domain = _get_current_domain(); + if (current_domain.is_empty()) { + return std::nullopt; + } + + auto t = current_domain.type(); + if (t != TILEDB_NDRECTANGLE) { + throw TileDBSOMAError("current_domain type is not NDRECTANGLE"); + } + + NDRectangle ndrect = current_domain.ndrectangle(); + + auto range = ndrect.range(dim_name); + auto max = range[1] + 1; + return std::optional(max); +} + std::vector SOMAArray::_tiledb_domain() { std::vector result; auto dimensions = mq_->schema()->domain().dimensions(); diff --git a/libtiledbsoma/src/soma/soma_array.h b/libtiledbsoma/src/soma/soma_array.h index 790c8594bf..47e272bd6e 100644 --- a/libtiledbsoma/src/soma/soma_array.h +++ b/libtiledbsoma/src/soma/soma_array.h @@ -803,6 +803,14 @@ class SOMAArray : public SOMAObject { return _get_current_domain(); } + protected: + // For use nominally by SOMADataFrame. This could be moved in its entrety to + // SOMADataFrame, but it would entail moving several SOMAArray attributes + // from private to protected, which has knock-on effects on the order of + // constructor initializers, etc.: in total it's simplest to place this + // here, and have SOMADataFrame invoke it. + std::optional _shape_slot_if_soma_joinid_dim(); + private: //=================================================================== //= private non-static diff --git a/libtiledbsoma/src/soma/soma_dataframe.cc b/libtiledbsoma/src/soma/soma_dataframe.cc index b25d4aab21..431b843c08 100644 --- a/libtiledbsoma/src/soma/soma_dataframe.cc +++ b/libtiledbsoma/src/soma/soma_dataframe.cc @@ -94,4 +94,10 @@ uint64_t SOMADataFrame::count() { return this->nnz(); } +std::vector SOMADataFrame::shape() { + std::optional attempt = _shape_slot_if_soma_joinid_dim(); + int64_t max = attempt.has_value() ? attempt.value() : this->nnz(); + return std::vector({max}); +} + } // namespace tiledbsoma diff --git a/libtiledbsoma/src/soma/soma_dataframe.h b/libtiledbsoma/src/soma/soma_dataframe.h index bc353649d0..6bf50ac97a 100644 --- a/libtiledbsoma/src/soma/soma_dataframe.h +++ b/libtiledbsoma/src/soma/soma_dataframe.h @@ -163,7 +163,24 @@ class SOMADataFrame : public SOMAArray { * @return int64_t */ uint64_t count(); + + /** + * For DataFrame with default indexing, namely, a single int64_t + * soma_joinid, returns the same as SOMAArray. For DataFrame with + * soma_joinid being a dim along with other dims (optional behavior), return + * the slot along that dim. For DataFrame with soma_joinid being an attr, + * not a dim at all, returns nnz(). + * + * Note that the SOMA spec for SOMADataFrame mandates a .domain() accessor, + * which is distinct, and type-polymorphic. This shape accessor exists + * because people can and do call .shape() on SOMA DataFrames, and we have + * to keep letting them do that. + * + * @return int64_t + */ + std::vector shape(); }; + } // namespace tiledbsoma #endif // SOMA_DATAFRAME diff --git a/libtiledbsoma/test/unit_soma_dataframe.cc b/libtiledbsoma/test/unit_soma_dataframe.cc index b4a32474e3..7df120e908 100644 --- a/libtiledbsoma/test/unit_soma_dataframe.cc +++ b/libtiledbsoma/test/unit_soma_dataframe.cc @@ -184,7 +184,10 @@ struct VariouslyIndexedDataFrameFixture { } }; -TEST_CASE_METHOD(VariouslyIndexedDataFrameFixture, "SOMADataFrame: basic") { +TEST_CASE_METHOD( + VariouslyIndexedDataFrameFixture, + "SOMADataFrame: basic", + "[SOMADataFrame]") { auto use_current_domain = GENERATE(false, true); // TODO this could be formatted with fmt::format which is part of internal // header spd/log/fmt/fmt.h and should not be used. In C++20, this can be @@ -250,7 +253,9 @@ TEST_CASE_METHOD(VariouslyIndexedDataFrameFixture, "SOMADataFrame: basic") { } TEST_CASE_METHOD( - VariouslyIndexedDataFrameFixture, "SOMADataFrame: platform_config") { + VariouslyIndexedDataFrameFixture, + "SOMADataFrame: platform_config", + "[SOMADataFrame]") { std::pair filter = GENERATE( std::make_pair( R"({"name": "GZIP", "COMPRESSION_LEVEL": 3})", TILEDB_FILTER_GZIP), @@ -354,7 +359,10 @@ TEST_CASE_METHOD( } } -TEST_CASE_METHOD(VariouslyIndexedDataFrameFixture, "SOMADataFrame: metadata") { +TEST_CASE_METHOD( + VariouslyIndexedDataFrameFixture, + "SOMADataFrame: metadata", + "[SOMADataFrame]") { auto use_current_domain = GENERATE(false, true); // TODO this could be formatted with fmt::format which is part of internal // header spd/log/fmt/fmt.h and should not be used. In C++20, this can be @@ -425,7 +433,9 @@ TEST_CASE_METHOD(VariouslyIndexedDataFrameFixture, "SOMADataFrame: metadata") { } TEST_CASE_METHOD( - VariouslyIndexedDataFrameFixture, "SOMADataFrame: bounds-checking") { + VariouslyIndexedDataFrameFixture, + "SOMADataFrame: bounds-checking", + "[SOMADataFrame]") { bool use_current_domain = true; int old_max = 100; int new_max = 200; @@ -464,7 +474,8 @@ TEST_CASE_METHOD( TEST_CASE_METHOD( VariouslyIndexedDataFrameFixture, - "SOMADataFrame: variant-indexed dataframe dim:sjid attr:str,u32") { + "SOMADataFrame: variant-indexed dataframe dim:sjid attr:str,u32", + "[SOMADataFrame]") { // LOG_SET_LEVEL("debug"); auto use_current_domain = GENERATE(false, true); std::ostringstream section; @@ -499,15 +510,35 @@ TEST_CASE_METHOD( REQUIRE(i64_range[1] == (int64_t)dim_infos[0].dim_max); } + // Check shape before write + int64_t expect = 0; + if (!use_current_domain) { + expect = 0; // nnz + } else { + expect = dim_infos[0].dim_max + 1; + } + REQUIRE(soma_dataframe->shape() == std::vector({expect})); + soma_dataframe->close(); write_generic_data(); + + // Check shape after write + soma_dataframe = open(OpenMode::read); + if (!use_current_domain) { + expect = 2; // nnz + } else { + expect = dim_infos[0].dim_max + 1; + } + REQUIRE(soma_dataframe->shape() == std::vector({expect})); + soma_dataframe->close(); } } TEST_CASE_METHOD( VariouslyIndexedDataFrameFixture, - "SOMADataFrame: variant-indexed dataframe dim:sjid,u32 attr:str") { + "SOMADataFrame: variant-indexed dataframe dim:sjid,u32 attr:str", + "[SOMADataFrame]") { // LOG_SET_LEVEL("debug"); auto use_current_domain = GENERATE(false, true); std::ostringstream section; @@ -547,16 +578,36 @@ TEST_CASE_METHOD( REQUIRE(u32_range[1] == (uint32_t)dim_infos[0].dim_max); } + // Check shape before write + int64_t expect = 0; + if (!use_current_domain) { + expect = 0; // nnz + } else { + expect = dim_infos[0].dim_max + 1; + } + REQUIRE(soma_dataframe->shape() == std::vector({expect})); + soma_dataframe->close(); // Write write_generic_data(); + + // Check shape after write + soma_dataframe = open(OpenMode::read); + if (!use_current_domain) { + expect = 2; // nnz + } else { + expect = dim_infos[0].dim_max + 1; + } + REQUIRE(soma_dataframe->shape() == std::vector({expect})); + soma_dataframe->close(); } } TEST_CASE_METHOD( VariouslyIndexedDataFrameFixture, - "SOMADataFrame: variant-indexed dataframe dim:sjid,str attr:u32") { + "SOMADataFrame: variant-indexed dataframe dim:sjid,str attr:u32", + "[SOMADataFrame]") { // LOG_SET_LEVEL("debug"); auto use_current_domain = GENERATE(false, true); std::ostringstream section; @@ -600,16 +651,36 @@ TEST_CASE_METHOD( REQUIRE(str_range[1] > "~"); } + // Check shape before write + int64_t expect = 0; + if (!use_current_domain) { + expect = 0; // nnz + } else { + expect = dim_infos[0].dim_max + 1; + } + REQUIRE(soma_dataframe->shape() == std::vector({expect})); + soma_dataframe->close(); // Write write_generic_data(); + + // Check shape after write + soma_dataframe = open(OpenMode::read); + if (!use_current_domain) { + expect = 2; // nnz + } else { + expect = dim_infos[0].dim_max + 1; + } + REQUIRE(soma_dataframe->shape() == std::vector({expect})); + soma_dataframe->close(); } } TEST_CASE_METHOD( VariouslyIndexedDataFrameFixture, - "SOMADataFrame: variant-indexed dataframe dim:str,u32 attr:sjid") { + "SOMADataFrame: variant-indexed dataframe dim:str,u32 attr:sjid", + "[SOMADataFrame]") { // LOG_SET_LEVEL("debug"); auto use_current_domain = GENERATE(false, true); std::ostringstream section; @@ -653,9 +724,19 @@ TEST_CASE_METHOD( REQUIRE(u32_range[1] == (uint32_t)dim_infos[1].dim_max); } + // Check shape before write + int64_t expect = 0; // nnz + REQUIRE(soma_dataframe->shape() == std::vector({expect})); + soma_dataframe->close(); // Write write_generic_data(); + + // Check shape after write + soma_dataframe = open(OpenMode::read); + expect = 2; // nnz + REQUIRE(soma_dataframe->shape() == std::vector({expect})); + soma_dataframe->close(); } } diff --git a/libtiledbsoma/test/unit_soma_sparse_ndarray.cc b/libtiledbsoma/test/unit_soma_sparse_ndarray.cc index added6b5b4..09defb9c73 100644 --- a/libtiledbsoma/test/unit_soma_sparse_ndarray.cc +++ b/libtiledbsoma/test/unit_soma_sparse_ndarray.cc @@ -85,11 +85,10 @@ TEST_CASE("SOMASparseNDArray: basic") { REQUIRE(soma_sparse->ndim() == 1); REQUIRE(soma_sparse->nnz() == 0); - if (use_current_domain) { - REQUIRE(soma_sparse->shape() == std::vector{dim_max + 1}); - } else { - REQUIRE( - soma_sparse->maxshape() == std::vector{dim_max + 1}); + auto expect = std::vector({dim_max + 1}); + REQUIRE(soma_sparse->shape() == expect); + if (!use_current_domain) { + REQUIRE(soma_sparse->maxshape() == expect); } soma_sparse->close();