From 39e9ab346703dc5927bcac6640ce3966c54720cb Mon Sep 17 00:00:00 2001 From: nguyenv Date: Mon, 7 Oct 2024 11:17:35 -0500 Subject: [PATCH] [c++,python] Fix default reads on unwritten dense arrays (#3136) * Found issue while working on spatial's `MultiscaleImage` and erroring out when applying `read_spatial_region` to a level that had not been written to yet * Add new `non_empty_domain_slot_opt` alongside `non_empty_domain_slot` that returns `std::nullopt` if it is empty rather than (0, 0) * Fix logic in C++ `ManagedQuery::setup_read` to check if the array has been written to yet. If it hasn't, set the subarray to use soma_dim_0's full domain. Otherwise, use the non-empty domain as before * Also fix logic in Python `DenseNDArray.read` to use the new `non_empty_domain_slot_opt` and correctly set the `data_shape` to the full domain if the array has not been written to yet * Add unit test in pytest * Update all tests to reflect fix --- apis/python/src/tiledbsoma/_dense_nd_array.py | 16 ++++-- apis/python/src/tiledbsoma/soma_array.cc | 56 +++++++++++++++++++ apis/python/tests/test_basic_anndata_io.py | 8 +-- apis/python/tests/test_dense_nd_array.py | 15 +++++ libtiledbsoma/src/soma/managed_query.cc | 42 +++++++++----- libtiledbsoma/src/soma/soma_array.h | 38 ++++++++++++- 6 files changed, 150 insertions(+), 25 deletions(-) diff --git a/apis/python/src/tiledbsoma/_dense_nd_array.py b/apis/python/src/tiledbsoma/_dense_nd_array.py index 7d689646fa..ba646efa0f 100644 --- a/apis/python/src/tiledbsoma/_dense_nd_array.py +++ b/apis/python/src/tiledbsoma/_dense_nd_array.py @@ -189,10 +189,18 @@ def read( # all, in which case the best we can do is use the schema shape. handle: clib.SOMADenseNDArray = self._handle._handle - data_shape = handle.shape - ned = self.non_empty_domain() - if ned is not None: - data_shape = tuple(slot[1] + 1 for slot in ned) + ned = [] + for dim_name in handle.dimension_names: + dtype = np.dtype(self.schema.field(dim_name).type.to_pandas_dtype()) + slot = handle.non_empty_domain_slot_opt(dim_name, dtype) + if slot is None: + use_shape = True + break + ned.append(slot[1] + 1) + else: + use_shape = False + + data_shape = tuple(handle.shape if use_shape else ned) target_shape = dense_indices_to_shape(coords, data_shape, result_order) context = handle.context() diff --git a/apis/python/src/tiledbsoma/soma_array.cc b/apis/python/src/tiledbsoma/soma_array.cc index 40c53ae34e..07b12ea39c 100644 --- a/apis/python/src/tiledbsoma/soma_array.cc +++ b/apis/python/src/tiledbsoma/soma_array.cc @@ -780,6 +780,62 @@ void load_soma_array(py::module& m) { } }) + .def( + "non_empty_domain_slot_opt", + [](SOMAArray& array, std::string name, py::dtype dtype) { + switch (np_to_tdb_dtype(dtype)) { + case TILEDB_UINT64: + return py::cast( + array.non_empty_domain_slot_opt(name)); + case TILEDB_DATETIME_YEAR: + case TILEDB_DATETIME_MONTH: + case TILEDB_DATETIME_WEEK: + case TILEDB_DATETIME_DAY: + case TILEDB_DATETIME_HR: + case TILEDB_DATETIME_MIN: + case TILEDB_DATETIME_SEC: + case TILEDB_DATETIME_MS: + case TILEDB_DATETIME_US: + case TILEDB_DATETIME_NS: + case TILEDB_DATETIME_PS: + case TILEDB_DATETIME_FS: + case TILEDB_DATETIME_AS: + case TILEDB_INT64: + return py::cast( + array.non_empty_domain_slot_opt(name)); + case TILEDB_UINT32: + return py::cast( + array.non_empty_domain_slot_opt(name)); + case TILEDB_INT32: + return py::cast( + array.non_empty_domain_slot_opt(name)); + case TILEDB_UINT16: + return py::cast( + array.non_empty_domain_slot_opt(name)); + case TILEDB_INT16: + return py::cast( + array.non_empty_domain_slot_opt(name)); + case TILEDB_UINT8: + return py::cast( + array.non_empty_domain_slot_opt(name)); + case TILEDB_INT8: + return py::cast( + array.non_empty_domain_slot_opt(name)); + case TILEDB_FLOAT64: + return py::cast( + array.non_empty_domain_slot_opt(name)); + case TILEDB_FLOAT32: + return py::cast( + array.non_empty_domain_slot_opt(name)); + case TILEDB_STRING_UTF8: + case TILEDB_STRING_ASCII: + return py::cast(array.non_empty_domain_slot_var(name)); + default: + throw TileDBSOMAError( + "Unsupported dtype for nonempty domain."); + } + }) + .def( "soma_domain_slot", [](SOMAArray& array, std::string name, py::dtype dtype) { diff --git a/apis/python/tests/test_basic_anndata_io.py b/apis/python/tests/test_basic_anndata_io.py index b110bdc0e0..67cfeca562 100644 --- a/apis/python/tests/test_basic_anndata_io.py +++ b/apis/python/tests/test_basic_anndata_io.py @@ -176,12 +176,8 @@ def test_import_anndata(conftest_pbmc_small, ingest_modes, X_kind): assert X.used_shape() == tuple([(0, e - 1) for e in orig.shape]) else: assert X.metadata.get(metakey) == "SOMADenseNDArray" - if have_ingested: - matrix = X.read(coords=all2d) - assert matrix.size == orig.X.size - else: - with pytest.raises(ValueError): - X.read(coords=all2d) + matrix = X.read(coords=all2d) + assert matrix.size == orig.X.size # Check raw/X/data (sparse) assert exp.ms["raw"].X["data"].metadata.get(metakey) == "SOMASparseNDArray" diff --git a/apis/python/tests/test_dense_nd_array.py b/apis/python/tests/test_dense_nd_array.py index 3e4ffb5161..325463c609 100644 --- a/apis/python/tests/test_dense_nd_array.py +++ b/apis/python/tests/test_dense_nd_array.py @@ -491,3 +491,18 @@ def test_fixed_timestamp(tmp_path: pathlib.Path): with pytest.raises(soma.SOMAError): soma.open(tmp_path.as_posix(), context=fixed_time, tiledb_timestamp=111) + + +@pytest.mark.parametrize("shape", [(10,), (10, 20), (10, 20, 2), (2, 4, 6, 8)]) +def test_read_to_unwritten_array(tmp_path, shape): + uri = tmp_path.as_posix() + + soma.DenseNDArray.create(uri, type=pa.uint8(), shape=shape) + + with tiledb.open(uri, "r") as A: + expected = A[:]["soma_data"] + + with soma.DenseNDArray.open(uri, "r") as A: + actual = A.read().to_numpy() + + assert np.array_equal(expected, actual) diff --git a/libtiledbsoma/src/soma/managed_query.cc b/libtiledbsoma/src/soma/managed_query.cc index 304cd63f2a..f3b6991eb9 100644 --- a/libtiledbsoma/src/soma/managed_query.cc +++ b/libtiledbsoma/src/soma/managed_query.cc @@ -100,22 +100,36 @@ void ManagedQuery::setup_read() { return; } + auto schema = array_->schema(); + // If the query is uninitialized, set the subarray for the query if (status == Query::Status::UNINITIALIZED) { // Dense array must have a subarray set. If the array is dense and no // ranges have been set, add a range for the array's entire non-empty - // domain on dimension 0. - if (array_->schema().array_type() == TILEDB_DENSE && - !subarray_range_set_) { - auto non_empty_domain = array_->non_empty_domain(0); - subarray_->add_range( - 0, non_empty_domain.first, non_empty_domain.second); + // domain on dimension 0. In the case that the non-empty domain does not + // exist (when the array has not been written to yet), use dimension 0's + // full domain + if (schema.array_type() == TILEDB_DENSE && !subarray_range_set_) { + // Check if the array has been written to by using the C API as + // there is no way to to check for an empty domain using the current + // CPP API + int32_t is_empty; + int64_t ned[2]; + ctx_->handle_error(tiledb_array_get_non_empty_domain_from_index( + ctx_->ptr().get(), array_->ptr().get(), 0, &ned, &is_empty)); + + std::pair array_shape; + if (is_empty == 1) { + array_shape = schema.domain().dimension(0).domain(); + } else { + array_shape = std::make_pair(ned[0], ned[1]); + } + subarray_->add_range(0, array_shape.first, array_shape.second); LOG_DEBUG(fmt::format( - "[ManagedQuery] Add full NED range to dense subarray = (0, {}, " - "{})", - non_empty_domain.first, - non_empty_domain.second)); + "[ManagedQuery] Add full range to dense subarray = (0, {}, {})", + array_shape.first, + array_shape.second)); } // Set the subarray for range slicing @@ -125,14 +139,14 @@ void ManagedQuery::setup_read() { // If no columns were selected, select all columns. // Add dims and attrs in the same order as specified in the schema if (columns_.empty()) { - if (array_->schema().array_type() == TILEDB_SPARSE) { - for (const auto& dim : array_->schema().domain().dimensions()) { + if (schema.array_type() == TILEDB_SPARSE) { + for (const auto& dim : schema.domain().dimensions()) { columns_.push_back(dim.name()); } } - int attribute_num = array_->schema().attribute_num(); + int attribute_num = schema.attribute_num(); for (int i = 0; i < attribute_num; i++) { - columns_.push_back(array_->schema().attribute(i).name()); + columns_.push_back(schema.attribute(i).name()); } } diff --git a/libtiledbsoma/src/soma/soma_array.h b/libtiledbsoma/src/soma/soma_array.h index 67833277cf..d80f64a573 100644 --- a/libtiledbsoma/src/soma/soma_array.h +++ b/libtiledbsoma/src/soma/soma_array.h @@ -722,7 +722,8 @@ class SOMAArray : public SOMAObject { /** * Retrieves the non-empty domain from the array. This is the union of the - * non-empty domains of the array fragments. + * non-empty domains of the array fragments. Returns (0, 0) for empty + * domains. */ template std::pair non_empty_domain_slot(const std::string& name) const { @@ -733,6 +734,41 @@ class SOMAArray : public SOMAObject { } } + /** + * Retrieves the non-empty domain from the array. This is the union of the + * non-empty domains of the array fragments. Return std::nullopt for empty + * domains. + */ + template + std::optional> non_empty_domain_slot_opt( + const std::string& name) const { + try { + int32_t is_empty; + T ned[2]; + + // TODO currently we need to use the TileDB C API in order to check + // if the domain is empty or not. The C++ API returns (0, 0) + // currently which could also represent a single point at coordinate + // 0. Replace this when the C++ API supports correct checking for + // empty domains + ctx_->tiledb_ctx()->handle_error( + tiledb_array_get_non_empty_domain_from_name( + ctx_->tiledb_ctx()->ptr().get(), + arr_->ptr().get(), + name.c_str(), + &ned, + &is_empty)); + + if (is_empty == 1) { + return std::nullopt; + } else { + return std::make_pair(ned[0], ned[1]); + } + } catch (const std::exception& e) { + throw TileDBSOMAError(e.what()); + } + } + /** * Retrieves the non-empty domain from the array on the given dimension. * This is the union of the non-empty domains of the array fragments.