From 073f3b4269557cea0fd7309c6c762cf1849312df Mon Sep 17 00:00:00 2001 From: John Kerl Date: Sat, 17 Aug 2024 12:18:19 -0400 Subject: [PATCH] new-shape testing [skip ci] --- .../python/src/tiledbsoma/_common_nd_array.py | 20 ++ apis/python/src/tiledbsoma/_dataframe.py | 76 +++- apis/python/src/tiledbsoma/_dense_nd_array.py | 46 +++ .../python/src/tiledbsoma/_sparse_nd_array.py | 48 ++- apis/python/src/tiledbsoma/_tdb_handles.py | 15 +- apis/python/src/tiledbsoma/io/ingest.py | 6 +- apis/python/src/tiledbsoma/soma_array.cc | 27 +- apis/python/src/tiledbsoma/soma_dataframe.cc | 25 ++ .../src/tiledbsoma/soma_dense_ndarray.cc | 21 ++ .../src/tiledbsoma/soma_sparse_ndarray.cc | 24 +- apis/python/tests/test_basic_anndata_io.py | 13 + .../tests/test_dataframe_index_columns.py | 4 +- .../tests/test_registration_mappings.py | 11 + apis/python/tests/test_shape.py | 328 ++++++++++++++++++ apis/python/tests/test_sparse_nd_array.py | 5 +- apis/r/tests/testthat/helper-test-data.R | 2 +- libtiledbsoma/src/soma/managed_query.h | 56 ++- libtiledbsoma/src/soma/soma_array.cc | 273 +++++++++++++-- libtiledbsoma/src/soma/soma_array.h | 27 ++ libtiledbsoma/src/soma/soma_collection.h | 2 +- libtiledbsoma/src/utils/arrow_adapter.cc | 188 ++++++++++ libtiledbsoma/src/utils/arrow_adapter.h | 6 + libtiledbsoma/src/utils/common.h | 17 + libtiledbsoma/test/common.cc | 25 +- libtiledbsoma/test/common.h | 6 +- libtiledbsoma/test/unit_soma_collection.cc | 14 +- libtiledbsoma/test/unit_soma_dataframe.cc | 6 +- libtiledbsoma/test/unit_soma_dense_ndarray.cc | 6 +- .../test/unit_soma_sparse_ndarray.cc | 8 +- 29 files changed, 1201 insertions(+), 104 deletions(-) create mode 100644 apis/python/tests/test_shape.py diff --git a/apis/python/src/tiledbsoma/_common_nd_array.py b/apis/python/src/tiledbsoma/_common_nd_array.py index 5525220012..ee49eac275 100644 --- a/apis/python/src/tiledbsoma/_common_nd_array.py +++ b/apis/python/src/tiledbsoma/_common_nd_array.py @@ -95,6 +95,26 @@ def shape(self) -> Tuple[int, ...]: """ return cast(Tuple[int, ...], tuple(self._handle.shape)) + @property + def maxshape(self) -> Tuple[int, ...]: + """XXX write me please thank you + Lifecycle: + Experimental. + """ + # For core 2.26 we'll implement this for sparse and dense. + # For core 2.25 we'll implement this only for dense. + # We'll leave this common accessor here, but raise + # NotImplementedError in DenseNDArray until 2.26. + return cast(Tuple[int, ...], tuple(self._handle.maxshape)) + + def resize(self, newshape: Sequence[Union[int, None]]) -> None: + """Comment me please thx""" + # For core 2.26 we'll implement this for sparse and dense. + # For core 2.25 we'll implement this only for dense. + # We'll leave this common accessor here, but raise + # NotImplementedError in DenseNDArray until 2.26. + self._handle.resize(newshape) + @classmethod def _dim_capacity_and_extent( cls, diff --git a/apis/python/src/tiledbsoma/_dataframe.py b/apis/python/src/tiledbsoma/_dataframe.py index 84e0aa34e0..21cc6d156c 100644 --- a/apis/python/src/tiledbsoma/_dataframe.py +++ b/apis/python/src/tiledbsoma/_dataframe.py @@ -215,10 +215,22 @@ def create( """ context = _validate_soma_tiledb_context(context) schema = _canonicalize_schema(schema, index_column_names) - if domain is None: - domain = tuple(None for _ in index_column_names) + + # XXX comment re mapping: + # * core current_domain <-> SOMA domain + # * core domain <-> SOMA max_domain + # + # As far as the user is concerned, the SOMA domain (core current_domain) + # is the _only_ thing they see and care about. It's resizeable (up to + # max_domain anyway), reads and writes are bounds-checked against it, + # etc. + + soma_domain = domain + + if soma_domain is None: + soma_domain = tuple(None for _ in index_column_names) else: - ndom = len(domain) + ndom = len(soma_domain) nidx = len(index_column_names) if ndom != nidx: raise ValueError( @@ -228,30 +240,52 @@ def create( index_column_schema = [] index_column_data = {} - for index_column_name, slot_domain in zip(index_column_names, domain): + for index_column_name, slot_current_domain in zip( + index_column_names, soma_domain + ): pa_field = schema.field(index_column_name) dtype = _arrow_types.tiledb_type_from_arrow_type( pa_field.type, is_indexed_column=True ) - slot_domain = _fill_out_slot_domain( - slot_domain, index_column_name, pa_field.type, dtype + slot_current_domain = _fill_out_slot_domain( + slot_current_domain, index_column_name, pa_field.type, dtype + ) + slot_max_domain = _fill_out_slot_domain( + None, index_column_name, pa_field.type, dtype ) extent = _find_extent_for_domain( index_column_name, TileDBCreateOptions.from_platform_config(platform_config), dtype, - slot_domain, + slot_max_domain, ) + # XXX COMMENT + # XXX emphasize: + # [0] core max domain lo + # [1] core max domain hi + # [2] core extent parameter + # [3] core current domain lo + # [4] core current domain hi + index_column_schema.append(pa_field) - index_column_data[pa_field.name] = [*slot_domain, extent] + index_column_data[pa_field.name] = [ + *slot_max_domain, + extent, + *slot_current_domain, + ] index_column_info = pa.RecordBatch.from_pydict( index_column_data, schema=pa.schema(index_column_schema) ) + # print() + # print("INDEX_COLUMN_INFO") + # print(index_column_info.to_pandas()) + # print() + plt_cfg = _util.build_clib_platform_config(platform_config) timestamp_ms = context._open_timestamp_ms(tiledb_timestamp) try: @@ -259,6 +293,7 @@ def create( uri, schema=schema, index_column_info=index_column_info, + # XXX domain=domain, ctx=context.native_context, platform_config=plt_cfg, timestamp=(0, timestamp_ms), @@ -317,6 +352,31 @@ def count(self) -> int: # if is it in read open mode, then it is a DataFrameWrapper return cast(DataFrameWrapper, self._handle).count + @property + def shape(self) -> Tuple[int, ...]: + """Returns capacity of each dimension, always a list of length ``ndim``. + This will not necessarily match the bounds of occupied cells within the array. + Rather, it is the bounds outside of which no data may be written. + + Lifecycle: + Experimental. + """ + # XXX COMMENT ME + return cast(Tuple[int, ...], (self._handle.shape[0],)) + + @property + def maxshape(self) -> Tuple[int, ...]: + """XXX write me please thank you + Lifecycle: + Experimental. + """ + # XXX COMMENT ME + return cast(Tuple[int, ...], (self._handle.maxshape[0],)) + + def resize(self, newshape: Sequence[Union[int, None]]) -> None: + """Comment me please thx""" + self._handle.resize(newshape) + def __len__(self) -> int: """Returns the number of rows in the dataframe. Same as ``df.count``.""" return self.count diff --git a/apis/python/src/tiledbsoma/_dense_nd_array.py b/apis/python/src/tiledbsoma/_dense_nd_array.py index cdec0ed135..6743622f7c 100644 --- a/apis/python/src/tiledbsoma/_dense_nd_array.py +++ b/apis/python/src/tiledbsoma/_dense_nd_array.py @@ -95,6 +95,17 @@ def create( ) -> Self: context = _validate_soma_tiledb_context(context) + # XXX comment re mapping: + # * core current_domain <-> (0, SOMA shape minus 1) + # * core domain <-> (0, SOMA max_shape minus 1) + # this is also known as capacity + # + # As far as the user is concerned, the SOMA domain (core current_domain) + # is the _only_ thing they see and care about. It's resizeable (up to max_domain + # anyway), reads and writes are bounds-checked against it, etc. + + # XXX note: don't set current_domain for dense arrays until core 2.26 + index_column_schema = [] index_column_data = {} for dim_idx, dim_shape in enumerate(shape): @@ -105,6 +116,16 @@ def create( dim_shape, TileDBCreateOptions.from_platform_config(platform_config), ) + + # XXX COMMENT + # XXX emphasize: + # [0] core max domain lo + # [1] core max domain hi + # [2] core extent parameter + # [3] core current domain lo + # [4] core current domain hi + # XXX note: don't set current_domain for dense arrays until core 2.26 + index_column_schema.append(pa_field) index_column_data[pa_field.name] = [0, dim_capacity - 1, dim_extent] @@ -314,3 +335,28 @@ def _dim_capacity_and_extent( dim_extent = min(dim_shape, create_options.dim_tile(dim_name, 2048)) return (dim_capacity, dim_extent) + + @property + def maxshape(self) -> Tuple[int, ...]: + """XXX write me please thank you + Lifecycle: + Experimental. + """ + # For core 2.26 we'll implement this for sparse and dense. + # For core 2.25 we'll implement this only for dense. + # This suppression overrides the parent class. + raise NotImplementedError( + "DenseNDArray maxshape support is scheduled for TileDBSOMA 1.14" + ) + + def resize(self, newshape: Sequence[Union[int, None]]) -> None: + """XXX write me please thank you + Lifecycle: + Experimental. + """ + # For core 2.26 we'll implement this for sparse and dense. + # For core 2.25 we'll implement this only for dense. + # This suppression overrides the parent class. + raise NotImplementedError( + "DenseNDArray resize support is scheduled for TileDBSOMA 1.14" + ) diff --git a/apis/python/src/tiledbsoma/_sparse_nd_array.py b/apis/python/src/tiledbsoma/_sparse_nd_array.py index 3cb2ccf967..69d8b2309d 100644 --- a/apis/python/src/tiledbsoma/_sparse_nd_array.py +++ b/apis/python/src/tiledbsoma/_sparse_nd_array.py @@ -124,21 +124,65 @@ def create( index_column_schema = [] index_column_data = {} + + # XXX comment re mapping: + # * core current_domain <-> (0, SOMA shape minus 1) + # * core domain <-> (0, SOMA max_shape minus 1) + # this is also known as capacity + # + # As far as the user is concerned, the SOMA domain (core current_domain) + # is the _only_ thing they see and care about. It's resizeable (up to max_domain + # anyway), reads and writes are bounds-checked against it, etc. + + # XXX COMMENT for dim_idx, dim_shape in enumerate(shape): dim_name = f"soma_dim_{dim_idx}" + pa_field = pa.field(dim_name, pa.int64()) dim_capacity, dim_extent = cls._dim_capacity_and_extent( dim_name, - dim_shape, + None, # XXX COMMENT TileDBCreateOptions.from_platform_config(platform_config), ) + + if dim_shape == 0: + raise ValueError("Write this message please") + # XXX comment + if dim_shape is None: + dim_shape = dim_capacity + # XXX different comment + # if dim_shape == 0: + # dim_shape = 1 + + # XXX COMMENT + # XXX emphasize: + # [0] core max domain lo + # [1] core max domain hi + # [2] core extent parameter + # [3] core current domain lo + # [4] core current domain hi + index_column_schema.append(pa_field) - index_column_data[pa_field.name] = [0, dim_capacity - 1, dim_extent] + # XXX COMMENT + index_column_data[pa_field.name] = [ + 0, + dim_capacity - 1, + dim_extent, + 0, + dim_shape - 1, + ] index_column_info = pa.RecordBatch.from_pydict( index_column_data, schema=pa.schema(index_column_schema) ) + # print() + # print("INDEX_COLUMN_SCHEMA") + # print(index_column_info.schema) + # print("INDEX_COLUMN_INFO") + # print(index_column_info.to_pandas()) + # print() + carrow_type = pyarrow_to_carrow_type(type) plt_cfg = _util.build_clib_platform_config(platform_config) timestamp_ms = context._open_timestamp_ms(tiledb_timestamp) diff --git a/apis/python/src/tiledbsoma/_tdb_handles.py b/apis/python/src/tiledbsoma/_tdb_handles.py index edc2f4ed76..a7c24c4678 100644 --- a/apis/python/src/tiledbsoma/_tdb_handles.py +++ b/apis/python/src/tiledbsoma/_tdb_handles.py @@ -19,6 +19,7 @@ Mapping, MutableMapping, Optional, + Sequence, Tuple, Type, TypeVar, @@ -408,6 +409,13 @@ def dim_names(self) -> Tuple[str, ...]: def shape(self) -> Tuple[int, ...]: return tuple(self._handle.shape) + @property + def maxshape(self) -> Tuple[int, ...]: + return tuple(self._handle.maxshape) + + def resize(self, newshape: Sequence[Union[int, None]]) -> None: + self._handle.resize(newshape) + class DataFrameWrapper(SOMAArrayWrapper[clib.SOMADataFrame]): """Wrapper around a Pybind11 SOMADataFrame handle.""" @@ -423,8 +431,11 @@ def write(self, values: pa.RecordBatch) -> None: @property def shape(self) -> Tuple[int, ...]: - # Shape is not implemented for DataFrames - raise NotImplementedError + return tuple(self._handle.shape) + + @property + def maxshape(self) -> Tuple[int, ...]: + return tuple(self._handle.maxshape) class DenseNDArrayWrapper(SOMAArrayWrapper[clib.SOMADenseNDArray]): diff --git a/apis/python/src/tiledbsoma/io/ingest.py b/apis/python/src/tiledbsoma/io/ingest.py index 600bad859b..c927d7ea15 100644 --- a/apis/python/src/tiledbsoma/io/ingest.py +++ b/apis/python/src/tiledbsoma/io/ingest.py @@ -1205,6 +1205,8 @@ def _write_dataframe_impl( schema=arrow_table.schema, platform_config=platform_config, context=context, + domain=((0, df.shape[0] - 1),), + # XXX DOMAIN ) except (AlreadyExistsError, NotCreateableError): if ingestion_params.error_if_already_exists: @@ -1302,7 +1304,9 @@ def _create_from_matrix( try: # A SparseNDArray must be appendable in soma.io. - shape = [None for _ in matrix.shape] if cls.is_sparse else matrix.shape + # XXX this can be numpy.int64 -- this is for the type-checker + # shape = matrix.shape + shape = tuple([int(e) for e in matrix.shape]) soma_ndarray = cls.create( uri, type=pa.from_numpy_dtype(matrix.dtype), diff --git a/apis/python/src/tiledbsoma/soma_array.cc b/apis/python/src/tiledbsoma/soma_array.cc index f89f7414f3..8c60b08533 100644 --- a/apis/python/src/tiledbsoma/soma_array.cc +++ b/apis/python/src/tiledbsoma/soma_array.cc @@ -500,14 +500,23 @@ void load_soma_array(py::module& m) { py::gil_scoped_release release; // Try to read more data - auto buffers = array.read_next(); - - // If more data was read, convert it to an arrow table and - // return - if (buffers.has_value()) { - // Acquire python GIL before accessing python objects - py::gil_scoped_acquire acquire; - return to_table(*buffers); + try { + auto buffers = array.read_next(); + + // If more data was read, convert it to an arrow table and + // return + if (buffers.has_value()) { + // Acquire python GIL before accessing python objects + py::gil_scoped_acquire acquire; + return to_table(*buffers); + } + + } catch (const TileDBSOMAIndexError& e) { + // Re-raise as ValueError to preserve index-out-of-bounds + // reporting semantics in the current-domain/new-shape era. + throw py::value_error(e.what()); + } catch (const std::exception& e) { + throw e; } // No data was read, the query is complete, return nullopt @@ -520,8 +529,6 @@ void load_soma_array(py::module& m) { .def("nnz", &SOMAArray::nnz, py::call_guard()) - .def_property_readonly("shape", &SOMAArray::shape) - .def_property_readonly("uri", &SOMAArray::uri) .def_property_readonly("column_names", &SOMAArray::column_names) diff --git a/apis/python/src/tiledbsoma/soma_dataframe.cc b/apis/python/src/tiledbsoma/soma_dataframe.cc index 30ba59541e..d91ddb6d4e 100644 --- a/apis/python/src/tiledbsoma/soma_dataframe.cc +++ b/apis/python/src/tiledbsoma/soma_dataframe.cc @@ -54,6 +54,7 @@ void load_soma_dataframe(py::module& m) { [](std::string_view uri, py::object py_schema, py::object index_column_info, + // XXX py::object domain, std::shared_ptr context, PlatformConfig platform_config, std::optional> timestamp) { @@ -142,8 +143,32 @@ void load_soma_dataframe(py::module& m) { "timestamp"_a = py::none()) .def_static("exists", &SOMADataFrame::exists) + + // XXX TEMP + .def( + "resize", + [](SOMAArray& array, const std::vector& newshape) { + try { + array.resize1(newshape); + } catch (const TileDBSOMAIndexError& e) { + // Re-raise as ValueError to preserve index-out-of-bounds + // reporting semantics in the current-domain/new-shape era. + throw py::value_error(e.what()); + } catch (const std::exception& e) { + throw e; + } + }, + "newshape"_a) + + // XXX TEMP + .def_property_readonly("shape", &SOMADataFrame::shape1) + + // XXX TEMP + .def_property_readonly("maxshape", &SOMADataFrame::maxshape1) + .def_property_readonly( "index_column_names", &SOMADataFrame::index_column_names) + .def_property_readonly( "count", &SOMADataFrame::count, diff --git a/apis/python/src/tiledbsoma/soma_dense_ndarray.cc b/apis/python/src/tiledbsoma/soma_dense_ndarray.cc index 0a8d8c3555..5bca5693bd 100644 --- a/apis/python/src/tiledbsoma/soma_dense_ndarray.cc +++ b/apis/python/src/tiledbsoma/soma_dense_ndarray.cc @@ -124,6 +124,27 @@ void load_soma_dense_ndarray(py::module& m) { .def_static("exists", &SOMADenseNDArray::exists) + .def( + "resize", + [](SOMAArray& array, const std::vector& newshape) { + try { + array.resize(newshape); + } catch (const TileDBSOMAIndexError& e) { + // Re-raise as ValueError to preserve index-out-of-bounds + // reporting semantics in the current-domain/new-shape era. + throw py::value_error(e.what()); + } catch (const std::exception& e) { + throw e; + } + }, + "newshape"_a) + + // XXX TEMP + .def_property_readonly("shape", &SOMAArray::shape) + + // XXX TEMP + .def_property_readonly("maxshape", &SOMAArray::maxshape) + .def("write", write); } } // namespace libtiledbsomacpp diff --git a/apis/python/src/tiledbsoma/soma_sparse_ndarray.cc b/apis/python/src/tiledbsoma/soma_sparse_ndarray.cc index 40bd4c8b43..58e8750dcf 100644 --- a/apis/python/src/tiledbsoma/soma_sparse_ndarray.cc +++ b/apis/python/src/tiledbsoma/soma_sparse_ndarray.cc @@ -110,6 +110,28 @@ void load_soma_sparse_ndarray(py::module& m) { "result_order"_a = ResultOrder::automatic, "timestamp"_a = py::none()) - .def_static("exists", &SOMASparseNDArray::exists); + .def_static("exists", &SOMASparseNDArray::exists) + + // XXX TEMP + .def( + "resize", + [](SOMAArray& array, const std::vector& newshape) { + try { + array.resize(newshape); + } catch (const TileDBSOMAIndexError& e) { + // Re-raise as ValueError to preserve index-out-of-bounds + // reporting semantics in the current-domain/new-shape era. + throw py::value_error(e.what()); + } catch (const std::exception& e) { + throw e; + } + }, + "newshape"_a) + + // XXX TEMP + .def_property_readonly("shape", &SOMAArray::shape) + + // XXX TEMP + .def_property_readonly("maxshape", &SOMAArray::maxshape); } } // namespace libtiledbsomacpp diff --git a/apis/python/tests/test_basic_anndata_io.py b/apis/python/tests/test_basic_anndata_io.py index 42de74644d..faa5d35c5d 100644 --- a/apis/python/tests/test_basic_anndata_io.py +++ b/apis/python/tests/test_basic_anndata_io.py @@ -1333,6 +1333,19 @@ def test_nan_append(conftest_pbmc_small, dtype, nans, new_obs_ids): var_field_name="var_id", ) + # XXX TO DO: NOW NEEDS A RESIZE AS OF 2.26 + # XXX TEMP -- needs an all-in-one experiment-level mutator ... + with tiledbsoma.Experiment.open(SOMA_URI, "w") as exp: + nobs2 = len(rd.obs_axis.data) + new_obs_shape = (nobs2,) + exp.obs.resize(new_obs_shape) + + new_X_shape = (nobs2, len(adata2.var)) + exp.ms["RNA"].X["data"].resize(new_X_shape) + + new_X_shape = (nobs2, len(adata2.raw.var)) + exp.ms["raw"].X["data"].resize(new_X_shape) + # Append the second anndata object tiledbsoma.io.from_anndata( experiment_uri=SOMA_URI, diff --git a/apis/python/tests/test_dataframe_index_columns.py b/apis/python/tests/test_dataframe_index_columns.py index 7629840603..7cb721c467 100644 --- a/apis/python/tests/test_dataframe_index_columns.py +++ b/apis/python/tests/test_dataframe_index_columns.py @@ -3,7 +3,6 @@ import pytest import tiledbsoma as soma -import tiledb @pytest.fixture @@ -1899,6 +1898,7 @@ def test_types_read_errors( with soma.DataFrame.open(uri, "w") as sdf: sdf.write(arrow_table) - with pytest.raises((RuntimeError, tiledb.cc.TileDBError)): + # XXX TO DO + with pytest.raises((soma.SOMAError)): with soma.DataFrame.open(uri, "r") as sdf: sdf.read(coords=coords).concat() diff --git a/apis/python/tests/test_registration_mappings.py b/apis/python/tests/test_registration_mappings.py index 45065b6a68..0708c3ee9d 100644 --- a/apis/python/tests/test_registration_mappings.py +++ b/apis/python/tests/test_registration_mappings.py @@ -430,6 +430,8 @@ def test_multiples_without_experiment( "ZZZ3": 9, } + # XXX TO DO: RESIZE + # Now do the ingestion per se. Note that once registration is done sequentially, ingest order # mustn't matter, and in fact, can be done in parallel. This is why we test various permutations # of the ordering of the h5ad file names. @@ -818,6 +820,8 @@ def test_append_with_disjoint_measurements( var_field_name=var_field_name, ) + # XXX TO DO: RESIZE + tiledbsoma.io.from_anndata( soma_uri, anndata2, @@ -1166,6 +1170,9 @@ def test_enum_bit_width_append(tmp_path, all_at_once, nobs_a, nobs_b): tiledbsoma.io.from_anndata( soma_uri, adata, measurement_name=measurement_name, registration_mapping=rd ) + + # XXX TO DO: RESIZE + tiledbsoma.io.from_anndata( soma_uri, bdata, measurement_name=measurement_name, registration_mapping=rd ) @@ -1181,6 +1188,8 @@ def test_enum_bit_width_append(tmp_path, all_at_once, nobs_a, nobs_b): var_field_name=var_field_name, ) + # XXX TO DO: RESIZE + tiledbsoma.io.from_anndata( soma_uri, bdata, measurement_name=measurement_name, registration_mapping=rd ) @@ -1256,6 +1265,8 @@ def test_multimodal_names(tmp_path, conftest_pbmc3k_adata): var_field_name=adata_protein.var.index.name, ) + # XXX TO DO: RESIZE + # Ingest the second anndata object into the protein measurement tiledbsoma.io.from_anndata( experiment_uri=uri, diff --git a/apis/python/tests/test_shape.py b/apis/python/tests/test_shape.py new file mode 100644 index 0000000000..47adc0e83e --- /dev/null +++ b/apis/python/tests/test_shape.py @@ -0,0 +1,328 @@ +from __future__ import annotations + +import pyarrow as pa +import pytest + +import tiledbsoma + +from tests._util import maybe_raises + +# ================================================================ +# SHORT LIST: +# +# k snda creation: with shape and maxshape +# +# k snda accessor: shape +# o try fallback on old data (check in to repo) +# k snda accessor: maxshape +# k snda accessor: used_shape +# o deprecation notice ... +# k snda accessor: non_empty_domain +# +# k snda bounds-checking on reads +# k snda bounds-checking on writes +# +# k snda mutator: resize +# o raise NotImplementedError for old arrays +# +# TODO: non-2D SNDA cases +# +# ---------------------------------------------------------------- +# * sdf creation: with domain and ... ? also shape or maxshape? implicit? +# +# * sdf accessor: shape +# o try fallback on old data (check in to repo) +# k sdf accessor: maxshape +# k sdf accessor: used_shape -- does not exist anyway +# k deprecation notice -- b/c it does not exist anyway +# k sdf accessor: non_empty_domain +# k sdf accessor: domain +# +# k sdf bounds-checking on reads +# k sdf bounds-checking on writes +# +# * sdf mutator: resize +# o raise NotImplementedError for old arrays +# * sdf mutator: tiledbsoma_upgrade_shape +# o no-op for new arrays -- ? +# +# k all: partials w/ extra dims +# +# ---------------------------------------------------------------- +# * both: raise IndexError rather than SOMAError for OOB accesses?? +# +# * both mutator: tiledbsoma_upgrade_shape +# o array.schema.version to see if needed +# o no-op for new arrays -- ? +# o use core storage-version-update logic +# o fail if outside domain +# +# ---------------------------------------------------------------- +# * experiment mutator: tiledbsoma.io.resize +# o do-it-all w/ new nobs/nvar -- ? +# ================================================================ + + +@pytest.mark.parametrize( + "element_dtype", + [ + pa.float64(), + pa.float32(), + pa.int64(), + pa.uint16(), + ], +) +@pytest.mark.parametrize( + "shape_exc", + [ + [(100,), None], + [(100,), None], + [(100, 200), None], + [(100, 200, 300), None], + [(100, 200), None], + [(100, 200), None], + [(100, 200), None], + [(100, 200), None], + [(100, 200), None], + ], +) +def test_sparse_nd_array_basics( + tmp_path, + element_dtype, + shape_exc, +): + uri = tmp_path.as_posix() + arg_shape, arg_create_exc = shape_exc + ndim = len(arg_shape) + + # Create the array + with maybe_raises(arg_create_exc): + snda = tiledbsoma.SparseNDArray.create( + uri, + type=element_dtype, + shape=arg_shape, + ) + if arg_create_exc is not None: + return + + assert tiledbsoma.SparseNDArray.exists(uri) + + # Test the various accessors + with tiledbsoma.SparseNDArray.open(uri) as snda: + assert snda.shape == arg_shape + + # TODO: need a saved-off array in UT-data land + + # We expect XXX to be set to a big signed int32. (There are details on the exact value of + # that number, involving R compatibility, and leaving room for a single tile capacity, etc + # ... we could check for some magic value but it suffices to check that it's over 2 + # billion.) + for e in snda.maxshape: + assert e > 2_000_000_000 + + # TODO: used_shape + # o as-is + # o deprecation notice ... + + # No data have been written for this test case + assert snda.non_empty_domain() == tuple([(0, 0)] * ndim) + + # soma_dim_0: (0,1) + # soma_dim_1: (2,3) + # soma_dim_2: (4,5) + coords = [] + dim_names = [] + for i in range(ndim): + dim_names.append(f"soma_dim_{i}") + coords.append((2 * i, 2 * i + 1)) + coords = tuple(coords) + + # Write some data + with tiledbsoma.SparseNDArray.open(uri, "w") as snda: + dikt = {"soma_data": [4, 5]} + for i in range(ndim): + dikt[dim_names[i]] = coords[i] + table = pa.Table.from_pydict(dikt) + snda.write(table) + + # Test the various accessors + with tiledbsoma.SparseNDArray.open(uri) as snda: + assert snda.shape == arg_shape + for e in snda.maxshape: + assert e > 2_000_000_000 + assert snda.non_empty_domain() == coords + + # Test reads out of bounds + with tiledbsoma.SparseNDArray.open(uri) as snda: + with pytest.raises(ValueError): + coords = tuple([arg_shape[i] + 10 for i in range(ndim)]) + snda.read(coords).tables().concat() + + # Test writes out of bounds + with tiledbsoma.SparseNDArray.open(uri, "w") as snda: + with pytest.raises(tiledbsoma.SOMAError): + dikt = {"soma_data": [30]} + for i in range(ndim): + dikt[dim_names[i]] = [arg_shape[i] + 20] + table = pa.Table.from_pydict(dikt) + snda.write(table) + + with tiledbsoma.SparseNDArray.open(uri) as snda: + assert snda.shape == arg_shape + + # Test resize down + new_shape = tuple([arg_shape[i] - 50 for i in range(ndim)]) + # TODO: why union with tiledb.cc.TileDBError -- needed in sandbox + with tiledbsoma.SparseNDArray.open(uri, "w") as snda: + with pytest.raises(ValueError): + snda.resize(new_shape) + + with tiledbsoma.SparseNDArray.open(uri) as snda: + assert snda.shape == arg_shape + + # Test resize too big + new_shape = tuple([4_000_000_000 for i in range(ndim)]) + with pytest.raises(ValueError): + with tiledbsoma.SparseNDArray.open(uri, "w") as snda: + snda.resize(new_shape) + with tiledbsoma.SparseNDArray.open(uri) as snda: + assert snda.shape == arg_shape + + # Test reasonable resize + new_shape = tuple([arg_shape[i] + 50 for i in range(ndim)]) + with tiledbsoma.SparseNDArray.open(uri, "w") as snda: + snda.resize(new_shape) + + dikt = {} + for i in range(ndim): + dikt[dim_names[i]] = [arg_shape[i] + 20] + dikt["soma_data"] = pa.array([34.5], type=element_dtype) + table = pa.Table.from_pydict(dikt) + + # Re-test writes out of old bounds, within new bounds + with tiledbsoma.SparseNDArray.open(uri, "w") as snda: + # Implicitly checking there's no raise + snda.write(table) + + # Re-test reads out of old bounds, within new bounds + with tiledbsoma.SparseNDArray.open(uri) as snda: + assert snda.shape == new_shape + + coords = tuple([(arg_shape[i] + 20,) for i in range(ndim)]) + # Implicitly checking there's no raise + readback = snda.read(coords).tables().concat() + assert readback == table + + +# Pending 2.26 timeframe for dense support +# TODO: mark these with a linked GitHub tracking issue +def test_dense_nd_array_basics(tmp_path): + uri = tmp_path.as_posix() + shape = (100, 200) + tiledbsoma.DenseNDArray.create(uri, type=pa.float64(), shape=shape) + + with tiledbsoma.DenseNDArray.open(uri) as dnda: + assert dnda.shape == (100, 200) + + with pytest.raises(NotImplementedError): + assert dnda.maxshape == (100, 200) + + with pytest.raises(NotImplementedError): + dnda.resize((300, 400)) + + assert dnda.non_empty_domain() == ((0, 0), (0, 0)) + + assert dnda.shape == (100, 200) + + +@pytest.mark.parametrize( + "domain0", + [ + None, + (0, 1), + (0, 3), + (0, 100), + ], +) +@pytest.mark.parametrize( + "index_column_names", + [ + ["soma_joinid"], + ["soma_joinid", "myint"], + ["soma_joinid", "mystring"], + ], +) +def test_dataframe_basics(tmp_path, domain0, index_column_names): + uri = tmp_path.as_posix() + + schema = pa.schema( + [ + ("soma_joinid", pa.int64()), + ("mystring", pa.string()), + ("myint", pa.int16()), + ("myfloat", pa.float32()), + ] + ) + + data = pa.Table.from_pydict( + { + "soma_joinid": [0, 1, 2, 3], + "mystring": ["a", "b", "a", "b"], + "myint": [20, 30, 40, 50], + "myfloat": [1.0, 2.5, 4.0, 5.5], + } + ) + + if domain0 is None: + shape0 = None + else: + shape0 = domain0[1] + 1 + domain = [None] * len(index_column_names) + domain[0] = domain0 + domain = tuple(domain) + + with tiledbsoma.DataFrame.create( + uri, + schema=schema, + index_column_names=index_column_names, + domain=domain, + ) as sdf: + if shape0 is not None and len(data) > shape0: + with pytest.raises(tiledbsoma.SOMAError): + sdf.write(data) + else: + sdf.write(data) + + with tiledbsoma.DataFrame.open(uri) as sdf: + assert len(sdf.shape) == 1 + if shape0 is not None: + assert sdf.shape[0] == shape0 + assert len(sdf.maxshape) == 1 + assert sdf.maxshape[0] > 2_000_000_000 # XXX COMMENT + assert len(sdf.non_empty_domain()) == len(index_column_names) + + # XXX guard against ... not just the type-checker ... + if domain0 is not None: + new_size = 10000 + with tiledbsoma.DataFrame.open(uri, "r") as sdf: + # Must be open for write. + # XXX TO DO fix this + # with pytest.raises(tiledbsoma.SOMAError): + with pytest.raises(ValueError): + sdf.resize([new_size]) + with tiledbsoma.DataFrame.open(uri, "w") as sdf: + sdf.resize([new_size]) + + with tiledbsoma.DataFrame.open(uri, "w") as sdf: + sdf.write(data) + + with tiledbsoma.DataFrame.open(uri) as sdf: + assert len(sdf.shape) == 1 + assert sdf.shape[0] == new_size + assert len(sdf.maxshape) == 1 + assert sdf.maxshape[0] > 2_000_000_000 # XXX COMMENT + assert len(sdf.non_empty_domain()) == len(index_column_names) + + # XXX MORE + # XXX have here too a saved-off old array + # XXX new OOB test diff --git a/apis/python/tests/test_sparse_nd_array.py b/apis/python/tests/test_sparse_nd_array.py index 1ccc597020..989914f7cf 100644 --- a/apis/python/tests/test_sparse_nd_array.py +++ b/apis/python/tests/test_sparse_nd_array.py @@ -589,7 +589,9 @@ def test_csr_csc_2d_read(tmp_path, shape): "dims": { "soma_dim_0": [0, 1, 2, 3], }, - "throws": None, + # XXX + # "throws": None, + "throws": soma.SOMAError, }, { "name": "coords=[4]", @@ -619,6 +621,7 @@ def test_csr_csc_2d_read(tmp_path, shape): "throws": ( RuntimeError, tiledb.cc.TileDBError, + soma.SOMAError, ), }, { diff --git a/apis/r/tests/testthat/helper-test-data.R b/apis/r/tests/testthat/helper-test-data.R index f0596fc08f..6a2324acdc 100644 --- a/apis/r/tests/testthat/helper-test-data.R +++ b/apis/r/tests/testthat/helper-test-data.R @@ -75,6 +75,6 @@ create_arrow_table <- function(nrows = 10L, factors = FALSE) { soma_joinid = bit64::seq.integer64(from = 0L, to = nrows - 1L), bar = seq(nrows) + 0.1, baz = as.character(seq.int(nrows) + 1000L) - # schema = create_arrow_schema() + # schema = create_arrow_schema(false) ) } diff --git a/libtiledbsoma/src/soma/managed_query.h b/libtiledbsoma/src/soma/managed_query.h index df27157f0a..1d4dc75298 100644 --- a/libtiledbsoma/src/soma/managed_query.h +++ b/libtiledbsoma/src/soma/managed_query.h @@ -151,11 +151,16 @@ class ManagedQuery { template void select_ranges( const std::string& dim, const std::vector>& ranges) { - subarray_range_set_ = true; - subarray_range_empty_[dim] = true; - for (auto& [start, stop] : ranges) { - subarray_->add_range(dim, start, stop); - subarray_range_empty_[dim] = false; + try { + // This can throw with current-domain support + subarray_range_set_ = true; + subarray_range_empty_[dim] = true; + for (auto& [start, stop] : ranges) { + subarray_->add_range(dim, start, stop); + subarray_range_empty_[dim] = false; + } + } catch (const std::exception& e) { + throw TileDBSOMAIndexError(e.what()); } } @@ -168,11 +173,16 @@ class ManagedQuery { */ template void select_points(const std::string& dim, const std::vector& points) { - subarray_range_set_ = true; - subarray_range_empty_[dim] = true; - for (auto& point : points) { - subarray_->add_range(dim, point, point); - subarray_range_empty_[dim] = false; + try { + // This can throw with current-domain support + subarray_range_set_ = true; + subarray_range_empty_[dim] = true; + for (auto& point : points) { + subarray_->add_range(dim, point, point); + subarray_range_empty_[dim] = false; + } + } catch (const std::exception& e) { + throw TileDBSOMAIndexError(e.what()); } } @@ -185,11 +195,16 @@ class ManagedQuery { */ template void select_points(const std::string& dim, const tcb::span points) { - subarray_range_set_ = true; - subarray_range_empty_[dim] = true; - for (auto& point : points) { - subarray_->add_range(dim, point, point); - subarray_range_empty_[dim] = false; + try { + // This can throw with current-domain support + subarray_range_set_ = true; + subarray_range_empty_[dim] = true; + for (auto& point : points) { + subarray_->add_range(dim, point, point); + subarray_range_empty_[dim] = false; + } + } catch (const std::exception& e) { + throw TileDBSOMAIndexError(e.what()); } } @@ -202,9 +217,14 @@ class ManagedQuery { */ template void select_point(const std::string& dim, const T& point) { - subarray_->add_range(dim, point, point); - subarray_range_set_ = true; - subarray_range_empty_[dim] = false; + try { + // This can throw with current-domain support + subarray_->add_range(dim, point, point); + subarray_range_set_ = true; + subarray_range_empty_[dim] = false; + } catch (const std::exception& e) { + throw TileDBSOMAIndexError(e.what()); + } } /** diff --git a/libtiledbsoma/src/soma/soma_array.cc b/libtiledbsoma/src/soma/soma_array.cc index ccda688494..da22c3750a 100644 --- a/libtiledbsoma/src/soma/soma_array.cc +++ b/libtiledbsoma/src/soma/soma_array.cc @@ -32,6 +32,7 @@ #include "soma_array.h" #include +#include #include "../utils/logger.h" #include "../utils/util.h" namespace tiledbsoma { @@ -279,30 +280,37 @@ void SOMAArray::reset( } std::optional> SOMAArray::read_next() { - // If the query is complete, return `std::nullopt` - if (mq_->is_complete(true)) { - return std::nullopt; - } - - // Configure query and allocate result buffers - mq_->setup_read(); + try { + // This can throw with current-domain support - // Continue to submit the empty query on first read to return empty results - if (mq_->is_empty_query()) { - if (first_read_next_) { - first_read_next_ = false; - return mq_->results(); - } else { + // If the query is complete, return `std::nullopt` + if (mq_->is_complete(true)) { return std::nullopt; } - } - first_read_next_ = false; + // Configure query and allocate result buffers + mq_->setup_read(); - mq_->submit_read(); + // Continue to submit the empty query on first read to return empty + // results + if (mq_->is_empty_query()) { + if (first_read_next_) { + first_read_next_ = false; + return mq_->results(); + } else { + return std::nullopt; + } + } - // Return the results, possibly incomplete - return mq_->results(); + first_read_next_ = false; + + mq_->submit_read(); + + // Return the results, possibly incomplete + return mq_->results(); + } catch (const std::exception& e) { + throw TileDBSOMAIndexError(e.what()); + } } bool SOMAArray::_extend_enumeration( @@ -471,6 +479,18 @@ uint64_t SOMAArray::_get_max_capacity(tiledb_datatype_t index_type) { } } +ArraySchemaEvolution SOMAArray::_make_se() { + ArraySchemaEvolution se(*ctx_->tiledb_ctx()); + if (timestamp_.has_value()) { + // ArraySchemaEvolution requires us to pair (t2, t2) even if our range + // is (t1, t2). + auto v = timestamp_.value(); + TimestampRange tr(v.second, v.second); + se.set_timestamp_range(tr); + } + return se; +} + void SOMAArray::set_column_data( std::string_view name, uint64_t num_elems, @@ -738,14 +758,7 @@ ArrowTable SOMAArray::_cast_table( // Go through all columns in the ArrowTable and cast the values to what is // in the ArraySchema on disk - ArraySchemaEvolution se(*ctx_->tiledb_ctx()); - if (timestamp_.has_value()) { - // ArraySchemaEvolution requires us to pair (t2, t2) even if our range - // is (t1, t2). - auto v = timestamp_.value(); - TimestampRange tr(v.second, v.second); - se.set_timestamp_range(tr); - } + ArraySchemaEvolution se = _make_se(); bool evolve_schema = false; for (auto i = 0; i < arrow_schema->n_children; ++i) { auto orig_arrow_sch_ = arrow_schema->children[i]; @@ -1177,8 +1190,66 @@ uint64_t SOMAArray::nnz_slow() { return total_cell_num; } +// XXX comment more std::vector SOMAArray::shape() { std::vector result; + + auto current_domain = tiledb::ArraySchemaExperimental::current_domain( + *ctx_->tiledb_ctx(), arr_->schema()); + if (current_domain.is_empty()) { + // XXX comment + return maxshape(); + } + + auto t = current_domain.type(); + if (t != TILEDB_NDRECTANGLE) { + throw TileDBSOMAError("current_domain type is not NDRECTANGLE"); + } + + NDRectangle ndrect = current_domain.ndrectangle(); + + for (auto dimension_name : dimension_names()) { + // TODO: non-int64 types for SOMADataFrame extra dims. + // This simply needs to be integrated with switch statements as in the + // legacy code below. + auto range = ndrect.range(dimension_name); + result.push_back(range[1] + 1); + } + return result; +} + +// XXX comment more +std::vector SOMAArray::shape1() { + std::vector result; + + auto current_domain = tiledb::ArraySchemaExperimental::current_domain( + *ctx_->tiledb_ctx(), arr_->schema()); + if (current_domain.is_empty()) { + // XXX comment + return maxshape1(); + } + + auto t = current_domain.type(); + if (t != TILEDB_NDRECTANGLE) { + throw TileDBSOMAError("current_domain type is not NDRECTANGLE"); + } + + NDRectangle ndrect = current_domain.ndrectangle(); + + // XXX temp + // XXX assert ndim >= 1 + // XXX assert dim[0].name is "soma_joinid" + + auto dimension_name = "soma_joinid"; + auto range = ndrect.range(dimension_name); + result.push_back(range[1] + 1); + + return result; +} + +// XXX comment more +std::vector SOMAArray::maxshape() { + std::vector result; auto dimensions = mq_->schema()->domain().dimensions(); for (const auto& dim : dimensions) { @@ -1253,6 +1324,156 @@ std::vector SOMAArray::shape() { return result; } +// XXX comment more +std::vector SOMAArray::maxshape1() { + std::vector result; + auto dimensions = mq_->schema()->domain().dimensions(); + + // XXX temp + // XXX assert ndim >= 1 + // XXX assert dim[0].name is "soma_joinid" + + const auto& dim = dimensions[0]; + + // XXX extract method for code dedupe + switch (dim.type()) { + case TILEDB_UINT8: + result.push_back( + dim.domain().second - dim.domain().first + 1); + break; + case TILEDB_INT8: + result.push_back( + dim.domain().second - dim.domain().first + 1); + break; + case TILEDB_UINT16: + result.push_back( + dim.domain().second - dim.domain().first + + 1); + break; + case TILEDB_INT16: + result.push_back( + dim.domain().second - dim.domain().first + 1); + break; + case TILEDB_UINT32: + result.push_back( + dim.domain().second - dim.domain().first + + 1); + break; + case TILEDB_INT32: + result.push_back( + dim.domain().second - dim.domain().first + 1); + break; + case TILEDB_UINT64: + result.push_back( + dim.domain().second - dim.domain().first + + 1); + break; + case TILEDB_INT64: + case TILEDB_DATETIME_YEAR: + case TILEDB_DATETIME_MONTH: + case TILEDB_DATETIME_WEEK: + case TILEDB_DATETIME_DAY: + case TILEDB_DATETIME_HR: + case TILEDB_DATETIME_MIN: + case TILEDB_DATETIME_SEC: + case TILEDB_DATETIME_MS: + case TILEDB_DATETIME_US: + case TILEDB_DATETIME_NS: + case TILEDB_DATETIME_PS: + case TILEDB_DATETIME_FS: + case TILEDB_DATETIME_AS: + case TILEDB_TIME_HR: + case TILEDB_TIME_MIN: + case TILEDB_TIME_SEC: + case TILEDB_TIME_MS: + case TILEDB_TIME_US: + case TILEDB_TIME_NS: + case TILEDB_TIME_PS: + case TILEDB_TIME_FS: + case TILEDB_TIME_AS: + result.push_back( + dim.domain().second - dim.domain().first + 1); + break; + default: + throw TileDBSOMAError("Dimension must be integer type."); + } + + return result; +} + +void SOMAArray::resize(const std::vector& newshape) { + if (mq_->query_type() != TILEDB_WRITE) { + throw TileDBSOMAError( + "[SOMAArray::resize] array must be opened in write mode"); + } + + auto tctx = ctx_->tiledb_ctx(); + ArraySchema schema = arr_->schema(); + Domain domain = schema.domain(); + ArraySchemaEvolution schema_evolution(*tctx); + CurrentDomain new_current_domain(*tctx); + + NDRectangle ndrect(*tctx, domain); + + // TODO: non-int64 for DataFrame when it has extra index dims -- ? + + unsigned n = domain.ndim(); + if ((unsigned)newshape.size() != n) { + throw TileDBSOMAError(fmt::format( + "[SOMAArray::resize]: newshape has dimension count {}; array has " + "{} ", + newshape.size(), + n)); + } + + for (unsigned i = 0; i < n; i++) { + ndrect.set_range( + domain.dimension(i).name(), 0, newshape[i] - 1); + } + + new_current_domain.set_ndrectangle(ndrect); + schema_evolution.expand_current_domain(new_current_domain); + schema_evolution.array_evolve(uri_); +} + +void SOMAArray::resize1(const std::vector& newshape) { + if (mq_->query_type() != TILEDB_WRITE) { + throw TileDBSOMAError( + "[SOMAArray::resize] array must be opened in write mode"); + } + + ArraySchema schema = arr_->schema(); + Domain domain = schema.domain(); + unsigned ndim = domain.ndim(); + if (newshape.size() != 1) { + throw TileDBSOMAError(fmt::format( + "[SOMAArray::resize]: newshape has dimension count {}; needed 1", + newshape.size(), + ndim)); + } + + // XXX TEMP + + auto tctx = ctx_->tiledb_ctx(); + CurrentDomain old_current_domain = ArraySchemaExperimental::current_domain( + *tctx, schema); + NDRectangle ndrect = old_current_domain.ndrectangle(); + + CurrentDomain new_current_domain(*tctx); + ArraySchemaEvolution schema_evolution(*tctx); + + for (unsigned i = 0; i < ndim; i++) { + if (domain.dimension(i).name() == "soma_joinid") { + ndrect.set_range( + domain.dimension(i).name(), 0, newshape[0] - 1); + } + } + + new_current_domain.set_ndrectangle(ndrect); + schema_evolution.expand_current_domain(new_current_domain); + schema_evolution.array_evolve(uri_); +} + uint64_t SOMAArray::ndim() const { return tiledb_schema()->domain().ndim(); } diff --git a/libtiledbsoma/src/soma/soma_array.h b/libtiledbsoma/src/soma/soma_array.h index d3a57987f2..90db6c0cb6 100644 --- a/libtiledbsoma/src/soma/soma_array.h +++ b/libtiledbsoma/src/soma/soma_array.h @@ -577,6 +577,26 @@ class SOMAArray : public SOMAObject { * value in the vector is the capcity of each dimension. */ std::vector shape(); + // XXX TEMP + std::vector shape1(); + + /** + * @brief XXX write me please thx + * + * @return XXX write me please thx + */ + std::vector maxshape(); + // XXX TEMP + std::vector maxshape1(); + + /** + * @brief XXX write me please thx + * + * @return XXX write me please thx + */ + void resize(const std::vector& newshape); + // XXX TEMP + void resize1(const std::vector& newshape); /** * @brief Get the number of dimensions. @@ -762,6 +782,13 @@ class SOMAArray : public SOMAObject { uint64_t _get_max_capacity(tiledb_datatype_t index_type); + /** + * Convenience function for creating an ArraySchemaEvolution object + * referencing this array's context pointer, along with its open-at + * timestamp (if any). + */ + ArraySchemaEvolution _make_se(); + bool _extend_enumeration( ArrowSchema* value_schema, ArrowArray* value_array, diff --git a/libtiledbsoma/src/soma/soma_collection.h b/libtiledbsoma/src/soma/soma_collection.h index 1c4274556b..60dc170fd2 100644 --- a/libtiledbsoma/src/soma/soma_collection.h +++ b/libtiledbsoma/src/soma/soma_collection.h @@ -281,4 +281,4 @@ class SOMACollection : public SOMAGroup { }; } // namespace tiledbsoma -#endif // SOMA_COLLECTION \ No newline at end of file +#endif // SOMA_COLLECTION diff --git a/libtiledbsoma/src/utils/arrow_adapter.cc b/libtiledbsoma/src/utils/arrow_adapter.cc index 3e94cf6c4e..f8086852a6 100644 --- a/libtiledbsoma/src/utils/arrow_adapter.cc +++ b/libtiledbsoma/src/utils/arrow_adapter.cc @@ -447,6 +447,139 @@ Dimension ArrowAdapter::_create_dim( } } +void ArrowAdapter::_set_current_domain_slot( + tiledb_datatype_t type, + const void* buff, + NDRectangle& ndrect, + std::string name) { + switch (type) { + case TILEDB_STRING_ASCII: + // XXX THROW + break; + case TILEDB_TIME_SEC: + case TILEDB_TIME_MS: + case TILEDB_TIME_US: + case TILEDB_TIME_NS: + case TILEDB_DATETIME_SEC: + case TILEDB_DATETIME_MS: + case TILEDB_DATETIME_US: + case TILEDB_DATETIME_NS: { + uint64_t lo = ((uint64_t*)buff)[3]; + uint64_t hi = ((uint64_t*)buff)[4]; + ndrect.set_range(name, lo, hi); + LOG_DEBUG(fmt::format( + "[ArrowAdapter] {} current_domain uint64_t {} to {}", + name, + lo, + hi)); + } break; + case TILEDB_INT8: { + int8_t lo = ((int8_t*)buff)[3]; + int8_t hi = ((int8_t*)buff)[4]; + ndrect.set_range(name, lo, hi); + LOG_DEBUG(fmt::format( + "[ArrowAdapter] {} current_domain int8_t {} to {}", + name, + lo, + hi)); + } break; + case TILEDB_UINT8: { + uint8_t lo = ((uint8_t*)buff)[3]; + uint8_t hi = ((uint8_t*)buff)[4]; + ndrect.set_range(name, lo, hi); + LOG_DEBUG(fmt::format( + "[ArrowAdapter] {} current_domain uint8_t {} to {}", + name, + lo, + hi)); + } break; + case TILEDB_INT16: { + int16_t lo = ((int16_t*)buff)[3]; + int16_t hi = ((int16_t*)buff)[4]; + ndrect.set_range(name, lo, hi); + LOG_DEBUG(fmt::format( + "[ArrowAdapter] {} current_domain int16_t {} to {}", + name, + lo, + hi)); + } break; + case TILEDB_UINT16: { + uint16_t lo = ((uint16_t*)buff)[3]; + uint16_t hi = ((uint16_t*)buff)[4]; + ndrect.set_range(name, lo, hi); + LOG_DEBUG(fmt::format( + "[ArrowAdapter] {} current_domain uint16_t {} to {}", + name, + lo, + hi)); + } break; + case TILEDB_INT32: { + int32_t lo = ((int32_t*)buff)[3]; + int32_t hi = ((int32_t*)buff)[4]; + ndrect.set_range(name, lo, hi); + LOG_DEBUG(fmt::format( + "[ArrowAdapter] {} current_domain int32_t {} to {}", + name, + lo, + hi)); + } break; + case TILEDB_UINT32: { + uint32_t lo = ((uint32_t*)buff)[3]; + uint32_t hi = ((uint32_t*)buff)[4]; + ndrect.set_range(name, lo, hi); + LOG_DEBUG(fmt::format( + "[ArrowAdapter] {} current_domain uint32_t {} to {}", + name, + lo, + hi)); + } break; + case TILEDB_INT64: { + int64_t lo = ((int64_t*)buff)[3]; + int64_t hi = ((int64_t*)buff)[4]; + ndrect.set_range(name, lo, hi); + LOG_DEBUG(fmt::format( + "[ArrowAdapter] {} current_domain int64_t {} to {}", + name, + lo, + hi)); + } break; + case TILEDB_UINT64: { + uint64_t lo = ((uint64_t*)buff)[3]; + uint64_t hi = ((uint64_t*)buff)[4]; + ndrect.set_range(name, lo, hi); + LOG_DEBUG(fmt::format( + "[ArrowAdapter] {} current_domain uint64_t {} to {}", + name, + lo, + hi)); + } break; + case TILEDB_FLOAT32: { + float lo = ((float*)buff)[3]; + float hi = ((float*)buff)[4]; + ndrect.set_range(name, lo, hi); + LOG_DEBUG(fmt::format( + "[ArrowAdapter] {} current_domain float {} to {}", + name, + lo, + hi)); + } break; + case TILEDB_FLOAT64: { + double lo = ((double*)buff)[3]; + double hi = ((double*)buff)[4]; + ndrect.set_range(name, lo, hi); + LOG_DEBUG(fmt::format( + "[ArrowAdapter] {} current_domain double {} to {}", + name, + lo, + hi)); + } break; + default: + throw TileDBSOMAError(fmt::format( + "ArrowAdapter: Unsupported TileDB dimension: {} ", + tiledb::impl::type_to_str(type))); + } +} + tiledb_layout_t ArrowAdapter::_get_order(std::string order) { std::transform( order.begin(), order.end(), order.begin(), [](unsigned char c) { @@ -516,6 +649,9 @@ ArraySchema ArrowAdapter::tiledb_schema_from_arrow_schema( auto child = arrow_schema->children[sch_idx]; auto type = ArrowAdapter::to_tiledb_format(child->format); + LOG_DEBUG(fmt::format( + "[ArrowAdapter] schema pass for {}", std::string(child->name))); + bool isattr = true; for (int64_t i = 0; i < index_column_schema->n_children; ++i) { @@ -531,6 +667,7 @@ ArraySchema ArrowAdapter::tiledb_schema_from_arrow_schema( const void* buff = index_column_array->children[i]->buffers[1]; auto dim = ArrowAdapter::_create_dim( type, child->name, buff, ctx); + dim.set_filter_list(filter_list); dims.insert({child->name, dim}); isattr = false; @@ -586,6 +723,57 @@ ArraySchema ArrowAdapter::tiledb_schema_from_arrow_schema( LOG_DEBUG(fmt::format("[ArrowAdapter] set_domain")); schema.set_domain(domain); + LOG_DEBUG(fmt::format( + "[ArrowAdapter] index_column_info length {}", + index_column_array->length)); + + // XXX COMMENT + // Note: this must be done after we've got the domain + bool have_current_domain_info = index_column_array->length == 5; + if (have_current_domain_info) { + CurrentDomain current_domain(*ctx); + NDRectangle ndrect(*ctx, domain); + + for (int64_t sch_idx = 0; sch_idx < arrow_schema->n_children; + ++sch_idx) { + auto child = arrow_schema->children[sch_idx]; + auto type = ArrowAdapter::to_tiledb_format(child->format); + + for (int64_t i = 0; i < index_column_schema->n_children; ++i) { + auto col_name = index_column_schema->children[i]->name; + if (strcmp(child->name, col_name) != 0) { + continue; + } + + if (ArrowAdapter::_isvar(child->format)) { + // In the core API: + // + // * domain for strings must be set as (nullptr, nullptr) + // * current_domain for strings cannot be set as (nullptr, + // nullptr) + // + // Fortunately, these are ASCII dims and we can range + // these accordingly. + ndrect.set_range(col_name, "\x01", "\x7f"); + } else { + const void* buff = index_column_array->children[i] + ->buffers[1]; + _set_current_domain_slot(type, buff, ndrect, col_name); + } + break; + } + } + + current_domain.set_ndrectangle(ndrect); + + LOG_DEBUG(fmt::format( + "[ArrowAdapter] before setting current_domain from ndrect")); + ArraySchemaExperimental::set_current_domain( + *ctx, schema, current_domain); + LOG_DEBUG(fmt::format( + "[ArrowAdapter] after setting current_domain from ndrect")); + } + LOG_DEBUG(fmt::format("[ArrowAdapter] check")); schema.check(); diff --git a/libtiledbsoma/src/utils/arrow_adapter.h b/libtiledbsoma/src/utils/arrow_adapter.h index 34c130439e..20e09afbc1 100644 --- a/libtiledbsoma/src/utils/arrow_adapter.h +++ b/libtiledbsoma/src/utils/arrow_adapter.h @@ -238,6 +238,12 @@ class ArrowAdapter { const void* buff, std::shared_ptr ctx); + static void _set_current_domain_slot( + tiledb_datatype_t type, + const void* buff, + NDRectangle& ndrect, + std::string name); + template static Dimension _create_dim_aux( std::shared_ptr ctx, std::string name, T* b) { diff --git a/libtiledbsoma/src/utils/common.h b/libtiledbsoma/src/utils/common.h index 87b41d5cd0..c9b4fa30bb 100644 --- a/libtiledbsoma/src/utils/common.h +++ b/libtiledbsoma/src/utils/common.h @@ -61,6 +61,23 @@ class TileDBSOMAError : public std::runtime_error { } }; +// The following allow us to demultiplex/remultiplex various +// libtiledbsoma errors involved in current-domain AKA "new shape", at the +// pybind11 boundary. E.g. at the pybind11 we can try/catch +// TileDBSOMAIndexError and re-throw py::value_error, etc. +class TileDBSOMAIndexError : public TileDBSOMAError { + public: + explicit TileDBSOMAIndexError(const char* m) + : TileDBSOMAError(m){}; + explicit TileDBSOMAIndexError(std::string m) + : TileDBSOMAError(m.c_str()){}; + + public: + virtual const char* what() const noexcept override { + return TileDBSOMAError::what(); + } +}; + }; // namespace tiledbsoma #endif // TILEDBSOMA_COMMON_H diff --git a/libtiledbsoma/test/common.cc b/libtiledbsoma/test/common.cc index adbe66f866..f3b7f731e1 100644 --- a/libtiledbsoma/test/common.cc +++ b/libtiledbsoma/test/common.cc @@ -51,7 +51,7 @@ ArraySchema create_schema(Context& ctx, bool allow_duplicates) { return schema; } -std::pair, ArrowTable> create_arrow_schema() { +std::pair, ArrowTable> create_arrow_schema(bool use_current_domain) { // Create ArrowSchema for SOMAArray auto arrow_schema = std::make_unique(); arrow_schema->format = "+s"; @@ -97,25 +97,27 @@ std::pair, ArrowTable> create_arrow_schema() { col_info_array->n_children = 2; col_info_array->release = &ArrowAdapter::release_array; col_info_array->children = new ArrowArray*[1]; + + int n = use_current_domain ? 5 : 3; auto d0_info = col_info_array->children[0] = new ArrowArray; - d0_info->length = 3; + d0_info->length = n; d0_info->null_count = 0; d0_info->offset = 0; d0_info->n_buffers = 2; d0_info->release = &ArrowAdapter::release_array; d0_info->buffers = new const void*[2]; d0_info->buffers[0] = nullptr; - d0_info->buffers[1] = malloc(sizeof(int64_t) * 3); + d0_info->buffers[1] = malloc(sizeof(int64_t) * n); d0_info->n_children = 0; - int64_t dom[] = {0, 1000, 1}; - std::memcpy((void*)d0_info->buffers[1], &dom, sizeof(int64_t) * 3); + int64_t dom[] = {0, 1000, 1, 0, 2147483646}; + std::memcpy((void*)d0_info->buffers[1], &dom, sizeof(int64_t) * n); return std::pair( std::move(arrow_schema), ArrowTable(std::move(col_info_array), std::move(col_info_schema))); } -ArrowTable create_column_index_info() { +ArrowTable create_column_index_info(bool use_current_domain) { // Create ArrowSchema for IndexColumnInfo auto col_info_schema = std::make_unique(); col_info_schema->format = "+s"; @@ -140,19 +142,20 @@ ArrowTable create_column_index_info() { col_info_array->n_children = 2; col_info_array->release = &ArrowAdapter::release_array; col_info_array->children = new ArrowArray*[1]; + int n = use_current_domain ? 5 : 3; auto d0_info = col_info_array->children[0] = new ArrowArray; - d0_info->length = 3; + d0_info->length = n; d0_info->null_count = 0; d0_info->offset = 0; d0_info->n_buffers = 2; d0_info->release = &ArrowAdapter::release_array; d0_info->buffers = new const void*[2]; d0_info->buffers[0] = nullptr; - d0_info->buffers[1] = malloc(sizeof(int64_t) * 3); + d0_info->buffers[1] = malloc(sizeof(int64_t) * n); d0_info->n_children = 0; - int64_t dom[] = {0, 1000, 1}; - std::memcpy((void*)d0_info->buffers[1], &dom, sizeof(int64_t) * 3); + int64_t dom[] = {0, 1000, 1, 0, 2147483646}; + std::memcpy((void*)d0_info->buffers[1], &dom, sizeof(int64_t) * n); return ArrowTable(std::move(col_info_array), std::move(col_info_schema)); } -} // namespace helper \ No newline at end of file +} // namespace helper diff --git a/libtiledbsoma/test/common.h b/libtiledbsoma/test/common.h index cfe660c90c..8ff740801d 100644 --- a/libtiledbsoma/test/common.h +++ b/libtiledbsoma/test/common.h @@ -61,7 +61,7 @@ static const std::string src_path = TILEDBSOMA_SOURCE_ROOT; namespace helper { ArraySchema create_schema(Context& ctx, bool allow_duplicates = false); -std::pair, ArrowTable> create_arrow_schema(); -ArrowTable create_column_index_info(); +std::pair, ArrowTable> create_arrow_schema(bool use_current_domain); +ArrowTable create_column_index_info(bool use_current_domain); } // namespace helper -#endif \ No newline at end of file +#endif diff --git a/libtiledbsoma/test/unit_soma_collection.cc b/libtiledbsoma/test/unit_soma_collection.cc index 9c700f4a63..b3bd9561af 100644 --- a/libtiledbsoma/test/unit_soma_collection.cc +++ b/libtiledbsoma/test/unit_soma_collection.cc @@ -53,7 +53,7 @@ TEST_CASE("SOMACollection: add SOMASparseNDArray") { std::string sub_uri = "mem://unit-test-add-sparse-ndarray/sub"; SOMACollection::create(base_uri, ctx, ts); - auto index_columns = helper::create_column_index_info(); + auto index_columns = helper::create_column_index_info(false); auto schema = helper::create_schema(*ctx->tiledb_ctx(), true); std::map expected_map{ @@ -94,7 +94,7 @@ TEST_CASE("SOMACollection: add SOMADenseNDArray") { std::string sub_uri = "mem://unit-test-add-dense-ndarray/sub"; SOMACollection::create(base_uri, ctx, ts); - auto index_columns = helper::create_column_index_info(); + auto index_columns = helper::create_column_index_info(false); auto schema = helper::create_schema(*ctx->tiledb_ctx(), true); std::map expected_map{ @@ -134,7 +134,7 @@ TEST_CASE("SOMACollection: add SOMADataFrame") { std::string sub_uri = "mem://unit-test-add-dataframe/sub"; SOMACollection::create(base_uri, ctx, ts); - auto [schema, index_columns] = helper::create_arrow_schema(); + auto [schema, index_columns] = helper::create_arrow_schema(false); std::map expected_map{ {"dataframe", SOMAGroupEntry(sub_uri, "SOMAArray")}}; @@ -198,7 +198,7 @@ TEST_CASE("SOMACollection: add SOMAExperiment") { std::string sub_uri = "mem://unit-test-add-experiment/sub"; SOMACollection::create(base_uri, ctx); - auto [schema, index_columns] = helper::create_arrow_schema(); + auto [schema, index_columns] = helper::create_arrow_schema(false); std::map expected_map{ {"experiment", SOMAGroupEntry(sub_uri, "SOMAGroup")}}; @@ -230,7 +230,7 @@ TEST_CASE("SOMACollection: add SOMAMeasurement") { std::string sub_uri = "mem://unit-test-add-measurement/sub"; SOMACollection::create(base_uri, ctx); - auto [schema, index_columns] = helper::create_arrow_schema(); + auto [schema, index_columns] = helper::create_arrow_schema(false); std::map expected_map{ {"measurement", SOMAGroupEntry(sub_uri, "SOMAGroup")}}; @@ -313,7 +313,7 @@ TEST_CASE("SOMAExperiment: metadata") { auto ctx = std::make_shared(); std::string uri = "mem://unit-test-experiment"; - auto [schema, index_columns] = helper::create_arrow_schema(); + auto [schema, index_columns] = helper::create_arrow_schema(false); SOMAExperiment::create( uri, std::move(schema), @@ -380,7 +380,7 @@ TEST_CASE("SOMAExperiment: metadata") { TEST_CASE("SOMAMeasurement: metadata") { auto ctx = std::make_shared(); std::string uri = "mem://unit-test-measurement"; - auto [schema, index_columns] = helper::create_arrow_schema(); + auto [schema, index_columns] = helper::create_arrow_schema(false); SOMAMeasurement::create( uri, std::move(schema), diff --git a/libtiledbsoma/test/unit_soma_dataframe.cc b/libtiledbsoma/test/unit_soma_dataframe.cc index edfe71ba8e..aeafc11b50 100644 --- a/libtiledbsoma/test/unit_soma_dataframe.cc +++ b/libtiledbsoma/test/unit_soma_dataframe.cc @@ -38,7 +38,7 @@ TEST_CASE("SOMADataFrame: basic") { REQUIRE(!SOMADataFrame::exists(uri, ctx)); - auto [schema, index_columns] = helper::create_arrow_schema(); + auto [schema, index_columns] = helper::create_arrow_schema(false); SOMADataFrame::create( uri, std::move(schema), @@ -148,7 +148,7 @@ TEST_CASE("SOMADataFrame: platform_config") { R"(]}})"; } - auto [schema, index_columns] = helper::create_arrow_schema(); + auto [schema, index_columns] = helper::create_arrow_schema(false); SOMADataFrame::create( uri, std::move(schema), @@ -184,7 +184,7 @@ TEST_CASE("SOMADataFrame: platform_config") { TEST_CASE("SOMADataFrame: metadata") { auto ctx = std::make_shared(); std::string uri = "mem://unit-test-collection"; - auto [schema, index_columns] = helper::create_arrow_schema(); + auto [schema, index_columns] = helper::create_arrow_schema(false); SOMADataFrame::create( uri, std::move(schema), diff --git a/libtiledbsoma/test/unit_soma_dense_ndarray.cc b/libtiledbsoma/test/unit_soma_dense_ndarray.cc index 09db1ec5bc..e3520e609f 100644 --- a/libtiledbsoma/test/unit_soma_dense_ndarray.cc +++ b/libtiledbsoma/test/unit_soma_dense_ndarray.cc @@ -38,7 +38,7 @@ TEST_CASE("SOMADenseNDArray: basic") { REQUIRE(!SOMADenseNDArray::exists(uri, ctx)); - auto index_columns = helper::create_column_index_info(); + auto index_columns = helper::create_column_index_info(false); SOMADenseNDArray::create( uri, "l", @@ -91,7 +91,7 @@ TEST_CASE("SOMADenseNDArray: platform_config") { PlatformConfig platform_config; platform_config.dense_nd_array_dim_zstd_level = 6; - auto index_columns = helper::create_column_index_info(); + auto index_columns = helper::create_column_index_info(false); SOMADenseNDArray::create( uri, "l", @@ -117,7 +117,7 @@ TEST_CASE("SOMADenseNDArray: metadata") { std::string uri = "mem://unit-test-dense-ndarray"; - auto index_columns = helper::create_column_index_info(); + auto index_columns = helper::create_column_index_info(false); SOMASparseNDArray::create( uri, "l", diff --git a/libtiledbsoma/test/unit_soma_sparse_ndarray.cc b/libtiledbsoma/test/unit_soma_sparse_ndarray.cc index 2fff13c525..65493590a9 100644 --- a/libtiledbsoma/test/unit_soma_sparse_ndarray.cc +++ b/libtiledbsoma/test/unit_soma_sparse_ndarray.cc @@ -38,7 +38,7 @@ TEST_CASE("SOMASparseNDArray: basic") { REQUIRE(!SOMASparseNDArray::exists(uri, ctx)); - auto index_columns = helper::create_column_index_info(); + auto index_columns = helper::create_column_index_info(false); SOMASparseNDArray::create( uri, "l", @@ -95,7 +95,7 @@ TEST_CASE("SOMASparseNDArray: platform_config") { PlatformConfig platform_config; platform_config.sparse_nd_array_dim_zstd_level = 6; - auto index_columns = helper::create_column_index_info(); + auto index_columns = helper::create_column_index_info(false); SOMASparseNDArray::create( uri, "l", @@ -121,7 +121,7 @@ TEST_CASE("SOMASparseNDArray: metadata") { std::string uri = "mem://unit-test-sparse-ndarray"; - auto index_columns = helper::create_column_index_info(); + auto index_columns = helper::create_column_index_info(false); SOMASparseNDArray::create( uri, "l", @@ -182,4 +182,4 @@ TEST_CASE("SOMASparseNDArray: metadata") { soma_sparse->open(OpenMode::read, TimestampRange(0, 2)); REQUIRE(!soma_sparse->has_metadata("md")); REQUIRE(soma_sparse->metadata_num() == 2); -} \ No newline at end of file +}