diff --git a/apis/python/src/tiledbsoma/_dataframe.py b/apis/python/src/tiledbsoma/_dataframe.py index 951234aeb5..979e947cf5 100644 --- a/apis/python/src/tiledbsoma/_dataframe.py +++ b/apis/python/src/tiledbsoma/_dataframe.py @@ -138,6 +138,7 @@ def create( schema: pa.Schema, index_column_names: Sequence[str] = (SOMA_JOINID,), domain: Optional[Domain] = None, + # TODO: optional maxshape? platform_config: Optional[options.PlatformConfig] = None, context: Optional[SOMATileDBContext] = None, tiledb_timestamp: Optional[OpenTimestamp] = None, @@ -319,6 +320,31 @@ def count(self) -> int: # if is it in read open mode, then it is a DataFrameWrapper return cast(DataFrameWrapper, self._handle).count + @property + def shape(self) -> Tuple[int, ...]: + """Returns capacity of each dimension, always a list of length ``ndim``. + This will not necessarily match the bounds of occupied cells within the array. + Rather, it is the bounds outside of which no data may be written. + + Lifecycle: + Experimental. + """ + # XXX COMMENT ME + return cast(Tuple[int, ...], (self._handle.shape[0],)) + + @property + def maxshape(self) -> Tuple[int, ...]: + """XXX write me please thank you + Lifecycle: + Experimental. + """ + # XXX COMMENT ME + return cast(Tuple[int, ...], (self._handle.maxshape[0],)) + + def resize(self, newshape: Sequence[Union[int, None]]): + """Comment me please thx""" + self._handle.resize(newshape) + def __len__(self) -> int: """Returns the number of rows in the dataframe. Same as ``df.count``.""" return self.count diff --git a/apis/python/src/tiledbsoma/_dense_nd_array.py b/apis/python/src/tiledbsoma/_dense_nd_array.py index 86cb7174bc..557c8b1957 100644 --- a/apis/python/src/tiledbsoma/_dense_nd_array.py +++ b/apis/python/src/tiledbsoma/_dense_nd_array.py @@ -324,7 +324,9 @@ def maxshape(self) -> Tuple[int, ...]: # For core 2.26 we'll implement this for sparse and dense. # For core 2.25 we'll implement this only for dense. # This suppression overrides the parent class. - raise NotImplementedError("DenseNDArray maxshape support is scheduled for TileDBSOMA 1.14") + raise NotImplementedError( + "DenseNDArray maxshape support is scheduled for TileDBSOMA 1.14" + ) def resize(self, newshape: Sequence[Union[int, None]]): """XXX write me please thank you @@ -334,4 +336,6 @@ def resize(self, newshape: Sequence[Union[int, None]]): # For core 2.26 we'll implement this for sparse and dense. # For core 2.25 we'll implement this only for dense. # This suppression overrides the parent class. - raise NotImplementedError("DenseNDArray resize support is scheduled for TileDBSOMA 1.14") + raise NotImplementedError( + "DenseNDArray resize support is scheduled for TileDBSOMA 1.14" + ) diff --git a/apis/python/src/tiledbsoma/_tdb_handles.py b/apis/python/src/tiledbsoma/_tdb_handles.py index 706cf0b48c..c79e6a3454 100644 --- a/apis/python/src/tiledbsoma/_tdb_handles.py +++ b/apis/python/src/tiledbsoma/_tdb_handles.py @@ -421,13 +421,11 @@ def write(self, values: pa.RecordBatch) -> None: @property def shape(self) -> Tuple[int, ...]: - # Shape is not implemented for DataFrames - raise NotImplementedError + return tuple(self._handle.shape) @property def maxshape(self) -> Tuple[int, ...]: - # Shape is not implemented for DataFrames -- XXX ?!? - raise NotImplementedError + return tuple(self._handle.maxshape) class DenseNDArrayWrapper(SOMAArrayWrapper[clib.SOMADenseNDArray]): diff --git a/apis/python/tests/test_shape.py b/apis/python/tests/test_shape.py index dc761103f6..06f5e7194d 100644 --- a/apis/python/tests/test_shape.py +++ b/apis/python/tests/test_shape.py @@ -3,62 +3,80 @@ import pyarrow as pa import pytest -import tiledbsoma as soma +import tiledbsoma import tiledb from tests._util import maybe_raises # ================================================================ -# TODO: - -# SNDA: -# o resize mutator -# - NotImplementedError for old arrays - -# SDF: -# o UT shape <, ==, > maxshape -# o UT partials w/ extra dims -# o UT OOB writes -# o UT OOB reads -# o UT resize -# - especially for extra index-dims -# - NotImplementedError for old arrays -# o check used_shape -# o check non_empty_domain - -# SNDA and SDF both: -# o deprecation notice ... -# o try fallback on old data (check in to repo) - -# tiledbsoma_upgrade_shape for snda -# o array.schema.version to see if needed -# o use core storage-version-update logic ... -# o fail if outside domain - -# tiledbsoma_upgrade_shape for sdf -# o mostly ditto -# o arg name is domain not shape - -# tiledbsoma.io.resize ... -# o per array -# o do-it-all w/ new nobs/nvar -- ? -# ================================================================ - +# SHORT LIST: +# +# k snda creation: with shape and maxshape +# +# k snda accessor: shape +# o try fallback on old data (check in to repo) +# k snda accessor: maxshape +# k snda accessor: used_shape +# o deprecation notice ... +# k snda accessor: non_empty_domain +# +# * snda bounds-checking on reads +# * snda bounds-checking on writes +# +# k snda mutator: resize +# o raise NotImplementedError for old arrays +# # TODO: non-2D SNDA cases +# +# ---------------------------------------------------------------- +# * sdf creation: with domain and ... ? also shape or maxshape? implicit? +# +# * sdf accessor: shape +# o try fallback on old data (check in to repo) +# * sdf accessor: maxshape +# * sdf accessor: used_shape +# o deprecation notice ... +# * sdf accessor: non_empty_domain +# * sdf accessor: domain +# +# * sdf bounds-checking on reads +# * sdf bounds-checking on writes +# +# * sdf mutator: resize +# o raise NotImplementedError for old arrays +# * sdf mutator: tiledbsoma_upgrade_shape +# o no-op for new arrays -- ? +# +# * all: partials w/ extra dims +# +# ---------------------------------------------------------------- +# * both: raise IndexError rather than SOMAError for OOB accesses?? +# * both mutator: tiledbsoma_upgrade_shape +# o array.schema.version to see if needed +# o no-op for new arrays -- ? +# o use core storage-version-update logic +# o fail if outside domain +# +# ---------------------------------------------------------------- +# * experiment mutator: tiledbsoma.io.resize +# o do-it-all w/ new nobs/nvar -- ? +# ================================================================ @pytest.mark.parametrize( "shape_maxshape_exc", [ + [(100,), None, None], [(100, 200), None, None], + [(100, 200, 300), None, None], [(100, 200), (None, None), None], [(100, 200), (1000, 2000), None], - # TO DO: reads on these throw tiledb.cc.TileDBError not soma.SOMAError -- need to debug: + # TO DO: reads on these throw tiledb.cc.TileDBError not tiledbsoma.SOMAError -- need to debug: [(100, 200), (100, 200), None], [(100, 200), (1000, 200), None], [(100, 200), (100, 2000), None], - [(100, 200), (10, 200), soma.SOMAError], - [(100, 200), (10, 200), soma.SOMAError], + [(100, 200), (10, 200), tiledbsoma.SOMAError], + [(100, 200), (10, 200), tiledbsoma.SOMAError], [(100, 200), (100,), ValueError], [(100, 200), (100, 200, 300), ValueError], ], @@ -67,24 +85,26 @@ def test_sparse_nd_array_basics( tmp_path, shape_maxshape_exc, ): - arg_shape, arg_maxshape, arg_create_exc = shape_maxshape_exc uri = tmp_path.as_posix() + arg_shape, arg_maxshape, arg_create_exc = shape_maxshape_exc + ndim = len(arg_shape) + # XXX FIX ME AND THEN PARAMETERIZE ME # element_type = pa.float32() element_type = pa.float64() # Create the array with maybe_raises(arg_create_exc): - snda = soma.SparseNDArray.create( + snda = tiledbsoma.SparseNDArray.create( uri, type=element_type, shape=arg_shape, maxshape=arg_maxshape ) if arg_create_exc is not None: return - assert soma.SparseNDArray.exists(uri) + assert tiledbsoma.SparseNDArray.exists(uri) # Test the various accessors - with soma.SparseNDArray.open(uri) as snda: + with tiledbsoma.SparseNDArray.open(uri) as snda: assert snda.shape == arg_shape # TODO: need a saved-off array in UT-data land @@ -109,21 +129,28 @@ def test_sparse_nd_array_basics( # o deprecation notice ... # No data have been written for this test case - assert snda.non_empty_domain() == ((0, 0), (0, 0)) + assert snda.non_empty_domain() == tuple([(0, 0)] * ndim) + + # soma_dim_0: (0,1) + # soma_dim_1: (2,3) + # soma_dim_2: (4,5) + coords = [] + dim_names = [] + for i in range(ndim): + dim_names.append(f"soma_dim_{i}") + coords.append((2 * i, 2 * i + 1)) + coords = tuple(coords) # Write some data - with soma.SparseNDArray.open(uri, "w") as snda: - table = pa.Table.from_pydict( - { - "soma_dim_0": [0, 1], - "soma_dim_1": [2, 3], - "soma_data": [4, 5], - } - ) + with tiledbsoma.SparseNDArray.open(uri, "w") as snda: + dikt = {"soma_data": [4, 5]} + for i in range(ndim): + dikt[dim_names[i]] = coords[i] + table = pa.Table.from_pydict(dikt) snda.write(table) # Test the various accessors - with soma.SparseNDArray.open(uri) as snda: + with tiledbsoma.SparseNDArray.open(uri) as snda: assert snda.shape == arg_shape if arg_maxshape is None: for e in snda.maxshape: @@ -134,84 +161,71 @@ def test_sparse_nd_array_basics( assert snda.maxshape[i] > 2_000_000_000 else: assert snda.maxshape[i] == arg_maxshape[i] - assert snda.non_empty_domain() == ((0, 1), (2, 3)) + assert snda.non_empty_domain() == coords # Test reads out of bounds - with soma.SparseNDArray.open(uri) as snda: + with tiledbsoma.SparseNDArray.open(uri) as snda: # AS NOTED ABOVE - # with pytest.raises(soma.SOMAError): - with pytest.raises((soma.SOMAError, tiledb.cc.TileDBError)): - coords = ((arg_shape[0] + 10,), (arg_shape[1] + 20,)) + # with pytest.raises(tiledbsoma.SOMAError): + with pytest.raises((tiledbsoma.SOMAError, tiledb.cc.TileDBError)): + coords = tuple([arg_shape[i] + 10 for i in range(ndim)]) snda.read(coords).tables().concat() # Test writes out of bounds - with soma.SparseNDArray.open(uri, "w") as snda: - with pytest.raises(soma.SOMAError): - table = pa.Table.from_pydict( - { - "soma_dim_0": [arg_shape[0] + 10], - "soma_dim_1": [arg_shape[1] + 20], - "soma_data": [30], - } - ) + with tiledbsoma.SparseNDArray.open(uri, "w") as snda: + with pytest.raises(tiledbsoma.SOMAError): + dikt = {"soma_data": [30]} + for i in range(ndim): + dikt[dim_names[i]] = [arg_shape[i] + 20] + table = pa.Table.from_pydict(dikt) snda.write(table) - with soma.SparseNDArray.open(uri) as snda: + with tiledbsoma.SparseNDArray.open(uri) as snda: assert snda.shape == arg_shape # Test resize down + new_shape = tuple([arg_shape[i] - 50 for i in range(ndim)]) # TODO: why union with tiledb.cc.TileDBError -- needed in sandbox - new_shape = (arg_shape[0] - 50, arg_shape[1] + 50) - with soma.SparseNDArray.open(uri, "w") as snda: - with pytest.raises((soma.SOMAError, tiledb.cc.TileDBError)): + with tiledbsoma.SparseNDArray.open(uri, "w") as snda: + with pytest.raises((tiledbsoma.SOMAError, tiledb.cc.TileDBError)): snda.resize(new_shape) - with soma.SparseNDArray.open(uri) as snda: - assert snda.shape == arg_shape - - # TODO: why union with tiledb.cc.TileDBError -- needed in sandbox - new_shape = (arg_shape[0] + 50, arg_shape[1] - 50) - with soma.SparseNDArray.open(uri, "w") as snda: - with pytest.raises((soma.SOMAError, tiledb.cc.TileDBError)): - snda.resize(new_shape) - - with soma.SparseNDArray.open(uri) as snda: + with tiledbsoma.SparseNDArray.open(uri) as snda: assert snda.shape == arg_shape # Test resize up - with soma.SparseNDArray.open(uri) as snda: + with tiledbsoma.SparseNDArray.open(uri) as snda: snda_maxshape = snda.maxshape - new_shape = (arg_shape[0] + 50, arg_shape[1] + 50) - if new_shape[0] > snda_maxshape[0] or new_shape[1] > snda_maxshape[1]: - with pytest.raises((soma.SOMAError, tiledb.cc.TileDBError)): - with soma.SparseNDArray.open(uri, "w") as snda: + new_shape = tuple([arg_shape[i] + 50 for i in range(ndim)]) + overs = [new_shape[i] > snda_maxshape[i] for i in range(ndim)] + if any(overs): + with pytest.raises((tiledbsoma.SOMAError, tiledb.cc.TileDBError)): + with tiledbsoma.SparseNDArray.open(uri, "w") as snda: snda.resize(new_shape) - with soma.SparseNDArray.open(uri) as snda: + with tiledbsoma.SparseNDArray.open(uri) as snda: assert snda.shape == arg_shape else: - with soma.SparseNDArray.open(uri, "w") as snda: + with tiledbsoma.SparseNDArray.open(uri, "w") as snda: snda.resize(new_shape) - table = pa.Table.from_pydict( - { - "soma_dim_0": [arg_shape[0] + 10], - "soma_dim_1": [arg_shape[1] + 20], - "soma_data": [34.5], - } - ) + dikt = {} + for i in range(ndim): + dikt[dim_names[i]] = [arg_shape[i] + 20] + dikt["soma_data"] = [34.5] + table = pa.Table.from_pydict(dikt) # Re-test writes out of old bounds, within new bounds - with soma.SparseNDArray.open(uri, "w") as snda: + with tiledbsoma.SparseNDArray.open(uri, "w") as snda: # Implicitly checking there's no raise snda.write(table) # Re-test reads out of old bounds, within new bounds - with soma.SparseNDArray.open(uri) as snda: + with tiledbsoma.SparseNDArray.open(uri) as snda: assert snda.shape == new_shape - coords = ((arg_shape[0] + 10,), (arg_shape[1] + 20,)) + coords = tuple([(arg_shape[i] + 20,) for i in range(ndim)]) # Implicitly checking there's no raise readback = snda.read(coords).tables().concat() assert readback == table @@ -222,9 +236,9 @@ def test_sparse_nd_array_basics( def test_dense_nd_array_basics(tmp_path): uri = tmp_path.as_posix() shape = (100, 200) - soma.DenseNDArray.create(uri, type=pa.float64(), shape=shape) + tiledbsoma.DenseNDArray.create(uri, type=pa.float64(), shape=shape) - with soma.DenseNDArray.open(uri) as dnda: + with tiledbsoma.DenseNDArray.open(uri) as dnda: assert dnda.shape == (100, 200) # Pending 2.26 @@ -238,3 +252,73 @@ def test_dense_nd_array_basics(tmp_path): assert dnda.non_empty_domain() == ((0, 0), (0, 0)) assert dnda.shape == (100, 200) + + +def test_dataframe_foo_1(tmp_path): + uri = tmp_path.as_posix() + index_column_names = ["soma_joinid"] + + schema = pa.schema( + [ + ("soma_joinid", pa.int64()), + ("mystring", pa.string()), + ("myfloat", pa.float32()), + ] + ) + + data = pa.Table.from_pydict( + { + "soma_joinid": [0, 1, 2, 3], + "mystring": ["a", "b", "a", "b"], + "myfloat": [1.0, 2.5, 4.0, 5.5], + } + ) + + with tiledbsoma.DataFrame.create( + uri, + schema=schema, + index_column_names=index_column_names, + domain=None, + ) as sdf: + sdf.write(data) + + with tiledbsoma.DataFrame.open(uri) as sdf: + # print(sdf.read().concat().to_pandas()) + assert len(sdf.shape) == 1 + assert len(sdf.maxshape) == 1 + assert len(sdf.non_empty_domain()) == 1 + + +def test_dataframe_foo_2(tmp_path): + uri = tmp_path.as_posix() + index_column_names = ["soma_joinid", "mystring"] + + schema = pa.schema( + [ + ("soma_joinid", pa.int64()), + ("mystring", pa.string()), + ("myfloat", pa.float32()), + ] + ) + + data = pa.Table.from_pydict( + { + "soma_joinid": [0, 1, 2, 3], + "mystring": ["a", "b", "a", "b"], + "myfloat": [1.0, 2.5, 4.0, 5.5], + } + ) + + with tiledbsoma.DataFrame.create( + uri, + schema=schema, + index_column_names=index_column_names, + domain=None, + ) as sdf: + sdf.write(data) + + with tiledbsoma.DataFrame.open(uri) as sdf: + # print(sdf.read().concat().to_pandas()) + assert len(sdf.shape) == 1 + assert len(sdf.maxshape) == 1 + assert len(sdf.non_empty_domain()) == 1