Skip to content

Commit

Permalink
[python] Support default-coord reads of corner-written dense ND arrays (
Browse files Browse the repository at this point in the history
#1810)

* [python] Support default-coord reads of corner-written dense ND arrays

* code-review feedback

* remove one open
  • Loading branch information
johnkerl authored Oct 20, 2023
1 parent a4718dd commit bbe7d68
Show file tree
Hide file tree
Showing 5 changed files with 57 additions and 13 deletions.
19 changes: 17 additions & 2 deletions apis/python/src/tiledbsoma/_dense_nd_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,8 +108,23 @@ def read(
self._check_open_read()
result_order = somacore.ResultOrder(result_order)

schema = self._handle.schema
target_shape = dense_indices_to_shape(coords, schema.shape, result_order)
# The dense_indices_to_shape includes, as one of its roles, how to handle default
# coordinates -- e.g. `dnda.read()`. The default for a DenseNDArray should be "all the data"
# -- but what is that? If the schema shape matches the non-empty domain -- e.g. at create,
# shape was 100x200, and at write, 100x200 cells were written, those are both the same. But
# if the array was written with room for growth -- e.g. created with shape
# 1,000,000x1,000,000 but only 100x200 cells were written -- then we need the non-empty
# domain.
#
# The non-empty domain is the corret choice in either case.
#
# The only exception is if the array has been created but no data have been written at
# all, in which case the best we can do is use the schema shape.
data_shape = self._handle.schema.shape
ned = self.non_empty_domain()
if ned is not None:
data_shape = tuple(slot[1] + 1 for slot in ned)
target_shape = dense_indices_to_shape(coords, data_shape, result_order)

sr = self._soma_reader(result_order=result_order)

Expand Down
11 changes: 0 additions & 11 deletions apis/python/src/tiledbsoma/_sparse_nd_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -342,17 +342,6 @@ def used_shape(self) -> Tuple[Tuple[int, int], ...]:
retval[i] = (min(ned_lower, bbox_lower), max(ned_upper, bbox_upper))
return tuple(retval)

def non_empty_domain(self) -> Tuple[Tuple[int, int], ...]:
"""
Retrieves the non-empty domain for each dimension, namely the smallest and
largest indices in each dimension for which the sparse array has data occupied.
This is nominally the same as ``used_shape``, but if for example the
leading/trailing rows/columns of the sparse array are entirely unoccupied, this
function will return a tighter range.
"""
with tiledb.open(self.uri, ctx=self.context.tiledb_ctx) as A:
return A.nonempty_domain() # type: ignore

def _compute_bounding_box_metadata(
self,
maxes: Sequence[int],
Expand Down
12 changes: 12 additions & 0 deletions apis/python/src/tiledbsoma/_tdb_handles.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
Mapping,
MutableMapping,
Optional,
Tuple,
Type,
TypeVar,
Union,
Expand Down Expand Up @@ -193,6 +194,17 @@ def _opener(
def schema(self) -> tiledb.ArraySchema:
return self._handle.schema

def non_empty_domain(self) -> Tuple[Tuple[int, int], ...]:
"""
Retrieves the non-empty domain for each dimension, namely the smallest
and largest indices in each dimension for which the array/dataframe has
data occupied. This is nominally the same as the domain used at
creation time, but if for example only a portion of the available domain
has actually had data written, this function will return a tighter
range.
"""
return self._handle.nonempty_domain() # type: ignore


@attrs.define(frozen=True)
class GroupEntry:
Expand Down
11 changes: 11 additions & 0 deletions apis/python/src/tiledbsoma/_tiledb_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,17 @@ def schema(self) -> pa.Schema:
"""
return tiledb_schema_to_arrow(self._tiledb_array_schema(), self.uri, self._ctx)

def non_empty_domain(self) -> Tuple[Tuple[int, int], ...]:
"""
Retrieves the non-empty domain for each dimension, namely the smallest
and largest indices in each dimension for which the array/dataframe has
data occupied. This is nominally the same as the domain used at
creation time, but if for example only a portion of the available domain
has actually had data written, this function will return a tighter
range.
"""
return self._handle.non_empty_domain()

def _tiledb_array_schema(self) -> tiledb.ArraySchema:
"""Returns the TileDB array schema, for internal use."""
return self._handle.schema
Expand Down
17 changes: 17 additions & 0 deletions apis/python/tests/test_dense_nd_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,23 @@ def test_dense_nd_array_requires_shape(tmp_path, shape_is_numeric):
soma.DenseNDArray.create(uri, type=pa.float32(), shape=(None, None)).close()


def test_dense_nd_array_ned_write(tmp_path):
uri = tmp_path.as_posix()

with soma.DenseNDArray.create(
uri=uri,
type=pa.int32(),
shape=[1000000],
) as dnda:
dnda.write(
(slice(0, 4),),
pa.Tensor.from_numpy(np.asarray([100, 101, 102, 103])),
)

with soma.DenseNDArray.open(uri) as dnda:
assert (dnda.read().to_numpy() == np.asarray([100, 101, 102, 103])).all()


@pytest.mark.parametrize(
"io",
[
Expand Down

0 comments on commit bbe7d68

Please sign in to comment.