Skip to content

Commit

Permalink
[python/r] Expose shape-related accessors from C++ to bindings
Browse files Browse the repository at this point in the history
  • Loading branch information
johnkerl committed Sep 4, 2024
1 parent 0025ed7 commit 8ef6cf4
Show file tree
Hide file tree
Showing 20 changed files with 572 additions and 12 deletions.
11 changes: 11 additions & 0 deletions apis/python/src/tiledbsoma/_common_nd_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,17 @@ def shape(self) -> Tuple[int, ...]:
"""
return cast(Tuple[int, ...], tuple(self._handle.shape))

@property
def maxshape(self) -> Tuple[int, ...]:
"""Returns the maximum resizable capacity of each dimension, always a list of length
``ndim``. This will not necessarily match the bounds of occupied cells within the array.
It is the upper limit for ``resize`` on the array.
Lifecycle:
Maturing.
"""
return cast(Tuple[int, ...], tuple(self._handle.maxshape))

@classmethod
def _dim_capacity_and_extent(
cls,
Expand Down
23 changes: 23 additions & 0 deletions apis/python/src/tiledbsoma/_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -317,6 +317,29 @@ def count(self) -> int:
# if is it in read open mode, then it is a DataFrameWrapper
return cast(DataFrameWrapper, self._handle).count

@property
def _maybe_soma_joinid_shape(self) -> Optional[int]:
"""An internal helper method that returns the shape
value along the ``soma_joinid`` index column, if the ``DataFrame
has one, else ``None``.
Lifecycle:
Experimental.
"""
return self._handle.maybe_soma_joinid_shape

@property
def _maybe_soma_joinid_maxshape(self) -> Optional[int]:
"""An internal helper method that returns the maxshape
value along the ``soma_joinid`` index column, if the ``DataFrame
has one, else ``None``.
Lifecycle:
Experimental.
"""
return self._handle.maybe_soma_joinid_maxshape

def __len__(self) -> int:
"""Returns the number of rows in the dataframe. Same as ``df.count``."""
return self.count
Expand Down
41 changes: 37 additions & 4 deletions apis/python/src/tiledbsoma/_tdb_handles.py
Original file line number Diff line number Diff line change
Expand Up @@ -406,7 +406,23 @@ def dim_names(self) -> Tuple[str, ...]:

@property
def shape(self) -> Tuple[int, ...]:
return tuple(self._handle.shape)
"""Not implemented for DataFrame."""
return cast(Tuple[int, ...], tuple(self._handle.shape))

@property
def maxshape(self) -> Tuple[int, ...]:
"""Not implemented for DataFrame."""
return cast(Tuple[int, ...], tuple(self._handle.maxshape))

@property
def maybe_soma_joinid_shape(self) -> Optional[int]:
"""Only implemented for DataFrame."""
raise NotImplementedError

@property
def maybe_soma_joinid_maxshape(self) -> Optional[int]:
"""Only implemented for DataFrame."""
raise NotImplementedError


class DataFrameWrapper(SOMAArrayWrapper[clib.SOMADataFrame]):
Expand All @@ -422,9 +438,26 @@ def write(self, values: pa.RecordBatch) -> None:
self._handle.write(values)

@property
def shape(self) -> Tuple[int, ...]:
# Shape is not implemented for DataFrames
raise NotImplementedError
def maybe_soma_joinid_shape(self) -> Optional[int]:
"""Return the shape slot for the soma_joinid dim, if the array has one.
This is an important test-point and dev-internal access-point,
in particular, for the tiledbsoma-io experiment-level resizer.
Lifecycle:
Maturing.
"""
return cast(Optional[int], self._handle.maybe_soma_joinid_shape)

@property
def maybe_soma_joinid_maxshape(self) -> Optional[int]:
"""Return the maxshape slot for the soma_joinid dim, if the array has one.
This is an important test-point and dev-internal access-point,
in particular, for the tiledbsoma-io experiment-level resizer.
Lifecycle:
Maturing.
"""
return cast(Optional[int], self._handle.maybe_soma_joinid_maxshape)


class DenseNDArrayWrapper(SOMAArrayWrapper[clib.SOMADenseNDArray]):
Expand Down
6 changes: 6 additions & 0 deletions apis/python/src/tiledbsoma/soma_dataframe.cc
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,12 @@ void load_soma_dataframe(py::module& m) {
.def_static("exists", &SOMADataFrame::exists)
.def_property_readonly(
"index_column_names", &SOMADataFrame::index_column_names)

.def_property_readonly(
"maybe_soma_joinid_shape", &SOMADataFrame::maybe_soma_joinid_shape)
.def_property_readonly(
"maybe_soma_joinid_maxshape",
&SOMADataFrame::maybe_soma_joinid_maxshape)
.def_property_readonly(
"count",
&SOMADataFrame::count,
Expand Down
5 changes: 4 additions & 1 deletion apis/python/src/tiledbsoma/soma_dense_ndarray.cc
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,9 @@ void load_soma_dense_ndarray(py::module& m) {

.def_static("exists", &SOMADenseNDArray::exists)

.def("write", write);
.def("write", write)

.def_property_readonly("shape", &SOMADenseNDArray::shape)
.def_property_readonly("maxshape", &SOMADenseNDArray::maxshape);
}
} // namespace libtiledbsomacpp
5 changes: 4 additions & 1 deletion apis/python/src/tiledbsoma/soma_sparse_ndarray.cc
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,9 @@ void load_soma_sparse_ndarray(py::module& m) {
"result_order"_a = ResultOrder::automatic,
"timestamp"_a = py::none())

.def_static("exists", &SOMASparseNDArray::exists);
.def_static("exists", &SOMASparseNDArray::exists)

.def_property_readonly("shape", &SOMASparseNDArray::shape)
.def_property_readonly("maxshape", &SOMASparseNDArray::maxshape);
}
} // namespace libtiledbsomacpp
4 changes: 4 additions & 0 deletions apis/python/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,10 @@ def test_dataframe(tmp_path, arrow_schema):
with pytest.raises(AttributeError):
assert sdf.shape is None

# soma_joinid is not a dim here
assert sdf._maybe_soma_joinid_shape is None
assert sdf._maybe_soma_joinid_maxshape is None

# Read all
table = sdf.read().concat()
assert table.num_rows == 5
Expand Down
207 changes: 207 additions & 0 deletions apis/python/tests/test_shape.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,207 @@
from __future__ import annotations

import pyarrow as pa
import pytest

import tiledbsoma

from tests._util import maybe_raises


@pytest.mark.parametrize(
"element_dtype",
[
pa.float64(),
pa.float32(),
pa.int64(),
pa.uint16(),
],
)
@pytest.mark.parametrize(
"shape_exc",
[
# Note: non-None exceptions are coming on https://github.com/single-cell-data/TileDB-SOMA/issues/2407
[(100,), None],
[(100, 200), None],
[(100, 200, 300), None],
],
)
def test_sparse_nd_array_basics(
tmp_path,
element_dtype,
shape_exc,
):
uri = tmp_path.as_posix()
arg_shape, arg_create_exc = shape_exc
ndim = len(arg_shape)

# Create the array
with maybe_raises(arg_create_exc):
snda = tiledbsoma.SparseNDArray.create(
uri,
type=element_dtype,
shape=arg_shape,
)
if arg_create_exc is not None:
return

assert tiledbsoma.SparseNDArray.exists(uri)

# Test the various accessors
with tiledbsoma.SparseNDArray.open(uri) as snda:
assert snda.shape == arg_shape

# Before current-domain support: shape is maxshape.
#
# With current-domain support:
# We expect XXX to be set to a big signed int32. (There are details on the exact value of
# that number, involving R compatibility, and leaving room for a single tile capacity, etc
# ... we could check for some magic value but it suffices to check that it's over 2
# billion.)
assert snda.shape == snda.maxshape
# for e in snda.maxshape:
# assert e > 2_000_000_000

# No data have been written for this test case
assert snda.non_empty_domain() == tuple([(0, 0)] * ndim)

# soma_dim_0: (0,1)
# soma_dim_1: (2,3)
# soma_dim_2: (4,5)
coords = []
dim_names = []
for i in range(ndim):
dim_names.append(f"soma_dim_{i}")
coords.append((2 * i, 2 * i + 1))
coords = tuple(coords)

# Write some data
with tiledbsoma.SparseNDArray.open(uri, "w") as snda:
dikt = {"soma_data": [4, 5]}
for i in range(ndim):
dikt[dim_names[i]] = coords[i]
table = pa.Table.from_pydict(dikt)
snda.write(table)

# Test the various accessors
with tiledbsoma.SparseNDArray.open(uri) as snda:
assert snda.shape == arg_shape
# This will change with current-domain support
assert snda.shape == snda.maxshape
# for e in snda.maxshape:
# assert e > 2_000_000_000
assert snda.non_empty_domain() == coords

# Test reads out of bounds
with tiledbsoma.SparseNDArray.open(uri) as snda:
# TODO: make sure this doesn't come through as RuntimeError
# https://github.com/single-cell-data/TileDB-SOMA/issues/2407
with pytest.raises((RuntimeError, ValueError)):
coords = tuple(arg_shape[i] + 10 for i in range(ndim))
snda.read(coords).tables().concat()

# Test writes out of bounds
with tiledbsoma.SparseNDArray.open(uri, "w") as snda:
with pytest.raises(tiledbsoma.SOMAError):
dikt = {"soma_data": [30]}
dikt = {name: [shape + 20] for name, shape in zip(dim_names, arg_shape)}
table = pa.Table.from_pydict(dikt)
snda.write(table)

with tiledbsoma.SparseNDArray.open(uri) as snda:
assert snda.shape == arg_shape
assert snda.shape == snda.maxshape


## Pending 2.27 timeframe for dense support for current domain, including resize
## TODO: mark these with a linked GitHub tracking issue
def test_dense_nd_array_basics(tmp_path):
uri = tmp_path.as_posix()
shape = (100, 200)
tiledbsoma.DenseNDArray.create(uri, type=pa.float64(), shape=shape)

with tiledbsoma.DenseNDArray.open(uri) as dnda:
assert dnda.shape == (100, 200)
assert dnda.maxshape == (100, 200)

assert dnda.non_empty_domain() == ((0, 0), (0, 0))


@pytest.mark.parametrize(
"soma_joinid_domain",
[
# TODO: https://github.com/single-cell-data/TileDB-SOMA/issues/2407
# None,
(0, 1),
(0, 3),
(0, 100),
],
)
@pytest.mark.parametrize(
"index_column_names",
[
["soma_joinid"],
["soma_joinid", "myint"],
["soma_joinid", "mystring"],
["mystring", "myint"],
],
)
def test_dataframe_basics(tmp_path, soma_joinid_domain, index_column_names):
uri = tmp_path.as_posix()

schema = pa.schema(
[
("soma_joinid", pa.int64()),
("mystring", pa.string()),
("myint", pa.int16()),
("myfloat", pa.float32()),
]
)

data = pa.Table.from_pydict(
{
"soma_joinid": [0, 1, 2, 3],
"mystring": ["a", "b", "a", "b"],
"myint": [20, 30, 40, 50],
"myfloat": [1.0, 2.5, 4.0, 5.5],
}
)

domain_slots = {
"soma_joinid": soma_joinid_domain,
"mystring": None,
"myint": (-1000, 1000),
"myfloat": (-999.5, 999.5),
}

domain = tuple([domain_slots[name] for name in index_column_names])

soma_joinid_coords = data["soma_joinid"]
oob_write = any(
e.as_py() < soma_joinid_domain[0] or e.as_py() > soma_joinid_domain[1]
for e in soma_joinid_coords
)
oob_write = oob_write and "soma_joinid" in index_column_names

with tiledbsoma.DataFrame.create(
uri,
schema=schema,
index_column_names=index_column_names,
domain=domain,
) as sdf:
if oob_write:
with pytest.raises(tiledbsoma.SOMAError):
sdf.write(data)
else:
sdf.write(data)

with tiledbsoma.DataFrame.open(uri) as sdf:
has_sjid_dim = "soma_joinid" in index_column_names
if has_sjid_dim:
assert sdf._maybe_soma_joinid_shape == 1 + soma_joinid_domain[1]
assert sdf._maybe_soma_joinid_maxshape == 1 + soma_joinid_domain[1]
else:
assert sdf._maybe_soma_joinid_shape is None
assert sdf._maybe_soma_joinid_maxshape is None

assert len(sdf.non_empty_domain()) == len(index_column_names)
8 changes: 8 additions & 0 deletions apis/python/tests/test_sparse_nd_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,14 @@ def test_sparse_nd_array_create_ok(
assert a.uri == tmp_path.as_posix()
assert a.ndim == len(shape)
assert a.shape == tuple(shape)

# TODO: more testing with current-domain feature integrated
# https://github.com/single-cell-data/TileDB-SOMA/issues/2407
assert isinstance(a.maxshape, tuple)
assert len(a.maxshape) == len(a.shape)
for maxshape, shape in zip(a.maxshape, a.shape):
assert maxshape >= shape

assert a.is_sparse is True

assert a.schema is not None
Expand Down
12 changes: 12 additions & 0 deletions apis/r/R/RcppExports.R
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,18 @@ shape <- function(uri, config = NULL) {
.Call(`_tiledbsoma_shape`, uri, config)
}

maxshape <- function(uri, config = NULL) {
.Call(`_tiledbsoma_maxshape`, uri, config)
}

maybe_soma_joinid_shape <- function(uri, config = NULL) {
.Call(`_tiledbsoma_maybe_soma_joinid_shape`, uri, config)
}

maybe_soma_joinid_maxshape <- function(uri, config = NULL) {
.Call(`_tiledbsoma_maybe_soma_joinid_maxshape`, uri, config)
}

#' Iterator-Style Access to SOMA Array via SOMAArray
#'
#' The `sr_*` functions provide low-level access to an instance of the SOMAArray
Expand Down
Loading

0 comments on commit 8ef6cf4

Please sign in to comment.