Skip to content

Commit

Permalink
new-shape testing [skip ci]
Browse files Browse the repository at this point in the history
  • Loading branch information
johnkerl committed Aug 17, 2024
1 parent bdba3c3 commit db05c39
Show file tree
Hide file tree
Showing 29 changed files with 1,224 additions and 104 deletions.
20 changes: 20 additions & 0 deletions apis/python/src/tiledbsoma/_common_nd_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,26 @@ def shape(self) -> Tuple[int, ...]:
"""
return cast(Tuple[int, ...], tuple(self._handle.shape))

@property
def maxshape(self) -> Tuple[int, ...]:
"""XXX write me please thank you
Lifecycle:
Experimental.
"""
# For core 2.26 we'll implement this for sparse and dense.
# For core 2.25 we'll implement this only for dense.
# We'll leave this common accessor here, but raise
# NotImplementedError in DenseNDArray until 2.26.
return cast(Tuple[int, ...], tuple(self._handle.maxshape))

def resize(self, newshape: Sequence[Union[int, None]]) -> None:
"""Comment me please thx"""
# For core 2.26 we'll implement this for sparse and dense.
# For core 2.25 we'll implement this only for dense.
# We'll leave this common accessor here, but raise
# NotImplementedError in DenseNDArray until 2.26.
self._handle.resize(newshape)

@classmethod
def _dim_capacity_and_extent(
cls,
Expand Down
76 changes: 68 additions & 8 deletions apis/python/src/tiledbsoma/_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,10 +215,22 @@ def create(
"""
context = _validate_soma_tiledb_context(context)
schema = _canonicalize_schema(schema, index_column_names)
if domain is None:
domain = tuple(None for _ in index_column_names)

# XXX comment re mapping:
# * core current_domain <-> SOMA domain
# * core domain <-> SOMA max_domain
#
# As far as the user is concerned, the SOMA domain (core current_domain)
# is the _only_ thing they see and care about. It's resizeable (up to
# max_domain anyway), reads and writes are bounds-checked against it,
# etc.

soma_domain = domain

if soma_domain is None:
soma_domain = tuple(None for _ in index_column_names)
else:
ndom = len(domain)
ndom = len(soma_domain)
nidx = len(index_column_names)
if ndom != nidx:
raise ValueError(
Expand All @@ -228,37 +240,60 @@ def create(
index_column_schema = []
index_column_data = {}

for index_column_name, slot_domain in zip(index_column_names, domain):
for index_column_name, slot_current_domain in zip(
index_column_names, soma_domain
):
pa_field = schema.field(index_column_name)
dtype = _arrow_types.tiledb_type_from_arrow_type(
pa_field.type, is_indexed_column=True
)

slot_domain = _fill_out_slot_domain(
slot_domain, index_column_name, pa_field.type, dtype
slot_current_domain = _fill_out_slot_domain(
slot_current_domain, index_column_name, pa_field.type, dtype
)
slot_max_domain = _fill_out_slot_domain(
None, index_column_name, pa_field.type, dtype
)

extent = _find_extent_for_domain(
index_column_name,
TileDBCreateOptions.from_platform_config(platform_config),
dtype,
slot_domain,
slot_max_domain,
)

# XXX COMMENT
# XXX emphasize:
# [0] core max domain lo
# [1] core max domain hi
# [2] core extent parameter
# [3] core current domain lo
# [4] core current domain hi

index_column_schema.append(pa_field)
index_column_data[pa_field.name] = [*slot_domain, extent]
index_column_data[pa_field.name] = [
*slot_max_domain,
extent,
*slot_current_domain,
]

index_column_info = pa.RecordBatch.from_pydict(
index_column_data, schema=pa.schema(index_column_schema)
)

# print()
# print("INDEX_COLUMN_INFO")
# print(index_column_info.to_pandas())
# print()

plt_cfg = _util.build_clib_platform_config(platform_config)
timestamp_ms = context._open_timestamp_ms(tiledb_timestamp)
try:
clib.SOMADataFrame.create(
uri,
schema=schema,
index_column_info=index_column_info,
# XXX domain=domain,
ctx=context.native_context,
platform_config=plt_cfg,
timestamp=(0, timestamp_ms),
Expand Down Expand Up @@ -317,6 +352,31 @@ def count(self) -> int:
# if is it in read open mode, then it is a DataFrameWrapper
return cast(DataFrameWrapper, self._handle).count

@property
def shape(self) -> Tuple[int, ...]:
"""Returns capacity of each dimension, always a list of length ``ndim``.
This will not necessarily match the bounds of occupied cells within the array.
Rather, it is the bounds outside of which no data may be written.
Lifecycle:
Experimental.
"""
# XXX COMMENT ME
return cast(Tuple[int, ...], (self._handle.shape[0],))

@property
def maxshape(self) -> Tuple[int, ...]:
"""XXX write me please thank you
Lifecycle:
Experimental.
"""
# XXX COMMENT ME
return cast(Tuple[int, ...], (self._handle.maxshape[0],))

def resize(self, newshape: Sequence[Union[int, None]]) -> None:
"""Comment me please thx"""
self._handle.resize(newshape)

def __len__(self) -> int:
"""Returns the number of rows in the dataframe. Same as ``df.count``."""
return self.count
Expand Down
46 changes: 46 additions & 0 deletions apis/python/src/tiledbsoma/_dense_nd_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,17 @@ def create(
) -> Self:
context = _validate_soma_tiledb_context(context)

# XXX comment re mapping:
# * core current_domain <-> (0, SOMA shape minus 1)
# * core domain <-> (0, SOMA max_shape minus 1)
# this is also known as capacity
#
# As far as the user is concerned, the SOMA domain (core current_domain)
# is the _only_ thing they see and care about. It's resizeable (up to max_domain
# anyway), reads and writes are bounds-checked against it, etc.

# XXX note: don't set current_domain for dense arrays until core 2.26

index_column_schema = []
index_column_data = {}
for dim_idx, dim_shape in enumerate(shape):
Expand All @@ -105,6 +116,16 @@ def create(
dim_shape,
TileDBCreateOptions.from_platform_config(platform_config),
)

# XXX COMMENT
# XXX emphasize:
# [0] core max domain lo
# [1] core max domain hi
# [2] core extent parameter
# [3] core current domain lo
# [4] core current domain hi
# XXX note: don't set current_domain for dense arrays until core 2.26

index_column_schema.append(pa_field)
index_column_data[pa_field.name] = [0, dim_capacity - 1, dim_extent]

Expand Down Expand Up @@ -314,3 +335,28 @@ def _dim_capacity_and_extent(
dim_extent = min(dim_shape, create_options.dim_tile(dim_name, 2048))

return (dim_capacity, dim_extent)

@property
def maxshape(self) -> Tuple[int, ...]:
"""XXX write me please thank you
Lifecycle:
Experimental.
"""
# For core 2.26 we'll implement this for sparse and dense.
# For core 2.25 we'll implement this only for dense.
# This suppression overrides the parent class.
raise NotImplementedError(
"DenseNDArray maxshape support is scheduled for TileDBSOMA 1.14"
)

def resize(self, newshape: Sequence[Union[int, None]]) -> None:
"""XXX write me please thank you
Lifecycle:
Experimental.
"""
# For core 2.26 we'll implement this for sparse and dense.
# For core 2.25 we'll implement this only for dense.
# This suppression overrides the parent class.
raise NotImplementedError(
"DenseNDArray resize support is scheduled for TileDBSOMA 1.14"
)
48 changes: 46 additions & 2 deletions apis/python/src/tiledbsoma/_sparse_nd_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,21 +124,65 @@ def create(

index_column_schema = []
index_column_data = {}

# XXX comment re mapping:
# * core current_domain <-> (0, SOMA shape minus 1)
# * core domain <-> (0, SOMA max_shape minus 1)
# this is also known as capacity
#
# As far as the user is concerned, the SOMA domain (core current_domain)
# is the _only_ thing they see and care about. It's resizeable (up to max_domain
# anyway), reads and writes are bounds-checked against it, etc.

# XXX COMMENT
for dim_idx, dim_shape in enumerate(shape):
dim_name = f"soma_dim_{dim_idx}"

pa_field = pa.field(dim_name, pa.int64())
dim_capacity, dim_extent = cls._dim_capacity_and_extent(
dim_name,
dim_shape,
None, # XXX COMMENT
TileDBCreateOptions.from_platform_config(platform_config),
)

if dim_shape == 0:
raise ValueError("Write this message please")
# XXX comment
if dim_shape is None:
dim_shape = dim_capacity
# XXX different comment
# if dim_shape == 0:
# dim_shape = 1

# XXX COMMENT
# XXX emphasize:
# [0] core max domain lo
# [1] core max domain hi
# [2] core extent parameter
# [3] core current domain lo
# [4] core current domain hi

index_column_schema.append(pa_field)
index_column_data[pa_field.name] = [0, dim_capacity - 1, dim_extent]
# XXX COMMENT
index_column_data[pa_field.name] = [
0,
dim_capacity - 1,
dim_extent,
0,
dim_shape - 1,
]

index_column_info = pa.RecordBatch.from_pydict(
index_column_data, schema=pa.schema(index_column_schema)
)

# print()
# print("INDEX_COLUMN_SCHEMA")
# print(index_column_info.schema)
# print("INDEX_COLUMN_INFO")
# print(index_column_info.to_pandas())
# print()

carrow_type = pyarrow_to_carrow_type(type)
plt_cfg = _util.build_clib_platform_config(platform_config)
timestamp_ms = context._open_timestamp_ms(tiledb_timestamp)
Expand Down
15 changes: 13 additions & 2 deletions apis/python/src/tiledbsoma/_tdb_handles.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
Mapping,
MutableMapping,
Optional,
Sequence,
Tuple,
Type,
TypeVar,
Expand Down Expand Up @@ -408,6 +409,13 @@ def dim_names(self) -> Tuple[str, ...]:
def shape(self) -> Tuple[int, ...]:
return tuple(self._handle.shape)

@property
def maxshape(self) -> Tuple[int, ...]:
return tuple(self._handle.maxshape)

def resize(self, newshape: Sequence[Union[int, None]]) -> None:
self._handle.resize(newshape)


class DataFrameWrapper(SOMAArrayWrapper[clib.SOMADataFrame]):
"""Wrapper around a Pybind11 SOMADataFrame handle."""
Expand All @@ -423,8 +431,11 @@ def write(self, values: pa.RecordBatch) -> None:

@property
def shape(self) -> Tuple[int, ...]:
# Shape is not implemented for DataFrames
raise NotImplementedError
return tuple(self._handle.shape)

@property
def maxshape(self) -> Tuple[int, ...]:
return tuple(self._handle.maxshape)


class DenseNDArrayWrapper(SOMAArrayWrapper[clib.SOMADenseNDArray]):
Expand Down
6 changes: 5 additions & 1 deletion apis/python/src/tiledbsoma/io/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -1205,6 +1205,8 @@ def _write_dataframe_impl(
schema=arrow_table.schema,
platform_config=platform_config,
context=context,
domain=((0, df.shape[0] - 1),),
# XXX DOMAIN
)
except (AlreadyExistsError, NotCreateableError):
if ingestion_params.error_if_already_exists:
Expand Down Expand Up @@ -1302,7 +1304,9 @@ def _create_from_matrix(

try:
# A SparseNDArray must be appendable in soma.io.
shape = [None for _ in matrix.shape] if cls.is_sparse else matrix.shape
# XXX this can be numpy.int64 -- this is for the type-checker
# shape = matrix.shape
shape = tuple([int(e) for e in matrix.shape])
soma_ndarray = cls.create(
uri,
type=pa.from_numpy_dtype(matrix.dtype),
Expand Down
27 changes: 17 additions & 10 deletions apis/python/src/tiledbsoma/soma_array.cc
Original file line number Diff line number Diff line change
Expand Up @@ -500,14 +500,23 @@ void load_soma_array(py::module& m) {
py::gil_scoped_release release;

// Try to read more data
auto buffers = array.read_next();

// If more data was read, convert it to an arrow table and
// return
if (buffers.has_value()) {
// Acquire python GIL before accessing python objects
py::gil_scoped_acquire acquire;
return to_table(*buffers);
try {
auto buffers = array.read_next();

// If more data was read, convert it to an arrow table and
// return
if (buffers.has_value()) {
// Acquire python GIL before accessing python objects
py::gil_scoped_acquire acquire;
return to_table(*buffers);
}

} catch (const TileDBSOMAIndexError& e) {
// Re-raise as ValueError to preserve index-out-of-bounds
// reporting semantics in the current-domain/new-shape era.
throw py::value_error(e.what());
} catch (const std::exception& e) {
throw e;
}

// No data was read, the query is complete, return nullopt
Expand All @@ -520,8 +529,6 @@ void load_soma_array(py::module& m) {

.def("nnz", &SOMAArray::nnz, py::call_guard<py::gil_scoped_release>())

.def_property_readonly("shape", &SOMAArray::shape)

.def_property_readonly("uri", &SOMAArray::uri)

.def_property_readonly("column_names", &SOMAArray::column_names)
Expand Down
Loading

0 comments on commit db05c39

Please sign in to comment.